diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 | 
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 | 
| commit | cfca06d7963fa0909f90483b42a6d7d194d01e08 (patch) | |
| tree | 209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/CodeGen/MachineBlockPlacement.cpp | |
| parent | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff) | |
Notes
Diffstat (limited to 'llvm/lib/CodeGen/MachineBlockPlacement.cpp')
| -rw-r--r-- | llvm/lib/CodeGen/MachineBlockPlacement.cpp | 298 | 
1 files changed, 259 insertions, 39 deletions
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 30b98ec88c24..783d22fafee9 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -346,7 +346,7 @@ class MachineBlockPlacement : public MachineFunctionPass {    const MachineBranchProbabilityInfo *MBPI;    /// A handle to the function-wide block frequency pass. -  std::unique_ptr<BranchFolder::MBFIWrapper> MBFI; +  std::unique_ptr<MBFIWrapper> MBFI;    /// A handle to the loop info.    MachineLoopInfo *MLI; @@ -374,6 +374,9 @@ class MachineBlockPlacement : public MachineFunctionPass {    /// must be done inline.    TailDuplicator TailDup; +  /// Partial tail duplication threshold. +  BlockFrequency DupThreshold; +    /// Allocator and owner of BlockChain structures.    ///    /// We build BlockChains lazily while processing the loop structure of @@ -399,6 +402,10 @@ class MachineBlockPlacement : public MachineFunctionPass {    SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;  #endif +  /// Scale the DupThreshold according to basic block size. +  BlockFrequency scaleThreshold(MachineBasicBlock *BB); +  void initDupThreshold(); +    /// Decrease the UnscheduledPredecessors count for all blocks in chain, and    /// if the count goes to 0, add them to the appropriate work list.    void markChainSuccessors( @@ -421,6 +428,11 @@ class MachineBlockPlacement : public MachineFunctionPass {        const MachineBasicBlock *BB, const MachineBasicBlock *Succ,        const BlockChain &Chain, const BlockFilterSet *BlockFilter,        BranchProbability SuccProb, BranchProbability HotProb); +  bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred, +                       BlockFilterSet *BlockFilter); +  void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates, +                               MachineBasicBlock *BB, +                               BlockFilterSet *BlockFilter);    bool repeatedlyTailDuplicateBlock(        MachineBasicBlock *BB, MachineBasicBlock *&LPred,        const MachineBasicBlock *LoopHeaderBB, @@ -1141,6 +1153,11 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(    if (NumDup == 0)      return false; +  // If profile information is available, findDuplicateCandidates can do more +  // precise benefit analysis. +  if (F->getFunction().hasProfileData()) +    return true; +    // This is mainly for function exit BB.    // The integrated tail duplication is really designed for increasing    // fallthrough from predecessors from Succ to its successors. We may need @@ -1169,9 +1186,6 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(    //    // A small number of extra duplication may not hurt too much. We need a better    // heuristic to handle it. -  // -  // FIXME: we should selectively tail duplicate a BB into part of its -  // predecessors.    if ((NumDup > Succ->succ_size()) || !Duplicate)      return false; @@ -1556,7 +1570,7 @@ MachineBlockPlacement::selectBestSuccessor(    // For blocks with CFG violations, we may be able to lay them out anyway with    // tail-duplication. We keep this vector so we can perform the probability    // calculations the minimum number of times. -  SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4> +  SmallVector<std::pair<BranchProbability, MachineBasicBlock *>, 4>        DupCandidates;    for (MachineBasicBlock *Succ : Successors) {      auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); @@ -1570,7 +1584,7 @@ MachineBlockPlacement::selectBestSuccessor(                                     Chain, BlockFilter)) {        // If tail duplication would make Succ profitable, place it.        if (allowTailDupPlacement() && shouldTailDuplicate(Succ)) -        DupCandidates.push_back(std::make_tuple(SuccProb, Succ)); +        DupCandidates.emplace_back(SuccProb, Succ);        continue;      } @@ -1799,11 +1813,11 @@ void MachineBlockPlacement::buildChain(      // Placement may have changed tail duplication opportunities.      // Check for that now.      if (allowTailDupPlacement() && BestSucc && ShouldTailDup) { -      // If the chosen successor was duplicated into all its predecessors, -      // don't bother laying it out, just go round the loop again with BB as -      // the chain end. -      if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain, -                                       BlockFilter, PrevUnplacedBlockIt)) +      repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain, +                                       BlockFilter, PrevUnplacedBlockIt); +      // If the chosen successor was duplicated into BB, don't bother laying +      // it out, just go round the loop again with BB as the chain end. +      if (!BB->isSuccessor(BestSucc))          continue;      } @@ -2082,8 +2096,7 @@ MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,    // In practice this never happens though: there always seems to be a preheader    // that can fallthrough and that is also placed before the header.    bool OptForSize = F->getFunction().hasOptSize() || -                    llvm::shouldOptimizeForSize(L.getHeader(), PSI, -                                                &MBFI->getMBFI()); +                    llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get());    if (OptForSize)      return L.getHeader(); @@ -2616,7 +2629,7 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {  void MachineBlockPlacement::buildCFGChains() {    // Ensure that every BB in the function has an associated chain to simplify    // the assumptions of the remaining algorithm. -  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch. +  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.    for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;         ++FI) {      MachineBasicBlock *BB = &*FI; @@ -2626,7 +2639,7 @@ void MachineBlockPlacement::buildCFGChains() {      // the exact fallthrough behavior for.      while (true) {        Cond.clear(); -      MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. +      MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.        if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())          break; @@ -2690,6 +2703,20 @@ void MachineBlockPlacement::buildCFGChains() {      assert(!BadFunc && "Detected problems with the block placement.");    }); +  // Remember original layout ordering, so we can update terminators after +  // reordering to point to the original layout successor. +  SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors( +      F->getNumBlockIDs()); +  { +    MachineBasicBlock *LastMBB = nullptr; +    for (auto &MBB : *F) { +      if (LastMBB != nullptr) +        OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB; +      LastMBB = &MBB; +    } +    OriginalLayoutSuccessors[F->back().getNumber()] = nullptr; +  } +    // Splice the blocks into place.    MachineFunction::iterator InsertPos = F->begin();    LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n"); @@ -2711,7 +2738,7 @@ void MachineBlockPlacement::buildCFGChains() {      // than assert when the branch cannot be analyzed in order to remove this      // boiler plate.      Cond.clear(); -    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. +    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.  #ifndef NDEBUG      if (!BlocksWithUnanalyzableExits.count(PrevBB)) { @@ -2747,15 +2774,18 @@ void MachineBlockPlacement::buildCFGChains() {      //     TBB = FBB = nullptr;      //   }      // } -    if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) -      PrevBB->updateTerminator(); +    if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) { +      PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]); +    }    }    // Fixup the last block.    Cond.clear(); -  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. -  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) -    F->back().updateTerminator(); +  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. +  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) { +    MachineBasicBlock *PrevBB = &F->back(); +    PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]); +  }    BlockWorkList.clear();    EHPadWorkList.clear(); @@ -2763,17 +2793,17 @@ void MachineBlockPlacement::buildCFGChains() {  void MachineBlockPlacement::optimizeBranches() {    BlockChain &FunctionChain = *BlockToChain[&F->front()]; -  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch. +  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.    // Now that all the basic blocks in the chain have the proper layout, -  // make a final call to AnalyzeBranch with AllowModify set. +  // make a final call to analyzeBranch with AllowModify set.    // Indeed, the target may be able to optimize the branches in a way we    // cannot because all branches may not be analyzable.    // E.g., the target may be able to remove an unconditional branch to    // a fallthrough when it occurs after predicated terminators.    for (MachineBasicBlock *ChainBB : FunctionChain) {      Cond.clear(); -    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. +    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.      if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {        // If PrevBB has a two-way branch, try to re-order the branches        // such that we branch to the successor with higher probability first. @@ -2789,7 +2819,6 @@ void MachineBlockPlacement::optimizeBranches() {          DebugLoc dl; // FIXME: this is nowhere          TII->removeBranch(*ChainBB);          TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl); -        ChainBB->updateTerminator();        }      }    } @@ -2841,7 +2870,7 @@ void MachineBlockPlacement::alignBlocks() {        continue;      // If the global profiles indicates so, don't align it. -    if (llvm::shouldOptimizeForSize(ChainBB, PSI, &MBFI->getMBFI()) && +    if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&          !TLI->alignLoopsWithOptSize())        continue; @@ -2901,10 +2930,7 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(    // duplicated into is still small enough to be duplicated again.    // No need to call markBlockSuccessors in this case, as the blocks being    // duplicated from here on are already scheduled. -  // Note that DuplicatedToLPred always implies Removed. -  while (DuplicatedToLPred) { -    assert(Removed && "Block must have been removed to be duplicated into its " -           "layout predecessor."); +  while (DuplicatedToLPred && Removed) {      MachineBasicBlock *DupBB, *DupPred;      // The removal callback causes Chain.end() to be updated when a block is      // removed. On the first pass through the loop, the chain end should be the @@ -2943,8 +2969,7 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(  ///                          chosen in the given order due to unnatural CFG  ///                          only needed if \p BB is removed and  ///                          \p PrevUnplacedBlockIt pointed to \p BB. -/// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will -///                        only be true if the block was removed. +/// \p DuplicatedToLPred - True if the block was duplicated into LPred.  /// \return  - True if the block was duplicated into all preds and removed.  bool MachineBlockPlacement::maybeTailDuplicateBlock(      MachineBasicBlock *BB, MachineBasicBlock *LPred, @@ -3012,8 +3037,18 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(    SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;    bool IsSimple = TailDup.isSimpleBB(BB); -  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, -                                 &DuplicatedPreds, &RemovalCallbackRef); +  SmallVector<MachineBasicBlock *, 8> CandidatePreds; +  SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr; +  if (F->getFunction().hasProfileData()) { +    // We can do partial duplication with precise profile information. +    findDuplicateCandidates(CandidatePreds, BB, BlockFilter); +    if (CandidatePreds.size() == 0) +      return false; +    if (CandidatePreds.size() < BB->pred_size()) +      CandidatePtr = &CandidatePreds; +  } +  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds, +                                 &RemovalCallbackRef, CandidatePtr);    // Update UnscheduledPredecessors to reflect tail-duplication.    DuplicatedToLPred = false; @@ -3036,6 +3071,191 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(    return Removed;  } +// Count the number of actual machine instructions. +static uint64_t countMBBInstruction(MachineBasicBlock *MBB) { +  uint64_t InstrCount = 0; +  for (MachineInstr &MI : *MBB) { +    if (!MI.isPHI() && !MI.isMetaInstruction()) +      InstrCount += 1; +  } +  return InstrCount; +} + +// The size cost of duplication is the instruction size of the duplicated block. +// So we should scale the threshold accordingly. But the instruction size is not +// available on all targets, so we use the number of instructions instead. +BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) { +  return DupThreshold.getFrequency() * countMBBInstruction(BB); +} + +// Returns true if BB is Pred's best successor. +bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB, +                                            MachineBasicBlock *Pred, +                                            BlockFilterSet *BlockFilter) { +  if (BB == Pred) +    return false; +  if (BlockFilter && !BlockFilter->count(Pred)) +    return false; +  BlockChain *PredChain = BlockToChain[Pred]; +  if (PredChain && (Pred != *std::prev(PredChain->end()))) +    return false; + +  // Find the successor with largest probability excluding BB. +  BranchProbability BestProb = BranchProbability::getZero(); +  for (MachineBasicBlock *Succ : Pred->successors()) +    if (Succ != BB) { +      if (BlockFilter && !BlockFilter->count(Succ)) +        continue; +      BlockChain *SuccChain = BlockToChain[Succ]; +      if (SuccChain && (Succ != *SuccChain->begin())) +        continue; +      BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ); +      if (SuccProb > BestProb) +        BestProb = SuccProb; +    } + +  BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB); +  if (BBProb <= BestProb) +    return false; + +  // Compute the number of reduced taken branches if Pred falls through to BB +  // instead of another successor. Then compare it with threshold. +  BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); +  BlockFrequency Gain = PredFreq * (BBProb - BestProb); +  return Gain > scaleThreshold(BB); +} + +// Find out the predecessors of BB and BB can be beneficially duplicated into +// them. +void MachineBlockPlacement::findDuplicateCandidates( +    SmallVectorImpl<MachineBasicBlock *> &Candidates, +    MachineBasicBlock *BB, +    BlockFilterSet *BlockFilter) { +  MachineBasicBlock *Fallthrough = nullptr; +  BranchProbability DefaultBranchProb = BranchProbability::getZero(); +  BlockFrequency BBDupThreshold(scaleThreshold(BB)); +  SmallVector<MachineBasicBlock *, 8> Preds(BB->pred_begin(), BB->pred_end()); +  SmallVector<MachineBasicBlock *, 8> Succs(BB->succ_begin(), BB->succ_end()); + +  // Sort for highest frequency. +  auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) { +    return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B); +  }; +  auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) { +    return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B); +  }; +  llvm::stable_sort(Succs, CmpSucc); +  llvm::stable_sort(Preds, CmpPred); + +  auto SuccIt = Succs.begin(); +  if (SuccIt != Succs.end()) { +    DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl(); +  } + +  // For each predecessors of BB, compute the benefit of duplicating BB, +  // if it is larger than the threshold, add it into Candidates. +  // +  // If we have following control flow. +  // +  //     PB1 PB2 PB3 PB4 +  //      \   |  /    /\ +  //       \  | /    /  \ +  //        \ |/    /    \ +  //         BB----/     OB +  //         /\ +  //        /  \ +  //      SB1 SB2 +  // +  // And it can be partially duplicated as +  // +  //   PB2+BB +  //      |  PB1 PB3 PB4 +  //      |   |  /    /\ +  //      |   | /    /  \ +  //      |   |/    /    \ +  //      |  BB----/     OB +  //      |\ /| +  //      | X | +  //      |/ \| +  //     SB2 SB1 +  // +  // The benefit of duplicating into a predecessor is defined as +  //         Orig_taken_branch - Duplicated_taken_branch +  // +  // The Orig_taken_branch is computed with the assumption that predecessor +  // jumps to BB and the most possible successor is laid out after BB. +  // +  // The Duplicated_taken_branch is computed with the assumption that BB is +  // duplicated into PB, and one successor is layout after it (SB1 for PB1 and +  // SB2 for PB2 in our case). If there is no available successor, the combined +  // block jumps to all BB's successor, like PB3 in this example. +  // +  // If a predecessor has multiple successors, so BB can't be duplicated into +  // it. But it can beneficially fall through to BB, and duplicate BB into other +  // predecessors. +  for (MachineBasicBlock *Pred : Preds) { +    BlockFrequency PredFreq = MBFI->getBlockFreq(Pred); + +    if (!TailDup.canTailDuplicate(BB, Pred)) { +      // BB can't be duplicated into Pred, but it is possible to be layout +      // below Pred. +      if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) { +        Fallthrough = Pred; +        if (SuccIt != Succs.end()) +          SuccIt++; +      } +      continue; +    } + +    BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb; +    BlockFrequency DupCost; +    if (SuccIt == Succs.end()) { +      // Jump to all successors; +      if (Succs.size() > 0) +        DupCost += PredFreq; +    } else { +      // Fallthrough to *SuccIt, jump to all other successors; +      DupCost += PredFreq; +      DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt); +    } + +    assert(OrigCost >= DupCost); +    OrigCost -= DupCost; +    if (OrigCost > BBDupThreshold) { +      Candidates.push_back(Pred); +      if (SuccIt != Succs.end()) +        SuccIt++; +    } +  } + +  // No predecessors can optimally fallthrough to BB. +  // So we can change one duplication into fallthrough. +  if (!Fallthrough) { +    if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) { +      Candidates[0] = Candidates.back(); +      Candidates.pop_back(); +    } +  } +} + +void MachineBlockPlacement::initDupThreshold() { +  DupThreshold = 0; +  if (!F->getFunction().hasProfileData()) +    return; + +  BlockFrequency MaxFreq = 0; +  for (MachineBasicBlock &MBB : *F) { +    BlockFrequency Freq = MBFI->getBlockFreq(&MBB); +    if (Freq > MaxFreq) +      MaxFreq = Freq; +  } + +  // FIXME: we may use profile count instead of frequency, +  // and need more fine tuning. +  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100); +  DupThreshold = MaxFreq * ThresholdProb; +} +  bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {    if (skipFunction(MF.getFunction()))      return false; @@ -3046,7 +3266,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {    F = &MF;    MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); -  MBFI = std::make_unique<BranchFolder::MBFIWrapper>( +  MBFI = std::make_unique<MBFIWrapper>(        getAnalysis<MachineBlockFrequencyInfo>());    MLI = &getAnalysis<MachineLoopInfo>();    TII = MF.getSubtarget().getInstrInfo(); @@ -3054,6 +3274,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {    MPDT = nullptr;    PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); +  initDupThreshold(); +    // Initialize PreferredLoopExit to nullptr here since it may never be set if    // there are no MachineLoops.    PreferredLoopExit = nullptr; @@ -3088,7 +3310,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {      if (OptForSize)        TailDupSize = 1;      bool PreRegAlloc = false; -    TailDup.initMF(MF, PreRegAlloc, MBPI, &MBFI->getMBFI(), PSI, +    TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,                     /* LayoutMode */ true, TailDupSize);      precomputeTriangleChains();    } @@ -3107,9 +3329,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {      BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,                      *MBPI, PSI, TailMergeSize); -    auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>(); -    if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), -                            MMIWP ? &MMIWP->getMMI() : nullptr, MLI, +    if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,                              /*AfterPlacement=*/true)) {        // Redo the layout if tail merging creates/removes/moves blocks.        BlockToChain.clear();  | 
