diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2022-07-04 19:20:19 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2023-02-08 19:02:26 +0000 |
commit | 81ad626541db97eb356e2c1d4a20eb2a26a766ab (patch) | |
tree | 311b6a8987c32b1e1dcbab65c54cfac3fdb56175 /contrib/llvm-project/llvm/lib/CodeGen/SelectOptimize.cpp | |
parent | 5fff09660e06a66bed6482da9c70df328e16bbb6 (diff) | |
parent | 145449b1e420787bb99721a429341fa6be3adfb6 (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/CodeGen/SelectOptimize.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/CodeGen/SelectOptimize.cpp | 989 |
1 files changed, 989 insertions, 0 deletions
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectOptimize.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectOptimize.cpp new file mode 100644 index 000000000000..c199b6a6cca8 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectOptimize.cpp @@ -0,0 +1,989 @@ +//===--- SelectOptimize.cpp - Convert select to branches if profitable ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass converts selects to conditional jumps when profitable. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/ScaledNumber.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/SizeOpts.h" +#include <algorithm> +#include <memory> +#include <queue> +#include <stack> +#include <string> + +using namespace llvm; + +#define DEBUG_TYPE "select-optimize" + +STATISTIC(NumSelectOptAnalyzed, + "Number of select groups considered for conversion to branch"); +STATISTIC(NumSelectConvertedExpColdOperand, + "Number of select groups converted due to expensive cold operand"); +STATISTIC(NumSelectConvertedHighPred, + "Number of select groups converted due to high-predictability"); +STATISTIC(NumSelectUnPred, + "Number of select groups not converted due to unpredictability"); +STATISTIC(NumSelectColdBB, + "Number of select groups not converted due to cold basic block"); +STATISTIC(NumSelectConvertedLoop, + "Number of select groups converted due to loop-level analysis"); +STATISTIC(NumSelectsConverted, "Number of selects converted"); + +static cl::opt<unsigned> ColdOperandThreshold( + "cold-operand-threshold", + cl::desc("Maximum frequency of path for an operand to be considered cold."), + cl::init(20), cl::Hidden); + +static cl::opt<unsigned> ColdOperandMaxCostMultiplier( + "cold-operand-max-cost-multiplier", + cl::desc("Maximum cost multiplier of TCC_expensive for the dependence " + "slice of a cold operand to be considered inexpensive."), + cl::init(1), cl::Hidden); + +static cl::opt<unsigned> + GainGradientThreshold("select-opti-loop-gradient-gain-threshold", + cl::desc("Gradient gain threshold (%)."), + cl::init(25), cl::Hidden); + +static cl::opt<unsigned> + GainCycleThreshold("select-opti-loop-cycle-gain-threshold", + cl::desc("Minimum gain per loop (in cycles) threshold."), + cl::init(4), cl::Hidden); + +static cl::opt<unsigned> GainRelativeThreshold( + "select-opti-loop-relative-gain-threshold", + cl::desc( + "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"), + cl::init(8), cl::Hidden); + +static cl::opt<unsigned> MispredictDefaultRate( + "mispredict-default-rate", cl::Hidden, cl::init(25), + cl::desc("Default mispredict rate (initialized to 25%).")); + +static cl::opt<bool> + DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden, + cl::init(false), + cl::desc("Disable loop-level heuristics.")); + +namespace { + +class SelectOptimize : public FunctionPass { + const TargetMachine *TM = nullptr; + const TargetSubtargetInfo *TSI; + const TargetLowering *TLI = nullptr; + const TargetTransformInfo *TTI = nullptr; + const LoopInfo *LI; + DominatorTree *DT; + std::unique_ptr<BlockFrequencyInfo> BFI; + std::unique_ptr<BranchProbabilityInfo> BPI; + ProfileSummaryInfo *PSI; + OptimizationRemarkEmitter *ORE; + TargetSchedModel TSchedModel; + +public: + static char ID; + + SelectOptimize() : FunctionPass(ID) { + initializeSelectOptimizePass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ProfileSummaryInfoWrapperPass>(); + AU.addRequired<TargetPassConfig>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + } + +private: + // Select groups consist of consecutive select instructions with the same + // condition. + using SelectGroup = SmallVector<SelectInst *, 2>; + using SelectGroups = SmallVector<SelectGroup, 2>; + + using Scaled64 = ScaledNumber<uint64_t>; + + struct CostInfo { + /// Predicated cost (with selects as conditional moves). + Scaled64 PredCost; + /// Non-predicated cost (with selects converted to branches). + Scaled64 NonPredCost; + }; + + // Converts select instructions of a function to conditional jumps when deemed + // profitable. Returns true if at least one select was converted. + bool optimizeSelects(Function &F); + + // Heuristics for determining which select instructions can be profitably + // conveted to branches. Separate heuristics for selects in inner-most loops + // and the rest of code regions (base heuristics for non-inner-most loop + // regions). + void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups); + void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups); + + // Converts to branches the select groups that were deemed + // profitable-to-convert. + void convertProfitableSIGroups(SelectGroups &ProfSIGroups); + + // Splits selects of a given basic block into select groups. + void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups); + + // Determines for which select groups it is profitable converting to branches + // (base and inner-most-loop heuristics). + void findProfitableSIGroupsBase(SelectGroups &SIGroups, + SelectGroups &ProfSIGroups); + void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups, + SelectGroups &ProfSIGroups); + + // Determines if a select group should be converted to a branch (base + // heuristics). + bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI); + + // Returns true if there are expensive instructions in the cold value + // operand's (if any) dependence slice of any of the selects of the given + // group. + bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI); + + // For a given source instruction, collect its backwards dependence slice + // consisting of instructions exclusively computed for producing the operands + // of the source instruction. + void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice, + bool ForSinking = false); + + // Returns true if the condition of the select is highly predictable. + bool isSelectHighlyPredictable(const SelectInst *SI); + + // Loop-level checks to determine if a non-predicated version (with branches) + // of the given loop is more profitable than its predicated version. + bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]); + + // Computes instruction and loop-critical-path costs for both the predicated + // and non-predicated version of the given loop. + bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups, + DenseMap<const Instruction *, CostInfo> &InstCostMap, + CostInfo *LoopCost); + + // Returns a set of all the select instructions in the given select groups. + SmallPtrSet<const Instruction *, 2> getSIset(const SelectGroups &SIGroups); + + // Returns the latency cost of a given instruction. + Optional<uint64_t> computeInstCost(const Instruction *I); + + // Returns the misprediction cost of a given select when converted to branch. + Scaled64 getMispredictionCost(const SelectInst *SI, const Scaled64 CondCost); + + // Returns the cost of a branch when the prediction is correct. + Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, + const SelectInst *SI); + + // Returns true if the target architecture supports lowering a given select. + bool isSelectKindSupported(SelectInst *SI); +}; +} // namespace + +char SelectOptimize::ID = 0; + +INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, + false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, + false) + +FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); } + +bool SelectOptimize::runOnFunction(Function &F) { + TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); + TSI = TM->getSubtargetImpl(F); + TLI = TSI->getTargetLowering(); + + // If none of the select types is supported then skip this pass. + // This is an optimization pass. Legality issues will be handled by + // instruction selection. + if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && + !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && + !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) + return false; + + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + BPI.reset(new BranchProbabilityInfo(F, *LI)); + BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI)); + PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); + ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + TSchedModel.init(TSI); + + // When optimizing for size, selects are preferable over branches. + if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get())) + return false; + + return optimizeSelects(F); +} + +bool SelectOptimize::optimizeSelects(Function &F) { + // Determine for which select groups it is profitable converting to branches. + SelectGroups ProfSIGroups; + // Base heuristics apply only to non-loops and outer loops. + optimizeSelectsBase(F, ProfSIGroups); + // Separate heuristics for inner-most loops. + optimizeSelectsInnerLoops(F, ProfSIGroups); + + // Convert to branches the select groups that were deemed + // profitable-to-convert. + convertProfitableSIGroups(ProfSIGroups); + + // Code modified if at least one select group was converted. + return !ProfSIGroups.empty(); +} + +void SelectOptimize::optimizeSelectsBase(Function &F, + SelectGroups &ProfSIGroups) { + // Collect all the select groups. + SelectGroups SIGroups; + for (BasicBlock &BB : F) { + // Base heuristics apply only to non-loops and outer loops. + Loop *L = LI->getLoopFor(&BB); + if (L && L->isInnermost()) + continue; + collectSelectGroups(BB, SIGroups); + } + + // Determine for which select groups it is profitable converting to branches. + findProfitableSIGroupsBase(SIGroups, ProfSIGroups); +} + +void SelectOptimize::optimizeSelectsInnerLoops(Function &F, + SelectGroups &ProfSIGroups) { + SmallVector<Loop *, 4> Loops(LI->begin(), LI->end()); + // Need to check size on each iteration as we accumulate child loops. + for (unsigned long i = 0; i < Loops.size(); ++i) + for (Loop *ChildL : Loops[i]->getSubLoops()) + Loops.push_back(ChildL); + + for (Loop *L : Loops) { + if (!L->isInnermost()) + continue; + + SelectGroups SIGroups; + for (BasicBlock *BB : L->getBlocks()) + collectSelectGroups(*BB, SIGroups); + + findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups); + } +} + +/// If \p isTrue is true, return the true value of \p SI, otherwise return +/// false value of \p SI. If the true/false value of \p SI is defined by any +/// select instructions in \p Selects, look through the defining select +/// instruction until the true/false value is not defined in \p Selects. +static Value * +getTrueOrFalseValue(SelectInst *SI, bool isTrue, + const SmallPtrSet<const Instruction *, 2> &Selects) { + Value *V = nullptr; + for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI); + DefSI = dyn_cast<SelectInst>(V)) { + assert(DefSI->getCondition() == SI->getCondition() && + "The condition of DefSI does not match with SI"); + V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue()); + } + assert(V && "Failed to get select true/false value"); + return V; +} + +void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) { + for (SelectGroup &ASI : ProfSIGroups) { + // The code transformation here is a modified version of the sinking + // transformation in CodeGenPrepare::optimizeSelectInst with a more + // aggressive strategy of which instructions to sink. + // + // TODO: eliminate the redundancy of logic transforming selects to branches + // by removing CodeGenPrepare::optimizeSelectInst and optimizing here + // selects for all cases (with and without profile information). + + // Transform a sequence like this: + // start: + // %cmp = cmp uge i32 %a, %b + // %sel = select i1 %cmp, i32 %c, i32 %d + // + // Into: + // start: + // %cmp = cmp uge i32 %a, %b + // %cmp.frozen = freeze %cmp + // br i1 %cmp.frozen, label %select.true, label %select.false + // select.true: + // br label %select.end + // select.false: + // br label %select.end + // select.end: + // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] + // + // %cmp should be frozen, otherwise it may introduce undefined behavior. + // In addition, we may sink instructions that produce %c or %d into the + // destination(s) of the new branch. + // If the true or false blocks do not contain a sunken instruction, that + // block and its branch may be optimized away. In that case, one side of the + // first branch will point directly to select.end, and the corresponding PHI + // predecessor block will be the start block. + + // Find all the instructions that can be soundly sunk to the true/false + // blocks. These are instructions that are computed solely for producing the + // operands of the select instructions in the group and can be sunk without + // breaking the semantics of the LLVM IR (e.g., cannot sink instructions + // with side effects). + SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices; + typedef std::stack<Instruction *>::size_type StackSizeType; + StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0; + for (SelectInst *SI : ASI) { + // For each select, compute the sinkable dependence chains of the true and + // false operands. + if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) { + std::stack<Instruction *> TrueSlice; + getExclBackwardsSlice(TI, TrueSlice, true); + maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size()); + TrueSlices.push_back(TrueSlice); + } + if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) { + std::stack<Instruction *> FalseSlice; + getExclBackwardsSlice(FI, FalseSlice, true); + maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size()); + FalseSlices.push_back(FalseSlice); + } + } + // In the case of multiple select instructions in the same group, the order + // of non-dependent instructions (instructions of different dependence + // slices) in the true/false blocks appears to affect performance. + // Interleaving the slices seems to experimentally be the optimal approach. + // This interleaving scheduling allows for more ILP (with a natural downside + // of increasing a bit register pressure) compared to a simple ordering of + // one whole chain after another. One would expect that this ordering would + // not matter since the scheduling in the backend of the compiler would + // take care of it, but apparently the scheduler fails to deliver optimal + // ILP with a naive ordering here. + SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved; + for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) { + for (auto &S : TrueSlices) { + if (!S.empty()) { + TrueSlicesInterleaved.push_back(S.top()); + S.pop(); + } + } + } + for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) { + for (auto &S : FalseSlices) { + if (!S.empty()) { + FalseSlicesInterleaved.push_back(S.top()); + S.pop(); + } + } + } + + // We split the block containing the select(s) into two blocks. + SelectInst *SI = ASI.front(); + SelectInst *LastSI = ASI.back(); + BasicBlock *StartBlock = SI->getParent(); + BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI)); + BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); + BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency()); + // Delete the unconditional branch that was just created by the split. + StartBlock->getTerminator()->eraseFromParent(); + + // Move any debug/pseudo instructions that were in-between the select + // group to the newly-created end block. + SmallVector<Instruction *, 2> DebugPseudoINS; + auto DIt = SI->getIterator(); + while (&*DIt != LastSI) { + if (DIt->isDebugOrPseudoInst()) + DebugPseudoINS.push_back(&*DIt); + DIt++; + } + for (auto DI : DebugPseudoINS) { + DI->moveBefore(&*EndBlock->getFirstInsertionPt()); + } + + // These are the new basic blocks for the conditional branch. + // At least one will become an actual new basic block. + BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr; + BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr; + if (!TrueSlicesInterleaved.empty()) { + TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink", + EndBlock->getParent(), EndBlock); + TrueBranch = BranchInst::Create(EndBlock, TrueBlock); + TrueBranch->setDebugLoc(LastSI->getDebugLoc()); + for (Instruction *TrueInst : TrueSlicesInterleaved) + TrueInst->moveBefore(TrueBranch); + } + if (!FalseSlicesInterleaved.empty()) { + FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink", + EndBlock->getParent(), EndBlock); + FalseBranch = BranchInst::Create(EndBlock, FalseBlock); + FalseBranch->setDebugLoc(LastSI->getDebugLoc()); + for (Instruction *FalseInst : FalseSlicesInterleaved) + FalseInst->moveBefore(FalseBranch); + } + // If there was nothing to sink, then arbitrarily choose the 'false' side + // for a new input value to the PHI. + if (TrueBlock == FalseBlock) { + assert(TrueBlock == nullptr && + "Unexpected basic block transform while optimizing select"); + + FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", + EndBlock->getParent(), EndBlock); + auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); + FalseBranch->setDebugLoc(SI->getDebugLoc()); + } + + // Insert the real conditional branch based on the original condition. + // If we did not create a new block for one of the 'true' or 'false' paths + // of the condition, it means that side of the branch goes to the end block + // directly and the path originates from the start block from the point of + // view of the new PHI. + BasicBlock *TT, *FT; + if (TrueBlock == nullptr) { + TT = EndBlock; + FT = FalseBlock; + TrueBlock = StartBlock; + } else if (FalseBlock == nullptr) { + TT = TrueBlock; + FT = EndBlock; + FalseBlock = StartBlock; + } else { + TT = TrueBlock; + FT = FalseBlock; + } + IRBuilder<> IB(SI); + auto *CondFr = + IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen"); + IB.CreateCondBr(CondFr, TT, FT, SI); + + SmallPtrSet<const Instruction *, 2> INS; + INS.insert(ASI.begin(), ASI.end()); + // Use reverse iterator because later select may use the value of the + // earlier select, and we need to propagate value through earlier select + // to get the PHI operand. + for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) { + SelectInst *SI = *It; + // The select itself is replaced with a PHI Node. + PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front()); + PN->takeName(SI); + PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock); + PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock); + PN->setDebugLoc(SI->getDebugLoc()); + + SI->replaceAllUsesWith(PN); + SI->eraseFromParent(); + INS.erase(SI); + ++NumSelectsConverted; + } + } +} + +void SelectOptimize::collectSelectGroups(BasicBlock &BB, + SelectGroups &SIGroups) { + BasicBlock::iterator BBIt = BB.begin(); + while (BBIt != BB.end()) { + Instruction *I = &*BBIt++; + if (SelectInst *SI = dyn_cast<SelectInst>(I)) { + SelectGroup SIGroup; + SIGroup.push_back(SI); + while (BBIt != BB.end()) { + Instruction *NI = &*BBIt; + SelectInst *NSI = dyn_cast<SelectInst>(NI); + if (NSI && SI->getCondition() == NSI->getCondition()) { + SIGroup.push_back(NSI); + } else if (!NI->isDebugOrPseudoInst()) { + // Debug/pseudo instructions should be skipped and not prevent the + // formation of a select group. + break; + } + ++BBIt; + } + + // If the select type is not supported, no point optimizing it. + // Instruction selection will take care of it. + if (!isSelectKindSupported(SI)) + continue; + + SIGroups.push_back(SIGroup); + } + } +} + +void SelectOptimize::findProfitableSIGroupsBase(SelectGroups &SIGroups, + SelectGroups &ProfSIGroups) { + for (SelectGroup &ASI : SIGroups) { + ++NumSelectOptAnalyzed; + if (isConvertToBranchProfitableBase(ASI)) + ProfSIGroups.push_back(ASI); + } +} + +void SelectOptimize::findProfitableSIGroupsInnerLoops( + const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { + NumSelectOptAnalyzed += SIGroups.size(); + // For each select group in an inner-most loop, + // a branch is more preferable than a select/conditional-move if: + // i) conversion to branches for all the select groups of the loop satisfies + // loop-level heuristics including reducing the loop's critical path by + // some threshold (see SelectOptimize::checkLoopHeuristics); and + // ii) the total cost of the select group is cheaper with a branch compared + // to its predicated version. The cost is in terms of latency and the cost + // of a select group is the cost of its most expensive select instruction + // (assuming infinite resources and thus fully leveraging available ILP). + + DenseMap<const Instruction *, CostInfo> InstCostMap; + CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()}, + {Scaled64::getZero(), Scaled64::getZero()}}; + if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) || + !checkLoopHeuristics(L, LoopCost)) { + return; + } + + for (SelectGroup &ASI : SIGroups) { + // Assuming infinite resources, the cost of a group of instructions is the + // cost of the most expensive instruction of the group. + Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero(); + for (SelectInst *SI : ASI) { + SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost); + BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost); + } + if (BranchCost < SelectCost) { + OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front()); + OR << "Profitable to convert to branch (loop analysis). BranchCost=" + << BranchCost.toString() << ", SelectCost=" << SelectCost.toString() + << ". "; + ORE->emit(OR); + ++NumSelectConvertedLoop; + ProfSIGroups.push_back(ASI); + } else { + OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front()); + ORmiss << "Select is more profitable (loop analysis). BranchCost=" + << BranchCost.toString() + << ", SelectCost=" << SelectCost.toString() << ". "; + ORE->emit(ORmiss); + } + } +} + +bool SelectOptimize::isConvertToBranchProfitableBase( + const SmallVector<SelectInst *, 2> &ASI) { + SelectInst *SI = ASI.front(); + OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI); + OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI); + + // Skip cold basic blocks. Better to optimize for size for cold blocks. + if (PSI->isColdBlock(SI->getParent(), BFI.get())) { + ++NumSelectColdBB; + ORmiss << "Not converted to branch because of cold basic block. "; + ORE->emit(ORmiss); + return false; + } + + // If unpredictable, branch form is less profitable. + if (SI->getMetadata(LLVMContext::MD_unpredictable)) { + ++NumSelectUnPred; + ORmiss << "Not converted to branch because of unpredictable branch. "; + ORE->emit(ORmiss); + return false; + } + + // If highly predictable, branch form is more profitable, unless a + // predictable select is inexpensive in the target architecture. + if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) { + ++NumSelectConvertedHighPred; + OR << "Converted to branch because of highly predictable branch. "; + ORE->emit(OR); + return true; + } + + // Look for expensive instructions in the cold operand's (if any) dependence + // slice of any of the selects in the group. + if (hasExpensiveColdOperand(ASI)) { + ++NumSelectConvertedExpColdOperand; + OR << "Converted to branch because of expensive cold operand."; + ORE->emit(OR); + return true; + } + + ORmiss << "Not profitable to convert to branch (base heuristic)."; + ORE->emit(ORmiss); + return false; +} + +static InstructionCost divideNearest(InstructionCost Numerator, + uint64_t Denominator) { + return (Numerator + (Denominator / 2)) / Denominator; +} + +bool SelectOptimize::hasExpensiveColdOperand( + const SmallVector<SelectInst *, 2> &ASI) { + bool ColdOperand = false; + uint64_t TrueWeight, FalseWeight, TotalWeight; + if (ASI.front()->extractProfMetadata(TrueWeight, FalseWeight)) { + uint64_t MinWeight = std::min(TrueWeight, FalseWeight); + TotalWeight = TrueWeight + FalseWeight; + // Is there a path with frequency <ColdOperandThreshold% (default:20%) ? + ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight; + } else if (PSI->hasProfileSummary()) { + OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front()); + ORmiss << "Profile data available but missing branch-weights metadata for " + "select instruction. "; + ORE->emit(ORmiss); + } + if (!ColdOperand) + return false; + // Check if the cold path's dependence slice is expensive for any of the + // selects of the group. + for (SelectInst *SI : ASI) { + Instruction *ColdI = nullptr; + uint64_t HotWeight; + if (TrueWeight < FalseWeight) { + ColdI = dyn_cast<Instruction>(SI->getTrueValue()); + HotWeight = FalseWeight; + } else { + ColdI = dyn_cast<Instruction>(SI->getFalseValue()); + HotWeight = TrueWeight; + } + if (ColdI) { + std::stack<Instruction *> ColdSlice; + getExclBackwardsSlice(ColdI, ColdSlice); + InstructionCost SliceCost = 0; + while (!ColdSlice.empty()) { + SliceCost += TTI->getInstructionCost(ColdSlice.top(), + TargetTransformInfo::TCK_Latency); + ColdSlice.pop(); + } + // The colder the cold value operand of the select is the more expensive + // the cmov becomes for computing the cold value operand every time. Thus, + // the colder the cold operand is the more its cost counts. + // Get nearest integer cost adjusted for coldness. + InstructionCost AdjSliceCost = + divideNearest(SliceCost * HotWeight, TotalWeight); + if (AdjSliceCost >= + ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive) + return true; + } + } + return false; +} + +// For a given source instruction, collect its backwards dependence slice +// consisting of instructions exclusively computed for the purpose of producing +// the operands of the source instruction. As an approximation +// (sufficiently-accurate in practice), we populate this set with the +// instructions of the backwards dependence slice that only have one-use and +// form an one-use chain that leads to the source instruction. +void SelectOptimize::getExclBackwardsSlice(Instruction *I, + std::stack<Instruction *> &Slice, + bool ForSinking) { + SmallPtrSet<Instruction *, 2> Visited; + std::queue<Instruction *> Worklist; + Worklist.push(I); + while (!Worklist.empty()) { + Instruction *II = Worklist.front(); + Worklist.pop(); + + // Avoid cycles. + if (!Visited.insert(II).second) + continue; + + if (!II->hasOneUse()) + continue; + + // Cannot soundly sink instructions with side-effects. + // Terminator or phi instructions cannot be sunk. + // Avoid sinking other select instructions (should be handled separetely). + if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() || + isa<SelectInst>(II) || isa<PHINode>(II))) + continue; + + // Avoid considering instructions with less frequency than the source + // instruction (i.e., avoid colder code regions of the dependence slice). + if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent())) + continue; + + // Eligible one-use instruction added to the dependence slice. + Slice.push(II); + + // Explore all the operands of the current instruction to expand the slice. + for (unsigned k = 0; k < II->getNumOperands(); ++k) + if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k))) + Worklist.push(OpI); + } +} + +bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) { + uint64_t TrueWeight, FalseWeight; + if (SI->extractProfMetadata(TrueWeight, FalseWeight)) { + uint64_t Max = std::max(TrueWeight, FalseWeight); + uint64_t Sum = TrueWeight + FalseWeight; + if (Sum != 0) { + auto Probability = BranchProbability::getBranchProbability(Max, Sum); + if (Probability > TTI->getPredictableBranchThreshold()) + return true; + } + } + return false; +} + +bool SelectOptimize::checkLoopHeuristics(const Loop *L, + const CostInfo LoopCost[2]) { + // Loop-level checks to determine if a non-predicated version (with branches) + // of the loop is more profitable than its predicated version. + + if (DisableLoopLevelHeuristics) + return true; + + OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", + L->getHeader()->getFirstNonPHI()); + + if (LoopCost[0].NonPredCost > LoopCost[0].PredCost || + LoopCost[1].NonPredCost >= LoopCost[1].PredCost) { + ORmissL << "No select conversion in the loop due to no reduction of loop's " + "critical path. "; + ORE->emit(ORmissL); + return false; + } + + Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost, + LoopCost[1].PredCost - LoopCost[1].NonPredCost}; + + // Profitably converting to branches need to reduce the loop's critical path + // by at least some threshold (absolute gain of GainCycleThreshold cycles and + // relative gain of 12.5%). + if (Gain[1] < Scaled64::get(GainCycleThreshold) || + Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) { + Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost; + ORmissL << "No select conversion in the loop due to small reduction of " + "loop's critical path. Gain=" + << Gain[1].toString() + << ", RelativeGain=" << RelativeGain.toString() << "%. "; + ORE->emit(ORmissL); + return false; + } + + // If the loop's critical path involves loop-carried dependences, the gradient + // of the gain needs to be at least GainGradientThreshold% (defaults to 25%). + // This check ensures that the latency reduction for the loop's critical path + // keeps decreasing with sufficient rate beyond the two analyzed loop + // iterations. + if (Gain[1] > Gain[0]) { + Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) / + (LoopCost[1].PredCost - LoopCost[0].PredCost); + if (GradientGain < Scaled64::get(GainGradientThreshold)) { + ORmissL << "No select conversion in the loop due to small gradient gain. " + "GradientGain=" + << GradientGain.toString() << "%. "; + ORE->emit(ORmissL); + return false; + } + } + // If the gain decreases it is not profitable to convert. + else if (Gain[1] < Gain[0]) { + ORmissL + << "No select conversion in the loop due to negative gradient gain. "; + ORE->emit(ORmissL); + return false; + } + + // Non-predicated version of the loop is more profitable than its + // predicated version. + return true; +} + +// Computes instruction and loop-critical-path costs for both the predicated +// and non-predicated version of the given loop. +// Returns false if unable to compute these costs due to invalid cost of loop +// instruction(s). +bool SelectOptimize::computeLoopCosts( + const Loop *L, const SelectGroups &SIGroups, + DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) { + const auto &SIset = getSIset(SIGroups); + // Compute instruction and loop-critical-path costs across two iterations for + // both predicated and non-predicated version. + const unsigned Iterations = 2; + for (unsigned Iter = 0; Iter < Iterations; ++Iter) { + // Cost of the loop's critical path. + CostInfo &MaxCost = LoopCost[Iter]; + for (BasicBlock *BB : L->getBlocks()) { + for (const Instruction &I : *BB) { + if (I.isDebugOrPseudoInst()) + continue; + // Compute the predicated and non-predicated cost of the instruction. + Scaled64 IPredCost = Scaled64::getZero(), + INonPredCost = Scaled64::getZero(); + + // Assume infinite resources that allow to fully exploit the available + // instruction-level parallelism. + // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost) + for (const Use &U : I.operands()) { + auto UI = dyn_cast<Instruction>(U.get()); + if (!UI) + continue; + if (InstCostMap.count(UI)) { + IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost); + INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost); + } + } + auto ILatency = computeInstCost(&I); + if (!ILatency) { + OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I); + ORmissL << "Invalid instruction cost preventing analysis and " + "optimization of the inner-most loop containing this " + "instruction. "; + ORE->emit(ORmissL); + return false; + } + IPredCost += Scaled64::get(ILatency.getValue()); + INonPredCost += Scaled64::get(ILatency.getValue()); + + // For a select that can be converted to branch, + // compute its cost as a branch (non-predicated cost). + // + // BranchCost = PredictedPathCost + MispredictCost + // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb + // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate + if (SIset.contains(&I)) { + auto SI = dyn_cast<SelectInst>(&I); + + Scaled64 TrueOpCost = Scaled64::getZero(), + FalseOpCost = Scaled64::getZero(); + if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) + if (InstCostMap.count(TI)) + TrueOpCost = InstCostMap[TI].NonPredCost; + if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) + if (InstCostMap.count(FI)) + FalseOpCost = InstCostMap[FI].NonPredCost; + Scaled64 PredictedPathCost = + getPredictedPathCost(TrueOpCost, FalseOpCost, SI); + + Scaled64 CondCost = Scaled64::getZero(); + if (auto *CI = dyn_cast<Instruction>(SI->getCondition())) + if (InstCostMap.count(CI)) + CondCost = InstCostMap[CI].NonPredCost; + Scaled64 MispredictCost = getMispredictionCost(SI, CondCost); + + INonPredCost = PredictedPathCost + MispredictCost; + } + + InstCostMap[&I] = {IPredCost, INonPredCost}; + MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost); + MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost); + } + } + } + return true; +} + +SmallPtrSet<const Instruction *, 2> +SelectOptimize::getSIset(const SelectGroups &SIGroups) { + SmallPtrSet<const Instruction *, 2> SIset; + for (const SelectGroup &ASI : SIGroups) + for (const SelectInst *SI : ASI) + SIset.insert(SI); + return SIset; +} + +Optional<uint64_t> SelectOptimize::computeInstCost(const Instruction *I) { + InstructionCost ICost = + TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency); + if (auto OC = ICost.getValue()) + return Optional<uint64_t>(*OC); + return Optional<uint64_t>(None); +} + +ScaledNumber<uint64_t> +SelectOptimize::getMispredictionCost(const SelectInst *SI, + const Scaled64 CondCost) { + uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; + + // Account for the default misprediction rate when using a branch + // (conservatively set to 25% by default). + uint64_t MispredictRate = MispredictDefaultRate; + // If the select condition is obviously predictable, then the misprediction + // rate is zero. + if (isSelectHighlyPredictable(SI)) + MispredictRate = 0; + + // CondCost is included to account for cases where the computation of the + // condition is part of a long dependence chain (potentially loop-carried) + // that would delay detection of a misprediction and increase its cost. + Scaled64 MispredictCost = + std::max(Scaled64::get(MispredictPenalty), CondCost) * + Scaled64::get(MispredictRate); + MispredictCost /= Scaled64::get(100); + + return MispredictCost; +} + +// Returns the cost of a branch when the prediction is correct. +// TrueCost * TrueProbability + FalseCost * FalseProbability. +ScaledNumber<uint64_t> +SelectOptimize::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, + const SelectInst *SI) { + Scaled64 PredPathCost; + uint64_t TrueWeight, FalseWeight; + if (SI->extractProfMetadata(TrueWeight, FalseWeight)) { + uint64_t SumWeight = TrueWeight + FalseWeight; + if (SumWeight != 0) { + PredPathCost = TrueCost * Scaled64::get(TrueWeight) + + FalseCost * Scaled64::get(FalseWeight); + PredPathCost /= Scaled64::get(SumWeight); + return PredPathCost; + } + } + // Without branch weight metadata, we assume 75% for the one path and 25% for + // the other, and pick the result with the biggest cost. + PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost, + FalseCost * Scaled64::get(3) + TrueCost); + PredPathCost /= Scaled64::get(4); + return PredPathCost; +} + +bool SelectOptimize::isSelectKindSupported(SelectInst *SI) { + bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); + if (VectorCond) + return false; + TargetLowering::SelectSupportKind SelectKind; + if (SI->getType()->isVectorTy()) + SelectKind = TargetLowering::ScalarCondVectorVal; + else + SelectKind = TargetLowering::ScalarValSelect; + return TLI->isSelectSupported(SelectKind); +} |