diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/IPO')
3 files changed, 99 insertions, 39 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp index 49babc24cb82..10abea7ebd32 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" @@ -92,6 +93,18 @@ static cl::opt<bool> DisableInlinedAllocaMerging("disable-inlined-alloca-merging", cl::init(false), cl::Hidden); +static cl::opt<int> IntraSCCCostMultiplier( + "intra-scc-cost-multiplier", cl::init(2), cl::Hidden, + cl::desc( + "Cost multiplier to multiply onto inlined call sites where the " + "new call was previously an intra-SCC call (not relevant when the " + "original call was already intra-SCC). This can accumulate over " + "multiple inlinings (e.g. if a call site already had a cost " + "multiplier and one of its inlined calls was also subject to " + "this, the inlined call would have the original multiplier " + "multiplied by intra-scc-cost-multiplier). This is to prevent tons of " + "inlining through a child SCC which can cause terrible compile times")); + /// A flag for test, so we can print the content of the advisor when running it /// as part of the default (e.g. -O3) pipeline. static cl::opt<bool> KeepAdvisorForPrinting("keep-inline-advisor-for-printing", @@ -876,8 +889,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // trigger infinite inlining, much like is prevented within the inliner // itself by the InlineHistory above, but spread across CGSCC iterations // and thus hidden from the full inline history. - if (CG.lookupSCC(*CG.lookup(Callee)) == C && - UR.InlinedInternalEdges.count({&N, C})) { + LazyCallGraph::SCC *CalleeSCC = CG.lookupSCC(*CG.lookup(Callee)); + if (CalleeSCC == C && UR.InlinedInternalEdges.count({&N, C})) { LLVM_DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node " "previously split out of this SCC by inlining: " << F.getName() << " -> " << Callee.getName() << "\n"); @@ -897,6 +910,11 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, continue; } + int CBCostMult = + getStringFnAttrAsInt( + *CB, InlineConstants::FunctionInlineCostMultiplierAttributeName) + .getValueOr(1); + // Setup the data structure used to plumb customization into the // `InlineFunction` routine. InlineFunctionInfo IFI( @@ -935,9 +953,28 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, if (tryPromoteCall(*ICB)) NewCallee = ICB->getCalledFunction(); } - if (NewCallee) - if (!NewCallee->isDeclaration()) + if (NewCallee) { + if (!NewCallee->isDeclaration()) { Calls->push({ICB, NewHistoryID}); + // Continually inlining through an SCC can result in huge compile + // times and bloated code since we arbitrarily stop at some point + // when the inliner decides it's not profitable to inline anymore. + // We attempt to mitigate this by making these calls exponentially + // more expensive. + // This doesn't apply to calls in the same SCC since if we do + // inline through the SCC the function will end up being + // self-recursive which the inliner bails out on, and inlining + // within an SCC is necessary for performance. + if (CalleeSCC != C && + CalleeSCC == CG.lookupSCC(CG.get(*NewCallee))) { + Attribute NewCBCostMult = Attribute::get( + M.getContext(), + InlineConstants::FunctionInlineCostMultiplierAttributeName, + itostr(CBCostMult * IntraSCCCostMultiplier)); + ICB->addFnAttr(NewCBCostMult); + } + } + } } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 5113c0c67acc..7205ae178d21 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -3712,9 +3712,9 @@ struct AAKernelInfoFunction : AAKernelInfo { // __kmpc_get_hardware_num_threads_in_block(); // WarpSize = __kmpc_get_warp_size(); // BlockSize = BlockHwSize - WarpSize; - // if (InitCB >= BlockSize) return; - // IsWorkerCheckBB: bool IsWorker = InitCB >= 0; + // IsWorkerCheckBB: bool IsWorker = InitCB != -1; // if (IsWorker) { + // if (InitCB >= BlockSize) return; // SMBeginBB: __kmpc_barrier_simple_generic(...); // void *WorkFn; // bool Active = __kmpc_kernel_parallel(&WorkFn); @@ -3771,6 +3771,13 @@ struct AAKernelInfoFunction : AAKernelInfo { ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc); InitBB->getTerminator()->eraseFromParent(); + Instruction *IsWorker = + ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB, + ConstantInt::get(KernelInitCB->getType(), -1), + "thread.is_worker", InitBB); + IsWorker->setDebugLoc(DLoc); + BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB); + Module &M = *Kernel->getParent(); auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); FunctionCallee BlockHwSizeFn = @@ -3780,29 +3787,22 @@ struct AAKernelInfoFunction : AAKernelInfo { OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( M, OMPRTL___kmpc_get_warp_size); CallInst *BlockHwSize = - CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB); + CallInst::Create(BlockHwSizeFn, "block.hw_size", IsWorkerCheckBB); OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize); BlockHwSize->setDebugLoc(DLoc); - CallInst *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB); + CallInst *WarpSize = + CallInst::Create(WarpSizeFn, "warp.size", IsWorkerCheckBB); OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize); WarpSize->setDebugLoc(DLoc); - Instruction *BlockSize = - BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB); + Instruction *BlockSize = BinaryOperator::CreateSub( + BlockHwSize, WarpSize, "block.size", IsWorkerCheckBB); BlockSize->setDebugLoc(DLoc); - Instruction *IsMainOrWorker = - ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB, - BlockSize, "thread.is_main_or_worker", InitBB); + Instruction *IsMainOrWorker = ICmpInst::Create( + ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB, BlockSize, + "thread.is_main_or_worker", IsWorkerCheckBB); IsMainOrWorker->setDebugLoc(DLoc); - BranchInst::Create(IsWorkerCheckBB, StateMachineFinishedBB, IsMainOrWorker, - InitBB); - - Instruction *IsWorker = - ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB, - ConstantInt::get(KernelInitCB->getType(), -1), - "thread.is_worker", IsWorkerCheckBB); - IsWorker->setDebugLoc(DLoc); - BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, - IsWorkerCheckBB); + BranchInst::Create(StateMachineBeginBB, StateMachineFinishedBB, + IsMainOrWorker, IsWorkerCheckBB); // Create local storage for the work function pointer. const DataLayout &DL = M.getDataLayout(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 74f68531b89a..6e5aeb9c41f6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -365,7 +365,9 @@ void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM, MPM.add(createFunctionInliningPass(IP)); MPM.add(createSROAPass()); MPM.add(createEarlyCSEPass()); // Catch trivial redundancies - MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + MPM.add(createCFGSimplificationPass( + SimplifyCFGOptions().convertSwitchRangeToICmp( + true))); // Merge & remove BBs MPM.add(createInstructionCombiningPass()); // Combine silly seq's addExtensionsToPM(EP_Peephole, MPM); } @@ -404,7 +406,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createGVNHoistPass()); if (EnableGVNSink) { MPM.add(createGVNSinkPass()); - MPM.add(createCFGSimplificationPass()); + MPM.add(createCFGSimplificationPass( + SimplifyCFGOptions().convertSwitchRangeToICmp(true))); } } @@ -418,7 +421,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createJumpThreadingPass()); // Thread jumps. MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals } - MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + MPM.add( + createCFGSimplificationPass(SimplifyCFGOptions().convertSwitchRangeToICmp( + true))); // Merge & remove BBs // Combine silly seq's if (OptLevel > 2) MPM.add(createAggressiveInstCombinerPass()); @@ -434,7 +439,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses( // TODO: Investigate the cost/benefit of tail call elimination on debugging. if (OptLevel > 1) MPM.add(createTailCallEliminationPass()); // Eliminate tail calls - MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + MPM.add( + createCFGSimplificationPass(SimplifyCFGOptions().convertSwitchRangeToICmp( + true))); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions // The matrix extension can introduce large vector operations early, which can @@ -451,13 +458,18 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createLoopSimplifyCFGPass()); } // Try to remove as much code from the loop header as possible, - // to reduce amount of IR that will have to be duplicated. + // to reduce amount of IR that will have to be duplicated. However, + // do not perform speculative hoisting the first time as LICM + // will destroy metadata that may not need to be destroyed if run + // after loop rotation. // TODO: Investigate promotion cap for O1. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/false)); // Rotate Loop - disable header duplication at -Oz MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO)); // TODO: Investigate promotion cap for O1. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); if (EnableSimpleLoopUnswitch) MPM.add(createSimpleLoopUnswitchLegacyPass()); else @@ -465,7 +477,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses( // FIXME: We break the loop pass pipeline here in order to do full // simplifycfg. Eventually loop-simplifycfg should be enhanced to replace the // need for this. - MPM.add(createCFGSimplificationPass()); + MPM.add(createCFGSimplificationPass( + SimplifyCFGOptions().convertSwitchRangeToICmp(true))); MPM.add(createInstructionCombiningPass()); // We resume loop passes creating a second loop pipeline here. if (EnableLoopFlatten) { @@ -521,7 +534,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses( // TODO: Investigate if this is too expensive at O1. if (OptLevel > 1) { MPM.add(createDeadStoreEliminationPass()); // Delete dead stores - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); } addExtensionsToPM(EP_ScalarOptimizerLate, MPM); @@ -580,9 +594,11 @@ void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM, PM.add(createEarlyCSEPass()); PM.add(createCorrelatedValuePropagationPass()); PM.add(createInstructionCombiningPass()); - PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); PM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); - PM.add(createCFGSimplificationPass()); + PM.add(createCFGSimplificationPass( + SimplifyCFGOptions().convertSwitchRangeToICmp(true))); PM.add(createInstructionCombiningPass()); } @@ -597,6 +613,7 @@ void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM, // before SLP vectorization. PM.add(createCFGSimplificationPass(SimplifyCFGOptions() .forwardSwitchCondToPhi(true) + .convertSwitchRangeToICmp(true) .convertSwitchToLookupTable(true) .needCanonicalLoops(false) .hoistCommonInsts(true) @@ -641,7 +658,8 @@ void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM, // unrolled loop is a inner loop, then the prologue will be inside the // outer loop. LICM pass can help to promote the runtime check out if the // checked value is loop invariant. - PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); } PM.add(createWarnMissedTransformationsPass()); @@ -772,7 +790,9 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE addExtensionsToPM(EP_Peephole, MPM); - MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE + MPM.add( + createCFGSimplificationPass(SimplifyCFGOptions().convertSwitchRangeToICmp( + true))); // Clean up after IPCP & DAE // For SamplePGO in ThinLTO compile phase, we do not want to do indirect // call promotion as it will change the CFG too much to make the 2nd @@ -886,7 +906,8 @@ void PassManagerBuilder::populateModulePassManager( // later might get benefit of no-alias assumption in clone loop. if (UseLoopVersioningLICM) { MPM.add(createLoopVersioningLICMPass()); // Do LoopVersioningLICM - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); } // We add a fresh GlobalsModRef run at this point. This is particularly @@ -972,7 +993,8 @@ void PassManagerBuilder::populateModulePassManager( // LoopSink (and other loop passes since the last simplifyCFG) might have // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. - MPM.add(createCFGSimplificationPass()); + MPM.add(createCFGSimplificationPass( + SimplifyCFGOptions().convertSwitchRangeToICmp(true))); addExtensionsToPM(EP_OptimizerLast, MPM); @@ -1120,7 +1142,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // Run a few AA driven optimizations here and now, to cleanup the code. PM.add(createGlobalsAAWrapperPass()); // IP alias analysis. - PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); PM.add(NewGVN ? createNewGVNPass() : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. PM.add(createMemCpyOptPass()); // Remove dead memcpys. |
