diff options
Diffstat (limited to 'lib/Analysis/InlineCost.cpp')
-rw-r--r-- | lib/Analysis/InlineCost.cpp | 424 |
1 files changed, 229 insertions, 195 deletions
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp index 6ddb3cbc01a3..0dec146e0465 100644 --- a/lib/Analysis/InlineCost.cpp +++ b/lib/Analysis/InlineCost.cpp @@ -1,9 +1,8 @@ //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -28,7 +27,6 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Config/llvm-config.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" @@ -37,6 +35,7 @@ #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -51,19 +50,19 @@ static cl::opt<int> InlineThreshold( cl::desc("Control the amount of inlining to perform (default = 225)")); static cl::opt<int> HintThreshold( - "inlinehint-threshold", cl::Hidden, cl::init(325), + "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore, cl::desc("Threshold for inlining functions with inline hint")); static cl::opt<int> ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, - cl::init(45), + cl::init(45), cl::ZeroOrMore, cl::desc("Threshold for inlining cold callsites")); // We introduce this threshold to help performance of instrumentation based // PGO before we actually hook up inliner with analysis passes such as BPI and // BFI. static cl::opt<int> ColdThreshold( - "inlinecold-threshold", cl::Hidden, cl::init(45), + "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore, cl::desc("Threshold for inlining functions with cold attribute")); static cl::opt<int> @@ -77,7 +76,7 @@ static cl::opt<int> LocallyHotCallSiteThreshold( static cl::opt<int> ColdCallSiteRelFreq( "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, - cl::desc("Maxmimum block frequency, expressed as a percentage of caller's " + cl::desc("Maximum block frequency, expressed as a percentage of caller's " "entry frequency, for a callsite to be cold in the absence of " "profile information.")); @@ -88,7 +87,7 @@ static cl::opt<int> HotCallSiteRelFreq( "profile information.")); static cl::opt<bool> OptComputeFullInlineCost( - "inline-cost-full", cl::Hidden, cl::init(false), + "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::desc("Compute the full inline cost of a call site even when the cost " "exceeds the threshold.")); @@ -122,31 +121,43 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { /// The candidate callsite being analyzed. Please do not use this to do /// analysis in the caller function; we want the inline cost query to be /// easily cacheable. Instead, use the cover function paramHasAttr. - CallSite CandidateCS; + CallBase &CandidateCall; /// Tunable parameters that control the analysis. const InlineParams &Params; + /// Upper bound for the inlining cost. Bonuses are being applied to account + /// for speculative "expected profit" of the inlining decision. int Threshold; - int Cost; + + /// Inlining cost measured in abstract units, accounts for all the + /// instructions expected to be executed for a given function invocation. + /// Instructions that are statically proven to be dead based on call-site + /// arguments are not counted here. + int Cost = 0; + bool ComputeFullInlineCost; - bool IsCallerRecursive; - bool IsRecursiveCall; - bool ExposesReturnsTwice; - bool HasDynamicAlloca; - bool ContainsNoDuplicateCall; - bool HasReturn; - bool HasIndirectBr; - bool HasUninlineableIntrinsic; - bool InitsVargArgs; + bool IsCallerRecursive = false; + bool IsRecursiveCall = false; + bool ExposesReturnsTwice = false; + bool HasDynamicAlloca = false; + bool ContainsNoDuplicateCall = false; + bool HasReturn = false; + bool HasIndirectBr = false; + bool HasUninlineableIntrinsic = false; + bool InitsVargArgs = false; /// Number of bytes allocated statically by the callee. - uint64_t AllocatedSize; - unsigned NumInstructions, NumVectorInstructions; - int VectorBonus, TenPercentVectorBonus; - // Bonus to be applied when the callee has only one reachable basic block. - int SingleBBBonus; + uint64_t AllocatedSize = 0; + unsigned NumInstructions = 0; + unsigned NumVectorInstructions = 0; + + /// Bonus to be applied when percentage of vector instructions in callee is + /// high (see more details in updateThreshold). + int VectorBonus = 0; + /// Bonus to be applied when the callee has only one reachable basic block. + int SingleBBBonus = 0; /// While we walk the potentially-inlined instructions, we build up and /// maintain a mapping of simplified values specific to this callsite. The @@ -181,7 +192,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { /// loads. bool EnableLoadElimination; SmallPtrSet<Value *, 16> LoadAddrSet; - int LoadEliminationCost; + int LoadEliminationCost = 0; // Custom simplification helper routines. bool isAllocaDerivedArg(Value *V); @@ -196,7 +207,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { bool isGEPFree(GetElementPtrInst &GEP); bool canFoldInboundsGEP(GetElementPtrInst &I); bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); - bool simplifyCallSite(Function *F, CallSite CS); + bool simplifyCallSite(Function *F, CallBase &Call); template <typename Callable> bool simplifyInstruction(Instruction &I, Callable Evaluate); ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); @@ -216,22 +227,28 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { /// attributes and callee hotness for PGO builds. The Callee is explicitly /// passed to support analyzing indirect calls whose target is inferred by /// analysis. - void updateThreshold(CallSite CS, Function &Callee); + void updateThreshold(CallBase &Call, Function &Callee); - /// Return true if size growth is allowed when inlining the callee at CS. - bool allowSizeGrowth(CallSite CS); + /// Return true if size growth is allowed when inlining the callee at \p Call. + bool allowSizeGrowth(CallBase &Call); - /// Return true if \p CS is a cold callsite. - bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI); + /// Return true if \p Call is a cold callsite. + bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); - /// Return a higher threshold if \p CS is a hot callsite. - Optional<int> getHotCallSiteThreshold(CallSite CS, + /// Return a higher threshold if \p Call is a hot callsite. + Optional<int> getHotCallSiteThreshold(CallBase &Call, BlockFrequencyInfo *CallerBFI); // Custom analysis routines. InlineResult analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); + /// Handle a capped 'int' increment for Cost. + void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) { + assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound"); + Cost = (int)std::min(UpperBound, Cost + Inc); + } + // Disable several entry points to the visitor so we don't accidentally use // them by declaring but not defining them here. void visit(Module *); @@ -256,11 +273,12 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { bool visitCmpInst(CmpInst &I); bool visitSub(BinaryOperator &I); bool visitBinaryOperator(BinaryOperator &I); + bool visitFNeg(UnaryOperator &I); bool visitLoad(LoadInst &I); bool visitStore(StoreInst &I); bool visitExtractValue(ExtractValueInst &I); bool visitInsertValue(InsertValueInst &I); - bool visitCallSite(CallSite CS); + bool visitCallBase(CallBase &Call); bool visitReturnInst(ReturnInst &RI); bool visitBranchInst(BranchInst &BI); bool visitSelectInst(SelectInst &SI); @@ -276,38 +294,29 @@ public: std::function<AssumptionCache &(Function &)> &GetAssumptionCache, Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, - Function &Callee, CallSite CSArg, const InlineParams &Params) + Function &Callee, CallBase &Call, const InlineParams &Params) : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), - CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), - Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || - Params.ComputeFullInlineCost || ORE), - IsCallerRecursive(false), IsRecursiveCall(false), - ExposesReturnsTwice(false), HasDynamicAlloca(false), - ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), - HasUninlineableIntrinsic(false), InitsVargArgs(false), AllocatedSize(0), - NumInstructions(0), NumVectorInstructions(0), VectorBonus(0), - SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0), - NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), - NumConstantPtrCmps(0), NumConstantPtrDiffs(0), - NumInstructionsSimplified(0), SROACostSavings(0), - SROACostSavingsLost(0) {} - - InlineResult analyzeCall(CallSite CS); + CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold), + ComputeFullInlineCost(OptComputeFullInlineCost || + Params.ComputeFullInlineCost || ORE), + EnableLoadElimination(true) {} + + InlineResult analyzeCall(CallBase &Call); int getThreshold() { return Threshold; } int getCost() { return Cost; } // Keep a bunch of stats about the cost savings found so we can print them // out when debugging. - unsigned NumConstantArgs; - unsigned NumConstantOffsetPtrArgs; - unsigned NumAllocaArgs; - unsigned NumConstantPtrCmps; - unsigned NumConstantPtrDiffs; - unsigned NumInstructionsSimplified; - unsigned SROACostSavings; - unsigned SROACostSavingsLost; + unsigned NumConstantArgs = 0; + unsigned NumConstantOffsetPtrArgs = 0; + unsigned NumAllocaArgs = 0; + unsigned NumConstantPtrCmps = 0; + unsigned NumConstantPtrDiffs = 0; + unsigned NumInstructionsSimplified = 0; + unsigned SROACostSavings = 0; + unsigned SROACostSavingsLost = 0; void dump(); }; @@ -342,7 +351,7 @@ bool CallAnalyzer::lookupSROAArgAndCost( void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { // If we're no longer able to perform SROA we need to undo its cost savings // and prevent subsequent analysis. - Cost += CostIt->second; + addCost(CostIt->second); SROACostSavings -= CostIt->second; SROACostSavingsLost += CostIt->second; SROAArgCosts.erase(CostIt); @@ -366,7 +375,7 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, void CallAnalyzer::disableLoadElimination() { if (EnableLoadElimination) { - Cost += LoadEliminationCost; + addCost(LoadEliminationCost); LoadEliminationCost = 0; EnableLoadElimination = false; } @@ -701,7 +710,7 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { } bool CallAnalyzer::visitCastInst(CastInst &I) { - // Propagate constants through ptrtoint. + // Propagate constants through casts. if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); })) @@ -721,7 +730,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) { case Instruction::FPToUI: case Instruction::FPToSI: if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) - Cost += InlineConstants::CallPenalty; + addCost(InlineConstants::CallPenalty); break; default: break; @@ -737,14 +746,14 @@ bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { })) return true; - // Disable any SROA on the argument to arbitrary unary operators. + // Disable any SROA on the argument to arbitrary unary instructions. disableSROA(Operand); return false; } bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { - return CandidateCS.paramHasAttr(A->getArgNo(), Attr); + return CandidateCall.paramHasAttr(A->getArgNo(), Attr); } bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { @@ -769,7 +778,7 @@ bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { return false; } -bool CallAnalyzer::allowSizeGrowth(CallSite CS) { +bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { // If the normal destination of the invoke or the parent block of the call // site is unreachable-terminated, there is little point in inlining this // unless there is literally zero cost. @@ -785,21 +794,21 @@ bool CallAnalyzer::allowSizeGrowth(CallSite CS) { // For now, we are not handling this corner case here as it is rare in real // code. In future, we should elaborate this based on BPI and BFI in more // general threshold adjusting heuristics in updateThreshold(). - Instruction *Instr = CS.getInstruction(); - if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { + if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) return false; - } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator())) + } else if (isa<UnreachableInst>(Call.getParent()->getTerminator())) return false; return true; } -bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { +bool CallAnalyzer::isColdCallSite(CallBase &Call, + BlockFrequencyInfo *CallerBFI) { // If global profile summary is available, then callsite's coldness is // determined based on that. if (PSI && PSI->hasProfileSummary()) - return PSI->isColdCallSite(CS, CallerBFI); + return PSI->isColdCallSite(CallSite(&Call), CallerBFI); // Otherwise we need BFI to be available. if (!CallerBFI) @@ -810,20 +819,21 @@ bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { // complexity is not worth it unless this scaling shows up high in the // profiles. const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); - auto CallSiteBB = CS.getInstruction()->getParent(); + auto CallSiteBB = Call.getParent(); auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); auto CallerEntryFreq = - CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock())); + CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock())); return CallSiteFreq < CallerEntryFreq * ColdProb; } Optional<int> -CallAnalyzer::getHotCallSiteThreshold(CallSite CS, +CallAnalyzer::getHotCallSiteThreshold(CallBase &Call, BlockFrequencyInfo *CallerBFI) { // If global profile summary is available, then callsite's hotness is // determined based on that. - if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI)) + if (PSI && PSI->hasProfileSummary() && + PSI->isHotCallSite(CallSite(&Call), CallerBFI)) return Params.HotCallSiteThreshold; // Otherwise we need BFI to be available and to have a locally hot callsite @@ -835,7 +845,7 @@ CallAnalyzer::getHotCallSiteThreshold(CallSite CS, // potentially cache the computation of scaled entry frequency, but the added // complexity is not worth it unless this scaling shows up high in the // profiles. - auto CallSiteBB = CS.getInstruction()->getParent(); + auto CallSiteBB = Call.getParent(); auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); auto CallerEntryFreq = CallerBFI->getEntryFreq(); if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) @@ -845,14 +855,14 @@ CallAnalyzer::getHotCallSiteThreshold(CallSite CS, return None; } -void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { +void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { // If no size growth is allowed for this inlining, set Threshold to 0. - if (!allowSizeGrowth(CS)) { + if (!allowSizeGrowth(Call)) { Threshold = 0; return; } - Function *Caller = CS.getCaller(); + Function *Caller = Call.getCaller(); // return min(A, B) if B is valid. auto MinIfValid = [](int A, Optional<int> B) { @@ -870,15 +880,6 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { // basic block at the given callsite context. This is speculatively applied // and withdrawn if more than one basic block is seen. // - // Vector bonuses: We want to more aggressively inline vector-dense kernels - // and apply this bonus based on the percentage of vector instructions. A - // bonus is applied if the vector instructions exceed 50% and half that amount - // is applied if it exceeds 10%. Note that these bonuses are some what - // arbitrary and evolved over time by accident as much as because they are - // principled bonuses. - // FIXME: It would be nice to base the bonus values on something more - // scientific. - // // LstCallToStaticBonus: This large bonus is applied to ensure the inlining // of the last call to a static function as inlining such functions is // guaranteed to reduce code size. @@ -886,7 +887,7 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { // These bonus percentages may be set to 0 based on properties of the caller // and the callsite. int SingleBBBonusPercent = 50; - int VectorBonusPercent = 150; + int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; // Lambda to set all the above bonus and bonus percentages to 0. @@ -898,7 +899,7 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available // and reduce the threshold if the caller has the necessary attribute. - if (Caller->optForMinSize()) { + if (Caller->hasMinSize()) { Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); // For minsize, we want to disable the single BB bonus and the vector // bonuses, but not the last-call-to-static bonus. Inlining the last call to @@ -906,12 +907,12 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { // call/return instructions. SingleBBBonusPercent = 0; VectorBonusPercent = 0; - } else if (Caller->optForSize()) + } else if (Caller->hasOptSize()) Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); // Adjust the threshold based on inlinehint attribute and profile based // hotness information if the caller does not have MinSize attribute. - if (!Caller->optForMinSize()) { + if (!Caller->hasMinSize()) { if (Callee.hasFnAttribute(Attribute::InlineHint)) Threshold = MaxIfValid(Threshold, Params.HintThreshold); @@ -923,15 +924,15 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { // used (which adds hotness metadata to calls) or if caller's // BlockFrequencyInfo is available. BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; - auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI); - if (!Caller->optForSize() && HotCallSiteThreshold) { + auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); + if (!Caller->hasOptSize() && HotCallSiteThreshold) { LLVM_DEBUG(dbgs() << "Hot callsite.\n"); // FIXME: This should update the threshold only if it exceeds the // current threshold, but AutoFDO + ThinLTO currently relies on this // behavior to prevent inlining of hot callsites during ThinLTO // compile phase. Threshold = HotCallSiteThreshold.getValue(); - } else if (isColdCallSite(CS, CallerBFI)) { + } else if (isColdCallSite(Call, CallerBFI)) { LLVM_DEBUG(dbgs() << "Cold callsite.\n"); // Do not apply bonuses for a cold callsite including the // LastCallToStatic bonus. While this bonus might result in code size @@ -968,7 +969,7 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { VectorBonus = Threshold * VectorBonusPercent / 100; bool OnlyOneCallAndLocalLinkage = - F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); + F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction(); // If there is only one call of the function, and it has internal linkage, // the cost of inlining it drops dramatically. It may seem odd to update // Cost in updateThreshold, but the bonus depends on the logic in this method. @@ -1087,10 +1088,34 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { // If the instruction is floating point, and the target says this operation // is expensive, this may eventually become a library call. Treat the cost - // as such. + // as such. Unless it's fneg which can be implemented with an xor. + using namespace llvm::PatternMatch; if (I.getType()->isFloatingPointTy() && - TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) - Cost += InlineConstants::CallPenalty; + TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive && + !match(&I, m_FNeg(m_Value()))) + addCost(InlineConstants::CallPenalty); + + return false; +} + +bool CallAnalyzer::visitFNeg(UnaryOperator &I) { + Value *Op = I.getOperand(0); + Constant *COp = dyn_cast<Constant>(Op); + if (!COp) + COp = SimplifiedValues.lookup(Op); + + Value *SimpleV = SimplifyFNegInst(COp ? COp : Op, + cast<FPMathOperator>(I).getFastMathFlags(), + DL); + + if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) + SimplifiedValues[&I] = C; + + if (SimpleV) + return true; + + // Disable any SROA on arguments to arbitrary, unsimplified fneg. + disableSROA(Op); return false; } @@ -1173,62 +1198,61 @@ bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { /// analyzing the arguments and call itself with instsimplify. Returns true if /// it has simplified the callsite to some other entity (a constant), making it /// free. -bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { +bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { // FIXME: Using the instsimplify logic directly for this is inefficient // because we have to continually rebuild the argument list even when no // simplifications can be performed. Until that is fixed with remapping // inside of instsimplify, directly constant fold calls here. - if (!canConstantFoldCallTo(CS, F)) + if (!canConstantFoldCallTo(&Call, F)) return false; // Try to re-map the arguments to constants. SmallVector<Constant *, 4> ConstantArgs; - ConstantArgs.reserve(CS.arg_size()); - for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; - ++I) { - Constant *C = dyn_cast<Constant>(*I); + ConstantArgs.reserve(Call.arg_size()); + for (Value *I : Call.args()) { + Constant *C = dyn_cast<Constant>(I); if (!C) - C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I)); + C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I)); if (!C) return false; // This argument doesn't map to a constant. ConstantArgs.push_back(C); } - if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) { - SimplifiedValues[CS.getInstruction()] = C; + if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) { + SimplifiedValues[&Call] = C; return true; } return false; } -bool CallAnalyzer::visitCallSite(CallSite CS) { - if (CS.hasFnAttr(Attribute::ReturnsTwice) && +bool CallAnalyzer::visitCallBase(CallBase &Call) { + if (Call.hasFnAttr(Attribute::ReturnsTwice) && !F.hasFnAttribute(Attribute::ReturnsTwice)) { // This aborts the entire analysis. ExposesReturnsTwice = true; return false; } - if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate()) + if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate()) ContainsNoDuplicateCall = true; - if (Function *F = CS.getCalledFunction()) { + if (Function *F = Call.getCalledFunction()) { // When we have a concrete function, first try to simplify it directly. - if (simplifyCallSite(F, CS)) + if (simplifyCallSite(F, Call)) return true; // Next check if it is an intrinsic we know about. // FIXME: Lift this into part of the InstVisitor. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) { switch (II->getIntrinsicID()) { default: - if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) + if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) disableLoadElimination(); - return Base::visitCallSite(CS); + return Base::visitCallBase(Call); case Intrinsic::load_relative: // This is normally lowered to 4 LLVM instructions. - Cost += 3 * InlineConstants::InstrCost; + addCost(3 * InlineConstants::InstrCost); return false; case Intrinsic::memset: @@ -1247,7 +1271,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { } } - if (F == CS.getInstruction()->getFunction()) { + if (F == Call.getFunction()) { // This flag will fully abort the analysis, so don't bother with anything // else. IsRecursiveCall = true; @@ -1257,34 +1281,34 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { if (TTI.isLoweredToCall(F)) { // We account for the average 1 instruction per call argument setup // here. - Cost += CS.arg_size() * InlineConstants::InstrCost; + addCost(Call.arg_size() * InlineConstants::InstrCost); // Everything other than inline ASM will also have a significant cost // merely from making the call. - if (!isa<InlineAsm>(CS.getCalledValue())) - Cost += InlineConstants::CallPenalty; + if (!isa<InlineAsm>(Call.getCalledValue())) + addCost(InlineConstants::CallPenalty); } - if (!CS.onlyReadsMemory()) + if (!Call.onlyReadsMemory()) disableLoadElimination(); - return Base::visitCallSite(CS); + return Base::visitCallBase(Call); } // Otherwise we're in a very special case -- an indirect function call. See // if we can be particularly clever about this. - Value *Callee = CS.getCalledValue(); + Value *Callee = Call.getCalledValue(); // First, pay the price of the argument setup. We account for the average // 1 instruction per call argument setup here. - Cost += CS.arg_size() * InlineConstants::InstrCost; + addCost(Call.arg_size() * InlineConstants::InstrCost); // Next, check if this happens to be an indirect function call to a known // function in this inline context. If not, we've done all we can. Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); if (!F) { - if (!CS.onlyReadsMemory()) + if (!Call.onlyReadsMemory()) disableLoadElimination(); - return Base::visitCallSite(CS); + return Base::visitCallBase(Call); } // If we have a constant that we are calling as a function, we can peer @@ -1294,9 +1318,9 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { // out. Pretend to inline the function, with a custom threshold. auto IndirectCallParams = Params; IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; - CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS, + CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call, IndirectCallParams); - if (CA.analyzeCall(CS)) { + if (CA.analyzeCall(Call)) { // We were able to inline the indirect call! Subtract the cost from the // threshold to get the bonus we want to apply, but don't go below zero. Cost -= std::max(0, CA.getThreshold() - CA.getCost()); @@ -1304,7 +1328,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { if (!F->onlyReadsMemory()) disableLoadElimination(); - return Base::visitCallSite(CS); + return Base::visitCallBase(Call); } bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { @@ -1438,7 +1462,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); if (CostLowerBound > Threshold && !ComputeFullInlineCost) { - Cost = CostLowerBound; + addCost((int64_t)SI.getNumCases() * InlineConstants::InstrCost); return false; } @@ -1452,7 +1476,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 4 * InlineConstants::InstrCost; - Cost = std::min((int64_t)CostUpperBound, JTCost + Cost); + addCost(JTCost, (int64_t)CostUpperBound); return false; } @@ -1473,7 +1497,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { // n + n / 2 - 1 = n * 3 / 2 - 1 if (NumCaseCluster <= 3) { // Suppose a comparison includes one compare and one conditional branch. - Cost += NumCaseCluster * 2 * InlineConstants::InstrCost; + addCost(NumCaseCluster * 2 * InlineConstants::InstrCost); return false; } @@ -1481,7 +1505,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; - Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost); + addCost(SwitchCost, (int64_t)CostUpperBound); return false; } @@ -1574,7 +1598,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB, if (Base::visit(&*I)) ++NumInstructionsSimplified; else - Cost += InlineConstants::InstrCost; + addCost(InlineConstants::InstrCost); using namespace ore; // If the visit this instruction detected an uninlinable pattern, abort. @@ -1595,7 +1619,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB, if (ORE) ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", - CandidateCS.getInstruction()) + &CandidateCall) << NV("Callee", &F) << " has uninlinable pattern (" << NV("InlineResult", IR.message) << ") and cost is not fully computed"; @@ -1612,14 +1636,14 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB, if (ORE) ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", - CandidateCS.getInstruction()) + &CandidateCall) << NV("Callee", &F) << " is " << NV("InlineResult", IR.message) << ". Cost is not fully computed"; }); return IR; } - // Check if we've past the maximum possible threshold so we don't spin in + // Check if we've passed the maximum possible threshold so we don't spin in // huge basic blocks that will never inline. if (Cost >= Threshold && !ComputeFullInlineCost) return false; @@ -1676,7 +1700,7 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { /// blocks to see if all their incoming edges are dead or not. void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { - // A CFG edge is dead if the predecessor is dead or the predessor has a + // A CFG edge is dead if the predecessor is dead or the predecessor has a // known successor which is not the one under exam. return (DeadBlocks.count(Pred) || (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); @@ -1712,7 +1736,7 @@ void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { /// factors and heuristics. If this method returns false but the computed cost /// is below the computed threshold, then inlining was forcibly disabled by /// some artifact of the routine. -InlineResult CallAnalyzer::analyzeCall(CallSite CS) { +InlineResult CallAnalyzer::analyzeCall(CallBase &Call) { ++NumCallsAnalyzed; // Perform some tweaks to the cost and threshold based on the direct @@ -1729,7 +1753,7 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) { assert(NumVectorInstructions == 0); // Update the threshold based on callsite properties - updateThreshold(CS, F); + updateThreshold(Call, F); // While Threshold depends on commandline options that can take negative // values, we want to enforce the invariant that the computed threshold and @@ -1745,7 +1769,7 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) { // Give out bonuses for the callsite, as the instructions setting them up // will be gone after inlining. - Cost -= getCallsiteCost(CS, DL); + addCost(-getCallsiteCost(Call, DL)); // If this function uses the coldcc calling convention, prefer not to inline // it. @@ -1759,14 +1783,11 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) { if (F.empty()) return true; - Function *Caller = CS.getInstruction()->getFunction(); + Function *Caller = Call.getFunction(); // Check if the caller function is recursive itself. for (User *U : Caller->users()) { - CallSite Site(U); - if (!Site) - continue; - Instruction *I = Site.getInstruction(); - if (I->getFunction() == Caller) { + CallBase *Call = dyn_cast<CallBase>(U); + if (Call && Call->getFunction() == Caller) { IsCallerRecursive = true; break; } @@ -1774,10 +1795,10 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) { // Populate our simplified values by mapping from function arguments to call // arguments with known important simplifications. - CallSite::arg_iterator CAI = CS.arg_begin(); + auto CAI = Call.arg_begin(); for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); FAI != FAE; ++FAI, ++CAI) { - assert(CAI != CS.arg_end()); + assert(CAI != Call.arg_end()); if (Constant *C = dyn_cast<Constant>(CAI)) SimplifiedValues[&*FAI] = C; @@ -1826,14 +1847,18 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) { if (BB->empty()) continue; - // Disallow inlining a blockaddress. A blockaddress only has defined - // behavior for an indirect branch in the same function, and we do not - // currently support inlining indirect branches. But, the inliner may not - // see an indirect branch that ends up being dead code at a particular call - // site. If the blockaddress escapes the function, e.g., via a global - // variable, inlining may lead to an invalid cross-function reference. + // Disallow inlining a blockaddress with uses other than strictly callbr. + // A blockaddress only has defined behavior for an indirect branch in the + // same function, and we do not currently support inlining indirect + // branches. But, the inliner may not see an indirect branch that ends up + // being dead code at a particular call site. If the blockaddress escapes + // the function, e.g., via a global variable, inlining may lead to an + // invalid cross-function reference. + // FIXME: pr/39560: continue relaxing this overt restriction. if (BB->hasAddressTaken()) - return "blockaddress"; + for (User *U : BlockAddress::get(&*BB)->users()) + if (!isa<CallBrInst>(*U)) + return "blockaddress used outside of callbr"; // Analyze the cost of this block. If we blow through the threshold, this // returns false, and we can bail on out. @@ -1887,7 +1912,7 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) { } bool OnlyOneCallAndLocalLinkage = - F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); + F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction(); // If this is a noduplicate call, we can still inline as long as // inlining this would cause the removal of the caller (so the instruction // is not actually duplicated, just moved). @@ -1899,7 +1924,7 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) { // size, we penalise any call sites that perform loops. We do this after all // other costs here, so will likely only be dealing with relatively small // functions (and hence DT and LI will hopefully be cheap). - if (Caller->optForMinSize()) { + if (Caller->hasMinSize()) { DominatorTree DT(F); LoopInfo LI(DT); int NumLoops = 0; @@ -1909,7 +1934,7 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) { continue; NumLoops++; } - Cost += NumLoops * InlineConstants::CallPenalty; + addCost(NumLoops * InlineConstants::CallPenalty); } // We applied the maximum possible vector bonus at the beginning. Now, @@ -1953,13 +1978,13 @@ static bool functionsHaveCompatibleAttributes(Function *Caller, AttributeFuncs::areInlineCompatible(*Caller, *Callee); } -int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) { +int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) { int Cost = 0; - for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { - if (CS.isByValArgument(I)) { + for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { + if (Call.isByValArgument(I)) { // We approximate the number of loads and stores needed by dividing the // size of the byval type by the target's pointer size. - PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); + PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); unsigned AS = PTy->getAddressSpace(); unsigned PointerSize = DL.getPointerSizeInBits(AS); @@ -1987,16 +2012,16 @@ int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) { } InlineCost llvm::getInlineCost( - CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, + CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function<AssumptionCache &(Function &)> &GetAssumptionCache, Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { - return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, + return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE); } InlineCost llvm::getInlineCost( - CallSite CS, Function *Callee, const InlineParams &Params, + CallBase &Call, Function *Callee, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function<AssumptionCache &(Function &)> &GetAssumptionCache, Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, @@ -2012,9 +2037,9 @@ InlineCost llvm::getInlineCost( // argument is in the alloca address space (so it is a little bit complicated // to solve). unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace(); - for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) - if (CS.isByValArgument(I)) { - PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); + for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) + if (Call.isByValArgument(I)) { + PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); if (PTy->getAddressSpace() != AllocaAS) return llvm::InlineCost::getNever("byval arguments without alloca" " address space"); @@ -2022,20 +2047,21 @@ InlineCost llvm::getInlineCost( // Calls to functions with always-inline attributes should be inlined // whenever possible. - if (CS.hasFnAttr(Attribute::AlwaysInline)) { - if (isInlineViable(*Callee)) + if (Call.hasFnAttr(Attribute::AlwaysInline)) { + auto IsViable = isInlineViable(*Callee); + if (IsViable) return llvm::InlineCost::getAlways("always inline attribute"); - return llvm::InlineCost::getNever("inapplicable always inline attribute"); + return llvm::InlineCost::getNever(IsViable.message); } // Never inline functions with conflicting attributes (unless callee has // always-inline attribute). - Function *Caller = CS.getCaller(); + Function *Caller = Call.getCaller(); if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) return llvm::InlineCost::getNever("conflicting attributes"); // Don't inline this call if the caller has the optnone attribute. - if (Caller->hasFnAttribute(Attribute::OptimizeNone)) + if (Caller->hasOptNone()) return llvm::InlineCost::getNever("optnone attribute"); // Don't inline a function that treats null pointer as valid into a caller @@ -2052,15 +2078,15 @@ InlineCost llvm::getInlineCost( return llvm::InlineCost::getNever("noinline function attribute"); // Don't inline call sites marked noinline. - if (CS.isNoInline()) + if (Call.isNoInline()) return llvm::InlineCost::getNever("noinline call site attribute"); LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "... (caller:" << Caller->getName() << ")\n"); - CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS, - Params); - InlineResult ShouldInline = CA.analyzeCall(CS); + CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, + Call, Params); + InlineResult ShouldInline = CA.analyzeCall(Call); LLVM_DEBUG(CA.dump()); @@ -2073,42 +2099,50 @@ InlineCost llvm::getInlineCost( return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); } -bool llvm::isInlineViable(Function &F) { +InlineResult llvm::isInlineViable(Function &F) { bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { - // Disallow inlining of functions which contain indirect branches or - // blockaddresses. - if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) - return false; + // Disallow inlining of functions which contain indirect branches. + if (isa<IndirectBrInst>(BI->getTerminator())) + return "contains indirect branches"; + + // Disallow inlining of blockaddresses which are used by non-callbr + // instructions. + if (BI->hasAddressTaken()) + for (User *U : BlockAddress::get(&*BI)->users()) + if (!isa<CallBrInst>(*U)) + return "blockaddress used outside of callbr"; for (auto &II : *BI) { - CallSite CS(&II); - if (!CS) + CallBase *Call = dyn_cast<CallBase>(&II); + if (!Call) continue; // Disallow recursive calls. - if (&F == CS.getCalledFunction()) - return false; + if (&F == Call->getCalledFunction()) + return "recursive call"; // Disallow calls which expose returns-twice to a function not previously // attributed as such. - if (!ReturnsTwice && CS.isCall() && - cast<CallInst>(CS.getInstruction())->canReturnTwice()) - return false; + if (!ReturnsTwice && isa<CallInst>(Call) && + cast<CallInst>(Call)->canReturnTwice()) + return "exposes returns-twice attribute"; - if (CS.getCalledFunction()) - switch (CS.getCalledFunction()->getIntrinsicID()) { + if (Call->getCalledFunction()) + switch (Call->getCalledFunction()->getIntrinsicID()) { default: break; // Disallow inlining of @llvm.icall.branch.funnel because current // backend can't separate call targets from call arguments. case llvm::Intrinsic::icall_branch_funnel: + return "disallowed inlining of @llvm.icall.branch.funnel"; // Disallow inlining functions that call @llvm.localescape. Doing this // correctly would require major changes to the inliner. case llvm::Intrinsic::localescape: + return "disallowed inlining of @llvm.localescape"; // Disallow inlining of functions that initialize VarArgs with va_start. case llvm::Intrinsic::vastart: - return false; + return "contains VarArgs initialized with va_start"; } } } |