diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2017-12-18 20:10:56 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2017-12-18 20:10:56 +0000 |
| commit | 044eb2f6afba375a914ac9d8024f8f5142bb912e (patch) | |
| tree | 1475247dc9f9fe5be155ebd4c9069c75aadf8c20 /lib/Target/AMDGPU/AMDGPUInline.cpp | |
| parent | eb70dddbd77e120e5d490bd8fbe7ff3f8fa81c6b (diff) | |
Notes
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUInline.cpp')
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUInline.cpp | 208 |
1 files changed, 208 insertions, 0 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp new file mode 100644 index 000000000000..ff9e7b50ed5c --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -0,0 +1,208 @@ +//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This is AMDGPU specific replacement of the standard inliner. +/// The main purpose is to account for the fact that calls not only expensive +/// on the AMDGPU, but much more expensive if a private memory pointer is +/// passed to a function as an argument. In this situation, we are unable to +/// eliminate private memory in the caller unless inlined and end up with slow +/// and expensive scratch access. Thus, we boost the inline threshold for such +/// functions here. +/// +//===----------------------------------------------------------------------===// + + +#include "AMDGPU.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/IPO/Inliner.h" + +using namespace llvm; + +#define DEBUG_TYPE "inline" + +static cl::opt<int> +ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), + cl::desc("Cost of alloca argument")); + +// If the amount of scratch memory to eliminate exceeds our ability to allocate +// it into registers we gain nothing by agressively inlining functions for that +// heuristic. +static cl::opt<unsigned> +ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), + cl::desc("Maximum alloca size to use for inline cost")); + +namespace { + +class AMDGPUInliner : public LegacyInlinerBase { + +public: + AMDGPUInliner() : LegacyInlinerBase(ID) { + initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry()); + Params = getInlineParams(); + } + + static char ID; // Pass identification, replacement for typeid + + unsigned getInlineThreshold(CallSite CS) const; + + InlineCost getInlineCost(CallSite CS) override; + + bool runOnSCC(CallGraphSCC &SCC) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + TargetTransformInfoWrapperPass *TTIWP; + + InlineParams Params; +}; + +} // end anonymous namespace + +char AMDGPUInliner::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", + "AMDGPU Function Integration/Inlining", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline", + "AMDGPU Function Integration/Inlining", false, false) + +Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); } + +bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) { + TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); + return LegacyInlinerBase::runOnSCC(SCC); +} + +void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetTransformInfoWrapperPass>(); + LegacyInlinerBase::getAnalysisUsage(AU); +} + +unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { + int Thres = Params.DefaultThreshold; + + Function *Caller = CS.getCaller(); + // Listen to the inlinehint attribute when it would increase the threshold + // and the caller does not need to minimize its size. + Function *Callee = CS.getCalledFunction(); + bool InlineHint = Callee && !Callee->isDeclaration() && + Callee->hasFnAttribute(Attribute::InlineHint); + if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres + && !Caller->hasFnAttribute(Attribute::MinSize)) + Thres = Params.HintThreshold.getValue(); + + const DataLayout &DL = Caller->getParent()->getDataLayout(); + if (!Callee) + return (unsigned)Thres; + + const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*Caller->getParent()); + + // If we have a pointer to private array passed into a function + // it will not be optimized out, leaving scratch usage. + // Increase the inline threshold to allow inliniting in this case. + uint64_t AllocaSize = 0; + SmallPtrSet<const AllocaInst *, 8> AIVisited; + for (Value *PtrArg : CS.args()) { + Type *Ty = PtrArg->getType(); + if (!Ty->isPointerTy() || + Ty->getPointerAddressSpace() != AS.PRIVATE_ADDRESS) + continue; + PtrArg = GetUnderlyingObject(PtrArg, DL); + if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { + if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) + continue; + AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); + // If the amount of stack memory is excessive we will not be able + // to get rid of the scratch anyway, bail out. + if (AllocaSize > ArgAllocaCutoff) { + AllocaSize = 0; + break; + } + } + } + if (AllocaSize) + Thres += ArgAllocaCost; + + return (unsigned)Thres; +} + +// Check if call is just a wrapper around another call. +// In this case we only have call and ret instructions. +static bool isWrapperOnlyCall(CallSite CS) { + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee->size() != 1) + return false; + const BasicBlock &BB = Callee->getEntryBlock(); + if (const Instruction *I = BB.getFirstNonPHI()) { + if (!isa<CallInst>(I)) { + return false; + } + if (isa<ReturnInst>(*std::next(I->getIterator()))) { + DEBUG(dbgs() << " Wrapper only call detected: " + << Callee->getName() << '\n'); + return true; + } + } + return false; +} + +InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { + Function *Callee = CS.getCalledFunction(); + Function *Caller = CS.getCaller(); + TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); + + if (!Callee || Callee->isDeclaration() || CS.isNoInline() || + !TTI.areInlineCompatible(Caller, Callee)) + return llvm::InlineCost::getNever(); + + if (CS.hasFnAttr(Attribute::AlwaysInline)) { + if (isInlineViable(*Callee)) + return llvm::InlineCost::getAlways(); + return llvm::InlineCost::getNever(); + } + + if (isWrapperOnlyCall(CS)) + return llvm::InlineCost::getAlways(); + + InlineParams LocalParams = Params; + LocalParams.DefaultThreshold = (int)getInlineThreshold(CS); + bool RemarksEnabled = false; + const auto &BBs = Caller->getBasicBlockList(); + if (!BBs.empty()) { + auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); + if (DI.isEnabled()) + RemarksEnabled = true; + } + + OptimizationRemarkEmitter ORE(Caller); + std::function<AssumptionCache &(Function &)> GetAssumptionCache = + [this](Function &F) -> AssumptionCache & { + return ACT->getAssumptionCache(F); + }; + + return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache, + None, PSI, RemarksEnabled ? &ORE : nullptr); +} |
