diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2017-12-18 20:10:56 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2017-12-18 20:10:56 +0000 |
| commit | 044eb2f6afba375a914ac9d8024f8f5142bb912e (patch) | |
| tree | 1475247dc9f9fe5be155ebd4c9069c75aadf8c20 /lib/Transforms/IPO | |
| parent | eb70dddbd77e120e5d490bd8fbe7ff3f8fa81c6b (diff) | |
Notes
Diffstat (limited to 'lib/Transforms/IPO')
28 files changed, 2669 insertions, 882 deletions
diff --git a/lib/Transforms/IPO/AlwaysInliner.cpp b/lib/Transforms/IPO/AlwaysInliner.cpp index b7d96007c24a..a4bbc99b1f90 100644 --- a/lib/Transforms/IPO/AlwaysInliner.cpp +++ b/lib/Transforms/IPO/AlwaysInliner.cpp @@ -15,15 +15,12 @@ #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" -#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Transforms/IPO.h" diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp index 72bae203ee94..b25cbcad3b9d 100644 --- a/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -1,4 +1,4 @@ -//===-- ArgumentPromotion.cpp - Promote by-reference arguments ------------===// +//===- ArgumentPromotion.cpp - Promote by-reference arguments -------------===// // // The LLVM Compiler Infrastructure // @@ -31,30 +31,59 @@ #include "llvm/Transforms/IPO/ArgumentPromotion.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <functional> +#include <iterator> +#include <map> #include <set> +#include <string> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "argpromotion" @@ -65,7 +94,7 @@ STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted"); STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated"); /// A vector used to hold the indices of a single GEP instruction -typedef std::vector<uint64_t> IndicesVector; +using IndicesVector = std::vector<uint64_t>; /// DoPromotion - This method actually performs the promotion of the specified /// arguments, and returns the new function. At this point, we know that it's @@ -75,13 +104,12 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, SmallPtrSetImpl<Argument *> &ByValArgsToTransform, Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>> ReplaceCallSite) { - // Start by computing a new prototype for the function, which is the same as // the old function, but has modified arguments. FunctionType *FTy = F->getFunctionType(); std::vector<Type *> Params; - typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable; + using ScalarizeTable = std::set<std::pair<Type *, IndicesVector>>; // ScalarizedElements - If we are promoting a pointer that has elements // accessed out of it, keep track of which elements are accessed so that we @@ -89,7 +117,6 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, // // Arguments that are directly loaded will have a zero element value here, to // handle cases where there are both a direct load and GEP accesses. - // std::map<Argument *, ScalarizeTable> ScalarizedElements; // OriginalLoads - Keep track of a representative load instruction from the @@ -335,7 +362,6 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, // Loop over the argument list, transferring uses of the old arguments over to // the new arguments, also transferring over the names as well. - // for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), I2 = NF->arg_begin(); I != E; ++I) { @@ -537,7 +563,7 @@ static void markIndicesSafe(const IndicesVector &ToMark, /// arguments passed in. static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, AAResults &AAR, unsigned MaxElements) { - typedef std::set<IndicesVector> GEPIndicesSet; + using GEPIndicesSet = std::set<IndicesVector>; // Quick exit for unused arguments if (Arg->use_empty()) @@ -693,7 +719,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, BasicBlock *BB = Load->getParent(); MemoryLocation Loc = MemoryLocation::get(Load); - if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, MRI_Mod)) + if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, ModRefInfo::Mod)) return false; // Pointer is invalidated! // Now check every path from the entry block to the load for transparency. @@ -714,7 +740,6 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, /// \brief Checks if a type could have padding bytes. static bool isDenselyPacked(Type *type, const DataLayout &DL) { - // There is no size information, so be conservative. if (!type->isSized()) return false; @@ -749,7 +774,6 @@ static bool isDenselyPacked(Type *type, const DataLayout &DL) { /// \brief Checks if the padding bytes of an argument could be accessed. static bool canPaddingBeAccessed(Argument *arg) { - assert(arg->hasByValAttr()); // Track all the pointers to the argument to make sure they are not captured. @@ -788,7 +812,6 @@ static bool canPaddingBeAccessed(Argument *arg) { /// are any promotable arguments and if it is safe to promote the function (for /// example, all callers are direct). If safe to promote some arguments, it /// calls the DoPromotion method. -/// static Function * promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter, unsigned MaxElements, @@ -964,9 +987,17 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C, } namespace { + /// ArgPromotion - The 'by reference' to 'by value' argument promotion pass. -/// struct ArgPromotion : public CallGraphSCCPass { + // Pass identification, replacement for typeid + static char ID; + + explicit ArgPromotion(unsigned MaxElements = 3) + : CallGraphSCCPass(ID), MaxElements(MaxElements) { + initializeArgPromotionPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); @@ -975,21 +1006,20 @@ struct ArgPromotion : public CallGraphSCCPass { } bool runOnSCC(CallGraphSCC &SCC) override; - static char ID; // Pass identification, replacement for typeid - explicit ArgPromotion(unsigned MaxElements = 3) - : CallGraphSCCPass(ID), MaxElements(MaxElements) { - initializeArgPromotionPass(*PassRegistry::getPassRegistry()); - } private: using llvm::Pass::doInitialization; + bool doInitialization(CallGraph &CG) override; + /// The maximum number of elements to expand, or 0 for unlimited. unsigned MaxElements; }; -} + +} // end anonymous namespace char ArgPromotion::ID = 0; + INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion", "Promote 'by reference' arguments to scalars", false, false) diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt index 67f18a307b9b..397561746f86 100644 --- a/lib/Transforms/IPO/CMakeLists.txt +++ b/lib/Transforms/IPO/CMakeLists.txt @@ -2,6 +2,7 @@ add_llvm_library(LLVMipo AlwaysInliner.cpp ArgumentPromotion.cpp BarrierNoopPass.cpp + CalledValuePropagation.cpp ConstantMerge.cpp CrossDSOCFI.cpp DeadArgumentElimination.cpp diff --git a/lib/Transforms/IPO/CalledValuePropagation.cpp b/lib/Transforms/IPO/CalledValuePropagation.cpp new file mode 100644 index 000000000000..c5f6336aa2be --- /dev/null +++ b/lib/Transforms/IPO/CalledValuePropagation.cpp @@ -0,0 +1,423 @@ +//===- CalledValuePropagation.cpp - Propagate called values -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a transformation that attaches !callees metadata to +// indirect call sites. For a given call site, the metadata, if present, +// indicates the set of functions the call site could possibly target at +// run-time. This metadata is added to indirect call sites when the set of +// possible targets can be determined by analysis and is known to be small. The +// analysis driving the transformation is similar to constant propagation and +// makes uses of the generic sparse propagation solver. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/CalledValuePropagation.h" +#include "llvm/Analysis/SparsePropagation.h" +#include "llvm/Analysis/ValueLatticeUtils.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/Transforms/IPO.h" +using namespace llvm; + +#define DEBUG_TYPE "called-value-propagation" + +/// The maximum number of functions to track per lattice value. Once the number +/// of functions a call site can possibly target exceeds this threshold, it's +/// lattice value becomes overdefined. The number of possible lattice values is +/// bounded by Ch(F, M), where F is the number of functions in the module and M +/// is MaxFunctionsPerValue. As such, this value should be kept very small. We +/// likely can't do anything useful for call sites with a large number of +/// possible targets, anyway. +static cl::opt<unsigned> MaxFunctionsPerValue( + "cvp-max-functions-per-value", cl::Hidden, cl::init(4), + cl::desc("The maximum number of functions to track per lattice value")); + +namespace { +/// To enable interprocedural analysis, we assign LLVM values to the following +/// groups. The register group represents SSA registers, the return group +/// represents the return values of functions, and the memory group represents +/// in-memory values. An LLVM Value can technically be in more than one group. +/// It's necessary to distinguish these groups so we can, for example, track a +/// global variable separately from the value stored at its location. +enum class IPOGrouping { Register, Return, Memory }; + +/// Our LatticeKeys are PointerIntPairs composed of LLVM values and groupings. +using CVPLatticeKey = PointerIntPair<Value *, 2, IPOGrouping>; + +/// The lattice value type used by our custom lattice function. It holds the +/// lattice state, and a set of functions. +class CVPLatticeVal { +public: + /// The states of the lattice values. Only the FunctionSet state is + /// interesting. It indicates the set of functions to which an LLVM value may + /// refer. + enum CVPLatticeStateTy { Undefined, FunctionSet, Overdefined, Untracked }; + + /// Comparator for sorting the functions set. We want to keep the order + /// deterministic for testing, etc. + struct Compare { + bool operator()(const Function *LHS, const Function *RHS) const { + return LHS->getName() < RHS->getName(); + } + }; + + CVPLatticeVal() : LatticeState(Undefined) {} + CVPLatticeVal(CVPLatticeStateTy LatticeState) : LatticeState(LatticeState) {} + CVPLatticeVal(std::set<Function *, Compare> &&Functions) + : LatticeState(FunctionSet), Functions(Functions) {} + + /// Get a reference to the functions held by this lattice value. The number + /// of functions will be zero for states other than FunctionSet. + const std::set<Function *, Compare> &getFunctions() const { + return Functions; + } + + /// Returns true if the lattice value is in the FunctionSet state. + bool isFunctionSet() const { return LatticeState == FunctionSet; } + + bool operator==(const CVPLatticeVal &RHS) const { + return LatticeState == RHS.LatticeState && Functions == RHS.Functions; + } + + bool operator!=(const CVPLatticeVal &RHS) const { + return LatticeState != RHS.LatticeState || Functions != RHS.Functions; + } + +private: + /// Holds the state this lattice value is in. + CVPLatticeStateTy LatticeState; + + /// Holds functions indicating the possible targets of call sites. This set + /// is empty for lattice values in the undefined, overdefined, and untracked + /// states. The maximum size of the set is controlled by + /// MaxFunctionsPerValue. Since most LLVM values are expected to be in + /// uninteresting states (i.e., overdefined), CVPLatticeVal objects should be + /// small and efficiently copyable. + std::set<Function *, Compare> Functions; +}; + +/// The custom lattice function used by the generic sparse propagation solver. +/// It handles merging lattice values and computing new lattice values for +/// constants, arguments, values returned from trackable functions, and values +/// located in trackable global variables. It also computes the lattice values +/// that change as a result of executing instructions. +class CVPLatticeFunc + : public AbstractLatticeFunction<CVPLatticeKey, CVPLatticeVal> { +public: + CVPLatticeFunc() + : AbstractLatticeFunction(CVPLatticeVal(CVPLatticeVal::Undefined), + CVPLatticeVal(CVPLatticeVal::Overdefined), + CVPLatticeVal(CVPLatticeVal::Untracked)) {} + + /// Compute and return a CVPLatticeVal for the given CVPLatticeKey. + CVPLatticeVal ComputeLatticeVal(CVPLatticeKey Key) override { + switch (Key.getInt()) { + case IPOGrouping::Register: + if (isa<Instruction>(Key.getPointer())) { + return getUndefVal(); + } else if (auto *A = dyn_cast<Argument>(Key.getPointer())) { + if (canTrackArgumentsInterprocedurally(A->getParent())) + return getUndefVal(); + } else if (auto *C = dyn_cast<Constant>(Key.getPointer())) { + return computeConstant(C); + } + return getOverdefinedVal(); + case IPOGrouping::Memory: + case IPOGrouping::Return: + if (auto *GV = dyn_cast<GlobalVariable>(Key.getPointer())) { + if (canTrackGlobalVariableInterprocedurally(GV)) + return computeConstant(GV->getInitializer()); + } else if (auto *F = cast<Function>(Key.getPointer())) + if (canTrackReturnsInterprocedurally(F)) + return getUndefVal(); + } + return getOverdefinedVal(); + } + + /// Merge the two given lattice values. The interesting cases are merging two + /// FunctionSet values and a FunctionSet value with an Undefined value. For + /// these cases, we simply union the function sets. If the size of the union + /// is greater than the maximum functions we track, the merged value is + /// overdefined. + CVPLatticeVal MergeValues(CVPLatticeVal X, CVPLatticeVal Y) override { + if (X == getOverdefinedVal() || Y == getOverdefinedVal()) + return getOverdefinedVal(); + if (X == getUndefVal() && Y == getUndefVal()) + return getUndefVal(); + std::set<Function *, CVPLatticeVal::Compare> Union; + std::set_union(X.getFunctions().begin(), X.getFunctions().end(), + Y.getFunctions().begin(), Y.getFunctions().end(), + std::inserter(Union, Union.begin()), + CVPLatticeVal::Compare{}); + if (Union.size() > MaxFunctionsPerValue) + return getOverdefinedVal(); + return CVPLatticeVal(std::move(Union)); + } + + /// Compute the lattice values that change as a result of executing the given + /// instruction. The changed values are stored in \p ChangedValues. We handle + /// just a few kinds of instructions since we're only propagating values that + /// can be called. + void ComputeInstructionState( + Instruction &I, DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, + SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) override { + switch (I.getOpcode()) { + case Instruction::Call: + return visitCallSite(cast<CallInst>(&I), ChangedValues, SS); + case Instruction::Invoke: + return visitCallSite(cast<InvokeInst>(&I), ChangedValues, SS); + case Instruction::Load: + return visitLoad(*cast<LoadInst>(&I), ChangedValues, SS); + case Instruction::Ret: + return visitReturn(*cast<ReturnInst>(&I), ChangedValues, SS); + case Instruction::Select: + return visitSelect(*cast<SelectInst>(&I), ChangedValues, SS); + case Instruction::Store: + return visitStore(*cast<StoreInst>(&I), ChangedValues, SS); + default: + return visitInst(I, ChangedValues, SS); + } + } + + /// Print the given CVPLatticeVal to the specified stream. + void PrintLatticeVal(CVPLatticeVal LV, raw_ostream &OS) override { + if (LV == getUndefVal()) + OS << "Undefined "; + else if (LV == getOverdefinedVal()) + OS << "Overdefined"; + else if (LV == getUntrackedVal()) + OS << "Untracked "; + else + OS << "FunctionSet"; + } + + /// Print the given CVPLatticeKey to the specified stream. + void PrintLatticeKey(CVPLatticeKey Key, raw_ostream &OS) override { + if (Key.getInt() == IPOGrouping::Register) + OS << "<reg> "; + else if (Key.getInt() == IPOGrouping::Memory) + OS << "<mem> "; + else if (Key.getInt() == IPOGrouping::Return) + OS << "<ret> "; + if (isa<Function>(Key.getPointer())) + OS << Key.getPointer()->getName(); + else + OS << *Key.getPointer(); + } + + /// We collect a set of indirect calls when visiting call sites. This method + /// returns a reference to that set. + SmallPtrSetImpl<Instruction *> &getIndirectCalls() { return IndirectCalls; } + +private: + /// Holds the indirect calls we encounter during the analysis. We will attach + /// metadata to these calls after the analysis indicating the functions the + /// calls can possibly target. + SmallPtrSet<Instruction *, 32> IndirectCalls; + + /// Compute a new lattice value for the given constant. The constant, after + /// stripping any pointer casts, should be a Function. We ignore null + /// pointers as an optimization, since calling these values is undefined + /// behavior. + CVPLatticeVal computeConstant(Constant *C) { + if (isa<ConstantPointerNull>(C)) + return CVPLatticeVal(CVPLatticeVal::FunctionSet); + if (auto *F = dyn_cast<Function>(C->stripPointerCasts())) + return CVPLatticeVal({F}); + return getOverdefinedVal(); + } + + /// Handle return instructions. The function's return state is the merge of + /// the returned value state and the function's return state. + void visitReturn(ReturnInst &I, + DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, + SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) { + Function *F = I.getParent()->getParent(); + if (F->getReturnType()->isVoidTy()) + return; + auto RegI = CVPLatticeKey(I.getReturnValue(), IPOGrouping::Register); + auto RetF = CVPLatticeKey(F, IPOGrouping::Return); + ChangedValues[RetF] = + MergeValues(SS.getValueState(RegI), SS.getValueState(RetF)); + } + + /// Handle call sites. The state of a called function's formal arguments is + /// the merge of the argument state with the call sites corresponding actual + /// argument state. The call site state is the merge of the call site state + /// with the returned value state of the called function. + void visitCallSite(CallSite CS, + DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, + SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) { + Function *F = CS.getCalledFunction(); + Instruction *I = CS.getInstruction(); + auto RegI = CVPLatticeKey(I, IPOGrouping::Register); + + // If this is an indirect call, save it so we can quickly revisit it when + // attaching metadata. + if (!F) + IndirectCalls.insert(I); + + // If we can't track the function's return values, there's nothing to do. + if (!F || !canTrackReturnsInterprocedurally(F)) { + ChangedValues[RegI] = getOverdefinedVal(); + return; + } + + // Inform the solver that the called function is executable, and perform + // the merges for the arguments and return value. + SS.MarkBlockExecutable(&F->front()); + auto RetF = CVPLatticeKey(F, IPOGrouping::Return); + for (Argument &A : F->args()) { + auto RegFormal = CVPLatticeKey(&A, IPOGrouping::Register); + auto RegActual = + CVPLatticeKey(CS.getArgument(A.getArgNo()), IPOGrouping::Register); + ChangedValues[RegFormal] = + MergeValues(SS.getValueState(RegFormal), SS.getValueState(RegActual)); + } + ChangedValues[RegI] = + MergeValues(SS.getValueState(RegI), SS.getValueState(RetF)); + } + + /// Handle select instructions. The select instruction state is the merge the + /// true and false value states. + void visitSelect(SelectInst &I, + DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, + SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) { + auto RegI = CVPLatticeKey(&I, IPOGrouping::Register); + auto RegT = CVPLatticeKey(I.getTrueValue(), IPOGrouping::Register); + auto RegF = CVPLatticeKey(I.getFalseValue(), IPOGrouping::Register); + ChangedValues[RegI] = + MergeValues(SS.getValueState(RegT), SS.getValueState(RegF)); + } + + /// Handle load instructions. If the pointer operand of the load is a global + /// variable, we attempt to track the value. The loaded value state is the + /// merge of the loaded value state with the global variable state. + void visitLoad(LoadInst &I, + DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, + SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) { + auto RegI = CVPLatticeKey(&I, IPOGrouping::Register); + if (auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand())) { + auto MemGV = CVPLatticeKey(GV, IPOGrouping::Memory); + ChangedValues[RegI] = + MergeValues(SS.getValueState(RegI), SS.getValueState(MemGV)); + } else { + ChangedValues[RegI] = getOverdefinedVal(); + } + } + + /// Handle store instructions. If the pointer operand of the store is a + /// global variable, we attempt to track the value. The global variable state + /// is the merge of the stored value state with the global variable state. + void visitStore(StoreInst &I, + DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, + SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) { + auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand()); + if (!GV) + return; + auto RegI = CVPLatticeKey(I.getValueOperand(), IPOGrouping::Register); + auto MemGV = CVPLatticeKey(GV, IPOGrouping::Memory); + ChangedValues[MemGV] = + MergeValues(SS.getValueState(RegI), SS.getValueState(MemGV)); + } + + /// Handle all other instructions. All other instructions are marked + /// overdefined. + void visitInst(Instruction &I, + DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues, + SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) { + auto RegI = CVPLatticeKey(&I, IPOGrouping::Register); + ChangedValues[RegI] = getOverdefinedVal(); + } +}; +} // namespace + +namespace llvm { +/// A specialization of LatticeKeyInfo for CVPLatticeKeys. The generic solver +/// must translate between LatticeKeys and LLVM Values when adding Values to +/// its work list and inspecting the state of control-flow related values. +template <> struct LatticeKeyInfo<CVPLatticeKey> { + static inline Value *getValueFromLatticeKey(CVPLatticeKey Key) { + return Key.getPointer(); + } + static inline CVPLatticeKey getLatticeKeyFromValue(Value *V) { + return CVPLatticeKey(V, IPOGrouping::Register); + } +}; +} // namespace llvm + +static bool runCVP(Module &M) { + // Our custom lattice function and generic sparse propagation solver. + CVPLatticeFunc Lattice; + SparseSolver<CVPLatticeKey, CVPLatticeVal> Solver(&Lattice); + + // For each function in the module, if we can't track its arguments, let the + // generic solver assume it is executable. + for (Function &F : M) + if (!F.isDeclaration() && !canTrackArgumentsInterprocedurally(&F)) + Solver.MarkBlockExecutable(&F.front()); + + // Solver our custom lattice. In doing so, we will also build a set of + // indirect call sites. + Solver.Solve(); + + // Attach metadata to the indirect call sites that were collected indicating + // the set of functions they can possibly target. + bool Changed = false; + MDBuilder MDB(M.getContext()); + for (Instruction *C : Lattice.getIndirectCalls()) { + CallSite CS(C); + auto RegI = CVPLatticeKey(CS.getCalledValue(), IPOGrouping::Register); + CVPLatticeVal LV = Solver.getExistingValueState(RegI); + if (!LV.isFunctionSet() || LV.getFunctions().empty()) + continue; + MDNode *Callees = MDB.createCallees(SmallVector<Function *, 4>( + LV.getFunctions().begin(), LV.getFunctions().end())); + C->setMetadata(LLVMContext::MD_callees, Callees); + Changed = true; + } + + return Changed; +} + +PreservedAnalyses CalledValuePropagationPass::run(Module &M, + ModuleAnalysisManager &) { + runCVP(M); + return PreservedAnalyses::all(); +} + +namespace { +class CalledValuePropagationLegacyPass : public ModulePass { +public: + static char ID; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + CalledValuePropagationLegacyPass() : ModulePass(ID) { + initializeCalledValuePropagationLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + if (skipModule(M)) + return false; + return runCVP(M); + } +}; +} // namespace + +char CalledValuePropagationLegacyPass::ID = 0; +INITIALIZE_PASS(CalledValuePropagationLegacyPass, "called-value-propagation", + "Called Value Propagation", false, false) + +ModulePass *llvm::createCalledValuePropagationPass() { + return new CalledValuePropagationLegacyPass(); +} diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp index 62b5a9c9ba26..e0b1037053f0 100644 --- a/lib/Transforms/IPO/ConstantMerge.cpp +++ b/lib/Transforms/IPO/ConstantMerge.cpp @@ -19,16 +19,23 @@ #include "llvm/Transforms/IPO/ConstantMerge.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include "llvm/IR/Operator.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Transforms/IPO.h" +#include <algorithm> +#include <cassert> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "constmerge" @@ -102,8 +109,7 @@ static bool mergeConstants(Module &M) { // constants together may allow us to merge other constants together if the // second level constants have initializers which point to the globals that // were just merged. - while (1) { - + while (true) { // First: Find the canonical constants others will be merged with. for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); GVI != E; ) { @@ -225,23 +231,27 @@ PreservedAnalyses ConstantMergePass::run(Module &M, ModuleAnalysisManager &) { } namespace { + struct ConstantMergeLegacyPass : public ModulePass { static char ID; // Pass identification, replacement for typeid + ConstantMergeLegacyPass() : ModulePass(ID) { initializeConstantMergeLegacyPassPass(*PassRegistry::getPassRegistry()); } // For this pass, process all of the globals in the module, eliminating // duplicate constants. - bool runOnModule(Module &M) { + bool runOnModule(Module &M) override { if (skipModule(M)) return false; return mergeConstants(M); } }; -} + +} // end anonymous namespace char ConstantMergeLegacyPass::ID = 0; + INITIALIZE_PASS(ConstantMergeLegacyPass, "constmerge", "Merge Duplicate Global Constants", false, false) diff --git a/lib/Transforms/IPO/CrossDSOCFI.cpp b/lib/Transforms/IPO/CrossDSOCFI.cpp index d94aa5da8560..886029ea58d5 100644 --- a/lib/Transforms/IPO/CrossDSOCFI.cpp +++ b/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -13,9 +13,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/CrossDSOCFI.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Triple.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -31,7 +31,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; @@ -81,7 +80,7 @@ ConstantInt *CrossDSOCFI::extractNumericTypeId(MDNode *MD) { void CrossDSOCFI::buildCFICheck(Module &M) { // FIXME: verify that __cfi_check ends up near the end of the code section, // but before the jump slots created in LowerTypeTests. - llvm::DenseSet<uint64_t> TypeIds; + SetVector<uint64_t> TypeIds; SmallVector<MDNode *, 2> Types; for (GlobalObject &GO : M.global_objects()) { Types.clear(); @@ -115,6 +114,11 @@ void CrossDSOCFI::buildCFICheck(Module &M) { // linker knows about the symbol; this pass replaces the function body. F->deleteBody(); F->setAlignment(4096); + + Triple T(M.getTargetTriple()); + if (T.isARM() || T.isThumb()) + F->addFnAttr("target-features", "+thumb-mode"); + auto args = F->arg_begin(); Value &CallSiteTypeId = *(args++); CallSiteTypeId.setName("CallSiteTypeId"); diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp index 8e26849ea9e3..5446541550e5 100644 --- a/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -1,4 +1,4 @@ -//===-- DeadArgumentElimination.cpp - Eliminate dead arguments ------------===// +//===- DeadArgumentElimination.cpp - Eliminate dead arguments -------------===// // // The LLVM Compiler Infrastructure // @@ -20,24 +20,36 @@ #include "llvm/Transforms/IPO/DeadArgumentElimination.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" -#include "llvm/IR/CallingConv.h" #include "llvm/IR/Constant.h" -#include "llvm/IR/DIBuilder.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include <set> -#include <tuple> +#include <cassert> +#include <cstdint> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "deadargelim" @@ -46,9 +58,10 @@ STATISTIC(NumArgumentsEliminated, "Number of unread args removed"); STATISTIC(NumRetValsEliminated , "Number of unused return values removed"); STATISTIC(NumArgumentsReplacedWithUndef, "Number of unread args replaced with undef"); + namespace { + /// DAE - The dead argument elimination pass. - /// class DAE : public ModulePass { protected: // DAH uses this to specify a different ID. @@ -56,6 +69,7 @@ namespace { public: static char ID; // Pass identification, replacement for typeid + DAE() : ModulePass(ID) { initializeDAEPass(*PassRegistry::getPassRegistry()); } @@ -71,33 +85,38 @@ namespace { virtual bool ShouldHackArguments() const { return false; } }; -} +} // end anonymous namespace char DAE::ID = 0; + INITIALIZE_PASS(DAE, "deadargelim", "Dead Argument Elimination", false, false) namespace { + /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but /// deletes arguments to functions which are external. This is only for use /// by bugpoint. struct DAH : public DAE { static char ID; + DAH() : DAE(ID) {} bool ShouldHackArguments() const override { return true; } }; -} + +} // end anonymous namespace char DAH::ID = 0; + INITIALIZE_PASS(DAH, "deadarghaX0r", "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)", false, false) /// createDeadArgEliminationPass - This pass removes arguments from functions /// which are not used by the body of the function. -/// ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); } + ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); } /// DeleteDeadVarargs - If this is an function that takes a ... list, and if @@ -140,7 +159,7 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) { // the old function, but doesn't have isVarArg set. FunctionType *FTy = Fn.getFunctionType(); - std::vector<Type*> Params(FTy->param_begin(), FTy->param_end()); + std::vector<Type *> Params(FTy->param_begin(), FTy->param_end()); FunctionType *NFTy = FunctionType::get(FTy->getReturnType(), Params, false); unsigned NumArgs = Params.size(); @@ -155,7 +174,7 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) { // Loop over all of the callers of the function, transforming the call sites // to pass in a smaller number of arguments into the new function. // - std::vector<Value*> Args; + std::vector<Value *> Args; for (Value::user_iterator I = Fn.user_begin(), E = Fn.user_end(); I != E; ) { CallSite CS(*I++); if (!CS) @@ -214,7 +233,6 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) { // Loop over the argument list, transferring uses of the old arguments over to // the new arguments, also transferring over the names as well. While we're at // it, remove the dead arguments from the DeadArguments list. - // for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(), I2 = NF->arg_begin(); I != E; ++I, ++I2) { // Move the name and users over to the new version. @@ -343,7 +361,6 @@ DeadArgumentEliminationPass::MarkIfNotLive(RetOrArg Use, return MaybeLive; } - /// SurveyUse - This looks at a single use of an argument or return value /// and determines if it should be alive or not. Adds this use to MaybeLiveUses /// if it causes the used value to become MaybeLive. @@ -460,7 +477,6 @@ DeadArgumentEliminationPass::SurveyUses(const Value *V, // // We consider arguments of non-internal functions to be intrinsically alive as // well as arguments to functions which have their "address taken". -// void DeadArgumentEliminationPass::SurveyFunction(const Function &F) { // Functions with inalloca parameters are expecting args in a particular // register and memory layout. @@ -478,11 +494,14 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) { } unsigned RetCount = NumRetVals(&F); + // Assume all return values are dead - typedef SmallVector<Liveness, 5> RetVals; + using RetVals = SmallVector<Liveness, 5>; + RetVals RetValLiveness(RetCount, MaybeLive); - typedef SmallVector<UseVector, 5> RetUses; + using RetUses = SmallVector<UseVector, 5>; + // These vectors map each return value to the uses that make it MaybeLive, so // we can add those to the Uses map if the return value really turns out to be // MaybeLive. Initialized to a list of RetCount empty lists. @@ -601,15 +620,15 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) { void DeadArgumentEliminationPass::MarkValue(const RetOrArg &RA, Liveness L, const UseVector &MaybeLiveUses) { switch (L) { - case Live: MarkLive(RA); break; + case Live: + MarkLive(RA); + break; case MaybeLive: - { // Note any uses of this value, so this return value can be // marked live whenever one of the uses becomes live. for (const auto &MaybeLiveUse : MaybeLiveUses) Uses.insert(std::make_pair(MaybeLiveUse, RA)); break; - } } } @@ -762,7 +781,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { // One return type? Just a simple value then, but only if we didn't use to // return a struct with that simple value before. NRetTy = RetTypes.front(); - else if (RetTypes.size() == 0) + else if (RetTypes.empty()) // No return types? Make it void, but only if we didn't use to return {}. NRetTy = Type::getVoidTy(F->getContext()); } @@ -808,7 +827,6 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { // Loop over all of the callers of the function, transforming the call sites // to pass in a smaller number of arguments into the new function. - // std::vector<Value*> Args; while (!F->use_empty()) { CallSite CS(F->user_back()); diff --git a/lib/Transforms/IPO/ElimAvailExtern.cpp b/lib/Transforms/IPO/ElimAvailExtern.cpp index ecff88c88dcb..d5fef59286dd 100644 --- a/lib/Transforms/IPO/ElimAvailExtern.cpp +++ b/lib/Transforms/IPO/ElimAvailExtern.cpp @@ -1,5 +1,4 @@ -//===-- ElimAvailExtern.cpp - DCE unreachable internal functions -//----------------===// +//===- ElimAvailExtern.cpp - DCE unreachable internal functions -----------===// // // The LLVM Compiler Infrastructure // @@ -15,11 +14,15 @@ #include "llvm/Transforms/IPO/ElimAvailExtern.h" #include "llvm/ADT/Statistic.h" -#include "llvm/IR/Constants.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/GlobalStatus.h" + using namespace llvm; #define DEBUG_TYPE "elim-avail-extern" @@ -69,8 +72,10 @@ EliminateAvailableExternallyPass::run(Module &M, ModuleAnalysisManager &) { } namespace { + struct EliminateAvailableExternallyLegacyPass : public ModulePass { static char ID; // Pass identification, replacement for typeid + EliminateAvailableExternallyLegacyPass() : ModulePass(ID) { initializeEliminateAvailableExternallyLegacyPassPass( *PassRegistry::getPassRegistry()); @@ -78,16 +83,17 @@ struct EliminateAvailableExternallyLegacyPass : public ModulePass { // run - Do the EliminateAvailableExternally pass on the specified module, // optionally updating the specified callgraph to reflect the changes. - // - bool runOnModule(Module &M) { + bool runOnModule(Module &M) override { if (skipModule(M)) return false; return eliminateAvailableExternally(M); } }; -} + +} // end anonymous namespace char EliminateAvailableExternallyLegacyPass::ID = 0; + INITIALIZE_PASS(EliminateAvailableExternallyLegacyPass, "elim-avail-extern", "Eliminate Available Externally Globals", false, false) diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp index d1147f7d844b..042cacb70ad0 100644 --- a/lib/Transforms/IPO/ExtractGV.cpp +++ b/lib/Transforms/IPO/ExtractGV.cpp @@ -12,8 +12,6 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/SetVector.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" diff --git a/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/lib/Transforms/IPO/ForceFunctionAttrs.cpp index 968712138208..325a5d77aadb 100644 --- a/lib/Transforms/IPO/ForceFunctionAttrs.cpp +++ b/lib/Transforms/IPO/ForceFunctionAttrs.cpp @@ -52,11 +52,13 @@ static Attribute::AttrKind parseAttrKind(StringRef Kind) { .Case("returns_twice", Attribute::ReturnsTwice) .Case("safestack", Attribute::SafeStack) .Case("sanitize_address", Attribute::SanitizeAddress) + .Case("sanitize_hwaddress", Attribute::SanitizeHWAddress) .Case("sanitize_memory", Attribute::SanitizeMemory) .Case("sanitize_thread", Attribute::SanitizeThread) .Case("ssp", Attribute::StackProtect) .Case("sspreq", Attribute::StackProtectReq) .Case("sspstrong", Attribute::StackProtectStrong) + .Case("strictfp", Attribute::StrictFP) .Case("uwtable", Attribute::UWTable) .Default(Attribute::None); } diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp index 813a4b6e2831..5352e32479bb 100644 --- a/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/lib/Transforms/IPO/FunctionAttrs.cpp @@ -6,34 +6,61 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -/// +// /// \file /// This file implements interprocedural passes which walk the /// call-graph deducing and/or propagating function attributes. -/// +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/FunctionAttrs.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/CaptureTracking.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/LazyCallGraph.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include <cassert> +#include <iterator> +#include <map> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "functionattrs" @@ -57,8 +84,10 @@ static cl::opt<bool> EnableNonnullArgPropagation( "caller functions.")); namespace { -typedef SmallSetVector<Function *, 8> SCCNodeSet; -} + +using SCCNodeSet = SmallSetVector<Function *, 8>; + +} // end anonymous namespace /// Returns the memory access attribute for function F using AAR for AA results, /// where SCCNodes is the current SCC. @@ -101,17 +130,18 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, SCCNodes.count(CS.getCalledFunction())) continue; FunctionModRefBehavior MRB = AAR.getModRefBehavior(CS); + ModRefInfo MRI = createModRefInfo(MRB); // If the call doesn't access memory, we're done. - if (!(MRB & MRI_ModRef)) + if (isNoModRef(MRI)) continue; if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) { // The call could access any memory. If that includes writes, give up. - if (MRB & MRI_Mod) + if (isModSet(MRI)) return MAK_MayWrite; // If it reads, note it. - if (MRB & MRI_Ref) + if (isRefSet(MRI)) ReadsMemory = true; continue; } @@ -133,10 +163,10 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) continue; - if (MRB & MRI_Mod) + if (isModSet(MRI)) // Writes non-local memory. Give up. return MAK_MayWrite; - if (MRB & MRI_Ref) + if (isRefSet(MRI)) // Ok, it reads non-local memory. ReadsMemory = true; } @@ -237,6 +267,7 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) { } namespace { + /// For a given pointer Argument, this retains a list of Arguments of functions /// in the same SCC that the pointer data flows into. We use this to build an /// SCC of the arguments. @@ -248,7 +279,7 @@ struct ArgumentGraphNode { class ArgumentGraph { // We store pointers to ArgumentGraphNode objects, so it's important that // that they not move around upon insert. - typedef std::map<Argument *, ArgumentGraphNode> ArgumentMapTy; + using ArgumentMapTy = std::map<Argument *, ArgumentGraphNode>; ArgumentMapTy ArgumentMap; @@ -263,7 +294,7 @@ class ArgumentGraph { public: ArgumentGraph() { SyntheticRoot.Definition = nullptr; } - typedef SmallVectorImpl<ArgumentGraphNode *>::iterator iterator; + using iterator = SmallVectorImpl<ArgumentGraphNode *>::iterator; iterator begin() { return SyntheticRoot.Uses.begin(); } iterator end() { return SyntheticRoot.Uses.end(); } @@ -281,8 +312,7 @@ public: /// consider that a capture, instead adding it to the "Uses" list and /// continuing with the analysis. struct ArgumentUsesTracker : public CaptureTracker { - ArgumentUsesTracker(const SCCNodeSet &SCCNodes) - : Captured(false), SCCNodes(SCCNodes) {} + ArgumentUsesTracker(const SCCNodeSet &SCCNodes) : SCCNodes(SCCNodes) {} void tooManyUses() override { Captured = true; } @@ -331,37 +361,45 @@ struct ArgumentUsesTracker : public CaptureTracker { return false; } - bool Captured; // True only if certainly captured (used outside our SCC). - SmallVector<Argument *, 4> Uses; // Uses within our SCC. + // True only if certainly captured (used outside our SCC). + bool Captured = false; + + // Uses within our SCC. + SmallVector<Argument *, 4> Uses; const SCCNodeSet &SCCNodes; }; -} + +} // end anonymous namespace namespace llvm { + template <> struct GraphTraits<ArgumentGraphNode *> { - typedef ArgumentGraphNode *NodeRef; - typedef SmallVectorImpl<ArgumentGraphNode *>::iterator ChildIteratorType; + using NodeRef = ArgumentGraphNode *; + using ChildIteratorType = SmallVectorImpl<ArgumentGraphNode *>::iterator; static NodeRef getEntryNode(NodeRef A) { return A; } static ChildIteratorType child_begin(NodeRef N) { return N->Uses.begin(); } static ChildIteratorType child_end(NodeRef N) { return N->Uses.end(); } }; + template <> struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> { static NodeRef getEntryNode(ArgumentGraph *AG) { return AG->getEntryNode(); } + static ChildIteratorType nodes_begin(ArgumentGraph *AG) { return AG->begin(); } + static ChildIteratorType nodes_end(ArgumentGraph *AG) { return AG->end(); } }; -} + +} // end namespace llvm /// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone. static Attribute::AttrKind determinePointerReadAttrs(Argument *A, const SmallPtrSet<Argument *, 8> &SCCNodes) { - SmallVector<Use *, 32> Worklist; SmallSet<Use *, 32> Visited; @@ -502,8 +540,8 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) { continue; // There is nothing to do if an argument is already marked as 'returned'. - if (any_of(F->args(), - [](const Argument &Arg) { return Arg.hasReturnedAttr(); })) + if (llvm::any_of(F->args(), + [](const Argument &Arg) { return Arg.hasReturnedAttr(); })) continue; auto FindRetArg = [&]() -> Value * { @@ -884,11 +922,13 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes, if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) FlowsToReturn.insert(Ret->getReturnValue()); + auto &DL = F->getParent()->getDataLayout(); + for (unsigned i = 0; i != FlowsToReturn.size(); ++i) { Value *RetVal = FlowsToReturn[i]; // If this value is locally known to be non-null, we're good - if (isKnownNonNull(RetVal)) + if (isKnownNonZero(RetVal, DL)) continue; // Otherwise, we need to look upwards since we can't make any local @@ -1135,8 +1175,11 @@ PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C, } namespace { + struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass { - static char ID; // Pass identification, replacement for typeid + // Pass identification, replacement for typeid + static char ID; + PostOrderFunctionAttrsLegacyPass() : CallGraphSCCPass(ID) { initializePostOrderFunctionAttrsLegacyPassPass( *PassRegistry::getPassRegistry()); @@ -1151,7 +1194,8 @@ struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass { CallGraphSCCPass::getAnalysisUsage(AU); } }; -} + +} // end anonymous namespace char PostOrderFunctionAttrsLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "functionattrs", @@ -1214,8 +1258,11 @@ bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) { } namespace { + struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass { - static char ID; // Pass identification, replacement for typeid + // Pass identification, replacement for typeid + static char ID; + ReversePostOrderFunctionAttrsLegacyPass() : ModulePass(ID) { initializeReversePostOrderFunctionAttrsLegacyPassPass( *PassRegistry::getPassRegistry()); @@ -1229,9 +1276,11 @@ struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass { AU.addPreserved<CallGraphWrapperPass>(); } }; -} + +} // end anonymous namespace char ReversePostOrderFunctionAttrsLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrsLegacyPass, "rpo-functionattrs", "Deduce function attributes in RPO", false, false) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) @@ -1291,7 +1340,7 @@ static bool deduceFunctionAttributeInRPO(Module &M, CallGraph &CG) { } bool Changed = false; - for (auto *F : reverse(Worklist)) + for (auto *F : llvm::reverse(Worklist)) Changed |= addNoRecurseAttrsTopDown(*F); return Changed; diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp index 233a36d2bc54..b6d6201cd23b 100644 --- a/lib/Transforms/IPO/FunctionImport.cpp +++ b/lib/Transforms/IPO/FunctionImport.cpp @@ -12,30 +12,54 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/FunctionImport.h" - +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringSet.h" -#include "llvm/ADT/Triple.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/AutoUpgrade.h" -#include "llvm/IR/DiagnosticPrinter.h" -#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" -#include "llvm/IR/Verifier.h" +#include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/IRReader/IRReader.h" -#include "llvm/Linker/Linker.h" -#include "llvm/Object/IRObjectFile.h" +#include "llvm/Linker/IRMover.h" +#include "llvm/Object/ModuleSymbolTable.h" +#include "llvm/Object/SymbolicFile.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO/Internalize.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/FunctionImportUtils.h" - -#define DEBUG_TYPE "function-import" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include <cassert> +#include <memory> +#include <set> +#include <string> +#include <system_error> +#include <tuple> +#include <utility> using namespace llvm; +#define DEBUG_TYPE "function-import" + STATISTIC(NumImportedFunctions, "Number of functions imported"); STATISTIC(NumImportedModules, "Number of modules imported from"); STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index"); @@ -61,7 +85,7 @@ static cl::opt<float> ImportHotInstrFactor( "before processing newly imported functions")); static cl::opt<float> ImportHotMultiplier( - "import-hot-multiplier", cl::init(3.0), cl::Hidden, cl::value_desc("x"), + "import-hot-multiplier", cl::init(10.0), cl::Hidden, cl::value_desc("x"), cl::desc("Multiply the `import-instr-limit` threshold for hot callsites")); static cl::opt<float> ImportCriticalMultiplier( @@ -91,6 +115,18 @@ static cl::opt<bool> EnableImportMetadata( ), cl::Hidden, cl::desc("Enable import metadata like 'thinlto_src_module'")); +/// Summary file to use for function importing when using -function-import from +/// the command line. +static cl::opt<std::string> + SummaryFile("summary-file", + cl::desc("The summary file to use for function importing.")); + +/// Used when testing importing from distributed indexes via opt +// -function-import. +static cl::opt<bool> + ImportAllIndex("import-all-index", + cl::desc("Import all external functions in index.")); + // Load lazily a module from \p FileName in \p Context. static std::unique_ptr<Module> loadFile(const std::string &FileName, LLVMContext &Context) { @@ -109,8 +145,6 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName, return Result; } -namespace { - /// Given a list of possible callee implementation for a call site, select one /// that fits the \p Threshold. /// @@ -129,21 +163,26 @@ selectCallee(const ModuleSummaryIndex &Index, CalleeSummaryList, [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) { auto *GVSummary = SummaryPtr.get(); + // For SamplePGO, in computeImportForFunction the OriginalId + // may have been used to locate the callee summary list (See + // comment there). + // The mapping from OriginalId to GUID may return a GUID + // that corresponds to a static variable. Filter it out here. + // This can happen when + // 1) There is a call to a library function which is not defined + // in the index. + // 2) There is a static variable with the OriginalGUID identical + // to the GUID of the library function in 1); + // When this happens, the logic for SamplePGO kicks in and + // the static variable in 2) will be found, which needs to be + // filtered out. + if (GVSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind) + return false; if (GlobalValue::isInterposableLinkage(GVSummary->linkage())) // There is no point in importing these, we can't inline them return false; - if (auto *AS = dyn_cast<AliasSummary>(GVSummary)) { - GVSummary = &AS->getAliasee(); - // Alias can't point to "available_externally". However when we import - // linkOnceODR the linkage does not change. So we import the alias - // and aliasee only in this case. - // FIXME: we should import alias as available_externally *function*, - // the destination module does need to know it is an alias. - if (!GlobalValue::isLinkOnceODRLinkage(GVSummary->linkage())) - return false; - } - auto *Summary = cast<FunctionSummary>(GVSummary); + auto *Summary = cast<FunctionSummary>(GVSummary->getBaseObject()); // If this is a local function, make sure we import the copy // in the caller's module. The only time a local function can @@ -174,9 +213,28 @@ selectCallee(const ModuleSummaryIndex &Index, return cast<GlobalValueSummary>(It->get()); } +namespace { + using EdgeInfo = std::tuple<const FunctionSummary *, unsigned /* Threshold */, GlobalValue::GUID>; +} // anonymous namespace + +static ValueInfo +updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) { + if (!VI.getSummaryList().empty()) + return VI; + // For SamplePGO, the indirect call targets for local functions will + // have its original name annotated in profile. We try to find the + // corresponding PGOFuncName as the GUID. + // FIXME: Consider updating the edges in the graph after building + // it, rather than needing to perform this mapping on each walk. + auto GUID = Index.getGUIDFromOriginalID(VI.getGUID()); + if (GUID == 0) + return nullptr; + return Index.getValueInfo(GUID); +} + /// Compute the list of functions to import for a given caller. Mark these /// imported functions and the symbols they reference in their source module as /// exported from their source module. @@ -191,17 +249,9 @@ static void computeImportForFunction( DEBUG(dbgs() << " edge -> " << VI.getGUID() << " Threshold:" << Threshold << "\n"); - if (VI.getSummaryList().empty()) { - // For SamplePGO, the indirect call targets for local functions will - // have its original name annotated in profile. We try to find the - // corresponding PGOFuncName as the GUID. - auto GUID = Index.getGUIDFromOriginalID(VI.getGUID()); - if (GUID == 0) - continue; - VI = Index.getValueInfo(GUID); - if (!VI) - continue; - } + VI = updateValueInfoForIndirectCalls(Index, VI); + if (!VI) + continue; if (DefinedGVSummaries.count(VI.getGUID())) { DEBUG(dbgs() << "ignored! Target already in destination module.\n"); @@ -227,16 +277,9 @@ static void computeImportForFunction( DEBUG(dbgs() << "ignored! No qualifying callee with summary found.\n"); continue; } - // "Resolve" the summary, traversing alias, - const FunctionSummary *ResolvedCalleeSummary; - if (isa<AliasSummary>(CalleeSummary)) { - ResolvedCalleeSummary = cast<FunctionSummary>( - &cast<AliasSummary>(CalleeSummary)->getAliasee()); - assert( - GlobalValue::isLinkOnceODRLinkage(ResolvedCalleeSummary->linkage()) && - "Unexpected alias to a non-linkonceODR in import list"); - } else - ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary); + + // "Resolve" the summary + const auto *ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary->getBaseObject()); assert(ResolvedCalleeSummary->instCount() <= NewThreshold && "selectCallee() didn't honor the threshold"); @@ -312,14 +355,12 @@ static void ComputeImportForModule( DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n"); continue; } - auto *Summary = GVSummary.second; - if (auto *AS = dyn_cast<AliasSummary>(Summary)) - Summary = &AS->getAliasee(); - auto *FuncSummary = dyn_cast<FunctionSummary>(Summary); + auto *FuncSummary = + dyn_cast<FunctionSummary>(GVSummary.second->getBaseObject()); if (!FuncSummary) // Skip import for global variables continue; - DEBUG(dbgs() << "Initalize import for " << GVSummary.first << "\n"); + DEBUG(dbgs() << "Initialize import for " << GVSummary.first << "\n"); computeImportForFunction(*FuncSummary, Index, ImportInstrLimit, DefinedGVSummaries, Worklist, ImportList, ExportLists); @@ -344,8 +385,6 @@ static void ComputeImportForModule( } } -} // anonymous namespace - /// Compute all the import and export for every module using the Index. void llvm::ComputeCrossModuleImport( const ModuleSummaryIndex &Index, @@ -395,11 +434,23 @@ void llvm::ComputeCrossModuleImport( #endif } +#ifndef NDEBUG +static void dumpImportListForModule(StringRef ModulePath, + FunctionImporter::ImportMapTy &ImportList) { + DEBUG(dbgs() << "* Module " << ModulePath << " imports from " + << ImportList.size() << " modules.\n"); + for (auto &Src : ImportList) { + auto SrcModName = Src.first(); + DEBUG(dbgs() << " - " << Src.second.size() << " functions imported from " + << SrcModName << "\n"); + } +} +#endif + /// Compute all the imports for the given module in the Index. void llvm::ComputeCrossModuleImportForModule( StringRef ModulePath, const ModuleSummaryIndex &Index, FunctionImporter::ImportMapTy &ImportList) { - // Collect the list of functions this module defines. // GUID -> Summary GVSummaryMapTy FunctionSummaryMap; @@ -410,13 +461,34 @@ void llvm::ComputeCrossModuleImportForModule( ComputeImportForModule(FunctionSummaryMap, Index, ImportList); #ifndef NDEBUG - DEBUG(dbgs() << "* Module " << ModulePath << " imports from " - << ImportList.size() << " modules.\n"); - for (auto &Src : ImportList) { - auto SrcModName = Src.first(); - DEBUG(dbgs() << " - " << Src.second.size() << " functions imported from " - << SrcModName << "\n"); + dumpImportListForModule(ModulePath, ImportList); +#endif +} + +// Mark all external summaries in Index for import into the given module. +// Used for distributed builds using a distributed index. +void llvm::ComputeCrossModuleImportForModuleFromIndex( + StringRef ModulePath, const ModuleSummaryIndex &Index, + FunctionImporter::ImportMapTy &ImportList) { + for (auto &GlobalList : Index) { + // Ignore entries for undefined references. + if (GlobalList.second.SummaryList.empty()) + continue; + + auto GUID = GlobalList.first; + assert(GlobalList.second.SummaryList.size() == 1 && + "Expected individual combined index to have one summary per GUID"); + auto &Summary = GlobalList.second.SummaryList[0]; + // Skip the summaries for the importing module. These are included to + // e.g. record required linkage changes. + if (Summary->modulePath() == ModulePath) + continue; + // Doesn't matter what value we plug in to the map, just needs an entry + // to provoke importing by thinBackend. + ImportList[Summary->modulePath()][GUID] = 1; } +#ifndef NDEBUG + dumpImportListForModule(ModulePath, ImportList); #endif } @@ -453,6 +525,17 @@ void llvm::computeDeadSymbols( // Make value live and add it to the worklist if it was not live before. // FIXME: we should only make the prevailing copy live here auto visit = [&](ValueInfo VI) { + // FIXME: If we knew which edges were created for indirect call profiles, + // we could skip them here. Any that are live should be reached via + // other edges, e.g. reference edges. Otherwise, using a profile collected + // on a slightly different binary might provoke preserving, importing + // and ultimately promoting calls to functions not linked into this + // binary, which increases the binary size unnecessarily. Note that + // if this code changes, the importer needs to change so that edges + // to functions marked dead are skipped. + VI = updateValueInfoForIndirectCalls(Index, VI); + if (!VI) + return; for (auto &S : VI.getSummaryList()) if (S->isLive()) return; @@ -465,17 +548,12 @@ void llvm::computeDeadSymbols( while (!Worklist.empty()) { auto VI = Worklist.pop_back_val(); for (auto &Summary : VI.getSummaryList()) { - for (auto Ref : Summary->refs()) + GlobalValueSummary *Base = Summary->getBaseObject(); + for (auto Ref : Base->refs()) visit(Ref); - if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) + if (auto *FS = dyn_cast<FunctionSummary>(Base)) for (auto Call : FS->calls()) visit(Call.first); - if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) { - auto AliaseeGUID = AS->getAliasee().getOriginalName(); - ValueInfo AliaseeVI = Index.getValueInfo(AliaseeGUID); - if (AliaseeVI) - visit(AliaseeVI); - } } } Index.setWithGlobalValueDeadStripping(); @@ -600,23 +678,9 @@ void llvm::thinLTOResolveWeakForLinkerModule( /// Run internalization on \p TheModule based on symmary analysis. void llvm::thinLTOInternalizeModule(Module &TheModule, const GVSummaryMapTy &DefinedGlobals) { - // Parse inline ASM and collect the list of symbols that are not defined in - // the current module. - StringSet<> AsmUndefinedRefs; - ModuleSymbolTable::CollectAsmSymbols( - TheModule, - [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) { - if (Flags & object::BasicSymbolRef::SF_Undefined) - AsmUndefinedRefs.insert(Name); - }); - // Declare a callback for the internalize pass that will ask for every // candidate GlobalValue if it can be internalized or not. auto MustPreserveGV = [&](const GlobalValue &GV) -> bool { - // Can't be internalized if referenced in inline asm. - if (AsmUndefinedRefs.count(GV.getName())) - return true; - // Lookup the linkage recorded in the summaries during global analysis. auto GS = DefinedGlobals.find(GV.getGUID()); if (GS == DefinedGlobals.end()) { @@ -647,12 +711,25 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, // FIXME: See if we can just internalize directly here via linkage changes // based on the index, rather than invoking internalizeModule. - llvm::internalizeModule(TheModule, MustPreserveGV); + internalizeModule(TheModule, MustPreserveGV); +} + +/// Make alias a clone of its aliasee. +static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) { + Function *Fn = cast<Function>(GA->getBaseObject()); + + ValueToValueMapTy VMap; + Function *NewFn = CloneFunction(Fn, VMap); + // Clone should use the original alias's linkage and name, and we ensure + // all uses of alias instead use the new clone (casted if necessary). + NewFn->setLinkage(GA->getLinkage()); + GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewFn, GA->getType())); + NewFn->takeName(GA); + return NewFn; } // Automatically import functions in Module \p DestModule based on the summaries // index. -// Expected<bool> FunctionImporter::importFunctions( Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) { DEBUG(dbgs() << "Starting import for Module " @@ -699,10 +776,9 @@ Expected<bool> FunctionImporter::importFunctions( // Add 'thinlto_src_module' metadata for statistics and debugging. F.setMetadata( "thinlto_src_module", - llvm::MDNode::get( - DestModule.getContext(), - {llvm::MDString::get(DestModule.getContext(), - SrcModule->getSourceFileName())})); + MDNode::get(DestModule.getContext(), + {MDString::get(DestModule.getContext(), + SrcModule->getSourceFileName())})); } GlobalsToImport.insert(&F); } @@ -722,13 +798,6 @@ Expected<bool> FunctionImporter::importFunctions( } } for (GlobalAlias &GA : SrcModule->aliases()) { - // FIXME: This should eventually be controlled entirely by the summary. - if (FunctionImportGlobalProcessing::doImportAsDefinition( - &GA, &GlobalsToImport)) { - GlobalsToImport.insert(&GA); - continue; - } - if (!GA.hasName()) continue; auto GUID = GA.getGUID(); @@ -737,25 +806,25 @@ Expected<bool> FunctionImporter::importFunctions( << " " << GA.getName() << " from " << SrcModule->getSourceFileName() << "\n"); if (Import) { - // Alias can't point to "available_externally". However when we import - // linkOnceODR the linkage does not change. So we import the alias - // and aliasee only in this case. This has been handled by - // computeImportForFunction() - GlobalObject *GO = GA.getBaseObject(); - assert(GO->hasLinkOnceODRLinkage() && - "Unexpected alias to a non-linkonceODR in import list"); -#ifndef NDEBUG - if (!GlobalsToImport.count(GO)) - DEBUG(dbgs() << " alias triggers importing aliasee " << GO->getGUID() - << " " << GO->getName() << " from " - << SrcModule->getSourceFileName() << "\n"); -#endif - if (Error Err = GO->materialize()) - return std::move(Err); - GlobalsToImport.insert(GO); if (Error Err = GA.materialize()) return std::move(Err); - GlobalsToImport.insert(&GA); + // Import alias as a copy of its aliasee. + GlobalObject *Base = GA.getBaseObject(); + if (Error Err = Base->materialize()) + return std::move(Err); + auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA); + DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID() + << " " << Base->getName() << " from " + << SrcModule->getSourceFileName() << "\n"); + if (EnableImportMetadata) { + // Add 'thinlto_src_module' metadata for statistics and debugging. + Fn->setMetadata( + "thinlto_src_module", + MDNode::get(DestModule.getContext(), + {MDString::get(DestModule.getContext(), + SrcModule->getSourceFileName())})); + } + GlobalsToImport.insert(Fn); } } @@ -789,12 +858,6 @@ Expected<bool> FunctionImporter::importFunctions( return ImportedCount; } -/// Summary file to use for function importing when using -function-import from -/// the command line. -static cl::opt<std::string> - SummaryFile("summary-file", - cl::desc("The summary file to use for function importing.")); - static bool doImportingForModule(Module &M) { if (SummaryFile.empty()) report_fatal_error("error: -function-import requires -summary-file\n"); @@ -809,8 +872,15 @@ static bool doImportingForModule(Module &M) { // First step is collecting the import list. FunctionImporter::ImportMapTy ImportList; - ComputeCrossModuleImportForModule(M.getModuleIdentifier(), *Index, - ImportList); + // If requested, simply import all functions in the index. This is used + // when testing distributed backend handling via the opt tool, when + // we have distributed indexes containing exactly the summaries to import. + if (ImportAllIndex) + ComputeCrossModuleImportForModuleFromIndex(M.getModuleIdentifier(), *Index, + ImportList); + else + ComputeCrossModuleImportForModule(M.getModuleIdentifier(), *Index, + ImportList); // Conservatively mark all internal values as promoted. This interface is // only used when doing importing via the function importing pass. The pass @@ -848,17 +918,18 @@ static bool doImportingForModule(Module &M) { } namespace { + /// Pass that performs cross-module function import provided a summary file. class FunctionImportLegacyPass : public ModulePass { public: /// Pass identification, replacement for typeid static char ID; + explicit FunctionImportLegacyPass() : ModulePass(ID) {} + /// Specify pass name for debug output StringRef getPassName() const override { return "Function Importing"; } - explicit FunctionImportLegacyPass() : ModulePass(ID) {} - bool runOnModule(Module &M) override { if (skipModule(M)) return false; @@ -866,7 +937,8 @@ public: return doImportingForModule(M); } }; -} // anonymous namespace + +} // end anonymous namespace PreservedAnalyses FunctionImportPass::run(Module &M, ModuleAnalysisManager &AM) { @@ -881,7 +953,9 @@ INITIALIZE_PASS(FunctionImportLegacyPass, "function-import", "Summary Based Function Import", false, false) namespace llvm { + Pass *createFunctionImportPass() { return new FunctionImportLegacyPass(); } -} + +} // end namespace llvm diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp index c91e8b454927..ada9eb80e680 100644 --- a/lib/Transforms/IPO/GlobalDCE.cpp +++ b/lib/Transforms/IPO/GlobalDCE.cpp @@ -18,7 +18,6 @@ #include "llvm/Transforms/IPO/GlobalDCE.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" @@ -115,7 +114,7 @@ void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) { ComputeDependencies(User, Deps); Deps.erase(&GV); // Remove self-reference. for (GlobalValue *GVU : Deps) { - GVDependencies.insert(std::make_pair(GVU, &GV)); + GVDependencies[GVU].insert(&GV); } } @@ -199,8 +198,8 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) { AliveGlobals.end()}; while (!NewLiveGVs.empty()) { GlobalValue *LGV = NewLiveGVs.pop_back_val(); - for (auto &&GVD : make_range(GVDependencies.equal_range(LGV))) - MarkLive(*GVD.second, &NewLiveGVs); + for (auto *GVD : GVDependencies[LGV]) + MarkLive(*GVD, &NewLiveGVs); } // Now that all globals which are needed are in the AliveGlobals set, we loop diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index 93eab680ca6b..4bb2984e3b47 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -20,22 +20,41 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -45,7 +64,11 @@ #include "llvm/Transforms/Utils/Evaluator.h" #include "llvm/Transforms/Utils/GlobalStatus.h" #include "llvm/Transforms/Utils/Local.h" -#include <algorithm> +#include <cassert> +#include <cstdint> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "globalopt" @@ -139,7 +162,7 @@ static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) { } V = I->getOperand(0); - } while (1); + } while (true); } /// This GV is a pointer root. Loop over all users of the global and clean up @@ -220,7 +243,7 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, break; I->eraseFromParent(); I = J; - } while (1); + } while (true); I->eraseFromParent(); } } @@ -348,7 +371,6 @@ static bool isSafeSROAElementUse(Value *V) { return true; } - /// U is a direct user of the specified global value. Look at it and its uses /// and decide whether it is safe to SROA this global. static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) { @@ -402,11 +424,8 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) { } } - for (User *UU : U->users()) - if (!isSafeSROAElementUse(UU)) - return false; - - return true; + return llvm::all_of(U->users(), + [](User *UU) { return isSafeSROAElementUse(UU); }); } /// Look at all uses of the global and decide whether it is safe for us to @@ -419,6 +438,27 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) { return true; } +/// Copy over the debug info for a variable to its SRA replacements. +static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV, + uint64_t FragmentOffsetInBits, + uint64_t FragmentSizeInBits, + unsigned NumElements) { + SmallVector<DIGlobalVariableExpression *, 1> GVs; + GV->getDebugInfo(GVs); + for (auto *GVE : GVs) { + DIVariable *Var = GVE->getVariable(); + DIExpression *Expr = GVE->getExpression(); + if (NumElements > 1) { + if (auto E = DIExpression::createFragmentExpression( + Expr, FragmentOffsetInBits, FragmentSizeInBits)) + Expr = *E; + else + return; + } + auto *NGVE = DIGlobalVariableExpression::get(GVE->getContext(), Var, Expr); + NGV->addDebugInfo(NGVE); + } +} /// Perform scalar replacement of aggregates on the specified global variable. /// This opens the door for other optimizations by exposing the behavior of the @@ -434,7 +474,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { Constant *Init = GV->getInitializer(); Type *Ty = Init->getType(); - std::vector<GlobalVariable*> NewGlobals; + std::vector<GlobalVariable *> NewGlobals; Module::GlobalListType &Globals = GV->getParent()->getGlobalList(); // Get the alignment of the global, either explicit or target-specific. @@ -443,9 +483,11 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { StartAlignment = DL.getABITypeAlignment(GV->getType()); if (StructType *STy = dyn_cast<StructType>(Ty)) { - NewGlobals.reserve(STy->getNumElements()); + uint64_t FragmentOffset = 0; + unsigned NumElements = STy->getNumElements(); + NewGlobals.reserve(NumElements); const StructLayout &Layout = *DL.getStructLayout(STy); - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + for (unsigned i = 0, e = NumElements; i != e; ++i) { Constant *In = Init->getAggregateElement(i); assert(In && "Couldn't get element of initializer?"); GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false, @@ -465,15 +507,22 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset); if (NewAlign > DL.getABITypeAlignment(STy->getElementType(i))) NGV->setAlignment(NewAlign); + + // Copy over the debug info for the variable. + FragmentOffset = alignTo(FragmentOffset, NewAlign); + uint64_t Size = DL.getTypeSizeInBits(NGV->getValueType()); + transferSRADebugInfo(GV, NGV, FragmentOffset, Size, NumElements); + FragmentOffset += Size; } } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) { unsigned NumElements = STy->getNumElements(); if (NumElements > 16 && GV->hasNUsesOrMore(16)) return nullptr; // It's not worth it. NewGlobals.reserve(NumElements); - - uint64_t EltSize = DL.getTypeAllocSize(STy->getElementType()); - unsigned EltAlign = DL.getABITypeAlignment(STy->getElementType()); + auto ElTy = STy->getElementType(); + uint64_t EltSize = DL.getTypeAllocSize(ElTy); + unsigned EltAlign = DL.getABITypeAlignment(ElTy); + uint64_t FragmentSizeInBits = DL.getTypeSizeInBits(ElTy); for (unsigned i = 0, e = NumElements; i != e; ++i) { Constant *In = Init->getAggregateElement(i); assert(In && "Couldn't get element of initializer?"); @@ -494,6 +543,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { unsigned NewAlign = (unsigned)MinAlign(StartAlignment, EltSize*i); if (NewAlign > EltAlign) NGV->setAlignment(NewAlign); + transferSRADebugInfo(GV, NGV, FragmentSizeInBits * i, FragmentSizeInBits, + NumElements); } } @@ -689,7 +740,6 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { return Changed; } - /// The specified global has only one non-null value stored into it. If there /// are uses of the loaded value that would trap if the loaded value is /// dynamically null, then we know that they cannot be reachable with a null @@ -1045,7 +1095,6 @@ static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V, return true; } - /// If all users of values loaded from GV are simple enough to perform HeapSRA, /// return true. static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV, @@ -1095,9 +1144,9 @@ static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV, } static Value *GetHeapSROAValue(Value *V, unsigned FieldNo, - DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues, - std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { - std::vector<Value*> &FieldVals = InsertedScalarizedValues[V]; + DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues, + std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) { + std::vector<Value *> &FieldVals = InsertedScalarizedValues[V]; if (FieldNo >= FieldVals.size()) FieldVals.resize(FieldNo+1); @@ -1139,8 +1188,8 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo, /// Given a load instruction and a value derived from the load, rewrite the /// derived value to use the HeapSRoA'd load. static void RewriteHeapSROALoadUser(Instruction *LoadUser, - DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues, - std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { + DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues, + std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) { // If this is a comparison against null, handle it. if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) { assert(isa<ConstantPointerNull>(SCI->getOperand(1))); @@ -1187,7 +1236,7 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser, // processed. PHINode *PN = cast<PHINode>(LoadUser); if (!InsertedScalarizedValues.insert(std::make_pair(PN, - std::vector<Value*>())).second) + std::vector<Value *>())).second) return; // If this is the first time we've seen this PHI, recursively process all @@ -1202,8 +1251,8 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser, /// global. Eliminate all uses of Ptr, making them use FieldGlobals instead. /// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA. static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, - DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues, - std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { + DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues, + std::vector<std::pair<PHINode *, unsigned> > &PHIsToRewrite) { for (auto UI = Load->user_begin(), E = Load->user_end(); UI != E;) { Instruction *User = cast<Instruction>(*UI++); RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite); @@ -1232,8 +1281,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, // Okay, at this point, there are no users of the malloc. Insert N // new mallocs at the same place as CI, and N globals. - std::vector<Value*> FieldGlobals; - std::vector<Value*> FieldMallocs; + std::vector<Value *> FieldGlobals; + std::vector<Value *> FieldMallocs; SmallVector<OperandBundleDef, 1> OpBundles; CI->getOperandBundlesAsDefs(OpBundles); @@ -1330,10 +1379,10 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, /// As we process loads, if we can't immediately update all uses of the load, /// keep track of what scalarized loads are inserted for a given load. - DenseMap<Value*, std::vector<Value*> > InsertedScalarizedValues; + DenseMap<Value *, std::vector<Value *>> InsertedScalarizedValues; InsertedScalarizedValues[GV] = FieldGlobals; - std::vector<std::pair<PHINode*, unsigned> > PHIsToRewrite; + std::vector<std::pair<PHINode *, unsigned>> PHIsToRewrite; // Okay, the malloc site is completely handled. All of the uses of GV are now // loads, and all uses of those loads are simple. Rewrite them to use loads @@ -1379,7 +1428,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, } // Drop all inter-phi links and any loads that made it this far. - for (DenseMap<Value*, std::vector<Value*> >::iterator + for (DenseMap<Value *, std::vector<Value *>>::iterator I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end(); I != E; ++I) { if (PHINode *PN = dyn_cast<PHINode>(I->first)) @@ -1389,7 +1438,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, } // Delete all the phis and loads now that inter-references are dead. - for (DenseMap<Value*, std::vector<Value*> >::iterator + for (DenseMap<Value *, std::vector<Value *>>::iterator I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end(); I != E; ++I) { if (PHINode *PN = dyn_cast<PHINode>(I->first)) @@ -1576,12 +1625,57 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) && "No reason to shrink to bool!"); + SmallVector<DIGlobalVariableExpression *, 1> GVs; + GV->getDebugInfo(GVs); + // If initialized to zero and storing one into the global, we can use a cast // instead of a select to synthesize the desired value. bool IsOneZero = false; - if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) + bool EmitOneOrZero = true; + if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)){ IsOneZero = InitVal->isNullValue() && CI->isOne(); + if (ConstantInt *CIInit = dyn_cast<ConstantInt>(GV->getInitializer())){ + uint64_t ValInit = CIInit->getZExtValue(); + uint64_t ValOther = CI->getZExtValue(); + uint64_t ValMinus = ValOther - ValInit; + + for(auto *GVe : GVs){ + DIGlobalVariable *DGV = GVe->getVariable(); + DIExpression *E = GVe->getExpression(); + + // It is expected that the address of global optimized variable is on + // top of the stack. After optimization, value of that variable will + // be ether 0 for initial value or 1 for other value. The following + // expression should return constant integer value depending on the + // value at global object address: + // val * (ValOther - ValInit) + ValInit: + // DW_OP_deref DW_OP_constu <ValMinus> + // DW_OP_mul DW_OP_constu <ValInit> DW_OP_plus DW_OP_stack_value + E = DIExpression::get(NewGV->getContext(), + {dwarf::DW_OP_deref, + dwarf::DW_OP_constu, + ValMinus, + dwarf::DW_OP_mul, + dwarf::DW_OP_constu, + ValInit, + dwarf::DW_OP_plus, + dwarf::DW_OP_stack_value}); + DIGlobalVariableExpression *DGVE = + DIGlobalVariableExpression::get(NewGV->getContext(), DGV, E); + NewGV->addDebugInfo(DGVE); + } + EmitOneOrZero = false; + } + } + + if (EmitOneOrZero) { + // FIXME: This will only emit address for debugger on which will + // be written only 0 or 1. + for(auto *GV : GVs) + NewGV->addDebugInfo(GV); + } + while (!GV->use_empty()) { Instruction *UI = cast<Instruction>(GV->user_back()); if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { @@ -2202,7 +2296,7 @@ static void setUsedInitializer(GlobalVariable &V, // Type of pointer to the array of pointers. PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext(), 0); - SmallVector<llvm::Constant *, 8> UsedArray; + SmallVector<Constant *, 8> UsedArray; for (GlobalValue *GV : Init) { Constant *Cast = ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Int8PtrTy); @@ -2215,14 +2309,15 @@ static void setUsedInitializer(GlobalVariable &V, Module *M = V.getParent(); V.removeFromParent(); GlobalVariable *NV = - new GlobalVariable(*M, ATy, false, llvm::GlobalValue::AppendingLinkage, - llvm::ConstantArray::get(ATy, UsedArray), ""); + new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage, + ConstantArray::get(ATy, UsedArray), ""); NV->takeName(&V); NV->setSection("llvm.metadata"); delete &V; } namespace { + /// An easy to access representation of llvm.used and llvm.compiler.used. class LLVMUsed { SmallPtrSet<GlobalValue *, 8> Used; @@ -2235,25 +2330,34 @@ public: UsedV = collectUsedGlobalVariables(M, Used, false); CompilerUsedV = collectUsedGlobalVariables(M, CompilerUsed, true); } - typedef SmallPtrSet<GlobalValue *, 8>::iterator iterator; - typedef iterator_range<iterator> used_iterator_range; + + using iterator = SmallPtrSet<GlobalValue *, 8>::iterator; + using used_iterator_range = iterator_range<iterator>; + iterator usedBegin() { return Used.begin(); } iterator usedEnd() { return Used.end(); } + used_iterator_range used() { return used_iterator_range(usedBegin(), usedEnd()); } + iterator compilerUsedBegin() { return CompilerUsed.begin(); } iterator compilerUsedEnd() { return CompilerUsed.end(); } + used_iterator_range compilerUsed() { return used_iterator_range(compilerUsedBegin(), compilerUsedEnd()); } + bool usedCount(GlobalValue *GV) const { return Used.count(GV); } + bool compilerUsedCount(GlobalValue *GV) const { return CompilerUsed.count(GV); } + bool usedErase(GlobalValue *GV) { return Used.erase(GV); } bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); } bool usedInsert(GlobalValue *GV) { return Used.insert(GV).second; } + bool compilerUsedInsert(GlobalValue *GV) { return CompilerUsed.insert(GV).second; } @@ -2265,7 +2369,8 @@ public: setUsedInitializer(*CompilerUsedV, CompilerUsed); } }; -} + +} // end anonymous namespace static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) { if (GA.use_empty()) // No use at all. @@ -2580,8 +2685,10 @@ PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) { } namespace { + struct GlobalOptLegacyPass : public ModulePass { static char ID; // Pass identification, replacement for typeid + GlobalOptLegacyPass() : ModulePass(ID) { initializeGlobalOptLegacyPassPass(*PassRegistry::getPassRegistry()); } @@ -2603,9 +2710,11 @@ struct GlobalOptLegacyPass : public ModulePass { AU.addRequired<DominatorTreeWrapperPass>(); } }; -} + +} // end anonymous namespace char GlobalOptLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt", "Global Variable Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) diff --git a/lib/Transforms/IPO/GlobalSplit.cpp b/lib/Transforms/IPO/GlobalSplit.cpp index e47d881d1127..792f4b3052a3 100644 --- a/lib/Transforms/IPO/GlobalSplit.cpp +++ b/lib/Transforms/IPO/GlobalSplit.cpp @@ -15,22 +15,30 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/GlobalSplit.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Transforms/IPO.h" - -#include <set> +#include <cstdint> +#include <vector> using namespace llvm; -namespace { - -bool splitGlobal(GlobalVariable &GV) { +static bool splitGlobal(GlobalVariable &GV) { // If the address of the global is taken outside of the module, we cannot // apply this transformation. if (!GV.hasLocalLinkage()) @@ -130,7 +138,7 @@ bool splitGlobal(GlobalVariable &GV) { return true; } -bool splitGlobals(Module &M) { +static bool splitGlobals(Module &M) { // First, see if the module uses either of the llvm.type.test or // llvm.type.checked.load intrinsics, which indicates that splitting globals // may be beneficial. @@ -151,12 +159,16 @@ bool splitGlobals(Module &M) { return Changed; } +namespace { + struct GlobalSplit : public ModulePass { static char ID; + GlobalSplit() : ModulePass(ID) { initializeGlobalSplitPass(*PassRegistry::getPassRegistry()); } - bool runOnModule(Module &M) { + + bool runOnModule(Module &M) override { if (skipModule(M)) return false; @@ -164,11 +176,12 @@ struct GlobalSplit : public ModulePass { } }; -} +} // end anonymous namespace -INITIALIZE_PASS(GlobalSplit, "globalsplit", "Global splitter", false, false) char GlobalSplit::ID = 0; +INITIALIZE_PASS(GlobalSplit, "globalsplit", "Global splitter", false, false) + ModulePass *llvm::createGlobalSplitPass() { return new GlobalSplit; } diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp index 5bb305ca84d0..d5d35ee89e0e 100644 --- a/lib/Transforms/IPO/IPO.cpp +++ b/lib/Transforms/IPO/IPO.cpp @@ -25,6 +25,7 @@ using namespace llvm; void llvm::initializeIPO(PassRegistry &Registry) { initializeArgPromotionPass(Registry); + initializeCalledValuePropagationLegacyPassPass(Registry); initializeConstantMergeLegacyPassPass(Registry); initializeCrossDSOCFIPass(Registry); initializeDAEPass(Registry); @@ -67,6 +68,10 @@ void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createArgumentPromotionPass()); } +void LLVMAddCalledValuePropagationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createCalledValuePropagationPass()); +} + void LLVMAddConstantMergePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createConstantMergePass()); } diff --git a/lib/Transforms/IPO/InferFunctionAttrs.cpp b/lib/Transforms/IPO/InferFunctionAttrs.cpp index 15d7515cc842..470f97b8ba61 100644 --- a/lib/Transforms/IPO/InferFunctionAttrs.cpp +++ b/lib/Transforms/IPO/InferFunctionAttrs.cpp @@ -8,7 +8,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/InferFunctionAttrs.h" -#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp index 50e7cc89a3b3..b259a0abd63c 100644 --- a/lib/Transforms/IPO/InlineSimple.cpp +++ b/lib/Transforms/IPO/InlineSimple.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -21,7 +20,6 @@ #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Transforms/IPO.h" @@ -57,12 +55,23 @@ public: InlineCost getInlineCost(CallSite CS) override { Function *Callee = CS.getCalledFunction(); TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); + + bool RemarksEnabled = false; + const auto &BBs = CS.getCaller()->getBasicBlockList(); + if (!BBs.empty()) { + auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); + if (DI.isEnabled()) + RemarksEnabled = true; + } + OptimizationRemarkEmitter ORE(CS.getCaller()); + std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, - /*GetBFI=*/None, PSI); + /*GetBFI=*/None, PSI, + RemarksEnabled ? &ORE : nullptr); } bool runOnSCC(CallGraphSCC &SCC) override; diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 317770d133b3..4449c87ddefa 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -14,29 +14,60 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/Inliner.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" -#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/LazyCallGraph.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include <algorithm> +#include <cassert> +#include <functional> +#include <tuple> +#include <utility> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "inline" @@ -63,13 +94,16 @@ static cl::opt<bool> cl::init(false), cl::Hidden); namespace { + enum class InlinerFunctionImportStatsOpts { No = 0, Basic = 1, Verbose = 2, }; -cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats( +} // end anonymous namespace + +static cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats( "inliner-function-import-stats", cl::init(InlinerFunctionImportStatsOpts::No), cl::values(clEnumValN(InlinerFunctionImportStatsOpts::Basic, "basic", @@ -77,10 +111,8 @@ cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats( clEnumValN(InlinerFunctionImportStatsOpts::Verbose, "verbose", "printing of statistics for each inlined function")), cl::Hidden, cl::desc("Enable inliner stats for imported functions")); -} // namespace -LegacyInlinerBase::LegacyInlinerBase(char &ID) - : CallGraphSCCPass(ID), InsertLifetime(true) {} +LegacyInlinerBase::LegacyInlinerBase(char &ID) : CallGraphSCCPass(ID) {} LegacyInlinerBase::LegacyInlinerBase(char &ID, bool InsertLifetime) : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {} @@ -96,7 +128,7 @@ void LegacyInlinerBase::getAnalysisUsage(AnalysisUsage &AU) const { CallGraphSCCPass::getAnalysisUsage(AU); } -typedef DenseMap<ArrayType *, std::vector<AllocaInst *>> InlinedArrayAllocasTy; +using InlinedArrayAllocasTy = DenseMap<ArrayType *, std::vector<AllocaInst *>>; /// Look at all of the allocas that we inlined through this call site. If we /// have already inlined other allocas through other calls into this function, @@ -161,7 +193,6 @@ static void mergeInlinedArrayAllocas( // function. Also, AllocasForType can be empty of course! bool MergedAwayAlloca = false; for (AllocaInst *AvailableAlloca : AllocasForType) { - unsigned Align1 = AI->getAlignment(), Align2 = AvailableAlloca->getAlignment(); @@ -267,7 +298,6 @@ static bool shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC, int &TotalSecondaryCost, function_ref<InlineCost(CallSite CS)> GetInlineCost) { - // For now we only handle local or inline functions. if (!Caller->hasLocalLinkage() && !Caller->hasLinkOnceODRLinkage()) return false; @@ -335,11 +365,14 @@ shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC, return false; } -/// Return true if the inliner should attempt to inline at the given CallSite. -static bool shouldInline(CallSite CS, - function_ref<InlineCost(CallSite CS)> GetInlineCost, - OptimizationRemarkEmitter &ORE) { +/// Return the cost only if the inliner should attempt to inline at the given +/// CallSite. If we return the cost, we will emit an optimisation remark later +/// using that cost, so we won't do so from this function. +static Optional<InlineCost> +shouldInline(CallSite CS, function_ref<InlineCost(CallSite CS)> GetInlineCost, + OptimizationRemarkEmitter &ORE) { using namespace ore; + InlineCost IC = GetInlineCost(CS); Instruction *Call = CS.getInstruction(); Function *Callee = CS.getCalledFunction(); @@ -348,32 +381,33 @@ static bool shouldInline(CallSite CS, if (IC.isAlways()) { DEBUG(dbgs() << " Inlining: cost=always" << ", Call: " << *CS.getInstruction() << "\n"); - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) - << NV("Callee", Callee) - << " should always be inlined (cost=always)"); - return true; + return IC; } if (IC.isNever()) { DEBUG(dbgs() << " NOT Inlining: cost=never" << ", Call: " << *CS.getInstruction() << "\n"); - ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) << NV("Callee", Callee) << " not inlined into " << NV("Caller", Caller) - << " because it should never be inlined (cost=never)"); - return false; + << " because it should never be inlined (cost=never)"; + }); + return None; } if (!IC) { DEBUG(dbgs() << " NOT Inlining: cost=" << IC.getCost() - << ", thres=" << (IC.getCostDelta() + IC.getCost()) + << ", thres=" << IC.getThreshold() << ", Call: " << *CS.getInstruction() << "\n"); - ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call) + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call) << NV("Callee", Callee) << " not inlined into " << NV("Caller", Caller) << " because too costly to inline (cost=" - << NV("Cost", IC.getCost()) << ", threshold=" - << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); - return false; + << NV("Cost", IC.getCost()) + << ", threshold=" << NV("Threshold", IC.getThreshold()) << ")"; + }); + return None; } int TotalSecondaryCost = 0; @@ -381,23 +415,23 @@ static bool shouldInline(CallSite CS, DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction() << " Cost = " << IC.getCost() << ", outer Cost = " << TotalSecondaryCost << '\n'); - ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts", + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts", Call) << "Not inlining. Cost of inlining " << NV("Callee", Callee) << " increases the cost of inlining " << NV("Caller", Caller) - << " in other contexts"); - return false; + << " in other contexts"; + }); + + // IC does not bool() to false, so get an InlineCost that will. + // This will not be inspected to make an error message. + return None; } DEBUG(dbgs() << " Inlining: cost=" << IC.getCost() - << ", thres=" << (IC.getCostDelta() + IC.getCost()) + << ", thres=" << IC.getThreshold() << ", Call: " << *CS.getInstruction() << '\n'); - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBeInlined", Call) - << NV("Callee", Callee) << " can be inlined into " - << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) - << " (threshold=" - << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); - return true; + return IC; } /// Return true if the specified inline history ID @@ -475,11 +509,14 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, if (Function *Callee = CS.getCalledFunction()) if (Callee->isDeclaration()) { using namespace ore; - ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I) + + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I) << NV("Callee", Callee) << " will not be inlined into " << NV("Caller", CS.getCaller()) << " because its definition is unavailable" - << setIsVerbose()); + << setIsVerbose(); + }); continue; } @@ -545,9 +582,10 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, // just become a regular analysis dependency. OptimizationRemarkEmitter ORE(Caller); + Optional<InlineCost> OIC = shouldInline(CS, GetInlineCost, ORE); // If the policy determines that we should inline this function, // delete the call instead. - if (!shouldInline(CS, GetInlineCost, ORE)) + if (!OIC) continue; // If this call site is dead and it is to a readonly function, we should @@ -562,26 +600,40 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, ++NumCallsDeleted; } else { // Get DebugLoc to report. CS will be invalid after Inliner. - DebugLoc DLoc = Instr->getDebugLoc(); + DebugLoc DLoc = CS->getDebugLoc(); BasicBlock *Block = CS.getParent(); // Attempt to inline the function. using namespace ore; + if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas, InlineHistoryID, InsertLifetime, AARGetter, ImportedFunctionsStats)) { - ORE.emit( - OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block) - << NV("Callee", Callee) << " will not be inlined into " - << NV("Caller", Caller)); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, + Block) + << NV("Callee", Callee) << " will not be inlined into " + << NV("Caller", Caller); + }); continue; } ++NumInlined; - // Report the inline decision. - ORE.emit(OptimizationRemark(DEBUG_TYPE, "Inlined", DLoc, Block) - << NV("Callee", Callee) << " inlined into " - << NV("Caller", Caller)); + ORE.emit([&]() { + bool AlwaysInline = OIC->isAlways(); + StringRef RemarkName = AlwaysInline ? "AlwaysInline" : "Inlined"; + OptimizationRemark R(DEBUG_TYPE, RemarkName, DLoc, Block); + R << NV("Callee", Callee) << " inlined into "; + R << NV("Caller", Caller); + if (AlwaysInline) + R << " with cost=always"; + else { + R << " with cost=" << NV("Cost", OIC->getCost()); + R << " (threshold=" << NV("Threshold", OIC->getThreshold()); + R << ")"; + } + return R; + }); // If inlining this function gave us any new call sites, throw them // onto our worklist to process. They are useful inline candidates. @@ -601,7 +653,6 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() && // TODO: Can remove if in SCC now. !SCCFunctions.count(Callee) && - // The function may be apparently dead, but if there are indirect // callgraph references to the node, we cannot delete it yet, this // could invalidate the CGSCC iterator. @@ -840,6 +891,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, FunctionAnalysisManager &FAM = AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG) .getManager(); + + // Get the remarks emission analysis for the caller. + auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); + std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&](Function &F) -> AssumptionCache & { return FAM.getResult<AssumptionAnalysis>(F); @@ -852,12 +907,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, Function &Callee = *CS.getCalledFunction(); auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee); return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, {GetBFI}, - PSI); + PSI, &ORE); }; - // Get the remarks emission analysis for the caller. - auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); - // Now process as many calls as we have within this caller in the sequnece. // We bail out as soon as the caller has to change so we can update the // call graph and prepare the context of that new caller. @@ -872,8 +924,22 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, InlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) continue; + // Check if this inlining may repeat breaking an SCC apart that has + // already been split once before. In that case, inlining here may + // trigger infinite inlining, much like is prevented within the inliner + // itself by the InlineHistory above, but spread across CGSCC iterations + // and thus hidden from the full inline history. + if (CG.lookupSCC(*CG.lookup(Callee)) == C && + UR.InlinedInternalEdges.count({&N, C})) { + DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node " + "previously split out of this SCC by inlining: " + << F.getName() << " -> " << Callee.getName() << "\n"); + continue; + } + + Optional<InlineCost> OIC = shouldInline(CS, GetInlineCost, ORE); // Check whether we want to inline this callsite. - if (!shouldInline(CS, GetInlineCost, ORE)) + if (!OIC) continue; // Setup the data structure used to plumb customization into the @@ -883,11 +949,39 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())), &FAM.getResult<BlockFrequencyAnalysis>(Callee)); - if (!InlineFunction(CS, IFI)) + // Get DebugLoc to report. CS will be invalid after Inliner. + DebugLoc DLoc = CS->getDebugLoc(); + BasicBlock *Block = CS.getParent(); + + using namespace ore; + + if (!InlineFunction(CS, IFI)) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block) + << NV("Callee", &Callee) << " will not be inlined into " + << NV("Caller", &F); + }); continue; + } DidInline = true; InlinedCallees.insert(&Callee); + ORE.emit([&]() { + bool AlwaysInline = OIC->isAlways(); + StringRef RemarkName = AlwaysInline ? "AlwaysInline" : "Inlined"; + OptimizationRemark R(DEBUG_TYPE, RemarkName, DLoc, Block); + R << NV("Callee", &Callee) << " inlined into "; + R << NV("Caller", &F); + if (AlwaysInline) + R << " with cost=always"; + else { + R << " with cost=" << NV("Cost", OIC->getCost()); + R << " (threshold=" << NV("Threshold", OIC->getThreshold()); + R << ")"; + } + return R; + }); + // Add any new callsites to defined functions to the worklist. if (!IFI.InlinedCallSites.empty()) { int NewHistoryID = InlineHistory.size(); @@ -949,17 +1043,38 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, for (LazyCallGraph::Edge &E : *CalleeN) RC->insertTrivialRefEdge(N, E.getNode()); } - InlinedCallees.clear(); // At this point, since we have made changes we have at least removed // a call instruction. However, in the process we do some incremental // simplification of the surrounding code. This simplification can // essentially do all of the same things as a function pass and we can // re-use the exact same logic for updating the call graph to reflect the - // change.. + // change. + LazyCallGraph::SCC *OldC = C; C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR); DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n"); RC = &C->getOuterRefSCC(); + + // If this causes an SCC to split apart into multiple smaller SCCs, there + // is a subtle risk we need to prepare for. Other transformations may + // expose an "infinite inlining" opportunity later, and because of the SCC + // mutation, we will revisit this function and potentially re-inline. If we + // do, and that re-inlining also has the potentially to mutate the SCC + // structure, the infinite inlining problem can manifest through infinite + // SCC splits and merges. To avoid this, we capture the originating caller + // node and the SCC containing the call edge. This is a slight over + // approximation of the possible inlining decisions that must be avoided, + // but is relatively efficient to store. + // FIXME: This seems like a very heavyweight way of retaining the inline + // history, we should look for a more efficient way of tracking it. + if (C != OldC && llvm::any_of(InlinedCallees, [&](Function *Callee) { + return CG.lookupSCC(*CG.lookup(*Callee)) == OldC; + })) { + DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, " + "retaining this to avoid infinite inlining.\n"); + UR.InlinedInternalEdges.insert({&N, OldC}); + } + InlinedCallees.clear(); } // Now that we've finished inlining all of the calls across this SCC, delete @@ -976,8 +1091,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, FunctionAnalysisManager &FAM = AM.getResult<FunctionAnalysisManagerCGSCCProxy>(DeadC, CG) .getManager(); - FAM.clear(*DeadF); - AM.clear(DeadC); + FAM.clear(*DeadF, DeadF->getName()); + AM.clear(DeadC, DeadC.getName()); auto &DeadRC = DeadC.getOuterRefSCC(); CG.removeDeadFunction(*DeadF); diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp index c74b0a35e296..36b6bdba2cd0 100644 --- a/lib/Transforms/IPO/LoopExtractor.cpp +++ b/lib/Transforms/IPO/LoopExtractor.cpp @@ -80,7 +80,7 @@ INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single", // Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); } -bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &) { +bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { if (skipLoop(L)) return false; @@ -143,7 +143,8 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &) { Changed = true; // After extraction, the loop is replaced by a function call, so // we shouldn't try to run any more loop passes on it. - LI.markAsRemoved(L); + LPM.markLoopAsDeleted(*L); + LI.erase(L); } ++NumExtracted; } diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp index 693df5e7ba92..8db7e1e142d2 100644 --- a/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1,4 +1,4 @@ -//===-- LowerTypeTests.cpp - type metadata lowering pass ------------------===// +//===- LowerTypeTests.cpp - type metadata lowering pass -------------------===// // // The LLVM Compiler Infrastructure // @@ -13,32 +13,70 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/LowerTypeTests.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/TinyPtrVector.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TypeMetadataUtils.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/IR/ModuleSummaryIndexYAML.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/TrailingObjects.h" +#include "llvm/Support/YAMLTraits.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <memory> +#include <set> +#include <string> +#include <system_error> +#include <utility> +#include <vector> using namespace llvm; using namespace lowertypetests; @@ -197,6 +235,7 @@ struct ByteArrayInfo { uint64_t BitSize; GlobalVariable *ByteArray; GlobalVariable *MaskGlobal; + uint8_t *MaskPtr = nullptr; }; /// A POD-like structure that we use to store a global reference together with @@ -205,16 +244,19 @@ struct ByteArrayInfo { /// operation involving a map lookup; this data structure helps to reduce the /// number of times we need to do this lookup. class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> { + friend TrailingObjects; + GlobalObject *GO; size_t NTypes; + // For functions: true if this is a definition (either in the merged module or // in one of the thinlto modules). bool IsDefinition; + // For functions: true if this function is either defined or used in a thinlto // module and its jumptable entry needs to be exported to thinlto backends. bool IsExported; - friend TrailingObjects; size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; } public: @@ -231,15 +273,19 @@ public: GTM->getTrailingObjects<MDNode *>()); return GTM; } + GlobalObject *getGlobal() const { return GO; } + bool isDefinition() const { return IsDefinition; } + bool isExported() const { return IsExported; } + ArrayRef<MDNode *> types() const { return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes); } @@ -258,6 +304,7 @@ class LowerTypeTestsModule { IntegerType *Int1Ty = Type::getInt1Ty(M.getContext()); IntegerType *Int8Ty = Type::getInt8Ty(M.getContext()); PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext()); + ArrayType *Int8Arr0Ty = ArrayType::get(Type::getInt8Ty(M.getContext()), 0); IntegerType *Int32Ty = Type::getInt32Ty(M.getContext()); PointerType *Int32PtrTy = PointerType::getUnqual(Int32Ty); IntegerType *Int64Ty = Type::getInt64Ty(M.getContext()); @@ -307,7 +354,8 @@ class LowerTypeTestsModule { Function *WeakInitializerFn = nullptr; - void exportTypeId(StringRef TypeId, const TypeIdLowering &TIL); + bool shouldExportConstantsAsAbsoluteSymbols(); + uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL); TypeIdLowering importTypeId(StringRef TypeId); void importTypeTest(CallInst *CI); void importFunction(Function *F, bool isDefinition); @@ -329,6 +377,7 @@ class LowerTypeTestsModule { unsigned getJumpTableEntrySize(); Type *getJumpTableEntryType(); void createJumpTableEntry(raw_ostream &AsmOS, raw_ostream &ConstraintOS, + Triple::ArchType JumpTableArch, SmallVectorImpl<Value *> &AsmArgs, Function *Dest); void verifyTypeMDNode(GlobalObject *GO, MDNode *Type); void buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds, @@ -350,6 +399,7 @@ class LowerTypeTestsModule { public: LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary, const ModuleSummaryIndex *ImportSummary); + bool lower(); // Lower the module using the action and summary passed as command line @@ -385,11 +435,12 @@ struct LowerTypeTests : public ModulePass { } }; -} // anonymous namespace +} // end anonymous namespace + +char LowerTypeTests::ID = 0; INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false, false) -char LowerTypeTests::ID = 0; ModulePass * llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary, @@ -473,6 +524,8 @@ void LowerTypeTestsModule::allocateByteArrays() { BAI->MaskGlobal->replaceAllUsesWith( ConstantExpr::getIntToPtr(ConstantInt::get(Int8Ty, Mask), Int8PtrTy)); BAI->MaskGlobal->eraseFromParent(); + if (BAI->MaskPtr) + *BAI->MaskPtr = Mask; } Constant *ByteArrayConst = ConstantDataArray::get(M.getContext(), BAB.Bytes); @@ -634,6 +687,10 @@ Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI, Br->getMetadata(LLVMContext::MD_prof)); ReplaceInstWithInst(InitialBB->getTerminator(), NewBr); + // Update phis in Else resulting from InitialBB being split + for (auto &Phi : Else->phis()) + Phi.addIncoming(Phi.getIncomingValueForBlock(Then), InitialBB); + IRBuilder<> ThenB(CI); return createBitSetTest(ThenB, TIL, BitOffset); } @@ -720,13 +777,21 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables( } } +bool LowerTypeTestsModule::shouldExportConstantsAsAbsoluteSymbols() { + return (Arch == Triple::x86 || Arch == Triple::x86_64) && + ObjectFormat == Triple::ELF; +} + /// Export the given type identifier so that ThinLTO backends may import it. /// Type identifiers are exported by adding coarse-grained information about how /// to test the type identifier to the summary, and creating symbols in the /// object file (aliases and absolute symbols) containing fine-grained /// information about the type identifier. -void LowerTypeTestsModule::exportTypeId(StringRef TypeId, - const TypeIdLowering &TIL) { +/// +/// Returns a pointer to the location in which to store the bitmask, if +/// applicable. +uint8_t *LowerTypeTestsModule::exportTypeId(StringRef TypeId, + const TypeIdLowering &TIL) { TypeTestResolution &TTRes = ExportSummary->getOrInsertTypeIdSummary(TypeId).TTRes; TTRes.TheKind = TIL.TheKind; @@ -738,14 +803,21 @@ void LowerTypeTestsModule::exportTypeId(StringRef TypeId, GA->setVisibility(GlobalValue::HiddenVisibility); }; + auto ExportConstant = [&](StringRef Name, uint64_t &Storage, Constant *C) { + if (shouldExportConstantsAsAbsoluteSymbols()) + ExportGlobal(Name, ConstantExpr::getIntToPtr(C, Int8PtrTy)); + else + Storage = cast<ConstantInt>(C)->getZExtValue(); + }; + if (TIL.TheKind != TypeTestResolution::Unsat) ExportGlobal("global_addr", TIL.OffsetedGlobal); if (TIL.TheKind == TypeTestResolution::ByteArray || TIL.TheKind == TypeTestResolution::Inline || TIL.TheKind == TypeTestResolution::AllOnes) { - ExportGlobal("align", ConstantExpr::getIntToPtr(TIL.AlignLog2, Int8PtrTy)); - ExportGlobal("size_m1", ConstantExpr::getIntToPtr(TIL.SizeM1, Int8PtrTy)); + ExportConstant("align", TTRes.AlignLog2, TIL.AlignLog2); + ExportConstant("size_m1", TTRes.SizeM1, TIL.SizeM1); uint64_t BitSize = cast<ConstantInt>(TIL.SizeM1)->getZExtValue() + 1; if (TIL.TheKind == TypeTestResolution::Inline) @@ -756,12 +828,16 @@ void LowerTypeTestsModule::exportTypeId(StringRef TypeId, if (TIL.TheKind == TypeTestResolution::ByteArray) { ExportGlobal("byte_array", TIL.TheByteArray); - ExportGlobal("bit_mask", TIL.BitMask); + if (shouldExportConstantsAsAbsoluteSymbols()) + ExportGlobal("bit_mask", TIL.BitMask); + else + return &TTRes.BitMask; } if (TIL.TheKind == TypeTestResolution::Inline) - ExportGlobal("inline_bits", - ConstantExpr::getIntToPtr(TIL.InlineBits, Int8PtrTy)); + ExportConstant("inline_bits", TTRes.InlineBits, TIL.InlineBits); + + return nullptr; } LowerTypeTestsModule::TypeIdLowering @@ -774,16 +850,34 @@ LowerTypeTestsModule::importTypeId(StringRef TypeId) { TypeIdLowering TIL; TIL.TheKind = TTRes.TheKind; - auto ImportGlobal = [&](StringRef Name, unsigned AbsWidth) { - Constant *C = - M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(), Int8Ty); - auto *GV = dyn_cast<GlobalVariable>(C); - // We only need to set metadata if the global is newly created, in which - // case it would not have hidden visibility. - if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility) + auto ImportGlobal = [&](StringRef Name) { + // Give the global a type of length 0 so that it is not assumed not to alias + // with any other global. + Constant *C = M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(), + Int8Arr0Ty); + if (auto *GV = dyn_cast<GlobalVariable>(C)) + GV->setVisibility(GlobalValue::HiddenVisibility); + C = ConstantExpr::getBitCast(C, Int8PtrTy); + return C; + }; + + auto ImportConstant = [&](StringRef Name, uint64_t Const, unsigned AbsWidth, + Type *Ty) { + if (!shouldExportConstantsAsAbsoluteSymbols()) { + Constant *C = + ConstantInt::get(isa<IntegerType>(Ty) ? Ty : Int64Ty, Const); + if (!isa<IntegerType>(Ty)) + C = ConstantExpr::getIntToPtr(C, Ty); + return C; + } + + Constant *C = ImportGlobal(Name); + auto *GV = cast<GlobalVariable>(C->stripPointerCasts()); + if (isa<IntegerType>(Ty)) + C = ConstantExpr::getPtrToInt(C, Ty); + if (GV->getMetadata(LLVMContext::MD_absolute_symbol)) return C; - GV->setVisibility(GlobalValue::HiddenVisibility); auto SetAbsRange = [&](uint64_t Min, uint64_t Max) { auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min)); auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max)); @@ -792,30 +886,30 @@ LowerTypeTestsModule::importTypeId(StringRef TypeId) { }; if (AbsWidth == IntPtrTy->getBitWidth()) SetAbsRange(~0ull, ~0ull); // Full set. - else if (AbsWidth) + else SetAbsRange(0, 1ull << AbsWidth); return C; }; if (TIL.TheKind != TypeTestResolution::Unsat) - TIL.OffsetedGlobal = ImportGlobal("global_addr", 0); + TIL.OffsetedGlobal = ImportGlobal("global_addr"); if (TIL.TheKind == TypeTestResolution::ByteArray || TIL.TheKind == TypeTestResolution::Inline || TIL.TheKind == TypeTestResolution::AllOnes) { - TIL.AlignLog2 = ConstantExpr::getPtrToInt(ImportGlobal("align", 8), Int8Ty); - TIL.SizeM1 = ConstantExpr::getPtrToInt( - ImportGlobal("size_m1", TTRes.SizeM1BitWidth), IntPtrTy); + TIL.AlignLog2 = ImportConstant("align", TTRes.AlignLog2, 8, Int8Ty); + TIL.SizeM1 = + ImportConstant("size_m1", TTRes.SizeM1, TTRes.SizeM1BitWidth, IntPtrTy); } if (TIL.TheKind == TypeTestResolution::ByteArray) { - TIL.TheByteArray = ImportGlobal("byte_array", 0); - TIL.BitMask = ImportGlobal("bit_mask", 8); + TIL.TheByteArray = ImportGlobal("byte_array"); + TIL.BitMask = ImportConstant("bit_mask", TTRes.BitMask, 8, Int8PtrTy); } if (TIL.TheKind == TypeTestResolution::Inline) - TIL.InlineBits = ConstantExpr::getPtrToInt( - ImportGlobal("inline_bits", 1 << TTRes.SizeM1BitWidth), + TIL.InlineBits = ImportConstant( + "inline_bits", TTRes.InlineBits, 1 << TTRes.SizeM1BitWidth, TTRes.SizeM1BitWidth <= 5 ? Int32Ty : Int64Ty); return TIL; @@ -894,6 +988,7 @@ void LowerTypeTestsModule::lowerTypeTestCalls( BSI.print(dbgs()); }); + ByteArrayInfo *BAI = nullptr; TypeIdLowering TIL; TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr( Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)), @@ -915,15 +1010,18 @@ void LowerTypeTestsModule::lowerTypeTestCalls( } else { TIL.TheKind = TypeTestResolution::ByteArray; ++NumByteArraysCreated; - ByteArrayInfo *BAI = createByteArray(BSI); + BAI = createByteArray(BSI); TIL.TheByteArray = BAI->ByteArray; TIL.BitMask = BAI->MaskGlobal; } TypeIdUserInfo &TIUI = TypeIdUsers[TypeId]; - if (TIUI.IsExported) - exportTypeId(cast<MDString>(TypeId)->getString(), TIL); + if (TIUI.IsExported) { + uint8_t *MaskPtr = exportTypeId(cast<MDString>(TypeId)->getString(), TIL); + if (BAI) + BAI->MaskPtr = MaskPtr; + } // Lower each call to llvm.type.test for this type identifier. for (CallInst *CI : TIUI.CallSites) { @@ -979,15 +1077,16 @@ unsigned LowerTypeTestsModule::getJumpTableEntrySize() { // constraints and arguments to AsmOS, ConstraintOS and AsmArgs. void LowerTypeTestsModule::createJumpTableEntry( raw_ostream &AsmOS, raw_ostream &ConstraintOS, - SmallVectorImpl<Value *> &AsmArgs, Function *Dest) { + Triple::ArchType JumpTableArch, SmallVectorImpl<Value *> &AsmArgs, + Function *Dest) { unsigned ArgIndex = AsmArgs.size(); - if (Arch == Triple::x86 || Arch == Triple::x86_64) { + if (JumpTableArch == Triple::x86 || JumpTableArch == Triple::x86_64) { AsmOS << "jmp ${" << ArgIndex << ":c}@plt\n"; AsmOS << "int3\nint3\nint3\n"; - } else if (Arch == Triple::arm || Arch == Triple::aarch64) { + } else if (JumpTableArch == Triple::arm || JumpTableArch == Triple::aarch64) { AsmOS << "b $" << ArgIndex << "\n"; - } else if (Arch == Triple::thumb) { + } else if (JumpTableArch == Triple::thumb) { AsmOS << "b.w $" << ArgIndex << "\n"; } else { report_fatal_error("Unsupported architecture for jump tables"); @@ -1074,6 +1173,46 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr( PlaceholderFn->eraseFromParent(); } +static bool isThumbFunction(Function *F, Triple::ArchType ModuleArch) { + Attribute TFAttr = F->getFnAttribute("target-features"); + if (!TFAttr.hasAttribute(Attribute::None)) { + SmallVector<StringRef, 6> Features; + TFAttr.getValueAsString().split(Features, ','); + for (StringRef Feature : Features) { + if (Feature == "-thumb-mode") + return false; + else if (Feature == "+thumb-mode") + return true; + } + } + + return ModuleArch == Triple::thumb; +} + +// Each jump table must be either ARM or Thumb as a whole for the bit-test math +// to work. Pick one that matches the majority of members to minimize interop +// veneers inserted by the linker. +static Triple::ArchType +selectJumpTableArmEncoding(ArrayRef<GlobalTypeMember *> Functions, + Triple::ArchType ModuleArch) { + if (ModuleArch != Triple::arm && ModuleArch != Triple::thumb) + return ModuleArch; + + unsigned ArmCount = 0, ThumbCount = 0; + for (const auto GTM : Functions) { + if (!GTM->isDefinition()) { + // PLT stubs are always ARM. + ++ArmCount; + continue; + } + + Function *F = cast<Function>(GTM->getGlobal()); + ++(isThumbFunction(F, ModuleArch) ? ThumbCount : ArmCount); + } + + return ArmCount > ThumbCount ? Triple::arm : Triple::thumb; +} + void LowerTypeTestsModule::createJumpTable( Function *F, ArrayRef<GlobalTypeMember *> Functions) { std::string AsmStr, ConstraintStr; @@ -1081,8 +1220,10 @@ void LowerTypeTestsModule::createJumpTable( SmallVector<Value *, 16> AsmArgs; AsmArgs.reserve(Functions.size() * 2); + Triple::ArchType JumpTableArch = selectJumpTableArmEncoding(Functions, Arch); + for (unsigned I = 0; I != Functions.size(); ++I) - createJumpTableEntry(AsmOS, ConstraintOS, AsmArgs, + createJumpTableEntry(AsmOS, ConstraintOS, JumpTableArch, AsmArgs, cast<Function>(Functions[I]->getGlobal())); // Try to emit the jump table at the end of the text segment. @@ -1098,11 +1239,15 @@ void LowerTypeTestsModule::createJumpTable( // Luckily, this function does not get any prologue even without the // attribute. if (OS != Triple::Win32) - F->addFnAttr(llvm::Attribute::Naked); - // Thumb jump table assembly needs Thumb2. The following attribute is added by - // Clang for -march=armv7. - if (Arch == Triple::thumb) + F->addFnAttr(Attribute::Naked); + if (JumpTableArch == Triple::arm) + F->addFnAttr("target-features", "-thumb-mode"); + if (JumpTableArch == Triple::thumb) { + F->addFnAttr("target-features", "+thumb-mode"); + // Thumb jump table assembly needs Thumb2. The following attribute is added + // by Clang for -march=armv7. F->addFnAttr("target-cpu", "cortex-a8"); + } BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F); IRBuilder<> IRB(BB); @@ -1256,7 +1401,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( FAlias->takeName(F); if (FAlias->hasName()) F->setName(FAlias->getName() + ".cfi"); - F->replaceAllUsesWith(FAlias); + F->replaceUsesExceptBlockAddr(FAlias); } if (!F->isDeclarationForLinker()) F->setLinkage(GlobalValue::InternalLinkage); @@ -1303,7 +1448,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsWASM( void LowerTypeTestsModule::buildBitSetsFromDisjointSet( ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) { - llvm::DenseMap<Metadata *, uint64_t> TypeIdIndices; + DenseMap<Metadata *, uint64_t> TypeIdIndices; for (unsigned I = 0; I != TypeIds.size(); ++I) TypeIdIndices[TypeIds[I]] = I; @@ -1335,38 +1480,25 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet( for (auto &&MemSet : TypeMembers) GLB.addFragment(MemSet); - // Build the bitsets from this disjoint set. - if (Globals.empty() || isa<GlobalVariable>(Globals[0]->getGlobal())) { - // Build a vector of global variables with the computed layout. - std::vector<GlobalTypeMember *> OrderedGVs(Globals.size()); - auto OGI = OrderedGVs.begin(); - for (auto &&F : GLB.Fragments) { - for (auto &&Offset : F) { - auto GV = dyn_cast<GlobalVariable>(Globals[Offset]->getGlobal()); - if (!GV) - report_fatal_error("Type identifier may not contain both global " - "variables and functions"); - *OGI++ = Globals[Offset]; - } - } - - buildBitSetsFromGlobalVariables(TypeIds, OrderedGVs); - } else { - // Build a vector of functions with the computed layout. - std::vector<GlobalTypeMember *> OrderedFns(Globals.size()); - auto OFI = OrderedFns.begin(); - for (auto &&F : GLB.Fragments) { - for (auto &&Offset : F) { - auto Fn = dyn_cast<Function>(Globals[Offset]->getGlobal()); - if (!Fn) - report_fatal_error("Type identifier may not contain both global " - "variables and functions"); - *OFI++ = Globals[Offset]; - } + // Build a vector of globals with the computed layout. + bool IsGlobalSet = + Globals.empty() || isa<GlobalVariable>(Globals[0]->getGlobal()); + std::vector<GlobalTypeMember *> OrderedGTMs(Globals.size()); + auto OGTMI = OrderedGTMs.begin(); + for (auto &&F : GLB.Fragments) { + for (auto &&Offset : F) { + if (IsGlobalSet != isa<GlobalVariable>(Globals[Offset]->getGlobal())) + report_fatal_error("Type identifier may not contain both global " + "variables and functions"); + *OGTMI++ = Globals[Offset]; } - - buildBitSetsFromFunctions(TypeIds, OrderedFns); } + + // Build the bitsets from this disjoint set. + if (IsGlobalSet) + buildBitSetsFromGlobalVariables(TypeIds, OrderedGTMs); + else + buildBitSetsFromFunctions(TypeIds, OrderedGTMs); } /// Lower all type tests in this module. @@ -1457,8 +1589,8 @@ bool LowerTypeTestsModule::lower() { // Equivalence class set containing type identifiers and the globals that // reference them. This is used to partition the set of type identifiers in // the module into disjoint sets. - typedef EquivalenceClasses<PointerUnion<GlobalTypeMember *, Metadata *>> - GlobalClassesTy; + using GlobalClassesTy = + EquivalenceClasses<PointerUnion<GlobalTypeMember *, Metadata *>>; GlobalClassesTy GlobalClasses; // Verify the type metadata and build a few data structures to let us @@ -1473,7 +1605,7 @@ bool LowerTypeTestsModule::lower() { unsigned Index; std::vector<GlobalTypeMember *> RefGlobals; }; - llvm::DenseMap<Metadata *, TIInfo> TypeIdInfo; + DenseMap<Metadata *, TIInfo> TypeIdInfo; unsigned I = 0; SmallVector<MDNode *, 2> Types; @@ -1512,14 +1644,27 @@ bool LowerTypeTestsModule::lower() { FunctionType::get(Type::getVoidTy(M.getContext()), false), GlobalVariable::ExternalLinkage, FunctionName, &M); - if (Linkage == CFL_Definition) - F->eraseMetadata(LLVMContext::MD_type); + // If the function is available_externally, remove its definition so + // that it is handled the same way as a declaration. Later we will try + // to create an alias using this function's linkage, which will fail if + // the linkage is available_externally. This will also result in us + // following the code path below to replace the type metadata. + if (F->hasAvailableExternallyLinkage()) { + F->setLinkage(GlobalValue::ExternalLinkage); + F->deleteBody(); + F->setComdat(nullptr); + F->clearMetadata(); + } + // If the function in the full LTO module is a declaration, replace its + // type metadata with the type metadata we found in cfi.functions. That + // metadata is presumed to be more accurate than the metadata attached + // to the declaration. if (F->isDeclaration()) { if (Linkage == CFL_WeakDeclaration) F->setLinkage(GlobalValue::ExternalWeakLinkage); - SmallVector<MDNode *, 2> Types; + F->eraseMetadata(LLVMContext::MD_type); for (unsigned I = 2; I < FuncMD->getNumOperands(); ++I) F->addMetadata(LLVMContext::MD_type, *cast<MDNode>(FuncMD->getOperand(I).get())); @@ -1548,7 +1693,7 @@ bool LowerTypeTestsModule::lower() { GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types); for (MDNode *Type : Types) { verifyTypeMDNode(&GO, Type); - auto &Info = TypeIdInfo[cast<MDNode>(Type)->getOperand(1)]; + auto &Info = TypeIdInfo[Type->getOperand(1)]; Info.Index = ++I; Info.RefGlobals.push_back(GTM); } @@ -1596,12 +1741,12 @@ bool LowerTypeTestsModule::lower() { for (auto &P : *ExportSummary) { for (auto &S : P.second.SummaryList) { - auto *FS = dyn_cast<FunctionSummary>(S.get()); - if (!FS || !ExportSummary->isGlobalValueLive(FS)) + if (!ExportSummary->isGlobalValueLive(S.get())) continue; - for (GlobalValue::GUID G : FS->type_tests()) - for (Metadata *MD : MetadataByGUID[G]) - AddTypeIdUse(MD).IsExported = true; + if (auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject())) + for (GlobalValue::GUID G : FS->type_tests()) + for (Metadata *MD : MetadataByGUID[G]) + AddTypeIdUse(MD).IsExported = true; } } } diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp index 0e478ba607be..76b90391fbb1 100644 --- a/lib/Transforms/IPO/MergeFunctions.cpp +++ b/lib/Transforms/IPO/MergeFunctions.cpp @@ -89,28 +89,45 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ADT/Hashing.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/ValueMap.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/FunctionComparator.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <set> +#include <utility> #include <vector> using namespace llvm; @@ -119,7 +136,6 @@ using namespace llvm; STATISTIC(NumFunctionsMerged, "Number of functions merged"); STATISTIC(NumThunksWritten, "Number of thunks generated"); -STATISTIC(NumAliasesWritten, "Number of aliases generated"); STATISTIC(NumDoubleWeak, "Number of new functions created"); static cl::opt<unsigned> NumFunctionsForSanityCheck( @@ -154,10 +170,12 @@ namespace { class FunctionNode { mutable AssertingVH<Function> F; FunctionComparator::FunctionHash Hash; + public: // Note the hash is recalculated potentially multiple times, but it is cheap. FunctionNode(Function *F) : F(F), Hash(FunctionComparator::functionHash(*F)) {} + Function *getFunc() const { return F; } FunctionComparator::FunctionHash getHash() const { return Hash; } @@ -174,13 +192,12 @@ public: /// by considering all pointer types to be equivalent. Once identified, /// MergeFunctions will fold them by replacing a call to one to a call to a /// bitcast of the other. -/// class MergeFunctions : public ModulePass { public: static char ID; + MergeFunctions() - : ModulePass(ID), FnTree(FunctionNodeCmp(&GlobalNumbers)), FNodesInTree(), - HasGlobalAliases(false) { + : ModulePass(ID), FnTree(FunctionNodeCmp(&GlobalNumbers)) { initializeMergeFunctionsPass(*PassRegistry::getPassRegistry()); } @@ -191,8 +208,10 @@ private: // not need to become larger with another pointer. class FunctionNodeCmp { GlobalNumberState* GlobalNumbers; + public: FunctionNodeCmp(GlobalNumberState* GN) : GlobalNumbers(GN) {} + bool operator()(const FunctionNode &LHS, const FunctionNode &RHS) const { // Order first by hashes, then full function comparison. if (LHS.getHash() != RHS.getHash()) @@ -201,7 +220,7 @@ private: return FCmp.compare() == -1; } }; - typedef std::set<FunctionNode, FunctionNodeCmp> FnTreeType; + using FnTreeType = std::set<FunctionNode, FunctionNodeCmp>; GlobalNumberState GlobalNumbers; @@ -209,9 +228,9 @@ private: /// analyzed again. std::vector<WeakTrackingVH> Deferred; +#ifndef NDEBUG /// Checks the rules of order relation introduced among functions set. /// Returns true, if sanity check has been passed, and false if failed. -#ifndef NDEBUG bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist); #endif @@ -236,9 +255,6 @@ private: /// again. void mergeTwoFunctions(Function *F, Function *G); - /// Replace G with a thunk or an alias to F. Deletes G. - void writeThunkOrAlias(Function *F, Function *G); - /// Fill PDIUnrelatedWL with instructions from the entry block that are /// unrelated to parameter related debug info. void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock, @@ -256,29 +272,25 @@ private: /// delete G. void writeThunk(Function *F, Function *G); - /// Replace G with an alias to F. Deletes G. - void writeAlias(Function *F, Function *G); - /// Replace function F with function G in the function tree. void replaceFunctionInTree(const FunctionNode &FN, Function *G); /// The set of all distinct functions. Use the insert() and remove() methods /// to modify it. The map allows efficient lookup and deferring of Functions. FnTreeType FnTree; + // Map functions to the iterators of the FunctionNode which contains them // in the FnTree. This must be updated carefully whenever the FnTree is // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid // dangling iterators into FnTree. The invariant that preserves this is that // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree. ValueMap<Function*, FnTreeType::iterator> FNodesInTree; - - /// Whether or not the target supports global aliases. - bool HasGlobalAliases; }; } // end anonymous namespace char MergeFunctions::ID = 0; + INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false) ModulePass *llvm::createMergeFunctionsPass() { @@ -454,19 +466,6 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { } } -// Replace G with an alias to F if possible, or else a thunk to F. Deletes G. -void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) { - if (HasGlobalAliases && G->hasGlobalUnnamedAddr()) { - if (G->hasExternalLinkage() || G->hasLocalLinkage() || - G->hasWeakLinkage()) { - writeAlias(F, G); - return; - } - } - - writeThunk(F, G); -} - // Helper for writeThunk, // Selects proper bitcast operation, // but a bit simpler then CastInst::getCastOpcode. @@ -499,7 +498,6 @@ static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) { // parameter debug info, from the entry block. void MergeFunctions::eraseInstsUnrelatedToPDI( std::vector<Instruction *> &PDIUnrelatedWL) { - DEBUG(dbgs() << " Erasing instructions (in reverse order of appearance in " "entry block) unrelated to parameter debug info from entry " "block: {\n"); @@ -517,7 +515,6 @@ void MergeFunctions::eraseInstsUnrelatedToPDI( // Reduce G to its entry block. void MergeFunctions::eraseTail(Function *G) { - std::vector<BasicBlock *> WorklistBB; for (Function::iterator BBI = std::next(G->begin()), BBE = G->end(); BBI != BBE; ++BBI) { @@ -542,7 +539,6 @@ void MergeFunctions::eraseTail(Function *G) { // PDIUnrelatedWL with such instructions. void MergeFunctions::filterInstsUnrelatedToPDI( BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL) { - std::set<Instruction *> PDIRelated; for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end(); BI != BIE; ++BI) { @@ -652,9 +648,18 @@ void MergeFunctions::filterInstsUnrelatedToPDI( // call sites to point to F even when within the same translation unit. void MergeFunctions::writeThunk(Function *F, Function *G) { if (!G->isInterposable() && !MergeFunctionsPDI) { - // Redirect direct callers of G to F. (See note on MergeFunctionsPDI - // above). - replaceDirectCallers(G, F); + if (G->hasGlobalUnnamedAddr()) { + // G might have been a key in our GlobalNumberState, and it's illegal + // to replace a key in ValueMap<GlobalValue *> with a non-global. + GlobalNumbers.erase(G); + // If G's address is not significant, replace it entirely. + Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType()); + G->replaceAllUsesWith(BitcastF); + } else { + // Redirect direct callers of G to F. (See note on MergeFunctionsPDI + // above). + replaceDirectCallers(G, F); + } } // If G was internal then we may have replaced all uses of G with F. If so, @@ -665,6 +670,16 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { return; } + // Don't merge tiny functions using a thunk, since it can just end up + // making the function larger. + if (F->size() == 1) { + if (F->front().size() <= 2) { + DEBUG(dbgs() << "writeThunk: " << F->getName() + << " is too small to bother creating a thunk for\n"); + return; + } + } + BasicBlock *GEntryBlock = nullptr; std::vector<Instruction *> PDIUnrelatedWL; BasicBlock *BB = nullptr; @@ -691,7 +706,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { SmallVector<Value *, 16> Args; unsigned i = 0; FunctionType *FFTy = F->getFunctionType(); - for (Argument & AI : H->args()) { + for (Argument &AI : H->args()) { Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i))); ++i; } @@ -734,20 +749,6 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { ++NumThunksWritten; } -// Replace G with an alias to F and delete G. -void MergeFunctions::writeAlias(Function *F, Function *G) { - auto *GA = GlobalAlias::create(G->getLinkage(), "", F); - F->setAlignment(std::max(F->getAlignment(), G->getAlignment())); - GA->takeName(G); - GA->setVisibility(G->getVisibility()); - removeUsers(G); - G->replaceAllUsesWith(GA); - G->eraseFromParent(); - - DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n'); - ++NumAliasesWritten; -} - // Merge two equivalent functions. Upon completion, Function G is deleted. void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) { if (F->isInterposable()) { @@ -763,19 +764,14 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) { unsigned MaxAlignment = std::max(G->getAlignment(), H->getAlignment()); - if (HasGlobalAliases) { - writeAlias(F, G); - writeAlias(F, H); - } else { - writeThunk(F, G); - writeThunk(F, H); - } + writeThunk(F, G); + writeThunk(F, H); F->setAlignment(MaxAlignment); F->setLinkage(GlobalValue::PrivateLinkage); ++NumDoubleWeak; } else { - writeThunkOrAlias(F, G); + writeThunk(F, G); } ++NumFunctionsMerged; @@ -816,18 +812,6 @@ bool MergeFunctions::insert(Function *NewFunction) { const FunctionNode &OldF = *Result.first; - // Don't merge tiny functions, since it can just end up making the function - // larger. - // FIXME: Should still merge them if they are unnamed_addr and produce an - // alias. - if (NewFunction->size() == 1) { - if (NewFunction->front().size() <= 2) { - DEBUG(dbgs() << NewFunction->getName() - << " is to small to bother merging\n"); - return false; - } - } - // Impose a total order (by name) on the replacement of functions. This is // important when operating on more than one module independently to prevent // cycles of thunks calling each other when the modules are linked together. diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 8840435af642..683655f1f68b 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -13,45 +13,126 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/PartialInlining.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/IR/User.h" #include "llvm/Pass.h" +#include "llvm/Support/BlockFrequency.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/CodeExtractor.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <functional> +#include <iterator> +#include <memory> +#include <tuple> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "partial-inlining" STATISTIC(NumPartialInlined, "Number of callsites functions partially inlined into."); +STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " + "cold outlined regions were partially " + "inlined into its caller(s)."); +STATISTIC(NumColdRegionsFound, + "Number of cold single entry/exit regions found."); +STATISTIC(NumColdRegionsOutlined, + "Number of cold single entry/exit regions outlined."); // Command line option to disable partial-inlining. The default is false: static cl::opt<bool> DisablePartialInlining("disable-partial-inlining", cl::init(false), - cl::Hidden, cl::desc("Disable partial ininling")); + cl::Hidden, cl::desc("Disable partial inlining")); +// Command line option to disable multi-region partial-inlining. The default is +// false: +static cl::opt<bool> DisableMultiRegionPartialInline( + "disable-mr-partial-inlining", cl::init(false), cl::Hidden, + cl::desc("Disable multi-region partial inlining")); + +// Command line option to force outlining in regions with live exit variables. +// The default is false: +static cl::opt<bool> + ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, + cl::desc("Force outline regions with live exits")); + +// Command line option to enable marking outline functions with Cold Calling +// Convention. The default is false: +static cl::opt<bool> + MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, + cl::desc("Mark outline function calls with ColdCC")); + +#ifndef NDEBUG +// Command line option to debug partial-inlining. The default is none: +static cl::opt<bool> TracePartialInlining("trace-partial-inlining", + cl::init(false), cl::Hidden, + cl::desc("Trace partial inlining.")); +#endif + // This is an option used by testing: static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::init(false), cl::ZeroOrMore, cl::ReallyHidden, cl::desc("Skip Cost Analysis")); +// Used to determine if a cold region is worth outlining based on +// its inlining cost compared to the original function. Default is set at 10%. +// ie. if the cold region reduces the inlining cost of the original function by +// at least 10%. +static cl::opt<float> MinRegionSizeRatio( + "min-region-size-ratio", cl::init(0.1), cl::Hidden, + cl::desc("Minimum ratio comparing relative sizes of each " + "outline candidate and original function")); +// Used to tune the minimum number of execution counts needed in the predecessor +// block to the cold edge. ie. confidence interval. +static cl::opt<unsigned> + MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, + cl::desc("Minimum block executions to consider " + "its BranchProbabilityInfo valid")); +// Used to determine when an edge is considered cold. Default is set to 10%. ie. +// if the branch probability is 10% or less, then it is deemed as 'cold'. +static cl::opt<float> ColdBranchRatio( + "cold-branch-ratio", cl::init(0.1), cl::Hidden, + cl::desc("Minimum BranchProbability to consider a region cold.")); static cl::opt<unsigned> MaxNumInlineBlocks( "max-num-inline-blocks", cl::init(5), cl::Hidden, - cl::desc("Max Number of Blocks To be Partially Inlined")); + cl::desc("Max number of blocks to be partially inlined")); // Command line option to set the maximum number of partial inlining allowed // for the module. The default value of -1 means no limit. @@ -75,9 +156,8 @@ static cl::opt<unsigned> ExtraOutliningPenalty( namespace { struct FunctionOutliningInfo { - FunctionOutliningInfo() - : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr), - ReturnBlockPreds() {} + FunctionOutliningInfo() = default; + // Returns the number of blocks to be inlined including all blocks // in Entries and one return block. unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } @@ -85,30 +165,69 @@ struct FunctionOutliningInfo { // A set of blocks including the function entry that guard // the region to be outlined. SmallVector<BasicBlock *, 4> Entries; + // The return block that is not included in the outlined region. - BasicBlock *ReturnBlock; + BasicBlock *ReturnBlock = nullptr; + // The dominating block of the region to be outlined. - BasicBlock *NonReturnBlock; + BasicBlock *NonReturnBlock = nullptr; + // The set of blocks in Entries that that are predecessors to ReturnBlock SmallVector<BasicBlock *, 4> ReturnBlockPreds; }; +struct FunctionOutliningMultiRegionInfo { + FunctionOutliningMultiRegionInfo() + : ORI() {} + + // Container for outline regions + struct OutlineRegionInfo { + OutlineRegionInfo(SmallVector<BasicBlock *, 8> Region, + BasicBlock *EntryBlock, BasicBlock *ExitBlock, + BasicBlock *ReturnBlock) + : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock), + ReturnBlock(ReturnBlock) {} + SmallVector<BasicBlock *, 8> Region; + BasicBlock *EntryBlock; + BasicBlock *ExitBlock; + BasicBlock *ReturnBlock; + }; + + SmallVector<OutlineRegionInfo, 4> ORI; +}; + struct PartialInlinerImpl { + PartialInlinerImpl( std::function<AssumptionCache &(Function &)> *GetAC, std::function<TargetTransformInfo &(Function &)> *GTTI, Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, - ProfileSummaryInfo *ProfSI) - : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} + ProfileSummaryInfo *ProfSI, + std::function<OptimizationRemarkEmitter &(Function &)> *GORE) + : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI), + GetORE(GORE) {} + bool run(Module &M); - Function *unswitchFunction(Function *F); + // Main part of the transformation that calls helper functions to find + // outlining candidates, clone & outline the function, and attempt to + // partially inline the resulting function. Returns true if + // inlining was successful, false otherwise. Also returns the outline + // function (only if we partially inlined early returns) as there is a + // possibility to further "peel" early return statements that were left in the + // outline function due to code size. + std::pair<bool, Function *> unswitchFunction(Function *F); // This class speculatively clones the the function to be partial inlined. // At the end of partial inlining, the remaining callsites to the cloned // function that are not partially inlined will be fixed up to reference // the original function, and the cloned function will be erased. struct FunctionCloner { - FunctionCloner(Function *F, FunctionOutliningInfo *OI); + // Two constructors, one for single region outlining, the other for + // multi-region outlining. + FunctionCloner(Function *F, FunctionOutliningInfo *OI, + OptimizationRemarkEmitter &ORE); + FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, + OptimizationRemarkEmitter &ORE); ~FunctionCloner(); // Prepare for function outlining: making sure there is only @@ -116,20 +235,34 @@ struct PartialInlinerImpl { // the return block. void NormalizeReturnBlock(); - // Do function outlining: - Function *doFunctionOutlining(); + // Do function outlining for cold regions. + bool doMultiRegionFunctionOutlining(); + // Do function outlining for region after early return block(s). + // NOTE: For vararg functions that do the vararg handling in the outlined + // function, we temporarily generate IR that does not properly + // forward varargs to the outlined function. Calling InlineFunction + // will update calls to the outlined functions to properly forward + // the varargs. + Function *doSingleRegionFunctionOutlining(); Function *OrigFunc = nullptr; Function *ClonedFunc = nullptr; - Function *OutlinedFunc = nullptr; - BasicBlock *OutliningCallBB = nullptr; + + typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; + // Keep track of Outlined Functions and the basic block they're called from. + SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; + // ClonedFunc is inlined in one of its callers after function // outlining. bool IsFunctionInlined = false; // The cost of the region to be outlined. int OutlinedRegionCost = 0; + // ClonedOI is specific to outlining non-early return blocks. std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; + // ClonedOMRI is specific to outlining cold regions. + std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; + OptimizationRemarkEmitter &ORE; }; private: @@ -138,6 +271,7 @@ private: std::function<TargetTransformInfo &(Function &)> *GetTTI; Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; ProfileSummaryInfo *PSI; + std::function<OptimizationRemarkEmitter &(Function &)> *GetORE; // Return the frequency of the OutlininingBB relative to F's entry point. // The result is no larger than 1 and is represented using BP. @@ -148,8 +282,7 @@ private: // Return true if the callee of CS should be partially inlined with // profit. bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, - BlockFrequency WeightedOutliningRcost, - OptimizationRemarkEmitter &ORE); + BlockFrequency WeightedOutliningRcost); // Try to inline DuplicateFunction (cloned from F with call to // the OutlinedFunction into its callers. Return true @@ -196,17 +329,20 @@ private: // - The second value is the estimated size of the new call sequence in // basic block Cloner.OutliningCallBB; std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); + // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to // approximate both the size and runtime cost (Note that in the current // inline cost analysis, there is no clear distinction there either). static int computeBBInlineCost(BasicBlock *BB); std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); - + std::unique_ptr<FunctionOutliningMultiRegionInfo> + computeOutliningColdRegionsInfo(Function *F); }; struct PartialInlinerLegacyPass : public ModulePass { static char ID; // Pass identification, replacement for typeid + PartialInlinerLegacyPass() : ModulePass(ID) { initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); } @@ -216,6 +352,7 @@ struct PartialInlinerLegacyPass : public ModulePass { AU.addRequired<ProfileSummaryInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); } + bool runOnModule(Module &M) override { if (skipModule(M)) return false; @@ -225,6 +362,7 @@ struct PartialInlinerLegacyPass : public ModulePass { &getAnalysis<TargetTransformInfoWrapperPass>(); ProfileSummaryInfo *PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); + std::unique_ptr<OptimizationRemarkEmitter> UPORE; std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & { @@ -236,9 +374,185 @@ struct PartialInlinerLegacyPass : public ModulePass { return TTIWP->getTTI(F); }; - return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M); + std::function<OptimizationRemarkEmitter &(Function &)> GetORE = + [&UPORE](Function &F) -> OptimizationRemarkEmitter & { + UPORE.reset(new OptimizationRemarkEmitter(&F)); + return *UPORE.get(); + }; + + return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI, + &GetORE) + .run(M); } }; + +} // end anonymous namespace + +std::unique_ptr<FunctionOutliningMultiRegionInfo> +PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) { + BasicBlock *EntryBlock = &F->front(); + + DominatorTree DT(*F); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*F, LI); + std::unique_ptr<BlockFrequencyInfo> ScopedBFI; + BlockFrequencyInfo *BFI; + if (!GetBFI) { + ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI)); + BFI = ScopedBFI.get(); + } else + BFI = &(*GetBFI)(*F); + + auto &ORE = (*GetORE)(*F); + + // Return if we don't have profiling information. + if (!PSI->hasInstrumentationProfile()) + return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); + + std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = + llvm::make_unique<FunctionOutliningMultiRegionInfo>(); + + auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) { + BasicBlock *Dom = BlockList.front(); + return BlockList.size() > 1 && + std::distance(pred_begin(Dom), pred_end(Dom)) == 1; + }; + + auto IsSingleExit = + [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { + BasicBlock *ExitBlock = nullptr; + for (auto *Block : BlockList) { + for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { + if (!is_contained(BlockList, *SI)) { + if (ExitBlock) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", + &SI->front()) + << "Region dominated by " + << ore::NV("Block", BlockList.front()->getName()) + << " has more than one region exit edge."; + }); + return nullptr; + } else + ExitBlock = Block; + } + } + } + return ExitBlock; + }; + + auto BBProfileCount = [BFI](BasicBlock *BB) { + return BFI->getBlockProfileCount(BB) + ? BFI->getBlockProfileCount(BB).getValue() + : 0; + }; + + // Use the same computeBBInlineCost function to compute the cost savings of + // the outlining the candidate region. + int OverallFunctionCost = 0; + for (auto &BB : *F) + OverallFunctionCost += computeBBInlineCost(&BB); + +#ifndef NDEBUG + if (TracePartialInlining) + dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n"; +#endif + int MinOutlineRegionCost = + static_cast<int>(OverallFunctionCost * MinRegionSizeRatio); + BranchProbability MinBranchProbability( + static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), + MinBlockCounterExecution); + bool ColdCandidateFound = false; + BasicBlock *CurrEntry = EntryBlock; + std::vector<BasicBlock *> DFS; + DenseMap<BasicBlock *, bool> VisitedMap; + DFS.push_back(CurrEntry); + VisitedMap[CurrEntry] = true; + // Use Depth First Search on the basic blocks to find CFG edges that are + // considered cold. + // Cold regions considered must also have its inline cost compared to the + // overall inline cost of the original function. The region is outlined only + // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or + // more. + while (!DFS.empty()) { + auto *thisBB = DFS.back(); + DFS.pop_back(); + // Only consider regions with predecessor blocks that are considered + // not-cold (default: part of the top 99.99% of all block counters) + // AND greater than our minimum block execution count (default: 100). + if (PSI->isColdBB(thisBB, BFI) || + BBProfileCount(thisBB) < MinBlockCounterExecution) + continue; + for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) { + if (VisitedMap[*SI]) + continue; + VisitedMap[*SI] = true; + DFS.push_back(*SI); + // If branch isn't cold, we skip to the next one. + BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI); + if (SuccProb > MinBranchProbability) + continue; +#ifndef NDEBUG + if (TracePartialInlining) { + dbgs() << "Found cold edge: " << thisBB->getName() << "->" + << (*SI)->getName() << "\nBranch Probability = " << SuccProb + << "\n"; + } +#endif + SmallVector<BasicBlock *, 8> DominateVector; + DT.getDescendants(*SI, DominateVector); + // We can only outline single entry regions (for now). + if (!IsSingleEntry(DominateVector)) + continue; + BasicBlock *ExitBlock = nullptr; + // We can only outline single exit regions (for now). + if (!(ExitBlock = IsSingleExit(DominateVector))) + continue; + int OutlineRegionCost = 0; + for (auto *BB : DominateVector) + OutlineRegionCost += computeBBInlineCost(BB); + +#ifndef NDEBUG + if (TracePartialInlining) + dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n"; +#endif + + if (OutlineRegionCost < MinOutlineRegionCost) { + ORE.emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", + &SI->front()) + << ore::NV("Callee", F) << " inline cost-savings smaller than " + << ore::NV("Cost", MinOutlineRegionCost); + }); + continue; + } + // For now, ignore blocks that belong to a SISE region that is a + // candidate for outlining. In the future, we may want to look + // at inner regions because the outer region may have live-exit + // variables. + for (auto *BB : DominateVector) + VisitedMap[BB] = true; + // ReturnBlock here means the block after the outline call + BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); + // assert(ReturnBlock && "ReturnBlock is NULL somehow!"); + FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( + DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); + RegInfo.Region = DominateVector; + OutliningInfo->ORI.push_back(RegInfo); +#ifndef NDEBUG + if (TracePartialInlining) { + dbgs() << "Found Cold Candidate starting at block: " + << DominateVector.front()->getName() << "\n"; + } +#endif + ColdCandidateFound = true; + NumColdRegionsFound++; + } + } + if (ColdCandidateFound) + return OutliningInfo; + else + return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); } std::unique_ptr<FunctionOutliningInfo> @@ -319,7 +633,6 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { OutliningInfo->Entries.push_back(CurrEntry); CurrEntry = OtherSucc; - } while (true); if (!CandidateFound) @@ -413,15 +726,19 @@ static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { - + BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; auto EntryFreq = Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); auto OutliningCallFreq = - Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB); - - auto OutlineRegionRelFreq = - BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), - EntryFreq.getFrequency()); + Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); + // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE + // we outlined any regions, so we may encounter situations where the + // OutliningCallFreq is *slightly* bigger than the EntryFreq. + if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) { + OutliningCallFreq = EntryFreq; + } + auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( + OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) return OutlineRegionRelFreq; @@ -448,10 +765,10 @@ PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { } bool PartialInlinerImpl::shouldPartialInline( - CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, - OptimizationRemarkEmitter &ORE) { - + CallSite CS, FunctionCloner &Cloner, + BlockFrequency WeightedOutliningRcost) { using namespace ore; + if (SkipCostAnalysis) return true; @@ -461,30 +778,37 @@ bool PartialInlinerImpl::shouldPartialInline( Function *Caller = CS.getCaller(); auto &CalleeTTI = (*GetTTI)(*Callee); + auto &ORE = (*GetORE)(*Caller); InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, - *GetAssumptionCache, GetBFI, PSI); + *GetAssumptionCache, GetBFI, PSI, &ORE); if (IC.isAlways()) { - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) + ORE.emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) << NV("Callee", Cloner.OrigFunc) - << " should always be fully inlined, not partially"); + << " should always be fully inlined, not partially"; + }); return false; } if (IC.isNever()) { - ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " << NV("Caller", Caller) - << " because it should never be inlined (cost=never)"); + << " because it should never be inlined (cost=never)"; + }); return false; } if (!IC) { - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) + ORE.emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " << NV("Caller", Caller) << " because too costly to inline (cost=" << NV("Cost", IC.getCost()) << ", threshold=" - << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); + << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; + }); return false; } const DataLayout &DL = Caller->getParent()->getDataLayout(); @@ -495,23 +819,28 @@ bool PartialInlinerImpl::shouldPartialInline( // Weighted saving is smaller than weighted cost, return false if (NormWeightedSavings < WeightedOutliningRcost) { - ORE.emit( - OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call) - << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " - << NV("Caller", Caller) << " runtime overhead (overhead=" - << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) - << ", savings=" - << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" - << " of making the outlined call is too high"); + ORE.emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", + Call) + << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " + << NV("Caller", Caller) << " runtime overhead (overhead=" + << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) + << ", savings=" + << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) + << ")" + << " of making the outlined call is too high"; + }); return false; } - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) + ORE.emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) << " (threshold=" - << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); + << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; + }); return true; } @@ -566,23 +895,26 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { std::tuple<int, int> PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { + int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; + for (auto FuncBBPair : Cloner.OutlinedFunctions) { + Function *OutlinedFunc = FuncBBPair.first; + BasicBlock* OutliningCallBB = FuncBBPair.second; + // Now compute the cost of the call sequence to the outlined function + // 'OutlinedFunction' in BB 'OutliningCallBB': + OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB); - // Now compute the cost of the call sequence to the outlined function - // 'OutlinedFunction' in BB 'OutliningCallBB': - int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB); - - // Now compute the cost of the extracted/outlined function itself: - int OutlinedFunctionCost = 0; - for (BasicBlock &BB : *Cloner.OutlinedFunc) { - OutlinedFunctionCost += computeBBInlineCost(&BB); + // Now compute the cost of the extracted/outlined function itself: + for (BasicBlock &BB : *OutlinedFunc) + OutlinedFunctionCost += computeBBInlineCost(&BB); } - assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && "Outlined function cost should be no less than the outlined region"); + // The code extractor introduces a new root and exit stub blocks with // additional unconditional branches. Those branches will be eliminated // later with bb layout. The cost should be adjusted accordingly: - OutlinedFunctionCost -= 2 * InlineConstants::InstrCost; + OutlinedFunctionCost -= + 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); int OutliningRuntimeOverhead = OutliningFuncCallCost + @@ -636,9 +968,9 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap( } } -PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F, - FunctionOutliningInfo *OI) - : OrigFunc(F) { +PartialInlinerImpl::FunctionCloner::FunctionCloner( + Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE) + : OrigFunc(F), ORE(ORE) { ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); // Clone the function, so that we can hack away on it. @@ -659,8 +991,39 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F, F->replaceAllUsesWith(ClonedFunc); } -void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { +PartialInlinerImpl::FunctionCloner::FunctionCloner( + Function *F, FunctionOutliningMultiRegionInfo *OI, + OptimizationRemarkEmitter &ORE) + : OrigFunc(F), ORE(ORE) { + ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>(); + + // Clone the function, so that we can hack away on it. + ValueToValueMapTy VMap; + ClonedFunc = CloneFunction(F, VMap); + + // Go through all Outline Candidate Regions and update all BasicBlock + // information. + for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : + OI->ORI) { + SmallVector<BasicBlock *, 8> Region; + for (BasicBlock *BB : RegionInfo.Region) { + Region.push_back(cast<BasicBlock>(VMap[BB])); + } + BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); + BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); + BasicBlock *NewReturnBlock = nullptr; + if (RegionInfo.ReturnBlock) + NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); + FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( + Region, NewEntryBlock, NewExitBlock, NewReturnBlock); + ClonedOMRI->ORI.push_back(MappedRegionInfo); + } + // Go ahead and update all uses to the duplicate, so that we can just + // use the inliner functionality when we're done hacking. + F->replaceAllUsesWith(ClonedFunc); +} +void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { auto getFirstPHI = [](BasicBlock *BB) { BasicBlock::iterator I = BB->begin(); PHINode *FirstPhi = nullptr; @@ -676,6 +1039,11 @@ void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { return FirstPhi; }; + // Shouldn't need to normalize PHIs if we're not outlining non-early return + // blocks. + if (!ClonedOI) + return; + // Special hackery is needed with PHI nodes that have inputs from more than // one extracted block. For simplicity, just split the PHIs into a two-level // sequence of PHIs, some of which will go in the extracted region, and some @@ -726,16 +1094,90 @@ void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { DeadPhis.push_back(OldPhi); } ++I; - } - for (auto *DP : DeadPhis) - DP->eraseFromParent(); + } + for (auto *DP : DeadPhis) + DP->eraseFromParent(); + + for (auto E : ClonedOI->ReturnBlockPreds) { + E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); + } +} + +bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { + + auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) { + int Cost = 0; + for (BasicBlock* BB : Region) + Cost += computeBBInlineCost(BB); + return Cost; + }; + + assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); + + if (ClonedOMRI->ORI.empty()) + return false; + + // The CodeExtractor needs a dominator tree. + DominatorTree DT; + DT.recalculate(*ClonedFunc); + + // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*ClonedFunc, LI); + ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); - for (auto E : ClonedOI->ReturnBlockPreds) { - E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); + SetVector<Value *> Inputs, Outputs, Sinks; + for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : + ClonedOMRI->ORI) { + int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); + + CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, + ClonedFuncBFI.get(), &BPI, /* AllowVarargs */ false); + + CE.findInputsOutputs(Inputs, Outputs, Sinks); + +#ifndef NDEBUG + if (TracePartialInlining) { + dbgs() << "inputs: " << Inputs.size() << "\n"; + dbgs() << "outputs: " << Outputs.size() << "\n"; + for (Value *value : Inputs) + dbgs() << "value used in func: " << *value << "\n"; + for (Value *output : Outputs) + dbgs() << "instr used in func: " << *output << "\n"; } +#endif + // Do not extract regions that have live exit variables. + if (Outputs.size() > 0 && !ForceLiveExit) + continue; + + Function *OutlinedFunc = CE.extractCodeRegion(); + + if (OutlinedFunc) { + CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc); + BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent(); + assert(OutliningCallBB->getParent() == ClonedFunc); + OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); + NumColdRegionsOutlined++; + OutlinedRegionCost += CurrentOutlinedRegionCost; + + if (MarkOutlinedColdCC) { + OutlinedFunc->setCallingConv(CallingConv::Cold); + OCS.setCallingConv(CallingConv::Cold); + } + } else + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", + &RegionInfo.Region.front()->front()) + << "Failed to extract region at block " + << ore::NV("Block", RegionInfo.Region.front()); + }); + } + + return !OutlinedFunctions.empty(); } -Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() { +Function * +PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { // Returns true if the block is to be partial inlined into the caller // (i.e. not to be extracted to the out of line function) auto ToBeInlined = [&, this](BasicBlock *BB) { @@ -744,6 +1186,16 @@ Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() { ClonedOI->Entries.end()); }; + assert(ClonedOI && "Expecting OutlineInfo for single region outline"); + // The CodeExtractor needs a dominator tree. + DominatorTree DT; + DT.recalculate(*ClonedFunc); + + // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*ClonedFunc, LI); + ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); + // Gather up the blocks that we're going to extract. std::vector<BasicBlock *> ToExtract; ToExtract.push_back(ClonedOI->NonReturnBlock); @@ -759,26 +1211,27 @@ Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() { OutlinedRegionCost += computeBBInlineCost(&BB); } - // The CodeExtractor needs a dominator tree. - DominatorTree DT; - DT.recalculate(*ClonedFunc); - - // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. - LoopInfo LI(DT); - BranchProbabilityInfo BPI(*ClonedFunc, LI); - ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); - // Extract the body of the if. - OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, - ClonedFuncBFI.get(), &BPI) - .extractCodeRegion(); + Function *OutlinedFunc = + CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, + ClonedFuncBFI.get(), &BPI, + /* AllowVarargs */ true) + .extractCodeRegion(); if (OutlinedFunc) { - OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) - .getInstruction() - ->getParent(); + BasicBlock *OutliningCallBB = + PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) + .getInstruction() + ->getParent(); assert(OutliningCallBB->getParent() == ClonedFunc); - } + OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); + } else + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", + &ToExtract.front()->front()) + << "Failed to extract region at block " + << ore::NV("Block", ToExtract.front()); + }); return OutlinedFunc; } @@ -789,76 +1242,133 @@ PartialInlinerImpl::FunctionCloner::~FunctionCloner() { ClonedFunc->replaceAllUsesWith(OrigFunc); ClonedFunc->eraseFromParent(); if (!IsFunctionInlined) { - // Remove the function that is speculatively created if there is no + // Remove each function that was speculatively created if there is no // reference. - if (OutlinedFunc) - OutlinedFunc->eraseFromParent(); + for (auto FuncBBPair : OutlinedFunctions) { + Function *Func = FuncBBPair.first; + Func->eraseFromParent(); + } } } -Function *PartialInlinerImpl::unswitchFunction(Function *F) { +std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) { if (F->hasAddressTaken()) - return nullptr; + return {false, nullptr}; // Let inliner handle it if (F->hasFnAttribute(Attribute::AlwaysInline)) - return nullptr; + return {false, nullptr}; if (F->hasFnAttribute(Attribute::NoInline)) - return nullptr; + return {false, nullptr}; if (PSI->isFunctionEntryCold(F)) - return nullptr; + return {false, nullptr}; if (F->user_begin() == F->user_end()) - return nullptr; + return {false, nullptr}; - std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); + auto &ORE = (*GetORE)(*F); + + // Only try to outline cold regions if we have a profile summary, which + // implies we have profiling information. + if (PSI->hasProfileSummary() && F->getEntryCount().hasValue() && + !DisableMultiRegionPartialInline) { + std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = + computeOutliningColdRegionsInfo(F); + if (OMRI) { + FunctionCloner Cloner(F, OMRI.get(), ORE); + +#ifndef NDEBUG + if (TracePartialInlining) { + dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n"; + dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold() + << "\n"; + } +#endif + bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); + + if (DidOutline) { +#ifndef NDEBUG + if (TracePartialInlining) { + dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; + Cloner.ClonedFunc->print(dbgs()); + dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; + } +#endif + if (tryPartialInline(Cloner)) + return {true, nullptr}; + } + } + } + + // Fall-thru to regular partial inlining if we: + // i) can't find any cold regions to outline, or + // ii) can't inline the outlined function anywhere. + std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); if (!OI) - return nullptr; + return {false, nullptr}; - FunctionCloner Cloner(F, OI.get()); + FunctionCloner Cloner(F, OI.get(), ORE); Cloner.NormalizeReturnBlock(); - Function *OutlinedFunction = Cloner.doFunctionOutlining(); + + Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); + + if (!OutlinedFunction) + return {false, nullptr}; bool AnyInline = tryPartialInline(Cloner); if (AnyInline) - return OutlinedFunction; + return {true, OutlinedFunction}; - return nullptr; + return {false, nullptr}; } bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { - int NonWeightedRcost; - int SizeCost; - - if (Cloner.OutlinedFunc == nullptr) + if (Cloner.OutlinedFunctions.empty()) return false; + int SizeCost = 0; + BlockFrequency WeightedRcost; + int NonWeightedRcost; std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); - auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); - auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; + // Only calculate RelativeToEntryFreq when we are doing single region + // outlining. + BranchProbability RelativeToEntryFreq; + if (Cloner.ClonedOI) { + RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); + } else + // RelativeToEntryFreq doesn't make sense when we have more than one + // outlined call because each call will have a different relative frequency + // to the entry block. We can consider using the average, but the + // usefulness of that information is questionable. For now, assume we never + // execute the calls to outlined functions. + RelativeToEntryFreq = BranchProbability(0, 1); + + WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; - // The call sequence to the outlined function is larger than the original - // outlined region size, it does not increase the chances of inlining - // the function with outlining (The inliner usies the size increase to + // The call sequence(s) to the outlined function(s) are larger than the sum of + // the original outlined region size(s), it does not increase the chances of + // inlining the function with outlining (The inliner uses the size increase to // model the cost of inlining a callee). if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { - OptimizationRemarkEmitter ORE(Cloner.OrigFunc); + auto &ORE = (*GetORE)(*Cloner.OrigFunc); DebugLoc DLoc; BasicBlock *Block; std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", + ORE.emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", DLoc, Block) << ore::NV("Function", Cloner.OrigFunc) << " not partially inlined into callers (Original Size = " << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) << ", Size of call sequence to outlined function = " - << ore::NV("NewSize", SizeCost) << ")"); + << ore::NV("NewSize", SizeCost) << ")"; + }); return false; } @@ -882,18 +1392,26 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { if (IsLimitReached()) continue; - OptimizationRemarkEmitter ORE(CS.getCaller()); - if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE)) + if (!shouldPartialInline(CS, Cloner, WeightedRcost)) continue; - ORE.emit( - OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()) - << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " - << ore::NV("Caller", CS.getCaller())); + auto &ORE = (*GetORE)(*CS.getCaller()); + // Construct remark before doing the inlining, as after successful inlining + // the callsite is removed. + OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()); + OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " + << ore::NV("Caller", CS.getCaller()); InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); - InlineFunction(CS, IFI); + // We can only forward varargs when we outlined a single region, else we + // bail on vararg functions. + if (!InlineFunction(CS, IFI, nullptr, true, + (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first + : nullptr))) + continue; + + ORE.emit(OR); // Now update the entry count: if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { @@ -904,13 +1422,23 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { AnyInline = true; NumPartialInlining++; // Update the stats - NumPartialInlined++; + if (Cloner.ClonedOI) + NumPartialInlined++; + else + NumColdOutlinePartialInlined++; + } if (AnyInline) { Cloner.IsFunctionInlined = true; if (CalleeEntryCount) Cloner.OrigFunc->setEntryCount(CalleeEntryCountV); + auto &ORE = (*GetORE)(*Cloner.OrigFunc); + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) + << "Partially inlined into at least one caller"; + }); + } return AnyInline; @@ -944,8 +1472,10 @@ bool PartialInlinerImpl::run(Module &M) { if (Recursive) continue; - if (Function *NewFunc = unswitchFunction(CurrFunc)) { - Worklist.push_back(NewFunc); + std::pair<bool, Function * > Result = unswitchFunction(CurrFunc); + if (Result.second) + Worklist.push_back(Result.second); + if (Result.first) { Changed = true; } } @@ -954,6 +1484,7 @@ bool PartialInlinerImpl::run(Module &M) { } char PartialInlinerLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) @@ -985,9 +1516,15 @@ PreservedAnalyses PartialInlinerPass::run(Module &M, return FAM.getResult<TargetIRAnalysis>(F); }; + std::function<OptimizationRemarkEmitter &(Function &)> GetORE = + [&FAM](Function &F) -> OptimizationRemarkEmitter & { + return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); + }; + ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); - if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M)) + if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI, &GetORE) + .run(M)) return PreservedAnalyses::none(); return PreservedAnalyses::all(); } diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 0b319f6a488b..3855e6245d8e 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -26,11 +26,9 @@ #include "llvm/Analysis/TypeBasedAliasAnalysis.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/IR/Verifier.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ManagedStatic.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/ForceFunctionAttrs.h" #include "llvm/Transforms/IPO/FunctionAttrs.h" @@ -94,15 +92,6 @@ static cl::opt<bool> EnableLoopInterchange( "enable-loopinterchange", cl::init(false), cl::Hidden, cl::desc("Enable the new, experimental LoopInterchange Pass")); -static cl::opt<bool> EnableNonLTOGlobalsModRef( - "enable-non-lto-gmr", cl::init(true), cl::Hidden, - cl::desc( - "Enable the GlobalsModRef AliasAnalysis outside of the LTO pipeline.")); - -static cl::opt<bool> EnableLoopLoadElim( - "enable-loop-load-elim", cl::init(true), cl::Hidden, - cl::desc("Enable the LoopLoadElimination Pass")); - static cl::opt<bool> EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden, cl::desc("Enable preparation for ThinLTO.")); @@ -160,7 +149,6 @@ PassManagerBuilder::PassManagerBuilder() { SizeLevel = 0; LibraryInfo = nullptr; Inliner = nullptr; - DisableUnitAtATime = false; DisableUnrollLoops = false; SLPVectorize = RunSLPVectorization; LoopVectorize = RunLoopVectorization; @@ -251,6 +239,7 @@ void PassManagerBuilder::addInstructionCombiningPass( void PassManagerBuilder::populateFunctionPassManager( legacy::FunctionPassManager &FPM) { addExtensionsToPM(EP_EarlyAsPossible, FPM); + FPM.add(createEntryExitInstrumenterPass()); // Add LibraryInfo if we have some. if (LibraryInfo) @@ -428,6 +417,14 @@ void PassManagerBuilder::populateModulePassManager( else if (GlobalExtensionsNotEmpty() || !Extensions.empty()) MPM.add(createBarrierNoopPass()); + if (PerformThinLTO) { + // Drop available_externally and unreferenced globals. This is necessary + // with ThinLTO in order to avoid leaving undefined references to dead + // globals in the object file. + MPM.add(createEliminateAvailableExternallyPass()); + MPM.add(createGlobalDCEPass()); + } + addExtensionsToPM(EP_EnabledOnOptLevel0, MPM); // Rename anon globals to be able to export them in the summary. @@ -464,23 +461,25 @@ void PassManagerBuilder::populateModulePassManager( if (PrepareForThinLTOUsingPGOSampleProfile) DisableUnrollLoops = true; - if (!DisableUnitAtATime) { - // Infer attributes about declarations if possible. - MPM.add(createInferFunctionAttrsLegacyPass()); + // Infer attributes about declarations if possible. + MPM.add(createInferFunctionAttrsLegacyPass()); - addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); + addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); - MPM.add(createIPSCCPPass()); // IP SCCP - MPM.add(createGlobalOptimizerPass()); // Optimize out global vars - // Promote any localized global vars. - MPM.add(createPromoteMemoryToRegisterPass()); + if (OptLevel > 2) + MPM.add(createCallSiteSplittingPass()); - MPM.add(createDeadArgEliminationPass()); // Dead argument elimination + MPM.add(createIPSCCPPass()); // IP SCCP + MPM.add(createCalledValuePropagationPass()); + MPM.add(createGlobalOptimizerPass()); // Optimize out global vars + // Promote any localized global vars. + MPM.add(createPromoteMemoryToRegisterPass()); - addInstructionCombiningPass(MPM); // Clean up after IPCP & DAE - addExtensionsToPM(EP_Peephole, MPM); - MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE - } + MPM.add(createDeadArgEliminationPass()); // Dead argument elimination + + addInstructionCombiningPass(MPM); // Clean up after IPCP & DAE + addExtensionsToPM(EP_Peephole, MPM); + MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE // For SamplePGO in ThinLTO compile phase, we do not want to do indirect // call promotion as it will change the CFG too much to make the 2nd @@ -490,21 +489,21 @@ void PassManagerBuilder::populateModulePassManager( if (!PerformThinLTO && !PrepareForThinLTOUsingPGOSampleProfile) addPGOInstrPasses(MPM); - if (EnableNonLTOGlobalsModRef) - // We add a module alias analysis pass here. In part due to bugs in the - // analysis infrastructure this "works" in that the analysis stays alive - // for the entire SCC pass run below. - MPM.add(createGlobalsAAWrapperPass()); + // We add a module alias analysis pass here. In part due to bugs in the + // analysis infrastructure this "works" in that the analysis stays alive + // for the entire SCC pass run below. + MPM.add(createGlobalsAAWrapperPass()); // Start of CallGraph SCC passes. - if (!DisableUnitAtATime) - MPM.add(createPruneEHPass()); // Remove dead EH info + MPM.add(createPruneEHPass()); // Remove dead EH info + bool RunInliner = false; if (Inliner) { MPM.add(Inliner); Inliner = nullptr; + RunInliner = true; } - if (!DisableUnitAtATime) - MPM.add(createPostOrderFunctionAttrsLegacyPass()); + + MPM.add(createPostOrderFunctionAttrsLegacyPass()); if (OptLevel > 2) MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args @@ -515,11 +514,11 @@ void PassManagerBuilder::populateModulePassManager( // pass manager that we are specifically trying to avoid. To prevent this // we must insert a no-op module pass to reset the pass manager. MPM.add(createBarrierNoopPass()); + if (RunPartialInlining) MPM.add(createPartialInliningPass()); - if (!DisableUnitAtATime && OptLevel > 1 && !PrepareForLTO && - !PrepareForThinLTO) + if (OptLevel > 1 && !PrepareForLTO && !PrepareForThinLTO) // Remove avail extern fns and globals definitions if we aren't // compiling an object file for later LTO. For LTO we want to preserve // these so they are eligible for inlining at link-time. Note if they @@ -531,15 +530,26 @@ void PassManagerBuilder::populateModulePassManager( // and saves running remaining passes on the eliminated functions. MPM.add(createEliminateAvailableExternallyPass()); - if (!DisableUnitAtATime) - MPM.add(createReversePostOrderFunctionAttrsPass()); + MPM.add(createReversePostOrderFunctionAttrsPass()); + + // The inliner performs some kind of dead code elimination as it goes, + // but there are cases that are not really caught by it. We might + // at some point consider teaching the inliner about them, but it + // is OK for now to run GlobalOpt + GlobalDCE in tandem as their + // benefits generally outweight the cost, making the whole pipeline + // faster. + if (RunInliner) { + MPM.add(createGlobalOptimizerPass()); + MPM.add(createGlobalDCEPass()); + } // If we are planning to perform ThinLTO later, let's not bloat the code with // unrolling/vectorization/... now. We'll first run the inliner + CGSCC passes // during ThinLTO and perform the rest of the optimizations afterward. if (PrepareForThinLTO) { - // Reduce the size of the IR as much as possible. - MPM.add(createGlobalOptimizerPass()); + // Ensure we perform any last passes, but do so before renaming anonymous + // globals in case the passes add any. + addExtensionsToPM(EP_OptimizerLast, MPM); // Rename anon globals to be able to export them in the summary. MPM.add(createNameAnonGlobalPass()); return; @@ -560,23 +570,22 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createLICMPass()); // Hoist loop invariants } - if (EnableNonLTOGlobalsModRef) - // We add a fresh GlobalsModRef run at this point. This is particularly - // useful as the above will have inlined, DCE'ed, and function-attr - // propagated everything. We should at this point have a reasonably minimal - // and richly annotated call graph. By computing aliasing and mod/ref - // information for all local globals here, the late loop passes and notably - // the vectorizer will be able to use them to help recognize vectorizable - // memory operations. - // - // Note that this relies on a bug in the pass manager which preserves - // a module analysis into a function pass pipeline (and throughout it) so - // long as the first function pass doesn't invalidate the module analysis. - // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for - // this to work. Fortunately, it is trivial to preserve AliasAnalysis - // (doing nothing preserves it as it is required to be conservatively - // correct in the face of IR changes). - MPM.add(createGlobalsAAWrapperPass()); + // We add a fresh GlobalsModRef run at this point. This is particularly + // useful as the above will have inlined, DCE'ed, and function-attr + // propagated everything. We should at this point have a reasonably minimal + // and richly annotated call graph. By computing aliasing and mod/ref + // information for all local globals here, the late loop passes and notably + // the vectorizer will be able to use them to help recognize vectorizable + // memory operations. + // + // Note that this relies on a bug in the pass manager which preserves + // a module analysis into a function pass pipeline (and throughout it) so + // long as the first function pass doesn't invalidate the module analysis. + // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for + // this to work. Fortunately, it is trivial to preserve AliasAnalysis + // (doing nothing preserves it as it is required to be conservatively + // correct in the face of IR changes). + MPM.add(createGlobalsAAWrapperPass()); MPM.add(createFloat2IntPass()); @@ -597,8 +606,7 @@ void PassManagerBuilder::populateModulePassManager( // Eliminate loads by forwarding stores from the previous iteration to loads // of the current iteration. - if (EnableLoopLoadElim) - MPM.add(createLoopLoadEliminationPass()); + MPM.add(createLoopLoadEliminationPass()); // FIXME: Because of #pragma vectorize enable, the passes below are always // inserted in the pipeline, even when the vectorizer doesn't run (ex. when @@ -622,6 +630,13 @@ void PassManagerBuilder::populateModulePassManager( addInstructionCombiningPass(MPM); } + // Cleanup after loop vectorization, etc. Simplification passes like CVP and + // GVN, loop transforms, and others have already run, so it's now better to + // convert to more optimized IR using more aggressive simplify CFG options. + // The extra sinking transform can create larger basic blocks, so do this + // before SLP vectorization. + MPM.add(createCFGSimplificationPass(1, true, true, false, true)); + if (RunSLPAfterLoopVectorization && SLPVectorize) { MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. if (OptLevel > 1 && ExtraVectorizerPasses) { @@ -630,7 +645,6 @@ void PassManagerBuilder::populateModulePassManager( } addExtensionsToPM(EP_Peephole, MPM); - MPM.add(createLateCFGSimplificationPass()); // Switches to lookup tables addInstructionCombiningPass(MPM); if (!DisableUnrollLoops) { @@ -650,16 +664,14 @@ void PassManagerBuilder::populateModulePassManager( // about pointer alignments. MPM.add(createAlignmentFromAssumptionsPass()); - if (!DisableUnitAtATime) { - // FIXME: We shouldn't bother with this anymore. - MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes + // FIXME: We shouldn't bother with this anymore. + MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes - // GlobalOpt already deletes dead functions and globals, at -O2 try a - // late pass of GlobalDCE. It is capable of deleting dead cycles. - if (OptLevel > 1) { - MPM.add(createGlobalDCEPass()); // Remove dead fns and globals. - MPM.add(createConstantMergePass()); // Merge dup global constants - } + // GlobalOpt already deletes dead functions and globals, at -O2 try a + // late pass of GlobalDCE. It is capable of deleting dead cycles. + if (OptLevel > 1) { + MPM.add(createGlobalDCEPass()); // Remove dead fns and globals. + MPM.add(createConstantMergePass()); // Merge dup global constants } if (MergeFunctions) @@ -673,6 +685,11 @@ void PassManagerBuilder::populateModulePassManager( // Get rid of LCSSA nodes. MPM.add(createInstructionSimplifierPass()); + // This hoists/decomposes div/rem ops. It should run after other sink/hoist + // passes to avoid re-sinking, but before SimplifyCFG because it can allow + // flattening of blocks. + MPM.add(createDivRemPairsPass()); + // LoopSink (and other loop passes since the last simplifyCFG) might have // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. MPM.add(createCFGSimplificationPass()); @@ -695,6 +712,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createInferFunctionAttrsLegacyPass()); if (OptLevel > 1) { + // Split call-site with more constrained arguments. + PM.add(createCallSiteSplittingPass()); + // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should @@ -706,6 +726,10 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // opens opportunities for globalopt (and inlining) by substituting function // pointers passed as arguments to direct uses of functions. PM.add(createIPSCCPPass()); + + // Attach metadata to indirect call sites indicating the set of functions + // they may target at run-time. This should follow IPSCCP. + PM.add(createCalledValuePropagationPass()); } // Infer attributes about definitions. The readnone attribute in particular is @@ -936,8 +960,7 @@ LLVMPassManagerBuilderSetSizeLevel(LLVMPassManagerBuilderRef PMB, void LLVMPassManagerBuilderSetDisableUnitAtATime(LLVMPassManagerBuilderRef PMB, LLVMBool Value) { - PassManagerBuilder *Builder = unwrap(PMB); - Builder->DisableUnitAtATime = Value; + // NOTE: The DisableUnitAtATime switch has been removed. } void diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp index 3fd59847a005..46b088189040 100644 --- a/lib/Transforms/IPO/PruneEH.cpp +++ b/lib/Transforms/IPO/PruneEH.cpp @@ -24,7 +24,6 @@ #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp index 6baada2c1ae1..f0e781b9d923 100644 --- a/lib/Transforms/IPO/SampleProfile.cpp +++ b/lib/Transforms/IPO/SampleProfile.cpp @@ -23,39 +23,65 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/SampleProfile.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/PostDominators.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" -#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" -#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/ValueSymbolTable.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" +#include "llvm/ProfileData/SampleProf.h" #include "llvm/ProfileData/SampleProfReader.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ErrorOr.h" -#include "llvm/Support/Format.h" +#include "llvm/Support/GenericDomTree.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Utils/CallPromotionUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include <cctype> +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <functional> +#include <limits> +#include <map> +#include <memory> +#include <string> +#include <system_error> +#include <utility> +#include <vector> using namespace llvm; using namespace sampleprof; @@ -67,34 +93,39 @@ using namespace sampleprof; static cl::opt<std::string> SampleProfileFile( "sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden); + static cl::opt<unsigned> SampleProfileMaxPropagateIterations( "sample-profile-max-propagate-iterations", cl::init(100), cl::desc("Maximum number of iterations to go through when propagating " "sample block/edge weights through the CFG.")); + static cl::opt<unsigned> SampleProfileRecordCoverage( "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"), cl::desc("Emit a warning if less than N% of records in the input profile " "are matched to the IR.")); + static cl::opt<unsigned> SampleProfileSampleCoverage( "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"), cl::desc("Emit a warning if less than N% of samples in the input profile " "are matched to the IR.")); + static cl::opt<double> SampleProfileHotThreshold( "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"), cl::desc("Inlined functions that account for more than N% of all samples " "collected in the parent function, will be inlined again.")); namespace { -typedef DenseMap<const BasicBlock *, uint64_t> BlockWeightMap; -typedef DenseMap<const BasicBlock *, const BasicBlock *> EquivalenceClassMap; -typedef std::pair<const BasicBlock *, const BasicBlock *> Edge; -typedef DenseMap<Edge, uint64_t> EdgeWeightMap; -typedef DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>> - BlockEdgeMap; + +using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>; +using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>; +using Edge = std::pair<const BasicBlock *, const BasicBlock *>; +using EdgeWeightMap = DenseMap<Edge, uint64_t>; +using BlockEdgeMap = + DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>; class SampleCoverageTracker { public: - SampleCoverageTracker() : SampleCoverage(), TotalUsedSamples(0) {} + SampleCoverageTracker() = default; bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset, uint32_t Discriminator, uint64_t Samples); @@ -103,15 +134,16 @@ public: unsigned countBodyRecords(const FunctionSamples *FS) const; uint64_t getTotalUsedSamples() const { return TotalUsedSamples; } uint64_t countBodySamples(const FunctionSamples *FS) const; + void clear() { SampleCoverage.clear(); TotalUsedSamples = 0; } private: - typedef std::map<LineLocation, unsigned> BodySampleCoverageMap; - typedef DenseMap<const FunctionSamples *, BodySampleCoverageMap> - FunctionSamplesCoverageMap; + using BodySampleCoverageMap = std::map<LineLocation, unsigned>; + using FunctionSamplesCoverageMap = + DenseMap<const FunctionSamples *, BodySampleCoverageMap>; /// Coverage map for sampling records. /// @@ -135,7 +167,7 @@ private: /// and all the inlined callsites. Strictly, we should have a map of counters /// keyed by FunctionSamples pointers, but these stats are cleared after /// every function, so we just need to keep a single counter. - uint64_t TotalUsedSamples; + uint64_t TotalUsedSamples = 0; }; /// \brief Sample profile pass. @@ -145,29 +177,31 @@ private: /// profile information found in that file. class SampleProfileLoader { public: - SampleProfileLoader(StringRef Name = SampleProfileFile) - : DT(nullptr), PDT(nullptr), LI(nullptr), ACT(nullptr), Reader(), - Samples(nullptr), Filename(Name), ProfileIsValid(false), - TotalCollectedSamples(0) {} + SampleProfileLoader( + StringRef Name, bool IsThinLTOPreLink, + std::function<AssumptionCache &(Function &)> GetAssumptionCache, + std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo) + : GetAC(GetAssumptionCache), GetTTI(GetTargetTransformInfo), + Filename(Name), IsThinLTOPreLink(IsThinLTOPreLink) {} bool doInitialization(Module &M); - bool runOnModule(Module &M); - void setACT(AssumptionCacheTracker *A) { ACT = A; } + bool runOnModule(Module &M, ModuleAnalysisManager *AM); void dump() { Reader->dump(); } protected: - bool runOnFunction(Function &F); + bool runOnFunction(Function &F, ModuleAnalysisManager *AM); unsigned getFunctionLoc(Function &F); bool emitAnnotations(Function &F); ErrorOr<uint64_t> getInstWeight(const Instruction &I); ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB); const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const; std::vector<const FunctionSamples *> - findIndirectCallFunctionSamples(const Instruction &I) const; + findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; const FunctionSamples *findFunctionSamples(const Instruction &I) const; + bool inlineCallInstruction(Instruction *I); bool inlineHotFunctions(Function &F, - DenseSet<GlobalValue::GUID> &ImportGUIDs); + DenseSet<GlobalValue::GUID> &InlinedGUIDs); void printEdgeWeight(raw_ostream &OS, Edge E); void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); @@ -222,7 +256,8 @@ protected: std::unique_ptr<PostDomTreeBase<BasicBlock>> PDT; std::unique_ptr<LoopInfo> LI; - AssumptionCacheTracker *ACT; + std::function<AssumptionCache &(Function &)> GetAC; + std::function<TargetTransformInfo &(Function &)> GetTTI; /// \brief Predecessors for each basic block in the CFG. BlockEdgeMap Predecessors; @@ -236,19 +271,28 @@ protected: std::unique_ptr<SampleProfileReader> Reader; /// \brief Samples collected for the body of this function. - FunctionSamples *Samples; + FunctionSamples *Samples = nullptr; /// \brief Name of the profile file to load. std::string Filename; /// \brief Flag indicating whether the profile input loaded successfully. - bool ProfileIsValid; + bool ProfileIsValid = false; + + /// \brief Flag indicating if the pass is invoked in ThinLTO compile phase. + /// + /// In this phase, in annotation, we should not promote indirect calls. + /// Instead, we will mark GUIDs that needs to be annotated to the function. + bool IsThinLTOPreLink; /// \brief Total number of samples collected in this profile. /// /// This is the sum of all the samples collected in all the functions executed /// at runtime. - uint64_t TotalCollectedSamples; + uint64_t TotalCollectedSamples = 0; + + /// \brief Optimization Remark Emitter used to emit diagnostic remarks. + OptimizationRemarkEmitter *ORE = nullptr; }; class SampleProfileLoaderLegacyPass : public ModulePass { @@ -256,8 +300,15 @@ public: // Class identification, replacement for typeinfo static char ID; - SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile) - : ModulePass(ID), SampleLoader(Name) { + SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile, + bool IsThinLTOPreLink = false) + : ModulePass(ID), SampleLoader(Name, IsThinLTOPreLink, + [&](Function &F) -> AssumptionCache & { + return ACT->getAssumptionCache(F); + }, + [&](Function &F) -> TargetTransformInfo & { + return TTIWP->getTTI(F); + }) { initializeSampleProfileLoaderLegacyPassPass( *PassRegistry::getPassRegistry()); } @@ -267,17 +318,23 @@ public: bool doInitialization(Module &M) override { return SampleLoader.doInitialization(M); } + StringRef getPassName() const override { return "Sample profile pass"; } bool runOnModule(Module &M) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } private: SampleProfileLoader SampleLoader; + AssumptionCacheTracker *ACT = nullptr; + TargetTransformInfoWrapperPass *TTIWP = nullptr; }; +} // end anonymous namespace + /// Return true if the given callsite is hot wrt to its caller. /// /// Functions that were inlined in the original binary will be represented @@ -292,8 +349,8 @@ private: /// /// If that fraction is larger than the default given by /// SampleProfileHotThreshold, the callsite will be inlined again. -bool callsiteIsHot(const FunctionSamples *CallerFS, - const FunctionSamples *CallsiteFS) { +static bool callsiteIsHot(const FunctionSamples *CallerFS, + const FunctionSamples *CallsiteFS) { if (!CallsiteFS) return false; // The callsite was not inlined in the original binary. @@ -309,7 +366,6 @@ bool callsiteIsHot(const FunctionSamples *CallerFS, (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0; return PercentSamples >= SampleProfileHotThreshold; } -} /// Mark as used the sample record for the given function samples at /// (LineOffset, Discriminator). @@ -423,6 +479,7 @@ unsigned SampleProfileLoader::getOffset(const DILocation *DIL) const { 0xffff; } +#ifndef NDEBUG /// \brief Print the weight of edge \p E on stream \p OS. /// /// \param OS Stream to emit the output to. @@ -453,6 +510,7 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS, uint64_t W = (I == BlockWeights.end() ? 0 : I->second); OS << "weight[" << BB->getName() << "]: " << W << "\n"; } +#endif /// \brief Get the weight for an instruction. /// @@ -480,10 +538,12 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst)) return std::error_code(); - // If a call/invoke instruction is inlined in profile, but not inlined here, + // If a direct call/invoke instruction is inlined in profile + // (findCalleeFunctionSamples returns non-empty result), but not inlined here, // it means that the inlined callsite has no sample, thus the call // instruction should have 0 count. if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) && + !ImmutableCallSite(&Inst).isIndirectCall() && findCalleeFunctionSamples(Inst)) return 0; @@ -495,13 +555,18 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { bool FirstMark = CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get()); if (FirstMark) { - const Function *F = Inst.getParent()->getParent(); - LLVMContext &Ctx = F->getContext(); - emitOptimizationRemark( - Ctx, DEBUG_TYPE, *F, DLoc, - Twine("Applied ") + Twine(*R) + - " samples from profile (offset: " + Twine(LineOffset) + - ((Discriminator) ? Twine(".") + Twine(Discriminator) : "") + ")"); + ORE->emit([&]() { + OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst); + Remark << "Applied " << ore::NV("NumSamples", *R); + Remark << " samples from profile (offset: "; + Remark << ore::NV("LineOffset", LineOffset); + if (Discriminator) { + Remark << "."; + Remark << ore::NV("Discriminator", Discriminator); + } + Remark << ")"; + return Remark; + }); } DEBUG(dbgs() << " " << DLoc.getLine() << "." << DIL->getBaseDiscriminator() << ":" << Inst @@ -588,10 +653,11 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const { } /// Returns a vector of FunctionSamples that are the indirect call targets -/// of \p Inst. The vector is sorted by the total number of samples. +/// of \p Inst. The vector is sorted by the total number of samples. Stores +/// the total call count of the indirect call in \p Sum. std::vector<const FunctionSamples *> SampleProfileLoader::findIndirectCallFunctionSamples( - const Instruction &Inst) const { + const Instruction &Inst, uint64_t &Sum) const { const DILocation *DIL = Inst.getDebugLoc(); std::vector<const FunctionSamples *> R; @@ -603,16 +669,25 @@ SampleProfileLoader::findIndirectCallFunctionSamples( if (FS == nullptr) return R; + uint32_t LineOffset = getOffset(DIL); + uint32_t Discriminator = DIL->getBaseDiscriminator(); + + auto T = FS->findCallTargetMapAt(LineOffset, Discriminator); + Sum = 0; + if (T) + for (const auto &T_C : T.get()) + Sum += T_C.second; if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt( LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()))) { - if (M->size() == 0) + if (M->empty()) return R; for (const auto &NameFS : *M) { + Sum += NameFS.second.getEntrySamples(); R.push_back(&NameFS.second); } std::sort(R.begin(), R.end(), [](const FunctionSamples *L, const FunctionSamples *R) { - return L->getTotalSamples() > R->getTotalSamples(); + return L->getEntrySamples() > R->getEntrySamples(); }); } return R; @@ -650,6 +725,39 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { return FS; } +bool SampleProfileLoader::inlineCallInstruction(Instruction *I) { + assert(isa<CallInst>(I) || isa<InvokeInst>(I)); + CallSite CS(I); + Function *CalledFunction = CS.getCalledFunction(); + assert(CalledFunction); + DebugLoc DLoc = I->getDebugLoc(); + BasicBlock *BB = I->getParent(); + InlineParams Params = getInlineParams(); + Params.ComputeFullInlineCost = true; + // Checks if there is anything in the reachable portion of the callee at + // this callsite that makes this inlining potentially illegal. Need to + // set ComputeFullInlineCost, otherwise getInlineCost may return early + // when cost exceeds threshold without checking all IRs in the callee. + // The acutal cost does not matter because we only checks isNever() to + // see if it is legal to inline the callsite. + InlineCost Cost = getInlineCost(CS, Params, GetTTI(*CalledFunction), GetAC, + None, nullptr, nullptr); + if (Cost.isNever()) { + ORE->emit(OptimizationRemark(DEBUG_TYPE, "Not inline", DLoc, BB) + << "incompatible inlining"); + return false; + } + InlineFunctionInfo IFI(nullptr, &GetAC); + if (InlineFunction(CS, IFI)) { + // The call to InlineFunction erases I, so we can't pass it here. + ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB) + << "inlined hot callee '" << ore::NV("Callee", CalledFunction) + << "' into '" << ore::NV("Caller", BB->getParent()) << "'"); + return true; + } + return false; +} + /// \brief Iteratively inline hot callsites of a function. /// /// Iteratively traverse all callsites of the function \p F, and find if @@ -659,17 +767,14 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { /// it to direct call. Each indirect call is limited with a single target. /// /// \param F function to perform iterative inlining. -/// \param ImportGUIDs a set to be updated to include all GUIDs that come -/// from a different module but inlined in the profiled binary. +/// \param InlinedGUIDs a set to be updated to include all GUIDs that are +/// inlined in the profiled binary. /// /// \returns True if there is any inline happened. bool SampleProfileLoader::inlineHotFunctions( - Function &F, DenseSet<GlobalValue::GUID> &ImportGUIDs) { + Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { DenseSet<Instruction *> PromotedInsns; bool Changed = false; - LLVMContext &Ctx = F.getContext(); - std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&]( - Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; while (true) { bool LocalChanged = false; SmallVector<Instruction *, 10> CIS; @@ -690,57 +795,59 @@ bool SampleProfileLoader::inlineHotFunctions( } } for (auto I : CIS) { - InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr); Function *CalledFunction = CallSite(I).getCalledFunction(); // Do not inline recursive calls. if (CalledFunction == &F) continue; - Instruction *DI = I; - if (!CalledFunction && !PromotedInsns.count(I) && - CallSite(I).isIndirectCall()) - for (const auto *FS : findIndirectCallFunctionSamples(*I)) { + if (CallSite(I).isIndirectCall()) { + if (PromotedInsns.count(I)) + continue; + uint64_t Sum; + for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) { + if (IsThinLTOPreLink) { + FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), + Samples->getTotalSamples() * + SampleProfileHotThreshold / 100); + continue; + } auto CalleeFunctionName = FS->getName(); // If it is a recursive call, we do not inline it as it could bloat // the code exponentially. There is way to better handle this, e.g. // clone the caller first, and inline the cloned caller if it is - // recursive. As llvm does not inline recursive calls, we will simply - // ignore it instead of handling it explicitly. + // recursive. As llvm does not inline recursive calls, we will + // simply ignore it instead of handling it explicitly. if (CalleeFunctionName == F.getName()) continue; + const char *Reason = "Callee function not available"; auto R = SymbolMap.find(CalleeFunctionName); - if (R == SymbolMap.end()) - continue; - CalledFunction = R->getValue(); - if (CalledFunction && isLegalToPromote(I, CalledFunction, &Reason)) { - // The indirect target was promoted and inlined in the profile, as a - // result, we do not have profile info for the branch probability. - // We set the probability to 80% taken to indicate that the static - // call is likely taken. - DI = dyn_cast<Instruction>( - promoteIndirectCall(I, CalledFunction, 80, 100, false) - ->stripPointerCasts()); + if (R != SymbolMap.end() && R->getValue() && + !R->getValue()->isDeclaration() && + R->getValue()->getSubprogram() && + isLegalToPromote(CallSite(I), R->getValue(), &Reason)) { + uint64_t C = FS->getEntrySamples(); + Instruction *DI = + pgo::promoteIndirectCall(I, R->getValue(), C, Sum, false, ORE); + Sum -= C; PromotedInsns.insert(I); + // If profile mismatches, we should not attempt to inline DI. + if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) && + inlineCallInstruction(DI)) + LocalChanged = true; } else { - DEBUG(dbgs() << "\nFailed to promote indirect call to " - << CalleeFunctionName << " because " << Reason - << "\n"); - continue; + DEBUG(dbgs() + << "\nFailed to promote indirect call to " + << CalleeFunctionName << " because " << Reason << "\n"); } } - if (!CalledFunction || !CalledFunction->getSubprogram()) { - findCalleeFunctionSamples(*I)->findImportedFunctions( - ImportGUIDs, F.getParent(), + } else if (CalledFunction && CalledFunction->getSubprogram() && + !CalledFunction->isDeclaration()) { + if (inlineCallInstruction(I)) + LocalChanged = true; + } else if (IsThinLTOPreLink) { + findCalleeFunctionSamples(*I)->findInlinedFunctions( + InlinedGUIDs, F.getParent(), Samples->getTotalSamples() * SampleProfileHotThreshold / 100); - continue; - } - DebugLoc DLoc = I->getDebugLoc(); - if (InlineFunction(CallSite(DI), IFI)) { - LocalChanged = true; - emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc, - Twine("inlined hot callee '") + - CalledFunction->getName() + "' into '" + - F.getName() + "'"); } } if (LocalChanged) { @@ -1076,24 +1183,20 @@ void SampleProfileLoader::buildEdges(Function &F) { } } -/// Sorts the CallTargetMap \p M by count in descending order and stores the -/// sorted result in \p Sorted. Returns the total counts. -static uint64_t SortCallTargets(SmallVector<InstrProfValueData, 2> &Sorted, - const SampleRecord::CallTargetMap &M) { - Sorted.clear(); - uint64_t Sum = 0; - for (auto I = M.begin(); I != M.end(); ++I) { - Sum += I->getValue(); - Sorted.push_back({Function::getGUID(I->getKey()), I->getValue()}); - } - std::sort(Sorted.begin(), Sorted.end(), +/// Returns the sorted CallTargetMap \p M by count in descending order. +static SmallVector<InstrProfValueData, 2> SortCallTargets( + const SampleRecord::CallTargetMap &M) { + SmallVector<InstrProfValueData, 2> R; + for (auto I = M.begin(); I != M.end(); ++I) + R.push_back({Function::getGUID(I->getKey()), I->getValue()}); + std::sort(R.begin(), R.end(), [](const InstrProfValueData &L, const InstrProfValueData &R) { if (L.Count == R.Count) return L.Value > R.Value; else return L.Count > R.Count; }); - return Sum; + return R; } /// \brief Propagate weights into edges @@ -1184,10 +1287,12 @@ void SampleProfileLoader::propagateWeights(Function &F) { if (!FS) continue; auto T = FS->findCallTargetMapAt(LineOffset, Discriminator); - if (!T || T.get().size() == 0) + if (!T || T.get().empty()) continue; - SmallVector<InstrProfValueData, 2> SortedCallTargets; - uint64_t Sum = SortCallTargets(SortedCallTargets, T.get()); + SmallVector<InstrProfValueData, 2> SortedCallTargets = + SortCallTargets(T.get()); + uint64_t Sum; + findIndirectCallFunctionSamples(I, Sum); annotateValueSite(*I.getParent()->getParent()->getParent(), I, SortedCallTargets, Sum, IPVK_IndirectCallTarget, SortedCallTargets.size()); @@ -1211,7 +1316,7 @@ void SampleProfileLoader::propagateWeights(Function &F) { << ".\n"); SmallVector<uint32_t, 4> Weights; uint32_t MaxWeight = 0; - DebugLoc MaxDestLoc; + Instruction *MaxDestInst; for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { BasicBlock *Succ = TI->getSuccessor(I); Edge E = std::make_pair(BB, Succ); @@ -1230,7 +1335,7 @@ void SampleProfileLoader::propagateWeights(Function &F) { if (Weight != 0) { if (Weight > MaxWeight) { MaxWeight = Weight; - MaxDestLoc = Succ->getFirstNonPHIOrDbgOrLifetime()->getDebugLoc(); + MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime(); } } } @@ -1243,15 +1348,13 @@ void SampleProfileLoader::propagateWeights(Function &F) { // weights, the second pass does not need to set it. if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) { DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); - TI->setMetadata(llvm::LLVMContext::MD_prof, + TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); - emitOptimizationRemark( - Ctx, DEBUG_TYPE, F, MaxDestLoc, - Twine("most popular destination for conditional branches at ") + - ((BranchLoc) ? Twine(BranchLoc->getFilename() + ":" + - Twine(BranchLoc.getLine()) + ":" + - Twine(BranchLoc.getCol())) - : Twine("<UNKNOWN LOCATION>"))); + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst) + << "most popular destination for conditional branches at " + << ore::NV("CondBranchesLoc", BranchLoc); + }); } else { DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n"); } @@ -1351,18 +1454,19 @@ bool SampleProfileLoader::emitAnnotations(Function &F) { DEBUG(dbgs() << "Line number for the first instruction in " << F.getName() << ": " << getFunctionLoc(F) << "\n"); - DenseSet<GlobalValue::GUID> ImportGUIDs; - Changed |= inlineHotFunctions(F, ImportGUIDs); + DenseSet<GlobalValue::GUID> InlinedGUIDs; + Changed |= inlineHotFunctions(F, InlinedGUIDs); // Compute basic block weights. Changed |= computeBlockWeights(F); if (Changed) { // Add an entry count to the function using the samples gathered at the - // function entry. Also sets the GUIDs that comes from a different - // module but inlined in the profiled binary. This is aiming at making - // the IR match the profiled binary before annotation. - F.setEntryCount(Samples->getHeadSamples() + 1, &ImportGUIDs); + // function entry. + // Sets the GUIDs that are inlined in the profiled binary. This is used + // for ThinLink to make correct liveness analysis, and also make the IR + // match the profiled binary before annotation. + F.setEntryCount(Samples->getHeadSamples() + 1, &InlinedGUIDs); // Compute dominance and loop info needed for propagation. computeDominanceAndLoopInfo(F); @@ -1404,9 +1508,11 @@ bool SampleProfileLoader::emitAnnotations(Function &F) { } char SampleProfileLoaderLegacyPass::ID = 0; + INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile", "Sample Profile loader", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", "Sample Profile loader", false, false) @@ -1431,7 +1537,7 @@ ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) { return new SampleProfileLoaderLegacyPass(Name); } -bool SampleProfileLoader::runOnModule(Module &M) { +bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM) { if (!ProfileIsValid) return false; @@ -1463,7 +1569,7 @@ bool SampleProfileLoader::runOnModule(Module &M) { for (auto &F : M) if (!F.isDeclaration()) { clearFunctionData(); - retval |= runOnFunction(F); + retval |= runOnFunction(F, AM); } if (M.getProfileSummary() == nullptr) M.setProfileSummary(Reader->getSummary().getMD(M.getContext())); @@ -1471,13 +1577,23 @@ bool SampleProfileLoader::runOnModule(Module &M) { } bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { - // FIXME: pass in AssumptionCache correctly for the new pass manager. - SampleLoader.setACT(&getAnalysis<AssumptionCacheTracker>()); - return SampleLoader.runOnModule(M); + ACT = &getAnalysis<AssumptionCacheTracker>(); + TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); + return SampleLoader.runOnModule(M, nullptr); } -bool SampleProfileLoader::runOnFunction(Function &F) { +bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { F.setEntryCount(0); + std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; + if (AM) { + auto &FAM = + AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent()) + .getManager(); + ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); + } else { + OwnedORE = make_unique<OptimizationRemarkEmitter>(&F); + ORE = OwnedORE.get(); + } Samples = Reader->getSamplesFor(F); if (Samples && !Samples->empty()) return emitAnnotations(F); @@ -1486,13 +1602,23 @@ bool SampleProfileLoader::runOnFunction(Function &F) { PreservedAnalyses SampleProfileLoaderPass::run(Module &M, ModuleAnalysisManager &AM) { + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + + auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { + return FAM.getResult<AssumptionAnalysis>(F); + }; + auto GetTTI = [&](Function &F) -> TargetTransformInfo & { + return FAM.getResult<TargetIRAnalysis>(F); + }; SampleProfileLoader SampleLoader( - ProfileFileName.empty() ? SampleProfileFile : ProfileFileName); + ProfileFileName.empty() ? SampleProfileFile : ProfileFileName, + IsThinLTOPreLink, GetAssumptionCache, GetTTI); SampleLoader.doInitialization(M); - if (!SampleLoader.runOnModule(M)) + if (!SampleLoader.runOnModule(M, &AM)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 8ef6bb652309..945133074059 100644 --- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -19,7 +19,6 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" -#include "llvm/Support/FileSystem.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" @@ -40,9 +39,17 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId, continue; auto Name = ExportGV.getName(); - GlobalValue *ImportGV = ImportM.getNamedValue(Name); - if ((!ImportGV || ImportGV->use_empty()) && !PromoteExtra.count(&ExportGV)) - continue; + GlobalValue *ImportGV = nullptr; + if (!PromoteExtra.count(&ExportGV)) { + ImportGV = ImportM.getNamedValue(Name); + if (!ImportGV) + continue; + ImportGV->removeDeadConstantUsers(); + if (ImportGV->use_empty()) { + ImportGV->eraseFromParent(); + continue; + } + } std::string NewName = (Name + ModuleId).str(); @@ -141,7 +148,9 @@ void simplifyExternals(Module &M) { continue; } - if (!F.isDeclaration() || F.getFunctionType() == EmptyFT) + if (!F.isDeclaration() || F.getFunctionType() == EmptyFT || + // Changing the type of an intrinsic may invalidate the IR. + F.getName().startswith("llvm.")) continue; Function *NewF = @@ -376,15 +385,14 @@ void splitAndWriteThinLTOBitcode( W.writeStrtab(); OS << Buffer; - // If a minimized bitcode module was requested for the thin link, - // strip the debug info (the merged module was already stripped above) - // and write it to the given OS. + // If a minimized bitcode module was requested for the thin link, only + // the information that is needed by thin link will be written in the + // given OS (the merged module will be written as usual). if (ThinLinkOS) { Buffer.clear(); BitcodeWriter W2(Buffer); StripDebugInfo(M); - W2.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index, - /*GenerateHash=*/false, &ModHash); + W2.writeThinLinkBitcode(&M, Index, ModHash); W2.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false, &MergedMIndex); W2.writeSymtab(); @@ -420,14 +428,11 @@ void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS, ModuleHash ModHash = {{0}}; WriteBitcodeToFile(&M, OS, /*ShouldPreserveUseListOrder=*/false, Index, /*GenerateHash=*/true, &ModHash); - // If a minimized bitcode module was requested for the thin link, - // strip the debug info and write it to the given OS. - if (ThinLinkOS) { - StripDebugInfo(M); - WriteBitcodeToFile(&M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false, - Index, - /*GenerateHash=*/false, &ModHash); - } + // If a minimized bitcode module was requested for the thin link, only + // the information that is needed by thin link will be written in the + // given OS. + if (ThinLinkOS && Index) + WriteThinLinkBitcodeToFile(&M, *ThinLinkOS, *Index, ModHash); } class WriteThinLTOBitcode : public ModulePass { diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp index 00769cd63229..ec56f0cde25d 100644 --- a/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -51,14 +51,13 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" @@ -275,18 +274,39 @@ struct VirtualCallSite { // of that field for details. unsigned *NumUnsafeUses; - void emitRemark(const Twine &OptName, const Twine &TargetName) { + void + emitRemark(const StringRef OptName, const StringRef TargetName, + function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { Function *F = CS.getCaller(); - emitOptimizationRemark( - F->getContext(), DEBUG_TYPE, *F, - CS.getInstruction()->getDebugLoc(), - OptName + ": devirtualized a call to " + TargetName); + DebugLoc DLoc = CS->getDebugLoc(); + BasicBlock *Block = CS.getParent(); + + // In the new pass manager, we can request the optimization + // remark emitter pass on a per-function-basis, which the + // OREGetter will do for us. + // In the old pass manager, this is harder, so we just build + // a optimization remark emitter on the fly, when we need it. + std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; + OptimizationRemarkEmitter *ORE; + if (OREGetter) + ORE = &OREGetter(F); + else { + OwnedORE = make_unique<OptimizationRemarkEmitter>(F); + ORE = OwnedORE.get(); + } + + using namespace ore; + ORE->emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block) + << NV("Optimization", OptName) << ": devirtualized a call to " + << NV("FunctionName", TargetName)); } - void replaceAndErase(const Twine &OptName, const Twine &TargetName, - bool RemarksEnabled, Value *New) { + void replaceAndErase( + const StringRef OptName, const StringRef TargetName, bool RemarksEnabled, + function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, + Value *New) { if (RemarksEnabled) - emitRemark(OptName, TargetName); + emitRemark(OptName, TargetName, OREGetter); CS->replaceAllUsesWith(New); if (auto II = dyn_cast<InvokeInst>(CS.getInstruction())) { BranchInst::Create(II->getNormalDest(), CS.getInstruction()); @@ -383,6 +403,7 @@ struct DevirtModule { IntegerType *IntPtrTy; bool RemarksEnabled; + function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; MapVector<VTableSlot, VTableSlotInfo> CallSlots; @@ -397,6 +418,7 @@ struct DevirtModule { std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest; DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter, + function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, ModuleSummaryIndex *ExportSummary, const ModuleSummaryIndex *ImportSummary) : M(M), AARGetter(AARGetter), ExportSummary(ExportSummary), @@ -405,7 +427,7 @@ struct DevirtModule { Int32Ty(Type::getInt32Ty(M.getContext())), Int64Ty(Type::getInt64Ty(M.getContext())), IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)), - RemarksEnabled(areRemarksEnabled()) { + RemarksEnabled(areRemarksEnabled()), OREGetter(OREGetter) { assert(!(ExportSummary && ImportSummary)); } @@ -444,16 +466,23 @@ struct DevirtModule { std::string getGlobalName(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name); + bool shouldExportConstantsAsAbsoluteSymbols(); + // This function is called during the export phase to create a symbol // definition containing information about the given vtable slot and list of // arguments. void exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name, Constant *C); + void exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name, + uint32_t Const, uint32_t &Storage); // This function is called during the import phase to create a reference to // the symbol definition created during the export phase. Constant *importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, - StringRef Name, unsigned AbsWidth = 0); + StringRef Name); + Constant *importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args, + StringRef Name, IntegerType *IntTy, + uint32_t Storage); void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne, Constant *UniqueMemberAddr); @@ -482,8 +511,9 @@ struct DevirtModule { // Lower the module using the action and summary passed as command line // arguments. For testing purposes only. - static bool runForTesting(Module &M, - function_ref<AAResults &(Function &)> AARGetter); + static bool runForTesting( + Module &M, function_ref<AAResults &(Function &)> AARGetter, + function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter); }; struct WholeProgramDevirt : public ModulePass { @@ -508,9 +538,14 @@ struct WholeProgramDevirt : public ModulePass { bool runOnModule(Module &M) override { if (skipModule(M)) return false; + + auto OREGetter = function_ref<OptimizationRemarkEmitter &(Function *)>(); + if (UseCommandLine) - return DevirtModule::runForTesting(M, LegacyAARGetter(*this)); - return DevirtModule(M, LegacyAARGetter(*this), ExportSummary, ImportSummary) + return DevirtModule::runForTesting(M, LegacyAARGetter(*this), OREGetter); + + return DevirtModule(M, LegacyAARGetter(*this), OREGetter, ExportSummary, + ImportSummary) .run(); } @@ -542,13 +577,17 @@ PreservedAnalyses WholeProgramDevirtPass::run(Module &M, auto AARGetter = [&](Function &F) -> AAResults & { return FAM.getResult<AAManager>(F); }; - if (!DevirtModule(M, AARGetter, nullptr, nullptr).run()) + auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { + return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); + }; + if (!DevirtModule(M, AARGetter, OREGetter, nullptr, nullptr).run()) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } bool DevirtModule::runForTesting( - Module &M, function_ref<AAResults &(Function &)> AARGetter) { + Module &M, function_ref<AAResults &(Function &)> AARGetter, + function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { ModuleSummaryIndex Summary; // Handle the command-line summary arguments. This code is for testing @@ -566,7 +605,7 @@ bool DevirtModule::runForTesting( bool Changed = DevirtModule( - M, AARGetter, + M, AARGetter, OREGetter, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr, ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr) .run(); @@ -684,7 +723,7 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, auto Apply = [&](CallSiteInfo &CSInfo) { for (auto &&VCallSite : CSInfo.CallSites) { if (RemarksEnabled) - VCallSite.emitRemark("single-impl", TheFn->getName()); + VCallSite.emitRemark("single-impl", TheFn->getName(), OREGetter); VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast( TheFn, VCallSite.CS.getCalledValue()->getType())); // This use is no longer unsafe. @@ -724,9 +763,24 @@ bool DevirtModule::trySingleImplDevirt( // to make it visible to thin LTO objects. We can only get here during the // ThinLTO export phase. if (TheFn->hasLocalLinkage()) { + std::string NewName = (TheFn->getName() + "$merged").str(); + + // Since we are renaming the function, any comdats with the same name must + // also be renamed. This is required when targeting COFF, as the comdat name + // must match one of the names of the symbols in the comdat. + if (Comdat *C = TheFn->getComdat()) { + if (C->getName() == TheFn->getName()) { + Comdat *NewC = M.getOrInsertComdat(NewName); + NewC->setSelectionKind(C->getSelectionKind()); + for (GlobalObject &GO : M.global_objects()) + if (GO.getComdat() == C) + GO.setComdat(NewC); + } + } + TheFn->setLinkage(GlobalValue::ExternalLinkage); TheFn->setVisibility(GlobalValue::HiddenVisibility); - TheFn->setName(TheFn->getName() + "$merged"); + TheFn->setName(NewName); } Res->TheKind = WholeProgramDevirtResolution::SingleImpl; @@ -769,7 +823,7 @@ void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, uint64_t TheRetVal) { for (auto Call : CSInfo.CallSites) Call.replaceAndErase( - "uniform-ret-val", FnName, RemarksEnabled, + "uniform-ret-val", FnName, RemarksEnabled, OREGetter, ConstantInt::get(cast<IntegerType>(Call.CS.getType()), TheRetVal)); CSInfo.markDevirt(); } @@ -808,6 +862,12 @@ std::string DevirtModule::getGlobalName(VTableSlot Slot, return OS.str(); } +bool DevirtModule::shouldExportConstantsAsAbsoluteSymbols() { + Triple T(M.getTargetTriple()); + return (T.getArch() == Triple::x86 || T.getArch() == Triple::x86_64) && + T.getObjectFormat() == Triple::ELF; +} + void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name, Constant *C) { GlobalAlias *GA = GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage, @@ -815,27 +875,55 @@ void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, GA->setVisibility(GlobalValue::HiddenVisibility); } +void DevirtModule::exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args, + StringRef Name, uint32_t Const, + uint32_t &Storage) { + if (shouldExportConstantsAsAbsoluteSymbols()) { + exportGlobal( + Slot, Args, Name, + ConstantExpr::getIntToPtr(ConstantInt::get(Int32Ty, Const), Int8PtrTy)); + return; + } + + Storage = Const; +} + Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, - StringRef Name, unsigned AbsWidth) { + StringRef Name) { Constant *C = M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Ty); auto *GV = dyn_cast<GlobalVariable>(C); + if (GV) + GV->setVisibility(GlobalValue::HiddenVisibility); + return C; +} + +Constant *DevirtModule::importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args, + StringRef Name, IntegerType *IntTy, + uint32_t Storage) { + if (!shouldExportConstantsAsAbsoluteSymbols()) + return ConstantInt::get(IntTy, Storage); + + Constant *C = importGlobal(Slot, Args, Name); + auto *GV = cast<GlobalVariable>(C->stripPointerCasts()); + C = ConstantExpr::getPtrToInt(C, IntTy); + // We only need to set metadata if the global is newly created, in which // case it would not have hidden visibility. - if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility) + if (GV->getMetadata(LLVMContext::MD_absolute_symbol)) return C; - GV->setVisibility(GlobalValue::HiddenVisibility); auto SetAbsRange = [&](uint64_t Min, uint64_t Max) { auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min)); auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max)); GV->setMetadata(LLVMContext::MD_absolute_symbol, MDNode::get(M.getContext(), {MinC, MaxC})); }; + unsigned AbsWidth = IntTy->getBitWidth(); if (AbsWidth == IntPtrTy->getBitWidth()) SetAbsRange(~0ull, ~0ull); // Full set. - else if (AbsWidth) + else SetAbsRange(0, 1ull << AbsWidth); - return GV; + return C; } void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, @@ -843,10 +931,12 @@ void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, Constant *UniqueMemberAddr) { for (auto &&Call : CSInfo.CallSites) { IRBuilder<> B(Call.CS.getInstruction()); - Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, - Call.VTable, UniqueMemberAddr); + Value *Cmp = + B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, + B.CreateBitCast(Call.VTable, Int8PtrTy), UniqueMemberAddr); Cmp = B.CreateZExt(Cmp, Call.CS->getType()); - Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, Cmp); + Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, OREGetter, + Cmp); } CSInfo.markDevirt(); } @@ -909,17 +999,19 @@ void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName, for (auto Call : CSInfo.CallSites) { auto *RetType = cast<IntegerType>(Call.CS.getType()); IRBuilder<> B(Call.CS.getInstruction()); - Value *Addr = B.CreateGEP(Int8Ty, Call.VTable, Byte); + Value *Addr = + B.CreateGEP(Int8Ty, B.CreateBitCast(Call.VTable, Int8PtrTy), Byte); if (RetType->getBitWidth() == 1) { Value *Bits = B.CreateLoad(Addr); Value *BitsAndBit = B.CreateAnd(Bits, Bit); auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0)); Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled, - IsBitSet); + OREGetter, IsBitSet); } else { Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo()); Value *Val = B.CreateLoad(RetType, ValAddr); - Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled, Val); + Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled, + OREGetter, Val); } } CSInfo.markDevirt(); @@ -1007,18 +1099,18 @@ bool DevirtModule::tryVirtualConstProp( for (auto &&Target : TargetsForSlot) Target.WasDevirt = true; - Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte); - Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit); if (CSByConstantArg.second.isExported()) { ResByArg->TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp; - exportGlobal(Slot, CSByConstantArg.first, "byte", - ConstantExpr::getIntToPtr(ByteConst, Int8PtrTy)); - exportGlobal(Slot, CSByConstantArg.first, "bit", - ConstantExpr::getIntToPtr(BitConst, Int8PtrTy)); + exportConstant(Slot, CSByConstantArg.first, "byte", OffsetByte, + ResByArg->Byte); + exportConstant(Slot, CSByConstantArg.first, "bit", 1ULL << OffsetBit, + ResByArg->Bit); } // Rewrite each call to a load from OffsetByte/OffsetBit. + Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte); + Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit); applyVirtualConstProp(CSByConstantArg.second, TargetsForSlot[0].Fn->getName(), ByteConst, BitConst); } @@ -1112,8 +1204,7 @@ void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc, Value *Ptr = CI->getArgOperand(0)->stripPointerCasts(); if (SeenPtrs.insert(Ptr).second) { for (DevirtCallSite Call : DevirtCalls) { - CallSlots[{TypeId, Call.Offset}].addCallSite(CI->getArgOperand(0), - Call.CS, nullptr); + CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CS, nullptr); } } } @@ -1250,10 +1341,10 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) { break; } case WholeProgramDevirtResolution::ByArg::VirtualConstProp: { - Constant *Byte = importGlobal(Slot, CSByConstantArg.first, "byte", 32); - Byte = ConstantExpr::getPtrToInt(Byte, Int32Ty); - Constant *Bit = importGlobal(Slot, CSByConstantArg.first, "bit", 8); - Bit = ConstantExpr::getPtrToInt(Bit, Int8Ty); + Constant *Byte = importConstant(Slot, CSByConstantArg.first, "byte", + Int32Ty, ResByArg.Byte); + Constant *Bit = importConstant(Slot, CSByConstantArg.first, "bit", Int8Ty, + ResByArg.Bit); applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit); } default: @@ -1406,9 +1497,24 @@ bool DevirtModule::run() { // Generate remarks for each devirtualized function. for (const auto &DT : DevirtTargets) { Function *F = DT.second; - DISubprogram *SP = F->getSubprogram(); - emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, SP, - Twine("devirtualized ") + F->getName()); + + // In the new pass manager, we can request the optimization + // remark emitter pass on a per-function-basis, which the + // OREGetter will do for us. + // In the old pass manager, this is harder, so we just build + // a optimization remark emitter on the fly, when we need it. + std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; + OptimizationRemarkEmitter *ORE; + if (OREGetter) + ORE = &OREGetter(F); + else { + OwnedORE = make_unique<OptimizationRemarkEmitter>(F); + ORE = OwnedORE.get(); + } + + using namespace ore; + ORE->emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F) + << "devirtualized " << NV("FunctionName", F->getName())); } } |
