diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-06-16 21:03:24 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-06-16 21:03:24 +0000 |
commit | 7c7aba6e5fef47a01a136be655b0a92cfd7090f6 (patch) | |
tree | 99ec531924f6078534b100ab9d7696abce848099 /lib/Transforms | |
parent | 7ab83427af0f77b59941ceba41d509d7d097b065 (diff) |
Notes
Diffstat (limited to 'lib/Transforms')
25 files changed, 1342 insertions, 766 deletions
diff --git a/lib/Transforms/IPO/CrossDSOCFI.cpp b/lib/Transforms/IPO/CrossDSOCFI.cpp index 1b111de061576..d94aa5da85601 100644 --- a/lib/Transforms/IPO/CrossDSOCFI.cpp +++ b/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -95,6 +95,17 @@ void CrossDSOCFI::buildCFICheck(Module &M) { } } + NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions"); + if (CfiFunctionsMD) { + for (auto Func : CfiFunctionsMD->operands()) { + assert(Func->getNumOperands() >= 2); + for (unsigned I = 2; I < Func->getNumOperands(); ++I) + if (ConstantInt *TypeId = + extractNumericTypeId(cast<MDNode>(Func->getOperand(I).get()))) + TypeIds.insert(TypeId->getZExtValue()); + } + } + LLVMContext &Ctx = M.getContext(); Constant *C = M.getOrInsertFunction( "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx), diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index c0dfeede05c5a..ad89e40661c67 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -523,40 +523,47 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, if (!Callee || Callee->isDeclaration()) continue; - // If this call site is dead and it is to a readonly function, we should - // just delete the call instead of trying to inline it, regardless of - // size. This happens because IPSCCP propagates the result out of the - // call and then we're left with the dead call. - if (isInstructionTriviallyDead(CS.getInstruction(), &TLI)) { - DEBUG(dbgs() << " -> Deleting dead call: " << *CS.getInstruction() - << "\n"); - // Update the call graph by deleting the edge from Callee to Caller. - CG[Caller]->removeCallEdgeFor(CS); - CS.getInstruction()->eraseFromParent(); - ++NumCallsDeleted; - } else { + Instruction *Instr = CS.getInstruction(); + + bool IsTriviallyDead = isInstructionTriviallyDead(Instr, &TLI); + + int InlineHistoryID; + if (!IsTriviallyDead) { // If this call site was obtained by inlining another function, verify // that the include path for the function did not include the callee // itself. If so, we'd be recursively inlining the same function, // which would provide the same callsites, which would cause us to // infinitely inline. - int InlineHistoryID = CallSites[CSi].second; + InlineHistoryID = CallSites[CSi].second; if (InlineHistoryID != -1 && InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) continue; + } + // FIXME for new PM: because of the old PM we currently generate ORE and + // in turn BFI on demand. With the new PM, the ORE dependency should + // just become a regular analysis dependency. + OptimizationRemarkEmitter ORE(Caller); + + // If the policy determines that we should inline this function, + // delete the call instead. + if (!shouldInline(CS, GetInlineCost, ORE)) + continue; + + // If this call site is dead and it is to a readonly function, we should + // just delete the call instead of trying to inline it, regardless of + // size. This happens because IPSCCP propagates the result out of the + // call and then we're left with the dead call. + if (IsTriviallyDead) { + DEBUG(dbgs() << " -> Deleting dead call: " << *Instr << "\n"); + // Update the call graph by deleting the edge from Callee to Caller. + CG[Caller]->removeCallEdgeFor(CS); + Instr->eraseFromParent(); + ++NumCallsDeleted; + } else { // Get DebugLoc to report. CS will be invalid after Inliner. - DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); + DebugLoc DLoc = Instr->getDebugLoc(); BasicBlock *Block = CS.getParent(); - // FIXME for new PM: because of the old PM we currently generate ORE and - // in turn BFI on demand. With the new PM, the ORE dependency should - // just become a regular analysis dependency. - OptimizationRemarkEmitter ORE(Caller); - - // If the policy determines that we should inline this function, - // try to do so. - if (!shouldInline(CS, GetInlineCost, ORE)) - continue; // Attempt to inline the function. using namespace ore; diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp index 90896d285f5af..b406c22c69d7a 100644 --- a/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/lib/Transforms/IPO/LowerTypeTests.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -206,17 +207,26 @@ struct ByteArrayInfo { class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> { GlobalObject *GO; size_t NTypes; + // For functions: true if this is a definition (either in the merged module or + // in one of the thinlto modules). + bool IsDefinition; + // For functions: true if this function is either defined or used in a thinlto + // module and its jumptable entry needs to be exported to thinlto backends. + bool IsExported; friend TrailingObjects; size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; } public: static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO, + bool IsDefinition, bool IsExported, ArrayRef<MDNode *> Types) { auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate( totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember))); GTM->GO = GO; GTM->NTypes = Types.size(); + GTM->IsDefinition = IsDefinition; + GTM->IsExported = IsExported; std::uninitialized_copy(Types.begin(), Types.end(), GTM->getTrailingObjects<MDNode *>()); return GTM; @@ -224,6 +234,12 @@ public: GlobalObject *getGlobal() const { return GO; } + bool isDefinition() const { + return IsDefinition; + } + bool isExported() const { + return IsExported; + } ArrayRef<MDNode *> types() const { return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes); } @@ -294,6 +310,7 @@ class LowerTypeTestsModule { void exportTypeId(StringRef TypeId, const TypeIdLowering &TIL); TypeIdLowering importTypeId(StringRef TypeId); void importTypeTest(CallInst *CI); + void importFunction(Function *F, bool isDefinition); BitSetInfo buildBitSet(Metadata *TypeId, @@ -820,6 +837,41 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) { CI->eraseFromParent(); } +// ThinLTO backend: the function F has a jump table entry; update this module +// accordingly. isDefinition describes the type of the jump table entry. +void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) { + assert(F->getType()->getAddressSpace() == 0); + + // Declaration of a local function - nothing to do. + if (F->isDeclarationForLinker() && isDefinition) + return; + + GlobalValue::VisibilityTypes Visibility = F->getVisibility(); + std::string Name = F->getName(); + Function *FDecl; + + if (F->isDeclarationForLinker() && !isDefinition) { + // Declaration of an external function. + FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, + Name + ".cfi_jt", &M); + FDecl->setVisibility(GlobalValue::HiddenVisibility); + } else { + // Definition. + assert(isDefinition); + F->setName(Name + ".cfi"); + F->setLinkage(GlobalValue::ExternalLinkage); + F->setVisibility(GlobalValue::HiddenVisibility); + FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, + Name, &M); + FDecl->setVisibility(Visibility); + } + + if (F->isWeakForLinker()) + replaceWeakDeclarationWithJumpTablePtr(F, FDecl); + else + F->replaceAllUsesWith(FDecl); +} + void LowerTypeTestsModule::lowerTypeTestCalls( ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr, const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) { @@ -1143,7 +1195,6 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( // arithmetic that we normally use for globals. // FIXME: find a better way to represent the jumptable in the IR. - assert(!Functions.empty()); // Build a simple layout based on the regular layout of jump tables. @@ -1167,6 +1218,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( // references to the original functions with references to the aliases. for (unsigned I = 0; I != Functions.size(); ++I) { Function *F = cast<Function>(Functions[I]->getGlobal()); + bool IsDefinition = Functions[I]->isDefinition(); Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast( ConstantExpr::getInBoundsGetElementPtr( @@ -1174,7 +1226,18 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0), ConstantInt::get(IntPtrTy, I)}), F->getType()); - if (F->isDeclarationForLinker()) { + if (Functions[I]->isExported()) { + if (IsDefinition) { + ExportSummary->cfiFunctionDefs().insert(F->getName()); + } else { + GlobalAlias *JtAlias = GlobalAlias::create( + F->getValueType(), 0, GlobalValue::ExternalLinkage, + F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M); + JtAlias->setVisibility(GlobalValue::HiddenVisibility); + ExportSummary->cfiFunctionDecls().insert(F->getName()); + } + } + if (!IsDefinition) { if (F->isWeakForLinker()) replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr); else @@ -1182,9 +1245,8 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( } else { assert(F->getType()->getAddressSpace() == 0); - GlobalAlias *FAlias = GlobalAlias::create(F->getValueType(), 0, - F->getLinkage(), "", - CombinedGlobalElemPtr, &M); + GlobalAlias *FAlias = GlobalAlias::create( + F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M); FAlias->setVisibility(F->getVisibility()); FAlias->takeName(F); if (FAlias->hasName()) @@ -1353,15 +1415,37 @@ bool LowerTypeTestsModule::runForTesting(Module &M) { bool LowerTypeTestsModule::lower() { Function *TypeTestFunc = M.getFunction(Intrinsic::getName(Intrinsic::type_test)); - if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary) + if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary && + !ImportSummary) return false; if (ImportSummary) { - for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end(); - UI != UE;) { - auto *CI = cast<CallInst>((*UI++).getUser()); - importTypeTest(CI); + if (TypeTestFunc) { + for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end(); + UI != UE;) { + auto *CI = cast<CallInst>((*UI++).getUser()); + importTypeTest(CI); + } + } + + SmallVector<Function *, 8> Defs; + SmallVector<Function *, 8> Decls; + for (auto &F : M) { + // CFI functions are either external, or promoted. A local function may + // have the same name, but it's not the one we are looking for. + if (F.hasLocalLinkage()) + continue; + if (ImportSummary->cfiFunctionDefs().count(F.getName())) + Defs.push_back(&F); + else if (ImportSummary->cfiFunctionDecls().count(F.getName())) + Decls.push_back(&F); } + + for (auto F : Defs) + importFunction(F, /*isDefinition*/ true); + for (auto F : Decls) + importFunction(F, /*isDefinition*/ false); + return true; } @@ -1387,6 +1471,58 @@ bool LowerTypeTestsModule::lower() { llvm::DenseMap<Metadata *, TIInfo> TypeIdInfo; unsigned I = 0; SmallVector<MDNode *, 2> Types; + + struct ExportedFunctionInfo { + CfiFunctionLinkage Linkage; + MDNode *FuncMD; // {name, linkage, type[, type...]} + }; + DenseMap<StringRef, ExportedFunctionInfo> ExportedFunctions; + if (ExportSummary) { + NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions"); + if (CfiFunctionsMD) { + for (auto FuncMD : CfiFunctionsMD->operands()) { + assert(FuncMD->getNumOperands() >= 2); + StringRef FunctionName = + cast<MDString>(FuncMD->getOperand(0))->getString(); + if (!ExportSummary->isGUIDLive(GlobalValue::getGUID( + GlobalValue::dropLLVMManglingEscape(FunctionName)))) + continue; + CfiFunctionLinkage Linkage = static_cast<CfiFunctionLinkage>( + cast<ConstantAsMetadata>(FuncMD->getOperand(1)) + ->getValue() + ->getUniqueInteger() + .getZExtValue()); + auto P = ExportedFunctions.insert({FunctionName, {Linkage, FuncMD}}); + if (!P.second && P.first->second.Linkage != CFL_Definition) + P.first->second = {Linkage, FuncMD}; + } + + for (const auto &P : ExportedFunctions) { + StringRef FunctionName = P.first; + CfiFunctionLinkage Linkage = P.second.Linkage; + MDNode *FuncMD = P.second.FuncMD; + Function *F = M.getFunction(FunctionName); + if (!F) + F = Function::Create( + FunctionType::get(Type::getVoidTy(M.getContext()), false), + GlobalVariable::ExternalLinkage, FunctionName, &M); + + if (Linkage == CFL_Definition) + F->eraseMetadata(LLVMContext::MD_type); + + if (F->isDeclaration()) { + if (Linkage == CFL_WeakDeclaration) + F->setLinkage(GlobalValue::ExternalWeakLinkage); + + SmallVector<MDNode *, 2> Types; + for (unsigned I = 2; I < FuncMD->getNumOperands(); ++I) + F->addMetadata(LLVMContext::MD_type, + *cast<MDNode>(FuncMD->getOperand(I).get())); + } + } + } + } + for (GlobalObject &GO : M.global_objects()) { if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker()) continue; @@ -1396,7 +1532,15 @@ bool LowerTypeTestsModule::lower() { if (Types.empty()) continue; - auto *GTM = GlobalTypeMember::create(Alloc, &GO, Types); + bool IsDefinition = !GO.isDeclarationForLinker(); + bool IsExported = false; + if (isa<Function>(GO) && ExportedFunctions.count(GO.getName())) { + IsDefinition |= ExportedFunctions[GO.getName()].Linkage == CFL_Definition; + IsExported = true; + } + + auto *GTM = + GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types); for (MDNode *Type : Types) { verifyTypeMDNode(&GO, Type); auto &Info = TypeIdInfo[cast<MDNode>(Type)->getOperand(1)]; diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index ea805efc66b79..8840435af6421 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -103,6 +103,35 @@ struct PartialInlinerImpl { bool run(Module &M); Function *unswitchFunction(Function *F); + // This class speculatively clones the the function to be partial inlined. + // At the end of partial inlining, the remaining callsites to the cloned + // function that are not partially inlined will be fixed up to reference + // the original function, and the cloned function will be erased. + struct FunctionCloner { + FunctionCloner(Function *F, FunctionOutliningInfo *OI); + ~FunctionCloner(); + + // Prepare for function outlining: making sure there is only + // one incoming edge from the extracted/outlined region to + // the return block. + void NormalizeReturnBlock(); + + // Do function outlining: + Function *doFunctionOutlining(); + + Function *OrigFunc = nullptr; + Function *ClonedFunc = nullptr; + Function *OutlinedFunc = nullptr; + BasicBlock *OutliningCallBB = nullptr; + // ClonedFunc is inlined in one of its callers after function + // outlining. + bool IsFunctionInlined = false; + // The cost of the region to be outlined. + int OutlinedRegionCost = 0; + std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; + std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; + }; + private: int NumPartialInlining = 0; std::function<AssumptionCache &(Function &)> *GetAssumptionCache; @@ -114,27 +143,18 @@ private: // The result is no larger than 1 and is represented using BP. // (Note that the outlined region's 'head' block can only have incoming // edges from the guarding entry blocks). - BranchProbability getOutliningCallBBRelativeFreq(Function *F, - FunctionOutliningInfo *OI, - Function *DuplicateFunction, - BlockFrequencyInfo *BFI, - BasicBlock *OutliningCallBB); + BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); // Return true if the callee of CS should be partially inlined with // profit. - bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI, - BlockFrequencyInfo *CalleeBFI, - BasicBlock *OutliningCallBB, - int OutliningCallOverhead, + bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, + BlockFrequency WeightedOutliningRcost, OptimizationRemarkEmitter &ORE); // Try to inline DuplicateFunction (cloned from F with call to // the OutlinedFunction into its callers. Return true // if there is any successful inlining. - bool tryPartialInline(Function *DuplicateFunction, - Function *F, /*orignal function */ - FunctionOutliningInfo *OI, Function *OutlinedFunction, - BlockFrequencyInfo *CalleeBFI); + bool tryPartialInline(FunctionCloner &Cloner); // Compute the mapping from use site of DuplicationFunction to the enclosing // BB's profile count. @@ -146,7 +166,7 @@ private: NumPartialInlining >= MaxNumPartialInlining); } - CallSite getCallSite(User *U) { + static CallSite getCallSite(User *U) { CallSite CS; if (CallInst *CI = dyn_cast<CallInst>(U)) CS = CallSite(CI); @@ -157,7 +177,7 @@ private: return CS; } - CallSite getOneCallSiteTo(Function *F) { + static CallSite getOneCallSiteTo(Function *F) { User *User = *F->user_begin(); return getCallSite(User); } @@ -171,20 +191,15 @@ private: // Returns the costs associated with function outlining: // - The first value is the non-weighted runtime cost for making the call - // to the outlined function 'OutlinedFunction', including the addtional - // setup cost in the outlined function itself; + // to the outlined function, including the addtional setup cost in the + // outlined function itself; // - The second value is the estimated size of the new call sequence in - // basic block 'OutliningCallBB'; - // - The third value is the estimated size of the original code from - // function 'F' that is extracted into the outlined function. - std::tuple<int, int, int> - computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo, - Function *OutlinedFunction, - BasicBlock *OutliningCallBB); + // basic block Cloner.OutliningCallBB; + std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to // approximate both the size and runtime cost (Note that in the current // inline cost analysis, there is no clear distinction there either). - int computeBBInlineCost(BasicBlock *BB); + static int computeBBInlineCost(BasicBlock *BB); std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); @@ -396,19 +411,19 @@ static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { return false; } -BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( - Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction, - BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) { +BranchProbability +PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { auto EntryFreq = - BFI->getBlockFreq(&DuplicateFunction->getEntryBlock()); - auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB); + Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); + auto OutliningCallFreq = + Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB); auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); - if (hasProfileData(F, OI)) + if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) return OutlineRegionRelFreq; // When profile data is not available, we need to be conservative in @@ -433,15 +448,17 @@ BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( } bool PartialInlinerImpl::shouldPartialInline( - CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI, - BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB, - int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) { + CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, + OptimizationRemarkEmitter &ORE) { + using namespace ore; if (SkipCostAnalysis) return true; Instruction *Call = CS.getInstruction(); Function *Callee = CS.getCalledFunction(); + assert(Callee == Cloner.ClonedFunc); + Function *Caller = CS.getCaller(); auto &CalleeTTI = (*GetTTI)(*Callee); InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, @@ -449,14 +466,14 @@ bool PartialInlinerImpl::shouldPartialInline( if (IC.isAlways()) { ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) - << NV("Callee", F) + << NV("Callee", Cloner.OrigFunc) << " should always be fully inlined, not partially"); return false; } if (IC.isNever()) { ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) - << NV("Callee", F) << " not partially inlined into " + << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " << NV("Caller", Caller) << " because it should never be inlined (cost=never)"); return false; @@ -464,29 +481,25 @@ bool PartialInlinerImpl::shouldPartialInline( if (!IC) { ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) - << NV("Callee", F) << " not partially inlined into " + << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " << NV("Caller", Caller) << " because too costly to inline (cost=" << NV("Cost", IC.getCost()) << ", threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); return false; } const DataLayout &DL = Caller->getParent()->getDataLayout(); + // The savings of eliminating the call: int NonWeightedSavings = getCallsiteCost(CS, DL); BlockFrequency NormWeightedSavings(NonWeightedSavings); - auto RelativeFreq = - getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB); - auto NormWeightedRcost = - BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq; - // Weighted saving is smaller than weighted cost, return false - if (NormWeightedSavings < NormWeightedRcost) { + if (NormWeightedSavings < WeightedOutliningRcost) { ORE.emit( OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call) - << NV("Callee", F) << " not partially inlined into " + << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " << NV("Caller", Caller) << " runtime overhead (overhead=" - << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency()) + << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) << ", savings=" << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" << " of making the outlined call is too high"); @@ -495,7 +508,7 @@ bool PartialInlinerImpl::shouldPartialInline( } ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) - << NV("Callee", F) << " can be partially inlined into " + << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) << " (threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); @@ -551,50 +564,32 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { return InlineCost; } -std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts( - Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction, - BasicBlock *OutliningCallBB) { - // First compute the cost of the outlined region 'OI' in the original - // function 'F'. - // FIXME: The code extractor (outliner) can now do code sinking/hoisting - // to reduce outlining cost. The hoisted/sunk code currently do not - // incur any runtime cost so it is still OK to compare the outlined - // function cost with the outlined region in the original function. - // If this ever changes, we will need to introduce new extractor api - // to pass the information. - int OutlinedRegionCost = 0; - for (BasicBlock &BB : *F) { - if (&BB != OI->ReturnBlock && - // Assuming Entry set is small -- do a linear search here: - std::find(OI->Entries.begin(), OI->Entries.end(), &BB) == - OI->Entries.end()) { - OutlinedRegionCost += computeBBInlineCost(&BB); - } - } +std::tuple<int, int> +PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { // Now compute the cost of the call sequence to the outlined function // 'OutlinedFunction' in BB 'OutliningCallBB': - int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB); + int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB); // Now compute the cost of the extracted/outlined function itself: int OutlinedFunctionCost = 0; - for (BasicBlock &BB : *OutlinedFunction) { + for (BasicBlock &BB : *Cloner.OutlinedFunc) { OutlinedFunctionCost += computeBBInlineCost(&BB); } - assert(OutlinedFunctionCost >= OutlinedRegionCost && + assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && "Outlined function cost should be no less than the outlined region"); // The code extractor introduces a new root and exit stub blocks with // additional unconditional branches. Those branches will be eliminated // later with bb layout. The cost should be adjusted accordingly: OutlinedFunctionCost -= 2 * InlineConstants::InstrCost; - int OutliningRuntimeOverhead = OutliningFuncCallCost + - (OutlinedFunctionCost - OutlinedRegionCost) + - ExtraOutliningPenalty; + int OutliningRuntimeOverhead = + OutliningFuncCallCost + + (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + + ExtraOutliningPenalty; - return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead, - OutlinedRegionCost); + return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); } // Create the callsite to profile count map which is @@ -641,42 +636,30 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap( } } -Function *PartialInlinerImpl::unswitchFunction(Function *F) { - - if (F->hasAddressTaken()) - return nullptr; - - // Let inliner handle it - if (F->hasFnAttribute(Attribute::AlwaysInline)) - return nullptr; - - if (F->hasFnAttribute(Attribute::NoInline)) - return nullptr; - - if (PSI->isFunctionEntryCold(F)) - return nullptr; - - if (F->user_begin() == F->user_end()) - return nullptr; - - std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); - - if (!OI) - return nullptr; +PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F, + FunctionOutliningInfo *OI) + : OrigFunc(F) { + ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; - Function *DuplicateFunction = CloneFunction(F, VMap); - BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); - BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); - DenseSet<BasicBlock *> NewEntries; + ClonedFunc = CloneFunction(F, VMap); + + ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); + ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); for (BasicBlock *BB : OI->Entries) { - NewEntries.insert(cast<BasicBlock>(VMap[BB])); + ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); + } + for (BasicBlock *E : OI->ReturnBlockPreds) { + BasicBlock *NewE = cast<BasicBlock>(VMap[E]); + ClonedOI->ReturnBlockPreds.push_back(NewE); } - // Go ahead and update all uses to the duplicate, so that we can just // use the inliner functionality when we're done hacking. - F->replaceAllUsesWith(DuplicateFunction); + F->replaceAllUsesWith(ClonedFunc); +} + +void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { auto getFirstPHI = [](BasicBlock *BB) { BasicBlock::iterator I = BB->begin(); @@ -692,14 +675,19 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { } return FirstPhi; }; + // Special hackery is needed with PHI nodes that have inputs from more than // one extracted block. For simplicity, just split the PHIs into a two-level // sequence of PHIs, some of which will go in the extracted region, and some // of which will go outside. - BasicBlock *PreReturn = NewReturnBlock; + BasicBlock *PreReturn = ClonedOI->ReturnBlock; // only split block when necessary: PHINode *FirstPhi = getFirstPHI(PreReturn); - unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size(); + unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); + + if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) + return; + auto IsTrivialPhi = [](PHINode *PN) -> Value * { Value *CommonValue = PN->getIncomingValue(0); if (all_of(PN->incoming_values(), @@ -708,143 +696,185 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { return nullptr; }; - if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) { - - NewReturnBlock = NewReturnBlock->splitBasicBlock( - NewReturnBlock->getFirstNonPHI()->getIterator()); - BasicBlock::iterator I = PreReturn->begin(); - Instruction *Ins = &NewReturnBlock->front(); - SmallVector<Instruction *, 4> DeadPhis; - while (I != PreReturn->end()) { - PHINode *OldPhi = dyn_cast<PHINode>(I); - if (!OldPhi) - break; - - PHINode *RetPhi = - PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); - OldPhi->replaceAllUsesWith(RetPhi); - Ins = NewReturnBlock->getFirstNonPHI(); + ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( + ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); + BasicBlock::iterator I = PreReturn->begin(); + Instruction *Ins = &ClonedOI->ReturnBlock->front(); + SmallVector<Instruction *, 4> DeadPhis; + while (I != PreReturn->end()) { + PHINode *OldPhi = dyn_cast<PHINode>(I); + if (!OldPhi) + break; - RetPhi->addIncoming(&*I, PreReturn); - for (BasicBlock *E : OI->ReturnBlockPreds) { - BasicBlock *NewE = cast<BasicBlock>(VMap[E]); - RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE); - OldPhi->removeIncomingValue(NewE); - } + PHINode *RetPhi = + PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); + OldPhi->replaceAllUsesWith(RetPhi); + Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); - // After incoming values splitting, the old phi may become trivial. - // Keeping the trivial phi can introduce definition inside the outline - // region which is live-out, causing necessary overhead (load, store - // arg passing etc). - if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { - OldPhi->replaceAllUsesWith(OldPhiVal); - DeadPhis.push_back(OldPhi); - } - - ++I; + RetPhi->addIncoming(&*I, PreReturn); + for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { + RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); + OldPhi->removeIncomingValue(E); } + // After incoming values splitting, the old phi may become trivial. + // Keeping the trivial phi can introduce definition inside the outline + // region which is live-out, causing necessary overhead (load, store + // arg passing etc). + if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { + OldPhi->replaceAllUsesWith(OldPhiVal); + DeadPhis.push_back(OldPhi); + } + ++I; + } for (auto *DP : DeadPhis) DP->eraseFromParent(); - for (auto E : OI->ReturnBlockPreds) { - BasicBlock *NewE = cast<BasicBlock>(VMap[E]); - NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock); + for (auto E : ClonedOI->ReturnBlockPreds) { + E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); } - } +} +Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() { // Returns true if the block is to be partial inlined into the caller // (i.e. not to be extracted to the out of line function) - auto ToBeInlined = [&](BasicBlock *BB) { - return BB == NewReturnBlock || NewEntries.count(BB); + auto ToBeInlined = [&, this](BasicBlock *BB) { + return BB == ClonedOI->ReturnBlock || + (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != + ClonedOI->Entries.end()); }; + // Gather up the blocks that we're going to extract. std::vector<BasicBlock *> ToExtract; - ToExtract.push_back(NewNonReturnBlock); - for (BasicBlock &BB : *DuplicateFunction) - if (!ToBeInlined(&BB) && &BB != NewNonReturnBlock) + ToExtract.push_back(ClonedOI->NonReturnBlock); + OutlinedRegionCost += + PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); + for (BasicBlock &BB : *ClonedFunc) + if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { ToExtract.push_back(&BB); + // FIXME: the code extractor may hoist/sink more code + // into the outlined function which may make the outlining + // overhead (the difference of the outlined function cost + // and OutliningRegionCost) look larger. + OutlinedRegionCost += computeBBInlineCost(&BB); + } // The CodeExtractor needs a dominator tree. DominatorTree DT; - DT.recalculate(*DuplicateFunction); + DT.recalculate(*ClonedFunc); // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. LoopInfo LI(DT); - BranchProbabilityInfo BPI(*DuplicateFunction, LI); - BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI); + BranchProbabilityInfo BPI(*ClonedFunc, LI); + ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); // Extract the body of the if. - Function *OutlinedFunction = - CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI) - .extractCodeRegion(); + OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, + ClonedFuncBFI.get(), &BPI) + .extractCodeRegion(); + + if (OutlinedFunc) { + OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) + .getInstruction() + ->getParent(); + assert(OutliningCallBB->getParent() == ClonedFunc); + } - bool AnyInline = - tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI); + return OutlinedFunc; +} +PartialInlinerImpl::FunctionCloner::~FunctionCloner() { // Ditch the duplicate, since we're done with it, and rewrite all remaining // users (function pointers, etc.) back to the original function. - DuplicateFunction->replaceAllUsesWith(F); - DuplicateFunction->eraseFromParent(); + ClonedFunc->replaceAllUsesWith(OrigFunc); + ClonedFunc->eraseFromParent(); + if (!IsFunctionInlined) { + // Remove the function that is speculatively created if there is no + // reference. + if (OutlinedFunc) + OutlinedFunc->eraseFromParent(); + } +} + +Function *PartialInlinerImpl::unswitchFunction(Function *F) { + + if (F->hasAddressTaken()) + return nullptr; + + // Let inliner handle it + if (F->hasFnAttribute(Attribute::AlwaysInline)) + return nullptr; + + if (F->hasFnAttribute(Attribute::NoInline)) + return nullptr; + + if (PSI->isFunctionEntryCold(F)) + return nullptr; + + if (F->user_begin() == F->user_end()) + return nullptr; + + std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); + + if (!OI) + return nullptr; + + FunctionCloner Cloner(F, OI.get()); + Cloner.NormalizeReturnBlock(); + Function *OutlinedFunction = Cloner.doFunctionOutlining(); + + bool AnyInline = tryPartialInline(Cloner); if (AnyInline) return OutlinedFunction; - // Remove the function that is speculatively created: - if (OutlinedFunction) - OutlinedFunction->eraseFromParent(); - return nullptr; } -bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction, - Function *F, - FunctionOutliningInfo *OI, - Function *OutlinedFunction, - BlockFrequencyInfo *CalleeBFI) { - if (OutlinedFunction == nullptr) - return false; - +bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { int NonWeightedRcost; int SizeCost; - int OutlinedRegionSizeCost; - auto OutliningCallBB = - getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent(); + if (Cloner.OutlinedFunc == nullptr) + return false; + + std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); - std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) = - computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB); + auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); + auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; // The call sequence to the outlined function is larger than the original // outlined region size, it does not increase the chances of inlining - // 'F' with outlining (The inliner usies the size increase to model the - // the cost of inlining a callee). - if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) { - OptimizationRemarkEmitter ORE(F); + // the function with outlining (The inliner usies the size increase to + // model the cost of inlining a callee). + if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { + OptimizationRemarkEmitter ORE(Cloner.OrigFunc); DebugLoc DLoc; BasicBlock *Block; - std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction); + std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", DLoc, Block) - << ore::NV("Function", F) + << ore::NV("Function", Cloner.OrigFunc) << " not partially inlined into callers (Original Size = " - << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost) + << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) << ", Size of call sequence to outlined function = " << ore::NV("NewSize", SizeCost) << ")"); return false; } - assert(F->user_begin() == F->user_end() && + assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() && "F's users should all be replaced!"); - std::vector<User *> Users(DuplicateFunction->user_begin(), - DuplicateFunction->user_end()); + + std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), + Cloner.ClonedFunc->user_end()); DenseMap<User *, uint64_t> CallSiteToProfCountMap; - if (F->getEntryCount()) - computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap); + if (Cloner.OrigFunc->getEntryCount()) + computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); - auto CalleeEntryCount = F->getEntryCount(); + auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); + bool AnyInline = false; for (User *User : Users) { CallSite CS = getCallSite(User); @@ -854,13 +884,12 @@ bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction, OptimizationRemarkEmitter ORE(CS.getCaller()); - if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB, - NonWeightedRcost, ORE)) + if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE)) continue; ORE.emit( OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()) - << ore::NV("Callee", F) << " partially inlined into " + << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " << ore::NV("Caller", CS.getCaller())); InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); @@ -878,8 +907,11 @@ bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction, NumPartialInlined++; } - if (AnyInline && CalleeEntryCount) - F->setEntryCount(CalleeEntryCountV); + if (AnyInline) { + Cloner.IsFunctionInlined = true; + if (CalleeEntryCount) + Cloner.OrigFunc->setEntryCount(CalleeEntryCountV); + } return AnyInline; } diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 16fba32e98056..4bc64ab698ff9 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -141,6 +141,10 @@ static cl::opt<int> PreInlineThreshold( cl::desc("Control the amount of inlining in pre-instrumentation inliner " "(default = 75)")); +static cl::opt<bool> EnableEarlyCSEMemSSA( + "enable-earlycse-memssa", cl::init(false), cl::Hidden, + cl::desc("Enable the EarlyCSE w/ MemorySSA pass (default = off)")); + static cl::opt<bool> EnableGVNHoist( "enable-gvn-hoist", cl::init(false), cl::Hidden, cl::desc("Enable the GVN hoisting pass (default = off)")); @@ -308,7 +312,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses( // Start of function pass. // Break up aggregate allocas, using SSAUpdater. MPM.add(createSROAPass()); - MPM.add(createEarlyCSEPass()); // Catch trivial redundancies + MPM.add(createEarlyCSEPass(EnableEarlyCSEMemSSA)); // Catch trivial redundancies if (EnableGVNHoist) MPM.add(createGVNHoistPass()); if (EnableGVNSink) { diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index a7bcc7cc55325..802f470ffe1fb 100644 --- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -32,7 +32,8 @@ namespace { // Promote each local-linkage entity defined by ExportM and used by ImportM by // changing visibility and appending the given ModuleId. -void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) { +void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId, + SetVector<GlobalValue *> &PromoteExtra) { DenseMap<const Comdat *, Comdat *> RenamedComdats; for (auto &ExportGV : ExportM.global_values()) { if (!ExportGV.hasLocalLinkage()) @@ -40,7 +41,7 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) { auto Name = ExportGV.getName(); GlobalValue *ImportGV = ImportM.getNamedValue(Name); - if (!ImportGV || ImportGV->use_empty()) + if ((!ImportGV || ImportGV->use_empty()) && !PromoteExtra.count(&ExportGV)) continue; std::string NewName = (Name + ModuleId).str(); @@ -53,8 +54,10 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) { ExportGV.setLinkage(GlobalValue::ExternalLinkage); ExportGV.setVisibility(GlobalValue::HiddenVisibility); - ImportGV->setName(NewName); - ImportGV->setVisibility(GlobalValue::HiddenVisibility); + if (ImportGV) { + ImportGV->setName(NewName); + ImportGV->setVisibility(GlobalValue::HiddenVisibility); + } } if (!RenamedComdats.empty()) @@ -296,6 +299,11 @@ void splitAndWriteThinLTOBitcode( F.setComdat(nullptr); } + SetVector<GlobalValue *> CfiFunctions; + for (auto &F : M) + if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F)) + CfiFunctions.insert(&F); + // Remove all globals with type metadata, globals with comdats that live in // MergedM, and aliases pointing to such globals from the thin LTO module. filterModule(&M, [&](const GlobalValue *GV) { @@ -308,11 +316,39 @@ void splitAndWriteThinLTOBitcode( return true; }); - promoteInternals(*MergedM, M, ModuleId); - promoteInternals(M, *MergedM, ModuleId); + promoteInternals(*MergedM, M, ModuleId, CfiFunctions); + promoteInternals(M, *MergedM, ModuleId, CfiFunctions); + + SmallVector<MDNode *, 8> CfiFunctionMDs; + for (auto V : CfiFunctions) { + Function &F = *cast<Function>(V); + SmallVector<MDNode *, 2> Types; + F.getMetadata(LLVMContext::MD_type, Types); + + auto &Ctx = MergedM->getContext(); + SmallVector<Metadata *, 4> Elts; + Elts.push_back(MDString::get(Ctx, F.getName())); + CfiFunctionLinkage Linkage; + if (!F.isDeclarationForLinker()) + Linkage = CFL_Definition; + else if (F.isWeakForLinker()) + Linkage = CFL_WeakDeclaration; + else + Linkage = CFL_Declaration; + Elts.push_back(ConstantAsMetadata::get( + llvm::ConstantInt::get(Type::getInt8Ty(Ctx), Linkage))); + for (auto Type : Types) + Elts.push_back(Type); + CfiFunctionMDs.push_back(MDTuple::get(Ctx, Elts)); + } - simplifyExternals(*MergedM); + if(!CfiFunctionMDs.empty()) { + NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("cfi.functions"); + for (auto MD : CfiFunctionMDs) + NMD->addOperand(MD); + } + simplifyExternals(*MergedM); // FIXME: Try to re-use BSI and PFI from the original module here. ProfileSummaryInfo PSI(M); diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 4fe3225a21722..a881bda5ba98d 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -763,8 +763,54 @@ foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS, return nullptr; } +// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) +// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) +Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS, + bool JoinedByAnd, + Instruction &CxtI) { + ICmpInst::Predicate Pred = LHS->getPredicate(); + if (Pred != RHS->getPredicate()) + return nullptr; + if (JoinedByAnd && Pred != ICmpInst::ICMP_NE) + return nullptr; + if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ) + return nullptr; + + // TODO support vector splats + ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1)); + ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1)); + if (!LHSC || !RHSC || !LHSC->isZero() || !RHSC->isZero()) + return nullptr; + + Value *A, *B, *C, *D; + if (match(LHS->getOperand(0), m_And(m_Value(A), m_Value(B))) && + match(RHS->getOperand(0), m_And(m_Value(C), m_Value(D)))) { + if (A == D || B == D) + std::swap(C, D); + if (B == C) + std::swap(A, B); + + if (A == C && + isKnownToBeAPowerOfTwo(B, false, 0, &CxtI) && + isKnownToBeAPowerOfTwo(D, false, 0, &CxtI)) { + Value *Mask = Builder->CreateOr(B, D); + Value *Masked = Builder->CreateAnd(A, Mask); + auto NewPred = JoinedByAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE; + return Builder->CreateICmp(NewPred, Masked, Mask); + } + } + + return nullptr; +} + /// Fold (icmp)&(icmp) if possible. -Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { +Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, + Instruction &CxtI) { + // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) + // if K1 and K2 are a one-bit mask. + if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, true, CxtI)) + return V; + ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B) @@ -1127,8 +1173,8 @@ Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) { ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src); ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src); if (ICmp0 && ICmp1) { - Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1) - : foldOrOfICmps(ICmp0, ICmp1, &I); + Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1, I) + : foldOrOfICmps(ICmp0, ICmp1, I); if (Res) return CastInst::Create(CastOpcode, Res, DestTy); return nullptr; @@ -1426,7 +1472,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { ICmpInst *LHS = dyn_cast<ICmpInst>(Op0); ICmpInst *RHS = dyn_cast<ICmpInst>(Op1); if (LHS && RHS) - if (Value *Res = foldAndOfICmps(LHS, RHS)) + if (Value *Res = foldAndOfICmps(LHS, RHS, I)) return replaceInstUsesWith(I, Res); // TODO: Make this recursive; it's a little tricky because an arbitrary @@ -1434,18 +1480,18 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { Value *X, *Y; if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) { if (auto *Cmp = dyn_cast<ICmpInst>(X)) - if (Value *Res = foldAndOfICmps(LHS, Cmp)) + if (Value *Res = foldAndOfICmps(LHS, Cmp, I)) return replaceInstUsesWith(I, Builder->CreateAnd(Res, Y)); if (auto *Cmp = dyn_cast<ICmpInst>(Y)) - if (Value *Res = foldAndOfICmps(LHS, Cmp)) + if (Value *Res = foldAndOfICmps(LHS, Cmp, I)) return replaceInstUsesWith(I, Builder->CreateAnd(Res, X)); } if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) { if (auto *Cmp = dyn_cast<ICmpInst>(X)) - if (Value *Res = foldAndOfICmps(Cmp, RHS)) + if (Value *Res = foldAndOfICmps(Cmp, RHS, I)) return replaceInstUsesWith(I, Builder->CreateAnd(Res, Y)); if (auto *Cmp = dyn_cast<ICmpInst>(Y)) - if (Value *Res = foldAndOfICmps(Cmp, RHS)) + if (Value *Res = foldAndOfICmps(Cmp, RHS, I)) return replaceInstUsesWith(I, Builder->CreateAnd(Res, X)); } } @@ -1591,41 +1637,16 @@ static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D, /// Fold (icmp)|(icmp) if possible. Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, - Instruction *CxtI) { - ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); - + Instruction &CxtI) { // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) // if K1 and K2 are a one-bit mask. - ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1)); - ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1)); + if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, false, CxtI)) + return V; - if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSC && LHSC->isZero() && - RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSC && RHSC->isZero()) { - - BinaryOperator *LAnd = dyn_cast<BinaryOperator>(LHS->getOperand(0)); - BinaryOperator *RAnd = dyn_cast<BinaryOperator>(RHS->getOperand(0)); - if (LAnd && RAnd && LAnd->hasOneUse() && RHS->hasOneUse() && - LAnd->getOpcode() == Instruction::And && - RAnd->getOpcode() == Instruction::And) { - - Value *Mask = nullptr; - Value *Masked = nullptr; - if (LAnd->getOperand(0) == RAnd->getOperand(0) && - isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, CxtI) && - isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, CxtI)) { - Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1)); - Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask); - } else if (LAnd->getOperand(1) == RAnd->getOperand(1) && - isKnownToBeAPowerOfTwo(LAnd->getOperand(0), false, 0, CxtI) && - isKnownToBeAPowerOfTwo(RAnd->getOperand(0), false, 0, CxtI)) { - Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0)); - Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask); - } + ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); - if (Masked) - return Builder->CreateICmp(ICmpInst::ICMP_NE, Masked, Mask); - } - } + ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1)); + ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1)); // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3) // --> (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3) @@ -2117,12 +2138,16 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { } // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C + // FIXME: The two hasOneUse calls here are the same call, maybe we were + // supposed to check Op1->operand(0)? if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse()) return BinaryOperator::CreateOr(Op0, C); // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C + // FIXME: The two hasOneUse calls here are the same call, maybe we were + // supposed to check Op0->operand(0)? if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B)))) if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse()) @@ -2194,7 +2219,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { ICmpInst *LHS = dyn_cast<ICmpInst>(Op0); ICmpInst *RHS = dyn_cast<ICmpInst>(Op1); if (LHS && RHS) - if (Value *Res = foldOrOfICmps(LHS, RHS, &I)) + if (Value *Res = foldOrOfICmps(LHS, RHS, I)) return replaceInstUsesWith(I, Res); // TODO: Make this recursive; it's a little tricky because an arbitrary @@ -2202,18 +2227,18 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { Value *X, *Y; if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { if (auto *Cmp = dyn_cast<ICmpInst>(X)) - if (Value *Res = foldOrOfICmps(LHS, Cmp, &I)) + if (Value *Res = foldOrOfICmps(LHS, Cmp, I)) return replaceInstUsesWith(I, Builder->CreateOr(Res, Y)); if (auto *Cmp = dyn_cast<ICmpInst>(Y)) - if (Value *Res = foldOrOfICmps(LHS, Cmp, &I)) + if (Value *Res = foldOrOfICmps(LHS, Cmp, I)) return replaceInstUsesWith(I, Builder->CreateOr(Res, X)); } if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) { if (auto *Cmp = dyn_cast<ICmpInst>(X)) - if (Value *Res = foldOrOfICmps(Cmp, RHS, &I)) + if (Value *Res = foldOrOfICmps(Cmp, RHS, I)) return replaceInstUsesWith(I, Builder->CreateOr(Res, Y)); if (auto *Cmp = dyn_cast<ICmpInst>(Y)) - if (Value *Res = foldOrOfICmps(Cmp, RHS, &I)) + if (Value *Res = foldOrOfICmps(Cmp, RHS, I)) return replaceInstUsesWith(I, Builder->CreateOr(Res, X)); } } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index d29ed49eca0b0..c0830a5d21124 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -94,75 +94,80 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { return ConstantVector::get(BoolVec); } -Instruction * -InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) { +Instruction *InstCombiner::SimplifyElementUnorderedAtomicMemCpy( + ElementUnorderedAtomicMemCpyInst *AMI) { // Try to unfold this intrinsic into sequence of explicit atomic loads and // stores. // First check that number of elements is compile time constant. - auto *NumElementsCI = dyn_cast<ConstantInt>(AMI->getNumElements()); - if (!NumElementsCI) + auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength()); + if (!LengthCI) return nullptr; // Check that there are not too many elements. - uint64_t NumElements = NumElementsCI->getZExtValue(); + uint64_t LengthInBytes = LengthCI->getZExtValue(); + uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes(); + uint64_t NumElements = LengthInBytes / ElementSizeInBytes; if (NumElements >= UnfoldElementAtomicMemcpyMaxElements) return nullptr; - // Don't unfold into illegal integers - uint64_t ElementSizeInBytes = AMI->getElementSizeInBytes() * 8; - if (!getDataLayout().isLegalInteger(ElementSizeInBytes)) - return nullptr; + // Only expand if there are elements to copy. + if (NumElements > 0) { + // Don't unfold into illegal integers + uint64_t ElementSizeInBits = ElementSizeInBytes * 8; + if (!getDataLayout().isLegalInteger(ElementSizeInBits)) + return nullptr; - // Cast source and destination to the correct type. Intrinsic input arguments - // are usually represented as i8*. - // Often operands will be explicitly casted to i8* and we can just strip - // those casts instead of inserting new ones. However it's easier to rely on - // other InstCombine rules which will cover trivial cases anyway. - Value *Src = AMI->getRawSource(); - Value *Dst = AMI->getRawDest(); - Type *ElementPointerType = Type::getIntNPtrTy( - AMI->getContext(), ElementSizeInBytes, Src->getType()->getPointerAddressSpace()); - - Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType, - "memcpy_unfold.src_casted"); - Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType, - "memcpy_unfold.dst_casted"); - - for (uint64_t i = 0; i < NumElements; ++i) { - // Get current element addresses - ConstantInt *ElementIdxCI = - ConstantInt::get(AMI->getContext(), APInt(64, i)); - Value *SrcElementAddr = - Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr"); - Value *DstElementAddr = - Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr"); - - // Load from the source. Transfer alignment information and mark load as - // unordered atomic. - LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val"); - Load->setOrdering(AtomicOrdering::Unordered); - // We know alignment of the first element. It is also guaranteed by the - // verifier that element size is less or equal than first element alignment - // and both of this values are powers of two. - // This means that all subsequent accesses are at least element size - // aligned. - // TODO: We can infer better alignment but there is no evidence that this - // will matter. - Load->setAlignment(i == 0 ? AMI->getSrcAlignment() - : AMI->getElementSizeInBytes()); - Load->setDebugLoc(AMI->getDebugLoc()); - - // Store loaded value via unordered atomic store. - StoreInst *Store = Builder->CreateStore(Load, DstElementAddr); - Store->setOrdering(AtomicOrdering::Unordered); - Store->setAlignment(i == 0 ? AMI->getDstAlignment() - : AMI->getElementSizeInBytes()); - Store->setDebugLoc(AMI->getDebugLoc()); + // Cast source and destination to the correct type. Intrinsic input + // arguments are usually represented as i8*. Often operands will be + // explicitly casted to i8* and we can just strip those casts instead of + // inserting new ones. However it's easier to rely on other InstCombine + // rules which will cover trivial cases anyway. + Value *Src = AMI->getRawSource(); + Value *Dst = AMI->getRawDest(); + Type *ElementPointerType = + Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits, + Src->getType()->getPointerAddressSpace()); + + Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType, + "memcpy_unfold.src_casted"); + Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType, + "memcpy_unfold.dst_casted"); + + for (uint64_t i = 0; i < NumElements; ++i) { + // Get current element addresses + ConstantInt *ElementIdxCI = + ConstantInt::get(AMI->getContext(), APInt(64, i)); + Value *SrcElementAddr = + Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr"); + Value *DstElementAddr = + Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr"); + + // Load from the source. Transfer alignment information and mark load as + // unordered atomic. + LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val"); + Load->setOrdering(AtomicOrdering::Unordered); + // We know alignment of the first element. It is also guaranteed by the + // verifier that element size is less or equal than first element + // alignment and both of this values are powers of two. This means that + // all subsequent accesses are at least element size aligned. + // TODO: We can infer better alignment but there is no evidence that this + // will matter. + Load->setAlignment(i == 0 ? AMI->getParamAlignment(1) + : ElementSizeInBytes); + Load->setDebugLoc(AMI->getDebugLoc()); + + // Store loaded value via unordered atomic store. + StoreInst *Store = Builder->CreateStore(Load, DstElementAddr); + Store->setOrdering(AtomicOrdering::Unordered); + Store->setAlignment(i == 0 ? AMI->getParamAlignment(0) + : ElementSizeInBytes); + Store->setDebugLoc(AMI->getDebugLoc()); + } } // Set the number of elements of the copy to 0, it will be deleted on the // next iteration. - AMI->setNumElements(Constant::getNullValue(NumElementsCI->getType())); + AMI->setLength(Constant::getNullValue(LengthCI->getType())); return AMI; } @@ -1888,12 +1893,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (Changed) return II; } - if (auto *AMI = dyn_cast<ElementAtomicMemCpyInst>(II)) { - if (Constant *C = dyn_cast<Constant>(AMI->getNumElements())) + if (auto *AMI = dyn_cast<ElementUnorderedAtomicMemCpyInst>(II)) { + if (Constant *C = dyn_cast<Constant>(AMI->getLength())) if (C->isNullValue()) return eraseInstFromFunction(*AMI); - if (Instruction *I = SimplifyElementAtomicMemCpy(AMI)) + if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI)) return I; } diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index fd0a64a5bbb56..1a7db146df426 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -447,12 +447,14 @@ private: Instruction::CastOps isEliminableCastPair(const CastInst *CI1, const CastInst *CI2); - Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS); + Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI); Value *foldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS); - Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI); + Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI); Value *foldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS); Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS); + Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS, + bool JoinedByAnd, Instruction &CxtI); public: /// \brief Inserts an instruction \p New before instruction \p Old /// @@ -724,7 +726,8 @@ private: Instruction *MatchBSwap(BinaryOperator &I); bool SimplifyStoreAtEndOfBlock(StoreInst &SI); - Instruction *SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI); + Instruction * + SimplifyElementUnorderedAtomicMemCpy(ElementUnorderedAtomicMemCpyInst *AMI); Instruction *SimplifyMemTransfer(MemIntrinsic *MI); Instruction *SimplifyMemSet(MemSetInst *MI); diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index 3f2ddcacce2b8..8cec865c6422a 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -682,11 +682,11 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) { return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask)); } - if (match(Op0, m_SExt(m_Value(X)))) { + if (match(Op0, m_SExt(m_Value(X))) && + (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) { // Are we moving the sign bit to the low bit and widening with high zeros? unsigned SrcTyBitWidth = X->getType()->getScalarSizeInBits(); - if (ShAmt == BitWidth - 1 && - (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) { + if (ShAmt == BitWidth - 1) { // lshr (sext i1 X to iN), N-1 --> zext X to iN if (SrcTyBitWidth == 1) return new ZExtInst(X, Ty); @@ -698,7 +698,13 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) { } } - // TODO: Convert to ashr+zext if the shift equals the extension amount. + // lshr (sext iM X to iN), N-M --> zext (ashr X, min(N-M, M-1)) to iN + if (ShAmt == BitWidth - SrcTyBitWidth && Op0->hasOneUse()) { + // The new shift amount can't be more than the narrow source type. + unsigned NewShAmt = std::min(ShAmt, SrcTyBitWidth - 1); + Value *AShr = Builder->CreateAShr(X, NewShAmt); + return new ZExtInst(AShr, Ty); + } } if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) { diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt index 7ff69b9eb7f42..f2806e278e6e1 100644 --- a/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/lib/Transforms/Instrumentation/CMakeLists.txt @@ -8,6 +8,7 @@ add_llvm_library(LLVMInstrumentation Instrumentation.cpp InstrProfiling.cpp PGOInstrumentation.cpp + PGOMemOPSizeOpt.cpp SanitizerCoverage.cpp ThreadSanitizer.cpp EfficiencySanitizer.cpp diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index 96027bc3d0a91..0d308810009d5 100644 --- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -56,8 +56,6 @@ using namespace llvm; STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions."); STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites."); -STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); -STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); // Command line option to disable indirect-call promotion with the default as // false. This is for debug purpose. @@ -111,44 +109,6 @@ static cl::opt<bool> ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden, cl::desc("Dump IR after transformation happens")); -// The minimum call count to optimize memory intrinsic calls. -static cl::opt<unsigned> - MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, - cl::init(1000), - cl::desc("The minimum count to optimize memory " - "intrinsic calls")); - -// Command line option to disable memory intrinsic optimization. The default is -// false. This is for debug purpose. -static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), - cl::Hidden, cl::desc("Disable optimize")); - -// The percent threshold to optimize memory intrinsic calls. -static cl::opt<unsigned> - MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), - cl::Hidden, cl::ZeroOrMore, - cl::desc("The percentage threshold for the " - "memory intrinsic calls optimization")); - -// Maximum number of versions for optimizing memory intrinsic call. -static cl::opt<unsigned> - MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, - cl::ZeroOrMore, - cl::desc("The max version for the optimized memory " - " intrinsic calls")); - -// Scale the counts from the annotation using the BB count value. -static cl::opt<bool> - MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, - cl::desc("Scale the memop size counts using the basic " - " block count value")); - -// This option sets the rangge of precise profile memop sizes. -extern cl::opt<std::string> MemOPSizeRange; - -// This option sets the value that groups large memop sizes -extern cl::opt<unsigned> MemOPSizeLarge; - namespace { class PGOIndirectCallPromotionLegacyPass : public ModulePass { public: @@ -173,24 +133,6 @@ private: // the promoted direct call. bool SamplePGO; }; - -class PGOMemOPSizeOptLegacyPass : public FunctionPass { -public: - static char ID; - - PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) { - initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry()); - } - - StringRef getPassName() const override { return "PGOMemOPSize"; } - -private: - bool runOnFunction(Function &F) override; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<BlockFrequencyInfoWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } -}; } // end anonymous namespace char PGOIndirectCallPromotionLegacyPass::ID = 0; @@ -204,19 +146,6 @@ ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO, return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO); } -char PGOMemOPSizeOptLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", - "Optimize memory intrinsic using its size value profile", - false, false) -INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) -INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", - "Optimize memory intrinsic using its size value profile", - false, false) - -FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { - return new PGOMemOPSizeOptLegacyPass(); -} - namespace { // The class for main data structure to promote indirect calls to conditional // direct calls. @@ -749,285 +678,3 @@ PreservedAnalyses PGOIndirectCallPromotion::run(Module &M, return PreservedAnalyses::none(); } - -namespace { -class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { -public: - MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI) - : Func(Func), BFI(BFI), Changed(false) { - ValueDataArray = - llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2); - // Get the MemOPSize range information from option MemOPSizeRange, - getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart, - PreciseRangeLast); - } - bool isChanged() const { return Changed; } - void perform() { - WorkList.clear(); - visit(Func); - - for (auto &MI : WorkList) { - ++NumOfPGOMemOPAnnotate; - if (perform(MI)) { - Changed = true; - ++NumOfPGOMemOPOpt; - DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName() - << "is Transformed.\n"); - } - } - } - - void visitMemIntrinsic(MemIntrinsic &MI) { - Value *Length = MI.getLength(); - // Not perform on constant length calls. - if (dyn_cast<ConstantInt>(Length)) - return; - WorkList.push_back(&MI); - } - -private: - Function &Func; - BlockFrequencyInfo &BFI; - bool Changed; - std::vector<MemIntrinsic *> WorkList; - // Start of the previse range. - int64_t PreciseRangeStart; - // Last value of the previse range. - int64_t PreciseRangeLast; - // The space to read the profile annotation. - std::unique_ptr<InstrProfValueData[]> ValueDataArray; - bool perform(MemIntrinsic *MI); - - // This kind shows which group the value falls in. For PreciseValue, we have - // the profile count for that value. LargeGroup groups the values that are in - // range [LargeValue, +inf). NonLargeGroup groups the rest of values. - enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup }; - - MemOPSizeKind getMemOPSizeKind(int64_t Value) const { - if (Value == MemOPSizeLarge && MemOPSizeLarge != 0) - return LargeGroup; - if (Value == PreciseRangeLast + 1) - return NonLargeGroup; - return PreciseValue; - } -}; - -static const char *getMIName(const MemIntrinsic *MI) { - switch (MI->getIntrinsicID()) { - case Intrinsic::memcpy: - return "memcpy"; - case Intrinsic::memmove: - return "memmove"; - case Intrinsic::memset: - return "memset"; - default: - return "unknown"; - } -} - -static bool isProfitable(uint64_t Count, uint64_t TotalCount) { - assert(Count <= TotalCount); - if (Count < MemOPCountThreshold) - return false; - if (Count < TotalCount * MemOPPercentThreshold / 100) - return false; - return true; -} - -static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, - uint64_t Denom) { - if (!MemOPScaleCount) - return Count; - bool Overflowed; - uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); - return ScaleCount / Denom; -} - -bool MemOPSizeOpt::perform(MemIntrinsic *MI) { - assert(MI); - if (MI->getIntrinsicID() == Intrinsic::memmove) - return false; - - uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2; - uint64_t TotalCount; - if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions, - ValueDataArray.get(), NumVals, TotalCount)) - return false; - - uint64_t ActualCount = TotalCount; - uint64_t SavedTotalCount = TotalCount; - if (MemOPScaleCount) { - auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent()); - if (!BBEdgeCount) - return false; - ActualCount = *BBEdgeCount; - } - - ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); - DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount - << "\n"); - DEBUG( - for (auto &VD - : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); - - if (ActualCount < MemOPCountThreshold) - return false; - // Skip if the total value profiled count is 0, in which case we can't - // scale up the counts properly (and there is no profitable transformation). - if (TotalCount == 0) - return false; - - TotalCount = ActualCount; - if (MemOPScaleCount) - DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount - << " denominator = " << SavedTotalCount << "\n"); - - // Keeping track of the count of the default case: - uint64_t RemainCount = TotalCount; - SmallVector<uint64_t, 16> SizeIds; - SmallVector<uint64_t, 16> CaseCounts; - uint64_t MaxCount = 0; - unsigned Version = 0; - // Default case is in the front -- save the slot here. - CaseCounts.push_back(0); - for (auto &VD : VDs) { - int64_t V = VD.Value; - uint64_t C = VD.Count; - if (MemOPScaleCount) - C = getScaledCount(C, ActualCount, SavedTotalCount); - - // Only care precise value here. - if (getMemOPSizeKind(V) != PreciseValue) - continue; - - // ValueCounts are sorted on the count. Break at the first un-profitable - // value. - if (!isProfitable(C, RemainCount)) - break; - - SizeIds.push_back(V); - CaseCounts.push_back(C); - if (C > MaxCount) - MaxCount = C; - - assert(RemainCount >= C); - RemainCount -= C; - - if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0) - break; - } - - if (Version == 0) - return false; - - CaseCounts[0] = RemainCount; - if (RemainCount > MaxCount) - MaxCount = RemainCount; - - uint64_t SumForOpt = TotalCount - RemainCount; - - DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version - << " Versions (covering " << SumForOpt << " out of " - << TotalCount << ")\n"); - - // mem_op(..., size) - // ==> - // switch (size) { - // case s1: - // mem_op(..., s1); - // goto merge_bb; - // case s2: - // mem_op(..., s2); - // goto merge_bb; - // ... - // default: - // mem_op(..., size); - // goto merge_bb; - // } - // merge_bb: - - BasicBlock *BB = MI->getParent(); - DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); - DEBUG(dbgs() << *BB << "\n"); - auto OrigBBFreq = BFI.getBlockFreq(BB); - - BasicBlock *DefaultBB = SplitBlock(BB, MI); - BasicBlock::iterator It(*MI); - ++It; - assert(It != DefaultBB->end()); - BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It)); - MergeBB->setName("MemOP.Merge"); - BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); - DefaultBB->setName("MemOP.Default"); - - auto &Ctx = Func.getContext(); - IRBuilder<> IRB(BB); - BB->getTerminator()->eraseFromParent(); - Value *SizeVar = MI->getLength(); - SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); - - // Clear the value profile data. - MI->setMetadata(LLVMContext::MD_prof, nullptr); - - DEBUG(dbgs() << "\n\n== Basic Block After==\n"); - - for (uint64_t SizeId : SizeIds) { - ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId); - BasicBlock *CaseBB = BasicBlock::Create( - Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); - Instruction *NewInst = MI->clone(); - // Fix the argument. - dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId); - CaseBB->getInstList().push_back(NewInst); - IRBuilder<> IRBCase(CaseBB); - IRBCase.CreateBr(MergeBB); - SI->addCase(CaseSizeId, CaseBB); - DEBUG(dbgs() << *CaseBB << "\n"); - } - setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); - - DEBUG(dbgs() << *BB << "\n"); - DEBUG(dbgs() << *DefaultBB << "\n"); - DEBUG(dbgs() << *MergeBB << "\n"); - - emitOptimizationRemark(Func.getContext(), "memop-opt", Func, - MI->getDebugLoc(), - Twine("optimize ") + getMIName(MI) + " with count " + - Twine(SumForOpt) + " out of " + Twine(TotalCount) + - " for " + Twine(Version) + " versions"); - - return true; -} -} // namespace - -static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) { - if (DisableMemOPOPT) - return false; - - if (F.hasFnAttribute(Attribute::OptimizeForSize)) - return false; - MemOPSizeOpt MemOPSizeOpt(F, BFI); - MemOPSizeOpt.perform(); - return MemOPSizeOpt.isChanged(); -} - -bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { - BlockFrequencyInfo &BFI = - getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); - return PGOMemOPSizeOptImpl(F, BFI); -} - -namespace llvm { -char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID; - -PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, - FunctionAnalysisManager &FAM) { - auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); - bool Changed = PGOMemOPSizeOptImpl(F, BFI); - if (!Changed) - return PreservedAnalyses::all(); - auto PA = PreservedAnalyses(); - PA.preserve<GlobalsAA>(); - return PA; -} -} // namespace llvm diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp index f83c930ca61be..37f88d5f95f18 100644 --- a/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -343,14 +343,24 @@ static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) { static inline bool shouldRecordFunctionAddr(Function *F) { // Check the linkage + bool HasAvailableExternallyLinkage = F->hasAvailableExternallyLinkage(); if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() && - !F->hasAvailableExternallyLinkage()) + !HasAvailableExternallyLinkage) return true; + + // A function marked 'alwaysinline' with available_externally linkage can't + // have its address taken. Doing so would create an undefined external ref to + // the function, which would fail to link. + if (HasAvailableExternallyLinkage && + F->hasFnAttribute(Attribute::AlwaysInline)) + return false; + // Prohibit function address recording if the function is both internal and // COMDAT. This avoids the profile data variable referencing internal symbols // in COMDAT. if (F->hasLocalLinkage() && F->hasComdat()) return false; + // Check uses of this function for other than direct calls or invokes to it. // Inline virtual functions have linkeOnceODR linkage. When a key method // exists, the vtable will only be emitted in the TU where the key method diff --git a/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp new file mode 100644 index 0000000000000..0bc9ddfbe4d33 --- /dev/null +++ b/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -0,0 +1,419 @@ +//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the transformation that optimizes memory intrinsics +// such as memcpy using the size value profile. When memory intrinsic size +// value profile metadata is available, a single memory intrinsic is expanded +// to a sequence of guarded specialized versions that are called with the +// hottest size(s), for later expansion into more optimal inline sequences. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/PassSupport.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/PGOInstrumentation.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <cassert> +#include <cstdint> +#include <vector> + +using namespace llvm; + +#define DEBUG_TYPE "pgo-memop-opt" + +STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); +STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); + +// The minimum call count to optimize memory intrinsic calls. +static cl::opt<unsigned> + MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, + cl::init(1000), + cl::desc("The minimum count to optimize memory " + "intrinsic calls")); + +// Command line option to disable memory intrinsic optimization. The default is +// false. This is for debug purpose. +static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), + cl::Hidden, cl::desc("Disable optimize")); + +// The percent threshold to optimize memory intrinsic calls. +static cl::opt<unsigned> + MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), + cl::Hidden, cl::ZeroOrMore, + cl::desc("The percentage threshold for the " + "memory intrinsic calls optimization")); + +// Maximum number of versions for optimizing memory intrinsic call. +static cl::opt<unsigned> + MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, + cl::ZeroOrMore, + cl::desc("The max version for the optimized memory " + " intrinsic calls")); + +// Scale the counts from the annotation using the BB count value. +static cl::opt<bool> + MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, + cl::desc("Scale the memop size counts using the basic " + " block count value")); + +// This option sets the rangge of precise profile memop sizes. +extern cl::opt<std::string> MemOPSizeRange; + +// This option sets the value that groups large memop sizes +extern cl::opt<unsigned> MemOPSizeLarge; + +namespace { +class PGOMemOPSizeOptLegacyPass : public FunctionPass { +public: + static char ID; + + PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) { + initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { return "PGOMemOPSize"; } + +private: + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<BlockFrequencyInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } +}; +} // end anonymous namespace + +char PGOMemOPSizeOptLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", + "Optimize memory intrinsic using its size value profile", + false, false) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", + "Optimize memory intrinsic using its size value profile", + false, false) + +FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { + return new PGOMemOPSizeOptLegacyPass(); +} + +namespace { +class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { +public: + MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI) + : Func(Func), BFI(BFI), Changed(false) { + ValueDataArray = + llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2); + // Get the MemOPSize range information from option MemOPSizeRange, + getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart, + PreciseRangeLast); + } + bool isChanged() const { return Changed; } + void perform() { + WorkList.clear(); + visit(Func); + + for (auto &MI : WorkList) { + ++NumOfPGOMemOPAnnotate; + if (perform(MI)) { + Changed = true; + ++NumOfPGOMemOPOpt; + DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName() + << "is Transformed.\n"); + } + } + } + + void visitMemIntrinsic(MemIntrinsic &MI) { + Value *Length = MI.getLength(); + // Not perform on constant length calls. + if (dyn_cast<ConstantInt>(Length)) + return; + WorkList.push_back(&MI); + } + +private: + Function &Func; + BlockFrequencyInfo &BFI; + bool Changed; + std::vector<MemIntrinsic *> WorkList; + // Start of the previse range. + int64_t PreciseRangeStart; + // Last value of the previse range. + int64_t PreciseRangeLast; + // The space to read the profile annotation. + std::unique_ptr<InstrProfValueData[]> ValueDataArray; + bool perform(MemIntrinsic *MI); + + // This kind shows which group the value falls in. For PreciseValue, we have + // the profile count for that value. LargeGroup groups the values that are in + // range [LargeValue, +inf). NonLargeGroup groups the rest of values. + enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup }; + + MemOPSizeKind getMemOPSizeKind(int64_t Value) const { + if (Value == MemOPSizeLarge && MemOPSizeLarge != 0) + return LargeGroup; + if (Value == PreciseRangeLast + 1) + return NonLargeGroup; + return PreciseValue; + } +}; + +static const char *getMIName(const MemIntrinsic *MI) { + switch (MI->getIntrinsicID()) { + case Intrinsic::memcpy: + return "memcpy"; + case Intrinsic::memmove: + return "memmove"; + case Intrinsic::memset: + return "memset"; + default: + return "unknown"; + } +} + +static bool isProfitable(uint64_t Count, uint64_t TotalCount) { + assert(Count <= TotalCount); + if (Count < MemOPCountThreshold) + return false; + if (Count < TotalCount * MemOPPercentThreshold / 100) + return false; + return true; +} + +static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, + uint64_t Denom) { + if (!MemOPScaleCount) + return Count; + bool Overflowed; + uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); + return ScaleCount / Denom; +} + +bool MemOPSizeOpt::perform(MemIntrinsic *MI) { + assert(MI); + if (MI->getIntrinsicID() == Intrinsic::memmove) + return false; + + uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2; + uint64_t TotalCount; + if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions, + ValueDataArray.get(), NumVals, TotalCount)) + return false; + + uint64_t ActualCount = TotalCount; + uint64_t SavedTotalCount = TotalCount; + if (MemOPScaleCount) { + auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent()); + if (!BBEdgeCount) + return false; + ActualCount = *BBEdgeCount; + } + + ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); + DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount + << "\n"); + DEBUG( + for (auto &VD + : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); + + if (ActualCount < MemOPCountThreshold) + return false; + // Skip if the total value profiled count is 0, in which case we can't + // scale up the counts properly (and there is no profitable transformation). + if (TotalCount == 0) + return false; + + TotalCount = ActualCount; + if (MemOPScaleCount) + DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount + << " denominator = " << SavedTotalCount << "\n"); + + // Keeping track of the count of the default case: + uint64_t RemainCount = TotalCount; + uint64_t SavedRemainCount = SavedTotalCount; + SmallVector<uint64_t, 16> SizeIds; + SmallVector<uint64_t, 16> CaseCounts; + uint64_t MaxCount = 0; + unsigned Version = 0; + // Default case is in the front -- save the slot here. + CaseCounts.push_back(0); + for (auto &VD : VDs) { + int64_t V = VD.Value; + uint64_t C = VD.Count; + if (MemOPScaleCount) + C = getScaledCount(C, ActualCount, SavedTotalCount); + + // Only care precise value here. + if (getMemOPSizeKind(V) != PreciseValue) + continue; + + // ValueCounts are sorted on the count. Break at the first un-profitable + // value. + if (!isProfitable(C, RemainCount)) + break; + + SizeIds.push_back(V); + CaseCounts.push_back(C); + if (C > MaxCount) + MaxCount = C; + + assert(RemainCount >= C); + RemainCount -= C; + assert(SavedRemainCount >= VD.Count); + SavedRemainCount -= VD.Count; + + if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0) + break; + } + + if (Version == 0) + return false; + + CaseCounts[0] = RemainCount; + if (RemainCount > MaxCount) + MaxCount = RemainCount; + + uint64_t SumForOpt = TotalCount - RemainCount; + + DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version + << " Versions (covering " << SumForOpt << " out of " + << TotalCount << ")\n"); + + // mem_op(..., size) + // ==> + // switch (size) { + // case s1: + // mem_op(..., s1); + // goto merge_bb; + // case s2: + // mem_op(..., s2); + // goto merge_bb; + // ... + // default: + // mem_op(..., size); + // goto merge_bb; + // } + // merge_bb: + + BasicBlock *BB = MI->getParent(); + DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); + DEBUG(dbgs() << *BB << "\n"); + auto OrigBBFreq = BFI.getBlockFreq(BB); + + BasicBlock *DefaultBB = SplitBlock(BB, MI); + BasicBlock::iterator It(*MI); + ++It; + assert(It != DefaultBB->end()); + BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It)); + MergeBB->setName("MemOP.Merge"); + BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); + DefaultBB->setName("MemOP.Default"); + + auto &Ctx = Func.getContext(); + IRBuilder<> IRB(BB); + BB->getTerminator()->eraseFromParent(); + Value *SizeVar = MI->getLength(); + SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); + + // Clear the value profile data. + MI->setMetadata(LLVMContext::MD_prof, nullptr); + // If all promoted, we don't need the MD.prof metadata. + if (SavedRemainCount > 0 || Version != NumVals) + // Otherwise we need update with the un-promoted records back. + annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version), + SavedRemainCount, IPVK_MemOPSize, NumVals); + + DEBUG(dbgs() << "\n\n== Basic Block After==\n"); + + for (uint64_t SizeId : SizeIds) { + ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId); + BasicBlock *CaseBB = BasicBlock::Create( + Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); + Instruction *NewInst = MI->clone(); + // Fix the argument. + dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId); + CaseBB->getInstList().push_back(NewInst); + IRBuilder<> IRBCase(CaseBB); + IRBCase.CreateBr(MergeBB); + SI->addCase(CaseSizeId, CaseBB); + DEBUG(dbgs() << *CaseBB << "\n"); + } + setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); + + DEBUG(dbgs() << *BB << "\n"); + DEBUG(dbgs() << *DefaultBB << "\n"); + DEBUG(dbgs() << *MergeBB << "\n"); + + emitOptimizationRemark(Func.getContext(), "memop-opt", Func, + MI->getDebugLoc(), + Twine("optimize ") + getMIName(MI) + " with count " + + Twine(SumForOpt) + " out of " + Twine(TotalCount) + + " for " + Twine(Version) + " versions"); + + return true; +} +} // namespace + +static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) { + if (DisableMemOPOPT) + return false; + + if (F.hasFnAttribute(Attribute::OptimizeForSize)) + return false; + MemOPSizeOpt MemOPSizeOpt(F, BFI); + MemOPSizeOpt.perform(); + return MemOPSizeOpt.isChanged(); +} + +bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { + BlockFrequencyInfo &BFI = + getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); + return PGOMemOPSizeOptImpl(F, BFI); +} + +namespace llvm { +char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID; + +PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, + FunctionAnalysisManager &FAM) { + auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); + bool Changed = PGOMemOPSizeOptImpl(F, BFI); + if (!Changed) + return PreservedAnalyses::all(); + auto PA = PreservedAnalyses(); + PA.preserve<GlobalsAA>(); + return PA; +} +} // namespace llvm diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 8aa40d1759de6..e3c36c98ab0db 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -61,7 +61,7 @@ static const char *const SanCov8bitCountersInitName = "__sanitizer_cov_8bit_counters_init"; static const char *const SanCovGuardsSectionName = "sancov_guards"; -static const char *const SanCovCountersSectionName = "sancov_counters"; +static const char *const SanCovCountersSectionName = "sancov_cntrs"; static cl::opt<int> ClCoverageLevel( "sanitizer-coverage-level", diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 7b625b9b136ec..2a4c9526dfcd9 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -551,7 +551,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) { BBChanged = true; } } - }; + } FnChanged |= BBChanged; } diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index c4f450949e6de..0f92760a874b5 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -15,6 +15,7 @@ #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" @@ -506,7 +507,7 @@ private: if (MemoryAccess *MA = MSSA->getMemoryAccess(Inst)) { // Optimize MemoryPhi nodes that may become redundant by having all the // same input values once MA is removed. - SmallVector<MemoryPhi *, 4> PhisToCheck; + SmallSetVector<MemoryPhi *, 4> PhisToCheck; SmallVector<MemoryAccess *, 8> WorkQueue; WorkQueue.push_back(MA); // Process MemoryPhi nodes in FIFO order using a ever-growing vector since @@ -517,7 +518,7 @@ private: for (auto *U : WI->users()) if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U)) - PhisToCheck.push_back(MP); + PhisToCheck.insert(MP); MSSAUpdater->removeMemoryAccess(WI); diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp index 8634816e702fb..5fd2dfc118b4b 100644 --- a/lib/Transforms/Scalar/GVNSink.cpp +++ b/lib/Transforms/Scalar/GVNSink.cpp @@ -64,6 +64,17 @@ using namespace llvm; STATISTIC(NumRemoved, "Number of instructions removed"); +namespace llvm { +namespace GVNExpression { + +LLVM_DUMP_METHOD void Expression::dump() const { + print(dbgs()); + dbgs() << "\n"; +} + +} +} + namespace { static bool isMemoryInst(const Instruction *I) { diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index b706152f30c81..8b435050ac769 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -983,21 +983,21 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW); + if (StoreSize != 1) + NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), + SCEV::FlagNUW); + + Value *NumBytes = + Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); + unsigned Align = std::min(SI->getAlignment(), LI->getAlignment()); CallInst *NewCall = nullptr; // Check whether to generate an unordered atomic memcpy: // If the load or store are atomic, then they must neccessarily be unordered // by previous checks. - if (!SI->isAtomic() && !LI->isAtomic()) { - if (StoreSize != 1) - NumBytesS = SE->getMulExpr( - NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW); - - Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); - + if (!SI->isAtomic() && !LI->isAtomic()) NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align); - } else { + else { // We cannot allow unaligned ops for unordered load/store, so reject // anything where the alignment isn't at least the element size. if (Align < StoreSize) @@ -1010,11 +1010,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) return false; - Value *NumElements = - Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); + NewCall = Builder.CreateElementUnorderedAtomicMemCpy( + StoreBasePtr, LoadBasePtr, NumBytes, StoreSize); - NewCall = Builder.CreateElementAtomicMemCpy(StoreBasePtr, LoadBasePtr, - NumElements, StoreSize); // Propagate alignment info onto the pointer args. Note that unordered // atomic loads/stores are *required* by the spec to have an alignment // but non-atomic loads/stores may not. diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 6926aae37963f..cbbd55512c9f5 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -2195,7 +2195,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, // For a given expression, mark the phi of ops instructions that could have // changed as a result. void NewGVN::markPhiOfOpsChanged(const Expression *E) { - touchAndErase(ExpressionToPhiOfOps, E); + touchAndErase(ExpressionToPhiOfOps, ExactEqualsExpression(*E)); } // Perform congruence finding on a given value numbering expression. @@ -3561,7 +3561,7 @@ bool NewGVN::eliminateInstructions(Function &F) { // TODO: It would be faster to use getNumIncomingBlocks() on a phi node in // the block and subtract the pred count, but it's more complicated. if (ReachablePredCount.lookup(BB) != - std::distance(pred_begin(BB), pred_end(BB))) { + unsigned(std::distance(pred_begin(BB), pred_end(BB)))) { for (auto II = BB->begin(); isa<PHINode>(II); ++II) { auto &PHI = cast<PHINode>(*II); ReplaceUnreachablePHIArgs(PHI, BB); diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index bae7911d222c9..a52739bb76f71 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -89,10 +89,10 @@ struct RewriteStatepointsForGC : public ModulePass { Changed |= runOnFunction(F); if (Changed) { - // stripNonValidAttributes asserts that shouldRewriteStatepointsIn + // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn // returns true for at least one function in the module. Since at least // one function changed, we know that the precondition is satisfied. - stripNonValidAttributes(M); + stripNonValidAttributesAndMetadata(M); } return Changed; @@ -105,20 +105,24 @@ struct RewriteStatepointsForGC : public ModulePass { AU.addRequired<TargetTransformInfoWrapperPass>(); } - /// The IR fed into RewriteStatepointsForGC may have had attributes implying - /// dereferenceability that are no longer valid/correct after - /// RewriteStatepointsForGC has run. This is because semantically, after + /// The IR fed into RewriteStatepointsForGC may have had attributes and + /// metadata implying dereferenceability that are no longer valid/correct after + /// RewriteStatepointsForGC has run. This is because semantically, after /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire - /// heap. stripNonValidAttributes (conservatively) restores correctness - /// by erasing all attributes in the module that externally imply - /// dereferenceability. - /// Similar reasoning also applies to the noalias attributes. gc.statepoint - /// can touch the entire heap including noalias objects. - void stripNonValidAttributes(Module &M); - - // Helpers for stripNonValidAttributes - void stripNonValidAttributesFromBody(Function &F); + /// heap. stripNonValidAttributesAndMetadata (conservatively) restores + /// correctness by erasing all attributes in the module that externally imply + /// dereferenceability. Similar reasoning also applies to the noalias + /// attributes and metadata. gc.statepoint can touch the entire heap including + /// noalias objects. + void stripNonValidAttributesAndMetadata(Module &M); + + // Helpers for stripNonValidAttributesAndMetadata + void stripNonValidAttributesAndMetadataFromBody(Function &F); void stripNonValidAttributesFromPrototype(Function &F); + // Certain metadata on instructions are invalid after running RS4GC. + // Optimizations that run after RS4GC can incorrectly use this metadata to + // optimize functions. We drop such metadata on the instruction. + void stripInvalidMetadataFromInstruction(Instruction &I); }; } // namespace @@ -2306,13 +2310,44 @@ RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) { RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex); } -void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) { +void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I) { + + if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) + return; + // These are the attributes that are still valid on loads and stores after + // RS4GC. + // The metadata implying dereferenceability and noalias are (conservatively) + // dropped. This is because semantically, after RewriteStatepointsForGC runs, + // all calls to gc.statepoint "free" the entire heap. Also, gc.statepoint can + // touch the entire heap including noalias objects. Note: The reasoning is + // same as stripping the dereferenceability and noalias attributes that are + // analogous to the metadata counterparts. + // We also drop the invariant.load metadata on the load because that metadata + // implies the address operand to the load points to memory that is never + // changed once it became dereferenceable. This is no longer true after RS4GC. + // Similar reasoning applies to invariant.group metadata, which applies to + // loads within a group. + unsigned ValidMetadataAfterRS4GC[] = {LLVMContext::MD_tbaa, + LLVMContext::MD_range, + LLVMContext::MD_alias_scope, + LLVMContext::MD_nontemporal, + LLVMContext::MD_nonnull, + LLVMContext::MD_align, + LLVMContext::MD_type}; + + // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC. + I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC); + +} + +void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); + for (Instruction &I : instructions(F)) { if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); @@ -2333,6 +2368,8 @@ void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) { I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA); } + stripInvalidMetadataFromInstruction(I); + if (CallSite CS = CallSite(&I)) { for (int i = 0, e = CS.arg_size(); i != e; i++) if (isa<PointerType>(CS.getArgument(i)->getType())) @@ -2357,7 +2394,7 @@ static bool shouldRewriteStatepointsIn(Function &F) { return false; } -void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) { +void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { #ifndef NDEBUG assert(any_of(M, shouldRewriteStatepointsIn) && "precondition!"); #endif @@ -2366,7 +2403,7 @@ void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) { stripNonValidAttributesFromPrototype(F); for (Function &F : M) - stripNonValidAttributesFromBody(F); + stripNonValidAttributesAndMetadataFromBody(F); } bool RewriteStatepointsForGC::runOnFunction(Function &F) { diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 815492ac354c0..c6929c33b3e9e 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -515,10 +515,6 @@ private: void visitCmpInst(CmpInst &I); void visitExtractValueInst(ExtractValueInst &EVI); void visitInsertValueInst(InsertValueInst &IVI); - void visitLandingPadInst(LandingPadInst &I) { markOverdefined(&I); } - void visitFuncletPadInst(FuncletPadInst &FPI) { - markOverdefined(&FPI); - } void visitCatchSwitchInst(CatchSwitchInst &CPI) { markOverdefined(&CPI); visitTerminatorInst(CPI); @@ -539,13 +535,6 @@ private: void visitResumeInst (TerminatorInst &I) { /*returns void*/ } void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ } void visitFenceInst (FenceInst &I) { /*returns void*/ } - void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { - markOverdefined(&I); - } - void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); } - void visitAllocaInst (Instruction &I) { markOverdefined(&I); } - void visitVAArgInst (Instruction &I) { markOverdefined(&I); } - void visitInstruction(Instruction &I) { // If a new instruction is added to LLVM that we don't handle. DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n'); diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index 24d28a6c28311..5d57ed9718fb6 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -142,8 +142,139 @@ static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) { return false; } -void CodeExtractor::findAllocas(ValueSet &SinkCands) const { +static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) { + BasicBlock *CommonExitBlock = nullptr; + auto hasNonCommonExitSucc = [&](BasicBlock *Block) { + for (auto *Succ : successors(Block)) { + // Internal edges, ok. + if (Blocks.count(Succ)) + continue; + if (!CommonExitBlock) { + CommonExitBlock = Succ; + continue; + } + if (CommonExitBlock == Succ) + continue; + + return true; + } + return false; + }; + + if (any_of(Blocks, hasNonCommonExitSucc)) + return nullptr; + + return CommonExitBlock; +} + +bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers( + Instruction *Addr) const { + AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets()); + Function *Func = (*Blocks.begin())->getParent(); + for (BasicBlock &BB : *Func) { + if (Blocks.count(&BB)) + continue; + for (Instruction &II : BB) { + + if (isa<DbgInfoIntrinsic>(II)) + continue; + + unsigned Opcode = II.getOpcode(); + Value *MemAddr = nullptr; + switch (Opcode) { + case Instruction::Store: + case Instruction::Load: { + if (Opcode == Instruction::Store) { + StoreInst *SI = cast<StoreInst>(&II); + MemAddr = SI->getPointerOperand(); + } else { + LoadInst *LI = cast<LoadInst>(&II); + MemAddr = LI->getPointerOperand(); + } + // Global variable can not be aliased with locals. + if (dyn_cast<Constant>(MemAddr)) + break; + Value *Base = MemAddr->stripInBoundsConstantOffsets(); + if (!dyn_cast<AllocaInst>(Base) || Base == AI) + return false; + break; + } + default: { + IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II); + if (IntrInst) { + if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start || + IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) + break; + return false; + } + // Treat all the other cases conservatively if it has side effects. + if (II.mayHaveSideEffects()) + return false; + } + } + } + } + + return true; +} + +BasicBlock * +CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) { + BasicBlock *SinglePredFromOutlineRegion = nullptr; + assert(!Blocks.count(CommonExitBlock) && + "Expect a block outside the region!"); + for (auto *Pred : predecessors(CommonExitBlock)) { + if (!Blocks.count(Pred)) + continue; + if (!SinglePredFromOutlineRegion) { + SinglePredFromOutlineRegion = Pred; + } else if (SinglePredFromOutlineRegion != Pred) { + SinglePredFromOutlineRegion = nullptr; + break; + } + } + + if (SinglePredFromOutlineRegion) + return SinglePredFromOutlineRegion; + +#ifndef NDEBUG + auto getFirstPHI = [](BasicBlock *BB) { + BasicBlock::iterator I = BB->begin(); + PHINode *FirstPhi = nullptr; + while (I != BB->end()) { + PHINode *Phi = dyn_cast<PHINode>(I); + if (!Phi) + break; + if (!FirstPhi) { + FirstPhi = Phi; + break; + } + } + return FirstPhi; + }; + // If there are any phi nodes, the single pred either exists or has already + // be created before code extraction. + assert(!getFirstPHI(CommonExitBlock) && "Phi not expected"); +#endif + + BasicBlock *NewExitBlock = CommonExitBlock->splitBasicBlock( + CommonExitBlock->getFirstNonPHI()->getIterator()); + + for (auto *Pred : predecessors(CommonExitBlock)) { + if (Blocks.count(Pred)) + continue; + Pred->getTerminator()->replaceUsesOfWith(CommonExitBlock, NewExitBlock); + } + // Now add the old exit block to the outline region. + Blocks.insert(CommonExitBlock); + return CommonExitBlock; +} + +void CodeExtractor::findAllocas(ValueSet &SinkCands, ValueSet &HoistCands, + BasicBlock *&ExitBlock) const { Function *Func = (*Blocks.begin())->getParent(); + ExitBlock = getCommonExitBlock(Blocks); + for (BasicBlock &BB : *Func) { if (Blocks.count(&BB)) continue; @@ -152,49 +283,96 @@ void CodeExtractor::findAllocas(ValueSet &SinkCands) const { if (!AI) continue; - // Returns true if matching life time markers are found within - // the outlined region. - auto GetLifeTimeMarkers = [&](Instruction *Addr) { + // Find the pair of life time markers for address 'Addr' that are either + // defined inside the outline region or can legally be shrinkwrapped into + // the outline region. If there are not other untracked uses of the + // address, return the pair of markers if found; otherwise return a pair + // of nullptr. + auto GetLifeTimeMarkers = + [&](Instruction *Addr, bool &SinkLifeStart, + bool &HoistLifeEnd) -> std::pair<Instruction *, Instruction *> { Instruction *LifeStart = nullptr, *LifeEnd = nullptr; - for (User *U : Addr->users()) { - if (!definedInRegion(Blocks, U)) - return false; + for (User *U : Addr->users()) { IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(U); if (IntrInst) { - if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) + if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) { + // Do not handle the case where AI has multiple start markers. + if (LifeStart) + return std::make_pair<Instruction *>(nullptr, nullptr); LifeStart = IntrInst; - if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) + } + if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) { + if (LifeEnd) + return std::make_pair<Instruction *>(nullptr, nullptr); LifeEnd = IntrInst; + } + continue; } + // Find untracked uses of the address, bail. + if (!definedInRegion(Blocks, U)) + return std::make_pair<Instruction *>(nullptr, nullptr); } - return LifeStart && LifeEnd; + + if (!LifeStart || !LifeEnd) + return std::make_pair<Instruction *>(nullptr, nullptr); + + SinkLifeStart = !definedInRegion(Blocks, LifeStart); + HoistLifeEnd = !definedInRegion(Blocks, LifeEnd); + // Do legality Check. + if ((SinkLifeStart || HoistLifeEnd) && + !isLegalToShrinkwrapLifetimeMarkers(Addr)) + return std::make_pair<Instruction *>(nullptr, nullptr); + + // Check to see if we have a place to do hoisting, if not, bail. + if (HoistLifeEnd && !ExitBlock) + return std::make_pair<Instruction *>(nullptr, nullptr); + + return std::make_pair(LifeStart, LifeEnd); }; - if (GetLifeTimeMarkers(AI)) { + bool SinkLifeStart = false, HoistLifeEnd = false; + auto Markers = GetLifeTimeMarkers(AI, SinkLifeStart, HoistLifeEnd); + + if (Markers.first) { + if (SinkLifeStart) + SinkCands.insert(Markers.first); SinkCands.insert(AI); + if (HoistLifeEnd) + HoistCands.insert(Markers.second); continue; } - // Follow the bitcast: + // Follow the bitcast. Instruction *MarkerAddr = nullptr; for (User *U : AI->users()) { - if (U->stripPointerCasts() == AI) { + + if (U->stripInBoundsConstantOffsets() == AI) { + SinkLifeStart = false; + HoistLifeEnd = false; Instruction *Bitcast = cast<Instruction>(U); - if (GetLifeTimeMarkers(Bitcast)) { + Markers = GetLifeTimeMarkers(Bitcast, SinkLifeStart, HoistLifeEnd); + if (Markers.first) { MarkerAddr = Bitcast; continue; } } + + // Found unknown use of AI. if (!definedInRegion(Blocks, U)) { MarkerAddr = nullptr; break; } } + if (MarkerAddr) { + if (SinkLifeStart) + SinkCands.insert(Markers.first); if (!definedInRegion(Blocks, MarkerAddr)) SinkCands.insert(MarkerAddr); SinkCands.insert(AI); + if (HoistLifeEnd) + HoistCands.insert(Markers.second); } } } @@ -780,7 +958,8 @@ Function *CodeExtractor::extractCodeRegion() { if (!isEligible()) return nullptr; - ValueSet inputs, outputs, SinkingCands; + ValueSet inputs, outputs, SinkingCands, HoistingCands; + BasicBlock *CommonExit = nullptr; // Assumption: this is a single-entry code region, and the header is the first // block in the region. @@ -819,7 +998,8 @@ Function *CodeExtractor::extractCodeRegion() { "newFuncRoot"); newFuncRoot->getInstList().push_back(BranchInst::Create(header)); - findAllocas(SinkingCands); + findAllocas(SinkingCands, HoistingCands, CommonExit); + assert(HoistingCands.empty() || CommonExit); // Find inputs to, outputs from the code region. findInputsOutputs(inputs, outputs, SinkingCands); @@ -829,6 +1009,13 @@ Function *CodeExtractor::extractCodeRegion() { cast<Instruction>(II)->moveBefore(*newFuncRoot, newFuncRoot->getFirstInsertionPt()); + if (!HoistingCands.empty()) { + auto *HoistToBlock = findOrCreateBlockForHoisting(CommonExit); + Instruction *TI = HoistToBlock->getTerminator(); + for (auto *II : HoistingCands) + cast<Instruction>(II)->moveBefore(TI); + } + // Calculate the exit blocks for the extracted region and the total exit // weights for each of those blocks. DenseMap<BasicBlock *, BlockFrequency> ExitWeights; diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp index 9e71cba4f1b7a..1260e35e934de 100644 --- a/lib/Transforms/Utils/PredicateInfo.cpp +++ b/lib/Transforms/Utils/PredicateInfo.cpp @@ -460,6 +460,9 @@ void PredicateInfo::buildPredicateInfo() { if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) { if (!BI->isConditional()) continue; + // Can't insert conditional information if they all go to the same place. + if (BI->getSuccessor(0) == BI->getSuccessor(1)) + continue; processBranch(BI, BranchBB, OpsToRename); } else if (auto *SI = dyn_cast<SwitchInst>(BranchBB->getTerminator())) { processSwitch(SI, BranchBB, OpsToRename); diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp index 02a5d3dbeadfb..faa14046b1e3c 100644 --- a/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -352,7 +352,7 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) { return false; typedef const SCEV *(ScalarEvolution::*OperationFunctionTy)( - const SCEV *, const SCEV *, SCEV::NoWrapFlags); + const SCEV *, const SCEV *, SCEV::NoWrapFlags, unsigned); typedef const SCEV *(ScalarEvolution::*ExtensionFunctionTy)( const SCEV *, Type *); @@ -406,10 +406,11 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) { IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2); const SCEV *A = - (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap), WideTy); + (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0u), + WideTy); const SCEV *B = (SE->*Operation)((SE->*Extension)(LHS, WideTy), - (SE->*Extension)(RHS, WideTy), SCEV::FlagAnyWrap); + (SE->*Extension)(RHS, WideTy), SCEV::FlagAnyWrap, 0u); if (A != B) return false; @@ -530,8 +531,7 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO, return false; const SCEV *(ScalarEvolution::*GetExprForBO)(const SCEV *, const SCEV *, - SCEV::NoWrapFlags); - + SCEV::NoWrapFlags, unsigned); switch (BO->getOpcode()) { default: return false; @@ -560,7 +560,7 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO, const SCEV *ExtendAfterOp = SE->getZeroExtendExpr(SE->getSCEV(BO), WideTy); const SCEV *OpAfterExtend = (SE->*GetExprForBO)( SE->getZeroExtendExpr(LHS, WideTy), SE->getZeroExtendExpr(RHS, WideTy), - SCEV::FlagAnyWrap); + SCEV::FlagAnyWrap, 0u); if (ExtendAfterOp == OpAfterExtend) { BO->setHasNoUnsignedWrap(); SE->forgetValue(BO); @@ -572,7 +572,7 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO, const SCEV *ExtendAfterOp = SE->getSignExtendExpr(SE->getSCEV(BO), WideTy); const SCEV *OpAfterExtend = (SE->*GetExprForBO)( SE->getSignExtendExpr(LHS, WideTy), SE->getSignExtendExpr(RHS, WideTy), - SCEV::FlagAnyWrap); + SCEV::FlagAnyWrap, 0u); if (ExtendAfterOp == OpAfterExtend) { BO->setHasNoSignedWrap(); SE->forgetValue(BO); |