diff options
Diffstat (limited to 'lib/Transforms')
39 files changed, 1845 insertions, 800 deletions
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp index 19e6789dfa74..4480220f2cd4 100644 --- a/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/lib/Transforms/Coroutines/CoroFrame.cpp @@ -177,7 +177,7 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape) // consume. Note, that crossing coro.save also requires a spill, as any code // between coro.save and coro.suspend may resume the coroutine and all of the // state needs to be saved by that time. - auto markSuspendBlock = [&](IntrinsicInst* BarrierInst) { + auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) { BasicBlock *SuspendBlock = BarrierInst->getParent(); auto &B = getBlockData(SuspendBlock); B.Suspend = true; @@ -495,6 +495,78 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) { return FramePtr; } +// Sets the unwind edge of an instruction to a particular successor. +static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) { + if (auto *II = dyn_cast<InvokeInst>(TI)) + II->setUnwindDest(Succ); + else if (auto *CS = dyn_cast<CatchSwitchInst>(TI)) + CS->setUnwindDest(Succ); + else if (auto *CR = dyn_cast<CleanupReturnInst>(TI)) + CR->setUnwindDest(Succ); + else + llvm_unreachable("unexpected terminator instruction"); +} + +// Replaces all uses of OldPred with the NewPred block in all PHINodes in a +// block. +static void updatePhiNodes(BasicBlock *DestBB, BasicBlock *OldPred, + BasicBlock *NewPred, + PHINode *LandingPadReplacement) { + unsigned BBIdx = 0; + for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) { + PHINode *PN = cast<PHINode>(I); + + // We manually update the LandingPadReplacement PHINode and it is the last + // PHI Node. So, if we find it, we are done. + if (LandingPadReplacement == PN) + break; + + // Reuse the previous value of BBIdx if it lines up. In cases where we + // have multiple phi nodes with *lots* of predecessors, this is a speed + // win because we don't have to scan the PHI looking for TIBB. This + // happens because the BB list of PHI nodes are usually in the same + // order. + if (PN->getIncomingBlock(BBIdx) != OldPred) + BBIdx = PN->getBasicBlockIndex(OldPred); + + assert(BBIdx != (unsigned)-1 && "Invalid PHI Index!"); + PN->setIncomingBlock(BBIdx, NewPred); + } +} + +// Uses SplitEdge unless the successor block is an EHPad, in which case do EH +// specific handling. +static BasicBlock *ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ, + LandingPadInst *OriginalPad, + PHINode *LandingPadReplacement) { + auto *PadInst = Succ->getFirstNonPHI(); + if (!LandingPadReplacement && !PadInst->isEHPad()) + return SplitEdge(BB, Succ); + + auto *NewBB = BasicBlock::Create(BB->getContext(), "", BB->getParent(), Succ); + setUnwindEdgeTo(BB->getTerminator(), NewBB); + updatePhiNodes(Succ, BB, NewBB, LandingPadReplacement); + + if (LandingPadReplacement) { + auto *NewLP = OriginalPad->clone(); + auto *Terminator = BranchInst::Create(Succ, NewBB); + NewLP->insertBefore(Terminator); + LandingPadReplacement->addIncoming(NewLP, NewBB); + return NewBB; + } + Value *ParentPad = nullptr; + if (auto *FuncletPad = dyn_cast<FuncletPadInst>(PadInst)) + ParentPad = FuncletPad->getParentPad(); + else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(PadInst)) + ParentPad = CatchSwitch->getParentPad(); + else + llvm_unreachable("handling for other EHPads not implemented yet"); + + auto *NewCleanupPad = CleanupPadInst::Create(ParentPad, {}, "", NewBB); + CleanupReturnInst::Create(NewCleanupPad, Succ, NewBB); + return NewBB; +} + static void rewritePHIs(BasicBlock &BB) { // For every incoming edge we will create a block holding all // incoming values in a single PHI nodes. @@ -502,7 +574,7 @@ static void rewritePHIs(BasicBlock &BB) { // loop: // %n.val = phi i32[%n, %entry], [%inc, %loop] // - // It will create: + // It will create: // // loop.from.entry: // %n.loop.pre = phi i32 [%n, %entry] @@ -517,9 +589,22 @@ static void rewritePHIs(BasicBlock &BB) { // TODO: Simplify PHINodes in the basic block to remove duplicate // predecessors. + LandingPadInst *LandingPad = nullptr; + PHINode *ReplPHI = nullptr; + if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) { + // ehAwareSplitEdge will clone the LandingPad in all the edge blocks. + // We replace the original landing pad with a PHINode that will collect the + // results from all of them. + ReplPHI = PHINode::Create(LandingPad->getType(), 1, "", LandingPad); + ReplPHI->takeName(LandingPad); + LandingPad->replaceAllUsesWith(ReplPHI); + // We will erase the original landing pad at the end of this function after + // ehAwareSplitEdge cloned it in the transition blocks. + } + SmallVector<BasicBlock *, 8> Preds(pred_begin(&BB), pred_end(&BB)); for (BasicBlock *Pred : Preds) { - auto *IncomingBB = SplitEdge(Pred, &BB); + auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI); IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName()); auto *PN = cast<PHINode>(&BB.front()); do { @@ -531,7 +616,14 @@ static void rewritePHIs(BasicBlock &BB) { InputV->addIncoming(V, Pred); PN->setIncomingValue(Index, InputV); PN = dyn_cast<PHINode>(PN->getNextNode()); - } while (PN); + } while (PN != ReplPHI); // ReplPHI is either null or the PHI that replaced + // the landing pad. + } + + if (LandingPad) { + // Calls to ehAwareSplitEdge function cloned the original lading pad. + // No longer need it. + LandingPad->eraseFromParent(); } } diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp index 7ed07d63c627..231487923fad 100644 --- a/lib/Transforms/IPO/FunctionImport.cpp +++ b/lib/Transforms/IPO/FunctionImport.cpp @@ -610,8 +610,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, return true; // Lookup the linkage recorded in the summaries during global analysis. - const auto &GS = DefinedGlobals.find(GV.getGUID()); - GlobalValue::LinkageTypes Linkage; + auto GS = DefinedGlobals.find(GV.getGUID()); if (GS == DefinedGlobals.end()) { // Must have been promoted (possibly conservatively). Find original // name so that we can access the correct summary and see if it can @@ -623,7 +622,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, std::string OrigId = GlobalValue::getGlobalIdentifier( OrigName, GlobalValue::InternalLinkage, TheModule.getSourceFileName()); - const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId)); + GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId)); if (GS == DefinedGlobals.end()) { // Also check the original non-promoted non-globalized name. In some // cases a preempted weak value is linked in as a local copy because @@ -631,15 +630,11 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, // In that case, since it was originally not a local value, it was // recorded in the index using the original name. // FIXME: This may not be needed once PR27866 is fixed. - const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName)); + GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName)); assert(GS != DefinedGlobals.end()); - Linkage = GS->second->linkage(); - } else { - Linkage = GS->second->linkage(); } - } else - Linkage = GS->second->linkage(); - return !GlobalValue::isLocalLinkage(Linkage); + } + return !GlobalValue::isLocalLinkage(GS->second->linkage()); }; // FIXME: See if we can just internalize directly here via linkage changes diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 6c83c99ae3be..673d3af0ab52 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -502,7 +502,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, std::swap(CallSites[i--], CallSites[--FirstCallInSCC]); InlinedArrayAllocasTy InlinedArrayAllocas; - InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache); + InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI); // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. @@ -872,7 +872,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // Setup the data structure used to plumb customization into the // `InlineFunction` routine. InlineFunctionInfo IFI( - /*cg=*/nullptr, &GetAssumptionCache, + /*cg=*/nullptr, &GetAssumptionCache, PSI, &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())), &FAM.getResult<BlockFrequencyAnalysis>(Callee)); diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 2db47b3b5622..8dff2fb3be8a 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationDiagnosticInfo.h" @@ -42,6 +43,11 @@ STATISTIC(NumPartialInlined, static cl::opt<bool> DisablePartialInlining("disable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable partial ininling")); +// This is an option used by testing: +static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", + cl::init(false), cl::ZeroOrMore, + cl::ReallyHidden, + cl::desc("Skip Cost Analysis")); static cl::opt<unsigned> MaxNumInlineBlocks( "max-num-inline-blocks", cl::init(5), cl::Hidden, @@ -53,6 +59,15 @@ static cl::opt<int> MaxNumPartialInlining( "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, cl::desc("Max number of partial inlining. The default is unlimited")); +// Used only when PGO or user annotated branch data is absent. It is +// the least value that is used to weigh the outline region. If BFI +// produces larger value, the BFI value will be used. +static cl::opt<int> + OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), + cl::Hidden, cl::ZeroOrMore, + cl::desc("Relative frequency of outline region to " + "the entry block")); + namespace { struct FunctionOutliningInfo { @@ -84,8 +99,6 @@ struct PartialInlinerImpl { bool run(Module &M); Function *unswitchFunction(Function *F); - std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); - private: int NumPartialInlining = 0; std::function<AssumptionCache &(Function &)> *GetAssumptionCache; @@ -93,11 +106,84 @@ private: Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; ProfileSummaryInfo *PSI; - bool shouldPartialInline(CallSite CS, OptimizationRemarkEmitter &ORE); + // Return the frequency of the OutlininingBB relative to F's entry point. + // The result is no larger than 1 and is represented using BP. + // (Note that the outlined region's 'head' block can only have incoming + // edges from the guarding entry blocks). + BranchProbability getOutliningCallBBRelativeFreq(Function *F, + FunctionOutliningInfo *OI, + Function *DuplicateFunction, + BlockFrequencyInfo *BFI, + BasicBlock *OutliningCallBB); + + // Return true if the callee of CS should be partially inlined with + // profit. + bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI, + BlockFrequencyInfo *CalleeBFI, + BasicBlock *OutliningCallBB, + int OutliningCallOverhead, + OptimizationRemarkEmitter &ORE); + + // Try to inline DuplicateFunction (cloned from F with call to + // the OutlinedFunction into its callers. Return true + // if there is any successful inlining. + bool tryPartialInline(Function *DuplicateFunction, + Function *F, /*orignal function */ + FunctionOutliningInfo *OI, Function *OutlinedFunction, + BlockFrequencyInfo *CalleeBFI); + + // Compute the mapping from use site of DuplicationFunction to the enclosing + // BB's profile count. + void computeCallsiteToProfCountMap(Function *DuplicateFunction, + DenseMap<User *, uint64_t> &SiteCountMap); + bool IsLimitReached() { return (MaxNumPartialInlining != -1 && NumPartialInlining >= MaxNumPartialInlining); } + + CallSite getCallSite(User *U) { + CallSite CS; + if (CallInst *CI = dyn_cast<CallInst>(U)) + CS = CallSite(CI); + else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) + CS = CallSite(II); + else + llvm_unreachable("All uses must be calls"); + return CS; + } + + CallSite getOneCallSiteTo(Function *F) { + User *User = *F->user_begin(); + return getCallSite(User); + } + + std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { + CallSite CS = getOneCallSiteTo(F); + DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); + BasicBlock *Block = CS.getParent(); + return std::make_tuple(DLoc, Block); + } + + // Returns the costs associated with function outlining: + // - The first value is the non-weighted runtime cost for making the call + // to the outlined function 'OutlinedFunction', including the addtional + // setup cost in the outlined function itself; + // - The second value is the estimated size of the new call sequence in + // basic block 'OutliningCallBB'; + // - The third value is the estimated size of the original code from + // function 'F' that is extracted into the outlined function. + std::tuple<int, int, int> + computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo, + Function *OutlinedFunction, + BasicBlock *OutliningCallBB); + // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to + // approximate both the size and runtime cost (Note that in the current + // inline cost analysis, there is no clear distinction there either). + int computeBBInlineCost(BasicBlock *BB); + + std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); + }; struct PartialInlinerLegacyPass : public ModulePass { @@ -157,7 +243,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { return isa<ReturnInst>(TI); }; - auto GetReturnBlock = [=](BasicBlock *Succ1, BasicBlock *Succ2) { + auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { if (IsReturnBlock(Succ1)) return std::make_tuple(Succ1, Succ2); if (IsReturnBlock(Succ2)) @@ -167,7 +253,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { }; // Detect a triangular shape: - auto GetCommonSucc = [=](BasicBlock *Succ1, BasicBlock *Succ2) { + auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { if (IsSuccessor(Succ1, Succ2)) return std::make_tuple(Succ1, Succ2); if (IsSuccessor(Succ2, Succ1)) @@ -223,7 +309,8 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { // Do sanity check of the entries: threre should not // be any successors (not in the entry set) other than // {ReturnBlock, NonReturnBlock} - assert(OutliningInfo->Entries[0] == &F->front()); + assert(OutliningInfo->Entries[0] == &F->front() && + "Function Entry must be the first in Entries vector"); DenseSet<BasicBlock *> Entries; for (BasicBlock *E : OutliningInfo->Entries) Entries.insert(E); @@ -289,10 +376,54 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { return OutliningInfo; } -bool PartialInlinerImpl::shouldPartialInline(CallSite CS, - OptimizationRemarkEmitter &ORE) { - // TODO : more sharing with shouldInline in Inliner.cpp +// Check if there is PGO data or user annoated branch data: +static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { + if (F->getEntryCount()) + return true; + // Now check if any of the entry block has MD_prof data: + for (auto *E : OI->Entries) { + BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); + if (!BR || BR->isUnconditional()) + continue; + uint64_t T, F; + if (BR->extractProfMetadata(T, F)) + return true; + } + return false; +} + +BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( + Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction, + BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) { + + auto EntryFreq = + BFI->getBlockFreq(&DuplicateFunction->getEntryBlock()); + auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB); + + auto OutlineRegionRelFreq = + BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), + EntryFreq.getFrequency()); + + if (hasProfileData(F, OI)) + return OutlineRegionRelFreq; + + // When profile data is not available, we need to be very + // conservative in estimating the overall savings. We need to make sure + // the outline region relative frequency is not below the threshold + // specified by the option. + OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); + + return OutlineRegionRelFreq; +} + +bool PartialInlinerImpl::shouldPartialInline( + CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI, + BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB, + int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) { using namespace ore; + if (SkipCostAnalysis) + return true; + Instruction *Call = CS.getInstruction(); Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); @@ -302,36 +433,170 @@ bool PartialInlinerImpl::shouldPartialInline(CallSite CS, if (IC.isAlways()) { ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) - << NV("Callee", Callee) + << NV("Callee", F) << " should always be fully inlined, not partially"); return false; } if (IC.isNever()) { ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) - << NV("Callee", Callee) << " not partially inlined into " + << NV("Callee", F) << " not partially inlined into " << NV("Caller", Caller) << " because it should never be inlined (cost=never)"); return false; } if (!IC) { - ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call) - << NV("Callee", Callee) << " not partially inlined into " + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) + << NV("Callee", F) << " not partially inlined into " << NV("Caller", Caller) << " because too costly to inline (cost=" << NV("Cost", IC.getCost()) << ", threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); return false; } + const DataLayout &DL = Caller->getParent()->getDataLayout(); + // The savings of eliminating the call: + int NonWeightedSavings = getCallsiteCost(CS, DL); + BlockFrequency NormWeightedSavings(NonWeightedSavings); + + auto RelativeFreq = + getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB); + auto NormWeightedRcost = + BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq; + + // Weighted saving is smaller than weighted cost, return false + if (NormWeightedSavings < NormWeightedRcost) { + ORE.emit( + OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call) + << NV("Callee", F) << " not partially inlined into " + << NV("Caller", Caller) << " runtime overhead (overhead=" + << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency()) + << ", savings=" + << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" + << " of making the outlined call is too high"); + + return false; + } ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) - << NV("Callee", Callee) << " can be partially inlined into " + << NV("Callee", F) << " can be partially inlined into " << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) << " (threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); return true; } +// TODO: Ideally we should share Inliner's InlineCost Analysis code. +// For now use a simplified version. The returned 'InlineCost' will be used +// to esimate the size cost as well as runtime cost of the BB. +int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { + int InlineCost = 0; + const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + if (isa<DbgInfoIntrinsic>(I)) + continue; + + if (CallInst *CI = dyn_cast<CallInst>(I)) { + InlineCost += getCallsiteCost(CallSite(CI), DL); + continue; + } + + if (InvokeInst *II = dyn_cast<InvokeInst>(I)) { + InlineCost += getCallsiteCost(CallSite(II), DL); + continue; + } + + if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { + InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; + continue; + } + InlineCost += InlineConstants::InstrCost; + } + return InlineCost; +} + +std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts( + Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction, + BasicBlock *OutliningCallBB) { + // First compute the cost of the outlined region 'OI' in the original + // function 'F': + int OutlinedRegionCost = 0; + for (BasicBlock &BB : *F) { + if (&BB != OI->ReturnBlock && + // Assuming Entry set is small -- do a linear search here: + std::find(OI->Entries.begin(), OI->Entries.end(), &BB) == + OI->Entries.end()) { + OutlinedRegionCost += computeBBInlineCost(&BB); + } + } + + // Now compute the cost of the call sequence to the outlined function + // 'OutlinedFunction' in BB 'OutliningCallBB': + int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB); + + // Now compute the cost of the extracted/outlined function itself: + int OutlinedFunctionCost = 0; + for (BasicBlock &BB : *OutlinedFunction) { + OutlinedFunctionCost += computeBBInlineCost(&BB); + } + + assert(OutlinedFunctionCost >= OutlinedRegionCost && + "Outlined function cost should be no less than the outlined region"); + int OutliningRuntimeOverhead = + OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost); + + return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead, + OutlinedRegionCost); +} + +// Create the callsite to profile count map which is +// used to update the original function's entry count, +// after the function is partially inlined into the callsite. +void PartialInlinerImpl::computeCallsiteToProfCountMap( + Function *DuplicateFunction, + DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { + std::vector<User *> Users(DuplicateFunction->user_begin(), + DuplicateFunction->user_end()); + Function *CurrentCaller = nullptr; + BlockFrequencyInfo *CurrentCallerBFI = nullptr; + + auto ComputeCurrBFI = [&,this](Function *Caller) { + // For the old pass manager: + if (!GetBFI) { + if (CurrentCallerBFI) + delete CurrentCallerBFI; + DominatorTree DT(*Caller); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*Caller, LI); + CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI); + } else { + // New pass manager: + CurrentCallerBFI = &(*GetBFI)(*Caller); + } + }; + + for (User *User : Users) { + CallSite CS = getCallSite(User); + Function *Caller = CS.getCaller(); + if (CurrentCaller != Caller) { + CurrentCaller = Caller; + ComputeCurrBFI(Caller); + } else { + assert(CurrentCallerBFI && "CallerBFI is not set"); + } + BasicBlock *CallBB = CS.getInstruction()->getParent(); + auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); + if (Count) + CallSiteToProfCountMap[User] = *Count; + else + CallSiteToProfCountMap[User] = 0; + } + if (!GetBFI) { + if (CurrentCallerBFI) + delete CurrentCallerBFI; + } +} + Function *PartialInlinerImpl::unswitchFunction(Function *F) { if (F->hasAddressTaken()) @@ -347,21 +612,21 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { if (PSI->isFunctionEntryCold(F)) return nullptr; - std::unique_ptr<FunctionOutliningInfo> OutliningInfo = - computeOutliningInfo(F); + if (F->user_begin() == F->user_end()) + return nullptr; + + std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); - if (!OutliningInfo) + if (!OI) return nullptr; // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; Function *DuplicateFunction = CloneFunction(F, VMap); - BasicBlock *NewReturnBlock = - cast<BasicBlock>(VMap[OutliningInfo->ReturnBlock]); - BasicBlock *NewNonReturnBlock = - cast<BasicBlock>(VMap[OutliningInfo->NonReturnBlock]); + BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); + BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); DenseSet<BasicBlock *> NewEntries; - for (BasicBlock *BB : OutliningInfo->Entries) { + for (BasicBlock *BB : OI->Entries) { NewEntries.insert(cast<BasicBlock>(VMap[BB])); } @@ -390,7 +655,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { BasicBlock *PreReturn = NewReturnBlock; // only split block when necessary: PHINode *FirstPhi = getFirstPHI(PreReturn); - unsigned NumPredsFromEntries = OutliningInfo->ReturnBlockPreds.size(); + unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size(); if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) { NewReturnBlock = NewReturnBlock->splitBasicBlock( @@ -408,14 +673,14 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { Ins = NewReturnBlock->getFirstNonPHI(); RetPhi->addIncoming(&*I, PreReturn); - for (BasicBlock *E : OutliningInfo->ReturnBlockPreds) { + for (BasicBlock *E : OI->ReturnBlockPreds) { BasicBlock *NewE = cast<BasicBlock>(VMap[E]); RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE); OldPhi->removeIncomingValue(NewE); } ++I; } - for (auto E : OutliningInfo->ReturnBlockPreds) { + for (auto E : OI->ReturnBlockPreds) { BasicBlock *NewE = cast<BasicBlock>(VMap[E]); NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock); } @@ -423,7 +688,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { // Returns true if the block is to be partial inlined into the caller // (i.e. not to be extracted to the out of line function) - auto ToBeInlined = [=](BasicBlock *BB) { + auto ToBeInlined = [&](BasicBlock *BB) { return BB == NewReturnBlock || NewEntries.count(BB); }; // Gather up the blocks that we're going to extract. @@ -443,50 +708,113 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI); // Extract the body of the if. - Function *ExtractedFunction = + Function *OutlinedFunction = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI) .extractCodeRegion(); - // Inline the top-level if test into all callers. + bool AnyInline = + tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI); + + // Ditch the duplicate, since we're done with it, and rewrite all remaining + // users (function pointers, etc.) back to the original function. + DuplicateFunction->replaceAllUsesWith(F); + DuplicateFunction->eraseFromParent(); + + if (AnyInline) + return OutlinedFunction; + + // Remove the function that is speculatively created: + if (OutlinedFunction) + OutlinedFunction->eraseFromParent(); + + return nullptr; +} + +bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction, + Function *F, + FunctionOutliningInfo *OI, + Function *OutlinedFunction, + BlockFrequencyInfo *CalleeBFI) { + if (OutlinedFunction == nullptr) + return false; + + int NonWeightedRcost; + int SizeCost; + int OutlinedRegionSizeCost; + + auto OutliningCallBB = + getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent(); + + std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) = + computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB); + + // The call sequence to the outlined function is larger than the original + // outlined region size, it does not increase the chances of inlining + // 'F' with outlining (The inliner usies the size increase to model the + // the cost of inlining a callee). + if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) { + OptimizationRemarkEmitter ORE(F); + DebugLoc DLoc; + BasicBlock *Block; + std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction); + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", + DLoc, Block) + << ore::NV("Function", F) + << " not partially inlined into callers (Original Size = " + << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost) + << ", Size of call sequence to outlined function = " + << ore::NV("NewSize", SizeCost) << ")"); + return false; + } + + assert(F->user_begin() == F->user_end() && + "F's users should all be replaced!"); std::vector<User *> Users(DuplicateFunction->user_begin(), DuplicateFunction->user_end()); + DenseMap<User *, uint64_t> CallSiteToProfCountMap; + if (F->getEntryCount()) + computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap); + + auto CalleeEntryCount = F->getEntryCount(); + uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); + bool AnyInline = false; for (User *User : Users) { - CallSite CS; - if (CallInst *CI = dyn_cast<CallInst>(User)) - CS = CallSite(CI); - else if (InvokeInst *II = dyn_cast<InvokeInst>(User)) - CS = CallSite(II); - else - llvm_unreachable("All uses must be calls"); + CallSite CS = getCallSite(User); if (IsLimitReached()) continue; OptimizationRemarkEmitter ORE(CS.getCaller()); - if (!shouldPartialInline(CS, ORE)) + + if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB, + NonWeightedRcost, ORE)) continue; - DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); - BasicBlock *Block = CS.getParent(); - ORE.emit(OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", DLoc, Block) - << ore::NV("Callee", F) << " partially inlined into " - << ore::NV("Caller", CS.getCaller())); + ORE.emit( + OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()) + << ore::NV("Callee", F) << " partially inlined into " + << ore::NV("Caller", CS.getCaller())); - InlineFunctionInfo IFI(nullptr, GetAssumptionCache); + InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); InlineFunction(CS, IFI); + + // Now update the entry count: + if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { + uint64_t CallSiteCount = CallSiteToProfCountMap[User]; + CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); + } + + AnyInline = true; NumPartialInlining++; - // update stats + // Update the stats NumPartialInlined++; } - // Ditch the duplicate, since we're done with it, and rewrite all remaining - // users (function pointers, etc.) back to the original function. - DuplicateFunction->replaceAllUsesWith(F); - DuplicateFunction->eraseFromParent(); - + if (AnyInline && CalleeEntryCount) + F->setEntryCount(CalleeEntryCountV); - return ExtractedFunction; + return AnyInline; } bool PartialInlinerImpl::run(Module &M) { diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index d3a3c24ce7b4..659cb9df00a2 100644 --- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IR/Constants.h" @@ -178,7 +179,7 @@ void filterModule( else GO = new GlobalVariable( *M, GA->getValueType(), false, GlobalValue::ExternalLinkage, - (Constant *)nullptr, "", (GlobalVariable *)nullptr, + nullptr, "", nullptr, GA->getThreadLocalMode(), GA->getType()->getAddressSpace()); GO->takeName(GA); GA->replaceAllUsesWith(GO); @@ -320,7 +321,8 @@ void splitAndWriteThinLTOBitcode( // FIXME: Try to re-use BSI and PFI from the original module here. - ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr); + ProfileSummaryInfo PSI(M); + ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI); SmallVector<char, 0> Buffer; diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 153a186d5ed4..0ca62b7ae40c 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -847,92 +847,6 @@ Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) { return createFMul(OpndVal, Coeff.getValue(Instr->getType())); } -/// \brief Return true if we can prove that adding the two values of the -/// knownbits will not overflow. -/// Otherwise return false. -static bool checkRippleForAdd(const KnownBits &LHSKnown, - const KnownBits &RHSKnown) { - // Addition of two 2's complement numbers having opposite signs will never - // overflow. - if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) || - (LHSKnown.isNonNegative() && RHSKnown.isNegative())) - return true; - - // If either of the values is known to be non-negative, adding them can only - // overflow if the second is also non-negative, so we can assume that. - // Two non-negative numbers will only overflow if there is a carry to the - // sign bit, so we can check if even when the values are as big as possible - // there is no overflow to the sign bit. - if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) { - APInt MaxLHS = ~LHSKnown.Zero; - MaxLHS.clearSignBit(); - APInt MaxRHS = ~RHSKnown.Zero; - MaxRHS.clearSignBit(); - APInt Result = std::move(MaxLHS) + std::move(MaxRHS); - return Result.isSignBitClear(); - } - - // If either of the values is known to be negative, adding them can only - // overflow if the second is also negative, so we can assume that. - // Two negative number will only overflow if there is no carry to the sign - // bit, so we can check if even when the values are as small as possible - // there is overflow to the sign bit. - if (LHSKnown.isNegative() || RHSKnown.isNegative()) { - APInt MinLHS = LHSKnown.One; - MinLHS.clearSignBit(); - APInt MinRHS = RHSKnown.One; - MinRHS.clearSignBit(); - APInt Result = std::move(MinLHS) + std::move(MinRHS); - return Result.isSignBitSet(); - } - - // If we reached here it means that we know nothing about the sign bits. - // In this case we can't know if there will be an overflow, since by - // changing the sign bits any two values can be made to overflow. - return false; -} - -/// Return true if we can prove that: -/// (sext (add LHS, RHS)) === (add (sext LHS), (sext RHS)) -/// This basically requires proving that the add in the original type would not -/// overflow to change the sign bit or have a carry out. -bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS, - Instruction &CxtI) { - // There are different heuristics we can use for this. Here are some simple - // ones. - - // If LHS and RHS each have at least two sign bits, the addition will look - // like - // - // XX..... + - // YY..... - // - // If the carry into the most significant position is 0, X and Y can't both - // be 1 and therefore the carry out of the addition is also 0. - // - // If the carry into the most significant position is 1, X and Y can't both - // be 0 and therefore the carry out of the addition is also 1. - // - // Since the carry into the most significant position is always equal to - // the carry out of the addition, there is no signed overflow. - if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 && - ComputeNumSignBits(RHS, 0, &CxtI) > 1) - return true; - - unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); - KnownBits LHSKnown(BitWidth); - computeKnownBits(LHS, LHSKnown, 0, &CxtI); - - KnownBits RHSKnown(BitWidth); - computeKnownBits(RHS, RHSKnown, 0, &CxtI); - - // Check if carry bit of addition will not cause overflow. - if (checkRippleForAdd(LHSKnown, RHSKnown)) - return true; - - return false; -} - /// \brief Return true if we can prove that: /// (sub LHS, RHS) === (sub nsw LHS, RHS) /// This basically requires proving that the add in the original type would not @@ -968,13 +882,9 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS, bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI) { // If the LHS is negative and the RHS is non-negative, no unsigned wrap. - bool LHSKnownNonNegative, LHSKnownNegative; - bool RHSKnownNonNegative, RHSKnownNegative; - ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0, - &CxtI); - ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0, - &CxtI); - if (LHSKnownNegative && RHSKnownNonNegative) + KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI); + KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI); + if (LHSKnown.isNegative() && RHSKnown.isNonNegative()) return true; return false; @@ -1041,6 +951,57 @@ static Value *checkForNegativeOperand(BinaryOperator &I, return nullptr; } +static Instruction *foldAddWithConstant(BinaryOperator &Add, + InstCombiner::BuilderTy &Builder) { + Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1); + const APInt *C; + if (!match(Op1, m_APInt(C))) + return nullptr; + + if (C->isSignMask()) { + // If wrapping is not allowed, then the addition must set the sign bit: + // X + (signmask) --> X | signmask + if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap()) + return BinaryOperator::CreateOr(Op0, Op1); + + // If wrapping is allowed, then the addition flips the sign bit of LHS: + // X + (signmask) --> X ^ signmask + return BinaryOperator::CreateXor(Op0, Op1); + } + + Value *X; + const APInt *C2; + Type *Ty = Add.getType(); + + // Is this add the last step in a convoluted sext? + // add(zext(xor i16 X, -32768), -32768) --> sext X + if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) && + C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C) + return CastInst::Create(Instruction::SExt, X, Ty); + + // (add (zext (add nuw X, C2)), C) --> (zext (add nuw X, C2 + C)) + // FIXME: This should check hasOneUse to not increase the instruction count? + if (C->isNegative() && + match(Op0, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2)))) && + C->sge(-C2->sext(C->getBitWidth()))) { + Constant *NewC = + ConstantInt::get(X->getType(), *C2 + C->trunc(C2->getBitWidth())); + return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty); + } + + // Shifts and add used to flip and mask off the low bit: + // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1 + const APInt *C3; + if (*C == 1 && match(Op0, m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C2)), + m_APInt(C3)))) && + C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) { + Value *NotX = Builder.CreateNot(X); + return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1)); + } + + return nullptr; +} + Instruction *InstCombiner::visitAdd(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); @@ -1056,41 +1017,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (Value *V = SimplifyUsingDistributiveLaws(I)) return replaceInstUsesWith(I, V); - const APInt *RHSC; - if (match(RHS, m_APInt(RHSC))) { - if (RHSC->isSignMask()) { - // If wrapping is not allowed, then the addition must set the sign bit: - // X + (signmask) --> X | signmask - if (I.hasNoSignedWrap() || I.hasNoUnsignedWrap()) - return BinaryOperator::CreateOr(LHS, RHS); - - // If wrapping is allowed, then the addition flips the sign bit of LHS: - // X + (signmask) --> X ^ signmask - return BinaryOperator::CreateXor(LHS, RHS); - } - - // Is this add the last step in a convoluted sext? - Value *X; - const APInt *C; - if (match(LHS, m_ZExt(m_Xor(m_Value(X), m_APInt(C)))) && - C->isMinSignedValue() && - C->sext(LHS->getType()->getScalarSizeInBits()) == *RHSC) { - // add(zext(xor i16 X, -32768), -32768) --> sext X - return CastInst::Create(Instruction::SExt, X, LHS->getType()); - } - - if (RHSC->isNegative() && - match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) && - RHSC->sge(-C->sext(RHSC->getBitWidth()))) { - // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val)) - Constant *NewC = - ConstantInt::get(X->getType(), *C + RHSC->trunc(C->getBitWidth())); - return new ZExtInst(Builder->CreateNUWAdd(X, NewC), I.getType()); - } - } + if (Instruction *X = foldAddWithConstant(I, *Builder)) + return X; - // FIXME: Use the match above instead of dyn_cast to allow these transforms - // for splat vectors. + // FIXME: This should be moved into the above helper function to allow these + // transforms for splat vectors. if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) { // zext(bool) + C -> bool ? C + 1 : C if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS)) @@ -1285,8 +1216,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { Constant *CI = ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType()); if (ConstantExpr::getZExt(CI, I.getType()) == RHSC && - computeOverflowForUnsignedAdd(LHSConv->getOperand(0), CI, &I) == - OverflowResult::NeverOverflows) { + willNotOverflowUnsignedAdd(LHSConv->getOperand(0), CI, I)) { // Insert the new, smaller add. Value *NewAdd = Builder->CreateNUWAdd(LHSConv->getOperand(0), CI, "addconv"); @@ -1303,9 +1233,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (LHSConv->getOperand(0)->getType() == RHSConv->getOperand(0)->getType() && (LHSConv->hasOneUse() || RHSConv->hasOneUse()) && - computeOverflowForUnsignedAdd(LHSConv->getOperand(0), - RHSConv->getOperand(0), - &I) == OverflowResult::NeverOverflows) { + willNotOverflowUnsignedAdd(LHSConv->getOperand(0), + RHSConv->getOperand(0), I)) { // Insert the new integer add. Value *NewAdd = Builder->CreateNUWAdd( LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv"); @@ -1347,15 +1276,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { } // TODO(jingyue): Consider WillNotOverflowSignedAdd and - // WillNotOverflowUnsignedAdd to reduce the number of invocations of + // willNotOverflowUnsignedAdd to reduce the number of invocations of // computeKnownBits. if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, I)) { Changed = true; I.setHasNoSignedWrap(true); } - if (!I.hasNoUnsignedWrap() && - computeOverflowForUnsignedAdd(LHS, RHS, &I) == - OverflowResult::NeverOverflows) { + if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) { Changed = true; I.setHasNoUnsignedWrap(true); } diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index b114801cc1c0..82dc88f1b3ad 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -23,21 +23,6 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -static inline Value *dyn_castNotVal(Value *V) { - // If this is not(not(x)) don't return that this is a not: we want the two - // not's to be folded first. - if (BinaryOperator::isNot(V)) { - Value *Operand = BinaryOperator::getNotArgument(V); - if (!IsFreeToInvert(Operand, Operand->hasOneUse())) - return Operand; - } - - // Constants can be considered to be not'ed values... - if (ConstantInt *C = dyn_cast<ConstantInt>(V)) - return ConstantInt::get(C->getType(), ~C->getValue()); - return nullptr; -} - /// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into /// a four bit mask. static unsigned getFCmpCode(FCmpInst::Predicate CC) { @@ -713,9 +698,8 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, } // This simplification is only valid if the upper range is not negative. - bool IsNegative, IsNotNegative; - ComputeSignBit(RangeEnd, IsNotNegative, IsNegative, /*Depth=*/0, Cmp1); - if (!IsNotNegative) + KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1); + if (!Known.isNonNegative()) return nullptr; if (Inverted) @@ -1013,26 +997,22 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { /// (~A & ~B) == (~(A | B)) /// (~A | ~B) == (~(A & B)) static Instruction *matchDeMorgansLaws(BinaryOperator &I, - InstCombiner::BuilderTy *Builder) { + InstCombiner::BuilderTy &Builder) { auto Opcode = I.getOpcode(); assert((Opcode == Instruction::And || Opcode == Instruction::Or) && "Trying to match De Morgan's Laws with something other than and/or"); + // Flip the logic operation. - if (Opcode == Instruction::And) - Opcode = Instruction::Or; - else - Opcode = Instruction::And; + Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And; - Value *Op0 = I.getOperand(0); - Value *Op1 = I.getOperand(1); - // TODO: Use pattern matchers instead of dyn_cast. - if (Value *Op0NotVal = dyn_castNotVal(Op0)) - if (Value *Op1NotVal = dyn_castNotVal(Op1)) - if (Op0->hasOneUse() && Op1->hasOneUse()) { - Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal, - I.getName() + ".demorgan"); - return BinaryOperator::CreateNot(LogicOp); - } + Value *A, *B; + if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) && + match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) && + !IsFreeToInvert(A, A->hasOneUse()) && + !IsFreeToInvert(B, B->hasOneUse())) { + Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan"); + return BinaryOperator::CreateNot(AndOr); + } return nullptr; } @@ -1376,7 +1356,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I)) return FoldedLogic; - if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) + if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder)) return DeMorgan; { @@ -2005,18 +1985,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (Value *V = SimplifyBSwap(I)) return replaceInstUsesWith(I, V); - if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { - ConstantInt *C1 = nullptr; Value *X = nullptr; - // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2) - if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) && - Op0->hasOneUse()) { - Value *Or = Builder->CreateOr(X, RHS); - Or->takeName(Op0); - return BinaryOperator::CreateXor(Or, - Builder->getInt(C1->getValue() & ~RHS->getValue())); - } - } - if (isa<Constant>(Op1)) if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I)) return FoldedLogic; @@ -2167,7 +2135,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A)))) return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C)); - if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) + if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder)) return DeMorgan; // Canonicalize xor to the RHS. @@ -2399,27 +2367,44 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { } // Is this a 'not' (~) fed by a binary operator? - BinaryOperator *NotOp; - if (match(&I, m_Not(m_BinOp(NotOp)))) { - if (NotOp->getOpcode() == Instruction::And || - NotOp->getOpcode() == Instruction::Or) { + BinaryOperator *NotVal; + if (match(&I, m_Not(m_BinOp(NotVal)))) { + if (NotVal->getOpcode() == Instruction::And || + NotVal->getOpcode() == Instruction::Or) { // Apply DeMorgan's Law when inverts are free: // ~(X & Y) --> (~X | ~Y) // ~(X | Y) --> (~X & ~Y) - if (IsFreeToInvert(NotOp->getOperand(0), - NotOp->getOperand(0)->hasOneUse()) && - IsFreeToInvert(NotOp->getOperand(1), - NotOp->getOperand(1)->hasOneUse())) { - Value *NotX = Builder->CreateNot(NotOp->getOperand(0), "notlhs"); - Value *NotY = Builder->CreateNot(NotOp->getOperand(1), "notrhs"); - if (NotOp->getOpcode() == Instruction::And) + if (IsFreeToInvert(NotVal->getOperand(0), + NotVal->getOperand(0)->hasOneUse()) && + IsFreeToInvert(NotVal->getOperand(1), + NotVal->getOperand(1)->hasOneUse())) { + Value *NotX = Builder->CreateNot(NotVal->getOperand(0), "notlhs"); + Value *NotY = Builder->CreateNot(NotVal->getOperand(1), "notrhs"); + if (NotVal->getOpcode() == Instruction::And) return BinaryOperator::CreateOr(NotX, NotY); return BinaryOperator::CreateAnd(NotX, NotY); } - } else if (NotOp->getOpcode() == Instruction::AShr) { - // ~(~X >>s Y) --> (X >>s Y) - if (Value *Op0NotVal = dyn_castNotVal(NotOp->getOperand(0))) - return BinaryOperator::CreateAShr(Op0NotVal, NotOp->getOperand(1)); + } + + // ~(~X >>s Y) --> (X >>s Y) + if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y)))) + return BinaryOperator::CreateAShr(X, Y); + + // If we are inverting a right-shifted constant, we may be able to eliminate + // the 'not' by inverting the constant and using the opposite shift type. + // Canonicalization rules ensure that only a negative constant uses 'ashr', + // but we must check that in case that transform has not fired yet. + const APInt *C; + if (match(NotVal, m_AShr(m_APInt(C), m_Value(Y))) && C->isNegative()) { + // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits) + Constant *NotC = ConstantInt::get(I.getType(), ~(*C)); + return BinaryOperator::CreateLShr(NotC, Y); + } + + if (match(NotVal, m_LShr(m_APInt(C), m_Value(Y))) && C->isNonNegative()) { + // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits) + Constant *NotC = ConstantInt::get(I.getType(), ~(*C)); + return BinaryOperator::CreateAShr(NotC, Y); } } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 6989d67f0060..face7abcc95f 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1384,10 +1384,10 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { // Create a mask for bits above (ctlz) or below (cttz) the first known one. bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; - unsigned PossibleZeros = IsTZ ? Known.One.countTrailingZeros() - : Known.One.countLeadingZeros(); - unsigned DefiniteZeros = IsTZ ? Known.Zero.countTrailingOnes() - : Known.Zero.countLeadingOnes(); + unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros() + : Known.countMaxLeadingZeros(); + unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros() + : Known.countMinLeadingZeros(); // If all bits above (ctlz) or below (cttz) the first known one are known // zero, this value is constant. diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 312d9baae43a..001a4bcf16f3 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -559,6 +559,9 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero); } + // FIXME: Maybe combine the next two transforms to handle the no cast case + // more efficiently. Support vector types. Cleanup code by using m_OneUse. + // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion. Value *A = nullptr; ConstantInt *Cst = nullptr; if (Src->hasOneUse() && @@ -588,15 +591,20 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // the sign bit of the original value; performing ashr instead of lshr // generates bits of the same value as the sign bit. if (Src->hasOneUse() && - match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) && - cast<Instruction>(Src)->getOperand(0)->hasOneUse()) { + match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) { + Value *SExt = cast<Instruction>(Src)->getOperand(0); + const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits(); const unsigned ASize = A->getType()->getPrimitiveSizeInBits(); + unsigned ShiftAmt = Cst->getZExtValue(); // This optimization can be only performed when zero bits generated by // the original lshr aren't pulled into the value after truncation, so we - // can only shift by values smaller than the size of destination type (in - // bits). - if (Cst->getValue().ult(ASize)) { - Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue()); + // can only shift by values no larger than the number of extension bits. + // FIXME: Instead of bailing when the shift is too large, use and to clear + // the extra bits. + if (SExt->hasOneUse() && ShiftAmt <= SExtSize - ASize) { + // If shifting by the size of the original value in bits or more, it is + // being filled with the sign bit, so shift by ASize-1 to avoid ub. + Value *Shift = Builder->CreateAShr(A, std::min(ShiftAmt, ASize-1)); Shift->takeName(Src); return CastInst::CreateIntegerCast(Shift, CI.getType(), true); } @@ -1180,9 +1188,8 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { // If we know that the value being extended is positive, we can use a zext // instead. - bool KnownZero, KnownOne; - ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI); - if (KnownZero) { + KnownBits Known = computeKnownBits(Src, 0, &CI); + if (Known.isNonNegative()) { Value *ZExt = Builder->CreateZExt(Src, DestTy); return replaceInstUsesWith(CI, ZExt); } diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 34ce235b3fe2..60ed4057cedd 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2785,6 +2785,9 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) { } /// Try to fold icmp (binop), X or icmp X, (binop). +/// TODO: A large part of this logic is duplicated in InstSimplify's +/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code +/// duplication. Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -2794,7 +2797,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { if (!BO0 && !BO1) return nullptr; - CmpInst::Predicate Pred = I.getPredicate(); + const CmpInst::Predicate Pred = I.getPredicate(); bool NoOp0WrapProblem = false, NoOp1WrapProblem = false; if (BO0 && isa<OverflowingBinaryOperator>(BO0)) NoOp0WrapProblem = @@ -3029,21 +3032,20 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { case Instruction::Sub: case Instruction::Xor: if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b - return new ICmpInst(I.getPredicate(), BO0->getOperand(0), - BO1->getOperand(0)); + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) { if (CI->getValue().isSignMask()) { - ICmpInst::Predicate Pred = + ICmpInst::Predicate NewPred = I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); - return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0)); } if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) { - ICmpInst::Predicate Pred = + ICmpInst::Predicate NewPred = I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); - Pred = I.getSwappedPredicate(Pred); - return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + NewPred = I.getSwappedPredicate(NewPred); + return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0)); } } break; @@ -3062,21 +3064,27 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { AP.getBitWidth() - AP.countTrailingZeros())); Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask); Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask); - return new ICmpInst(I.getPredicate(), And1, And2); + return new ICmpInst(Pred, And1, And2); } } break; + case Instruction::UDiv: case Instruction::LShr: - if (I.isSigned()) + if (I.isSigned() || !BO0->isExact() || !BO1->isExact()) break; - LLVM_FALLTHROUGH; + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + case Instruction::SDiv: + if (!I.isEquality() || !BO0->isExact() || !BO1->isExact()) + break; + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + case Instruction::AShr: if (!BO0->isExact() || !BO1->isExact()) break; - return new ICmpInst(I.getPredicate(), BO0->getOperand(0), - BO1->getOperand(0)); + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + case Instruction::Shl: { bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap(); bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap(); @@ -3084,8 +3092,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { break; if (!NSW && I.isSigned()) break; - return new ICmpInst(I.getPredicate(), BO0->getOperand(0), - BO1->getOperand(0)); + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); } } } @@ -3096,7 +3103,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { auto BitwiseAnd = m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value())); - if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) { + if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) { auto *Zero = Constant::getNullValue(BO0->getType()); return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero); } diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 3be6419a129a..1424f61fe701 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -30,6 +30,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/Dwarf.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Transforms/InstCombine/InstCombineWorklist.h" #include "llvm/Transforms/Utils/Local.h" @@ -388,10 +389,21 @@ private: bool DoTransform = true); Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI); - bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI); + bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) { + return computeOverflowForSignedAdd(LHS, RHS, &CxtI) == + OverflowResult::NeverOverflows; + }; + bool willNotOverflowUnsignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) { + return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) == + OverflowResult::NeverOverflows; + }; bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI); bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI); bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction &CxtI); + bool willNotOverflowUnsignedMul(Value *LHS, Value *RHS, Instruction &CxtI) { + return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) == + OverflowResult::NeverOverflows; + }; Value *EmitGEPOffset(User *GEP); Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN); Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask); @@ -492,7 +504,11 @@ public: void computeKnownBits(Value *V, KnownBits &Known, unsigned Depth, Instruction *CxtI) const { - return llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT); + llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT); + } + KnownBits computeKnownBits(Value *V, unsigned Depth, + Instruction *CxtI) const { + return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT); } bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0, @@ -503,11 +519,6 @@ public: Instruction *CxtI = nullptr) const { return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT); } - void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne, - unsigned Depth = 0, Instruction *CxtI = nullptr) const { - return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI, - &DT); - } OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS, const Instruction *CxtI) { return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT); @@ -516,6 +527,11 @@ public: const Instruction *CxtI) { return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT); } + OverflowResult computeOverflowForSignedAdd(const Value *LHS, + const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT); + } /// Maximum size of array considered when transforming. uint64_t MaxArraySizeForCombine; diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 675553017838..a4d84ae81aa0 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -885,10 +885,8 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI, // first non-zero index. auto IsAllNonNegative = [&]() { for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) { - bool KnownNonNegative, KnownNegative; - IC.ComputeSignBit(GEPI->getOperand(i), KnownNonNegative, - KnownNegative, 0, MemI); - if (KnownNonNegative) + KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI); + if (Known.isNonNegative()) continue; return false; } diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index face9d9237ae..2a35259f2103 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -162,11 +162,9 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS, // product is exactly the minimum negative number. // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000 // For simplicity we just check if at least one side is not negative. - bool LHSNonNegative, LHSNegative; - bool RHSNonNegative, RHSNegative; - ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, &CxtI); - ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, &CxtI); - if (LHSNonNegative || RHSNonNegative) + KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI); + KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI); + if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) return true; } return false; @@ -422,8 +420,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { Constant *CI = ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType()); if (ConstantExpr::getZExt(CI, I.getType()) == Op1C && - computeOverflowForUnsignedMul(Op0Conv->getOperand(0), CI, &I) == - OverflowResult::NeverOverflows) { + willNotOverflowUnsignedMul(Op0Conv->getOperand(0), CI, I)) { // Insert the new, smaller mul. Value *NewMul = Builder->CreateNUWMul(Op0Conv->getOperand(0), CI, "mulconv"); @@ -440,9 +437,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { if (Op0Conv->getOperand(0)->getType() == Op1Conv->getOperand(0)->getType() && (Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) && - computeOverflowForUnsignedMul(Op0Conv->getOperand(0), - Op1Conv->getOperand(0), - &I) == OverflowResult::NeverOverflows) { + willNotOverflowUnsignedMul(Op0Conv->getOperand(0), + Op1Conv->getOperand(0), I)) { // Insert the new integer mul. Value *NewMul = Builder->CreateNUWMul( Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv"); @@ -456,9 +452,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { I.setHasNoSignedWrap(true); } - if (!I.hasNoUnsignedWrap() && - computeOverflowForUnsignedMul(Op0, Op1, &I) == - OverflowResult::NeverOverflows) { + if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) { Changed = true; I.setHasNoUnsignedWrap(true); } diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 05b01774cd5e..4028a92771a4 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -611,7 +611,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1)) return I; - unsigned Leaders = Known2.Zero.countLeadingOnes(); + unsigned Leaders = Known2.countMinLeadingZeros(); Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask; break; } diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 1792cb585f87..65b1148cb03b 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2212,9 +2212,9 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { // Canonicalize fcmp_one -> fcmp_oeq FCmpInst::Predicate FPred; Value *Y; - if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), - TrueDest, FalseDest)) && - BI.getCondition()->hasOneUse()) + if (match(&BI, m_Br(m_OneUse(m_FCmp(FPred, m_Value(X), m_Value(Y))), + TrueDest, FalseDest))) { + // TODO: Why are we only transforming these 3 predicates? if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE || FPred == FCmpInst::FCMP_OGE) { FCmpInst *Cond = cast<FCmpInst>(BI.getCondition()); @@ -2225,12 +2225,12 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { Worklist.Add(Cond); return &BI; } + } // Canonicalize icmp_ne -> icmp_eq ICmpInst::Predicate IPred; - if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)), - TrueDest, FalseDest)) && - BI.getCondition()->hasOneUse()) + if (match(&BI, m_Br(m_OneUse(m_ICmp(IPred, m_Value(X), m_Value(Y))), + TrueDest, FalseDest))) { if (IPred == ICmpInst::ICMP_NE || IPred == ICmpInst::ICMP_ULE || IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE || IPred == ICmpInst::ICMP_SGE) { @@ -2241,6 +2241,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { Worklist.Add(Cond); return &BI; } + } return nullptr; } @@ -2264,8 +2265,8 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth(); KnownBits Known(BitWidth); computeKnownBits(Cond, Known, 0, &SI); - unsigned LeadingKnownZeros = Known.Zero.countLeadingOnes(); - unsigned LeadingKnownOnes = Known.One.countLeadingOnes(); + unsigned LeadingKnownZeros = Known.countMinLeadingZeros(); + unsigned LeadingKnownOnes = Known.countMinLeadingOnes(); // Compute the number of leading bits we can ignore. // TODO: A better way to determine this would use ComputeNumSignBits(). @@ -3141,7 +3142,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, // Lower dbg.declare intrinsics otherwise their value may be clobbered // by instcombiner. - bool DbgDeclaresChanged = LowerDbgDeclare(F); + bool MadeIRChange = LowerDbgDeclare(F); // Iterate while there is work to do. int Iteration = 0; @@ -3150,18 +3151,17 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " << F.getName() << "\n"); - bool Changed = prepareICWorklistFromFunction(F, DL, &TLI, Worklist); + MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist); InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines, AA, AC, TLI, DT, DL, LI); IC.MaxArraySizeForCombine = MaxArraySize; - Changed |= IC.run(); - if (!Changed) + if (!IC.run()) break; } - return DbgDeclaresChanged || Iteration > 1; + return MadeIRChange || Iteration > 1; } PreservedAnalyses InstCombinePass::run(Function &F, diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index b034ccc46933..7eea44d6aca0 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -613,7 +613,15 @@ public: bool UseGlobalsGC = true) : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan), Recover(Recover || ClRecover), - UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC) {} + UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC), + // Not a typo: ClWithComdat is almost completely pointless without + // ClUseGlobalsGC (because then it only works on modules without + // globals, which are rare); it is a prerequisite for ClUseGlobalsGC; + // and both suffer from gold PR19002 for which UseGlobalsGC constructor + // argument is designed as workaround. Therefore, disable both + // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to + // do globals-gc. + UseCtorComdat(UseGlobalsGC && ClWithComdat) {} bool runOnModule(Module &M) override; static char ID; // Pass identification, replacement for typeid StringRef getPassName() const override { return "AddressSanitizerModule"; } @@ -656,6 +664,7 @@ private: bool CompileKernel; bool Recover; bool UseGlobalsGC; + bool UseCtorComdat; Type *IntptrTy; LLVMContext *C; Triple TargetTriple; @@ -1677,7 +1686,7 @@ AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer, : GlobalVariable::PrivateLinkage; GlobalVariable *Metadata = new GlobalVariable( M, Initializer->getType(), false, Linkage, Initializer, - Twine("__asan_global_") + GlobalValue::getRealLinkageName(OriginalName)); + Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName)); Metadata->setSection(getGlobalMetadataSection()); return Metadata; } @@ -1782,7 +1791,7 @@ void AddressSanitizerModule::InstrumentGlobalsMachO( // On recent Mach-O platforms, use a structure which binds the liveness of // the global variable to the metadata struct. Keep the list of "Liveness" GV // created to be added to llvm.compiler.used - StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy, nullptr); + StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy); SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size()); for (size_t i = 0; i < ExtendedGlobals.size(); i++) { @@ -1793,9 +1802,9 @@ void AddressSanitizerModule::InstrumentGlobalsMachO( // On recent Mach-O platforms, we emit the global metadata in a way that // allows the linker to properly strip dead globals. - auto LivenessBinder = ConstantStruct::get( - LivenessTy, Initializer->getAggregateElement(0u), - ConstantExpr::getPointerCast(Metadata, IntptrTy), nullptr); + auto LivenessBinder = + ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u), + ConstantExpr::getPointerCast(Metadata, IntptrTy)); GlobalVariable *Liveness = new GlobalVariable( M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder, Twine("__asan_binder_") + G->getName()); @@ -1893,7 +1902,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool // We initialize an array of such structures and pass it to a run-time call. StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy, - IntptrTy, IntptrTy, IntptrTy, nullptr); + IntptrTy, IntptrTy, IntptrTy); SmallVector<GlobalVariable *, 16> NewGlobals(n); SmallVector<Constant *, 16> Initializers(n); @@ -1929,10 +1938,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0); Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize); - StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr); - Constant *NewInitializer = - ConstantStruct::get(NewTy, G->getInitializer(), - Constant::getNullValue(RightRedZoneTy), nullptr); + StructType *NewTy = StructType::get(Ty, RightRedZoneTy); + Constant *NewInitializer = ConstantStruct::get( + NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy)); // Create a new global variable with enough space for a redzone. GlobalValue::LinkageTypes Linkage = G->getLinkage(); @@ -2013,7 +2021,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool ConstantExpr::getPointerCast(Name, IntptrTy), ConstantExpr::getPointerCast(ModuleName, IntptrTy), ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc, - ConstantExpr::getPointerCast(ODRIndicator, IntptrTy), nullptr); + ConstantExpr::getPointerCast(ODRIndicator, IntptrTy)); if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true; @@ -2073,7 +2081,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) { // Put the constructor and destructor in comdat if both // (1) global instrumentation is not TU-specific // (2) target is ELF. - if (ClWithComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) { + if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) { AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName)); appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority, AsanCtorFunction); diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 8786781933ea..e2e3cbdbc295 100644 --- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -388,7 +388,7 @@ FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) { ArgTypes.push_back(ShadowPtrTy); Type *RetType = T->getReturnType(); if (!RetType->isVoidTy()) - RetType = StructType::get(RetType, ShadowTy, (Type *)nullptr); + RetType = StructType::get(RetType, ShadowTy); return FunctionType::get(RetType, ArgTypes, T->isVarArg()); } @@ -476,16 +476,14 @@ bool DataFlowSanitizer::doInitialization(Module &M) { GetArgTLS = ConstantExpr::getIntToPtr( ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)), PointerType::getUnqual( - FunctionType::get(PointerType::getUnqual(ArgTLSTy), - (Type *)nullptr))); + FunctionType::get(PointerType::getUnqual(ArgTLSTy), false))); } if (GetRetvalTLSPtr) { RetvalTLS = nullptr; GetRetvalTLS = ConstantExpr::getIntToPtr( ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)), PointerType::getUnqual( - FunctionType::get(PointerType::getUnqual(ShadowTy), - (Type *)nullptr))); + FunctionType::get(PointerType::getUnqual(ShadowTy), false))); } ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000); diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp index 7dea1dee756a..e89384c559fe 100644 --- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp +++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp @@ -398,8 +398,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV( // u64 *ArrayCounter; // }; auto *StructInfoTy = - StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy, - Int8PtrPtrTy, Int64PtrTy, Int64PtrTy, nullptr); + StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy, + Int8PtrPtrTy, Int64PtrTy, Int64PtrTy); auto *StructInfoPtrTy = StructInfoTy->getPointerTo(); // This structure should be kept consistent with the CacheFragInfo struct // in the runtime library. @@ -408,8 +408,7 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV( // u32 NumStructs; // StructInfo *Structs; // }; - auto *CacheFragInfoTy = - StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy, nullptr); + auto *CacheFragInfoTy = StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy); std::vector<StructType *> Vec = M.getIdentifiedStructTypes(); unsigned NumStructs = 0; @@ -457,24 +456,23 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV( ArrayCounterIdx[0] = ConstantInt::get(Int32Ty, 0); ArrayCounterIdx[1] = ConstantInt::get(Int32Ty, getArrayCounterIdx(StructTy)); - Initializers.push_back( - ConstantStruct::get( - StructInfoTy, - ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy), - ConstantInt::get(Int32Ty, - DL.getStructLayout(StructTy)->getSizeInBytes()), - ConstantInt::get(Int32Ty, StructTy->getNumElements()), - Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) : - ConstantExpr::getPointerCast(Offset, Int32PtrTy), - Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) : - ConstantExpr::getPointerCast(Size, Int32PtrTy), - TypeName == nullptr ? ConstantPointerNull::get(Int8PtrPtrTy) : - ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy), - ConstantExpr::getGetElementPtr(CounterArrayTy, Counters, - FieldCounterIdx), - ConstantExpr::getGetElementPtr(CounterArrayTy, Counters, - ArrayCounterIdx), - nullptr)); + Initializers.push_back(ConstantStruct::get( + StructInfoTy, + ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy), + ConstantInt::get(Int32Ty, + DL.getStructLayout(StructTy)->getSizeInBytes()), + ConstantInt::get(Int32Ty, StructTy->getNumElements()), + Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) + : ConstantExpr::getPointerCast(Offset, Int32PtrTy), + Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) + : ConstantExpr::getPointerCast(Size, Int32PtrTy), + TypeName == nullptr + ? ConstantPointerNull::get(Int8PtrPtrTy) + : ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy), + ConstantExpr::getGetElementPtr(CounterArrayTy, Counters, + FieldCounterIdx), + ConstantExpr::getGetElementPtr(CounterArrayTy, Counters, + ArrayCounterIdx))); } // Structs. Constant *StructInfo; @@ -491,11 +489,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV( auto *CacheFragInfoGV = new GlobalVariable( M, CacheFragInfoTy, true, GlobalVariable::InternalLinkage, - ConstantStruct::get(CacheFragInfoTy, - UnitName, - ConstantInt::get(Int32Ty, NumStructs), - StructInfo, - nullptr)); + ConstantStruct::get(CacheFragInfoTy, UnitName, + ConstantInt::get(Int32Ty, NumStructs), StructInfo)); return CacheFragInfoGV; } diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 15333a5317dd..ff753c20a94a 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -1576,13 +1576,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy, bool Signed = false) { Type *srcTy = V->getType(); + size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy); + size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy); + if (srcSizeInBits > 1 && dstSizeInBits == 1) + return IRB.CreateICmpNE(V, getCleanShadow(V)); + if (dstTy->isIntegerTy() && srcTy->isIntegerTy()) return IRB.CreateIntCast(V, dstTy, Signed); if (dstTy->isVectorTy() && srcTy->isVectorTy() && dstTy->getVectorNumElements() == srcTy->getVectorNumElements()) return IRB.CreateIntCast(V, dstTy, Signed); - size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy); - size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy); Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits)); Value *V2 = IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed); diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 3f1a77b49a44..ee493a8ec7e1 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -442,9 +442,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) { bool Changed = false; if (!NUW) { - ConstantRange NUWRange = - LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange, - OBO::NoUnsignedWrap); + ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion( + BinaryOperator::Add, LRange, OBO::NoUnsignedWrap); if (!NUWRange.isEmptySet()) { bool NewNUW = NUWRange.contains(LazyRRange()); AddOp->setHasNoUnsignedWrap(NewNUW); @@ -452,9 +451,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) { } } if (!NSW) { - ConstantRange NSWRange = - LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange, - OBO::NoSignedWrap); + ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion( + BinaryOperator::Add, LRange, OBO::NoSignedWrap); if (!NSWRange.isEmptySet()) { bool NewNSW = NSWRange.contains(LazyRRange()); AddOp->setHasNoSignedWrap(NewNSW); diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 48d5ae88cda9..6693a26e8890 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -144,6 +144,10 @@ private: bool recognizePopcount(); void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var); + bool recognizeAndInsertCTLZ(); + void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst, + PHINode *CntPhi, Value *Var, const DebugLoc DL, + bool ZeroCheck, bool IsCntPhiUsedOutsideLoop); /// @} }; @@ -994,7 +998,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, } bool LoopIdiomRecognize::runOnNoncountableLoop() { - return recognizePopcount(); + return recognizePopcount() || recognizeAndInsertCTLZ(); } /// Check if the given conditional branch is based on the comparison between @@ -1159,6 +1163,167 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, return true; } +/// Return true if the idiom is detected in the loop. +/// +/// Additionally: +/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) +/// or nullptr if there is no such. +/// 2) \p CntPhi is set to the corresponding phi node +/// or nullptr if there is no such. +/// 3) \p Var is set to the value whose CTLZ could be used. +/// 4) \p DefX is set to the instruction calculating Loop exit condition. +/// +/// The core idiom we are trying to detect is: +/// \code +/// if (x0 == 0) +/// goto loop-exit // the precondition of the loop +/// cnt0 = init-val; +/// do { +/// x = phi (x0, x.next); //PhiX +/// cnt = phi(cnt0, cnt.next); +/// +/// cnt.next = cnt + 1; +/// ... +/// x.next = x >> 1; // DefX +/// ... +/// } while(x.next != 0); +/// +/// loop-exit: +/// \endcode +static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, + Instruction *&CntInst, PHINode *&CntPhi, + Instruction *&DefX) { + BasicBlock *LoopEntry; + Value *VarX = nullptr; + + DefX = nullptr; + PhiX = nullptr; + CntInst = nullptr; + CntPhi = nullptr; + LoopEntry = *(CurLoop->block_begin()); + + // step 1: Check if the loop-back branch is in desirable form. + if (Value *T = matchCondition( + dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) + DefX = dyn_cast<Instruction>(T); + else + return false; + + // step 2: detect instructions corresponding to "x.next = x >> 1" + if (!DefX || DefX->getOpcode() != Instruction::AShr) + return false; + if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1))) + if (!Shft || !Shft->isOne()) + return false; + VarX = DefX->getOperand(0); + + // step 3: Check the recurrence of variable X + PhiX = dyn_cast<PHINode>(VarX); + if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX)) + return false; + + // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 + // TODO: We can skip the step. If loop trip count is known (CTLZ), + // then all uses of "cnt.next" could be optimized to the trip count + // plus "cnt0". Currently it is not optimized. + // This step could be used to detect POPCNT instruction: + // cnt.next = cnt + (x.next & 1) + for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), + IterE = LoopEntry->end(); + Iter != IterE; Iter++) { + Instruction *Inst = &*Iter; + if (Inst->getOpcode() != Instruction::Add) + continue; + + ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); + if (!Inc || !Inc->isOne()) + continue; + + PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); + if (!Phi || Phi->getParent() != LoopEntry) + continue; + + CntInst = Inst; + CntPhi = Phi; + break; + } + if (!CntInst) + return false; + + return true; +} + +/// Recognize CTLZ idiom in a non-countable loop and convert the loop +/// to countable (with CTLZ trip count). +/// If CTLZ inserted as a new trip count returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizeAndInsertCTLZ() { + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; + + Instruction *CntInst, *DefX; + PHINode *CntPhi, *PhiX; + if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX)) + return false; + + bool IsCntPhiUsedOutsideLoop = false; + for (User *U : CntPhi->users()) + if (!CurLoop->contains(dyn_cast<Instruction>(U))) { + IsCntPhiUsedOutsideLoop = true; + break; + } + bool IsCntInstUsedOutsideLoop = false; + for (User *U : CntInst->users()) + if (!CurLoop->contains(dyn_cast<Instruction>(U))) { + IsCntInstUsedOutsideLoop = true; + break; + } + // If both CntInst and CntPhi are used outside the loop the profitability + // is questionable. + if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop) + return false; + + // For some CPUs result of CTLZ(X) intrinsic is undefined + // when X is 0. If we can not guarantee X != 0, we need to check this + // when expand. + bool ZeroCheck = false; + // It is safe to assume Preheader exist as it was checked in + // parent function RunOnLoop. + BasicBlock *PH = CurLoop->getLoopPreheader(); + Value *InitX = PhiX->getIncomingValueForBlock(PH); + // If we check X != 0 before entering the loop we don't need a zero + // check in CTLZ intrinsic. + if (BasicBlock *PreCondBB = PH->getSinglePredecessor()) + if (BranchInst *PreCondBr = + dyn_cast<BranchInst>(PreCondBB->getTerminator())) { + if (matchCondition(PreCondBr, PH) == InitX) + ZeroCheck = true; + } + + // Check if CTLZ intrinsic is profitable. Assume it is always profitable + // if we delete the loop (the loop has only 6 instructions): + // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] + // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] + // %shr = ashr %n.addr.0, 1 + // %tobool = icmp eq %shr, 0 + // %inc = add nsw %i.0, 1 + // br i1 %tobool + + IRBuilder<> Builder(PH->getTerminator()); + SmallVector<const Value *, 2> Ops = + {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()}; + ArrayRef<const Value *> Args(Ops); + if (CurLoop->getHeader()->size() != 6 && + TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) > + TargetTransformInfo::TCC_Basic) + return false; + + const DebugLoc DL = DefX->getDebugLoc(); + transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck, + IsCntPhiUsedOutsideLoop); + return true; +} + /// Recognizes a population count idiom in a non-countable loop. /// /// If detected, transforms the relevant code to issue the popcount intrinsic @@ -1222,6 +1387,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, return CI; } +static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, + const DebugLoc &DL, bool ZeroCheck) { + Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()}; + Type *Tys[] = {Val->getType()}; + + Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys); + CallInst *CI = IRBuilder.CreateCall(Func, Ops); + CI->setDebugLoc(DL); + + return CI; +} + +/// Transform the following loop: +/// loop: +/// CntPhi = PHI [Cnt0, CntInst] +/// PhiX = PHI [InitX, DefX] +/// CntInst = CntPhi + 1 +/// DefX = PhiX >> 1 +// LOOP_BODY +/// Br: loop if (DefX != 0) +/// Use(CntPhi) or Use(CntInst) +/// +/// Into: +/// If CntPhi used outside the loop: +/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) +/// Count = CountPrev + 1 +/// else +/// Count = BitWidth(InitX) - CTLZ(InitX) +/// loop: +/// CntPhi = PHI [Cnt0, CntInst] +/// PhiX = PHI [InitX, DefX] +/// PhiCount = PHI [Count, Dec] +/// CntInst = CntPhi + 1 +/// DefX = PhiX >> 1 +/// Dec = PhiCount - 1 +/// LOOP_BODY +/// Br: loop if (Dec != 0) +/// Use(CountPrev + Cnt0) // Use(CntPhi) +/// or +/// Use(Count + Cnt0) // Use(CntInst) +/// +/// If LOOP_BODY is empty the loop will be deleted. +/// If CntInst and DefX are not used in LOOP_BODY they will be removed. +void LoopIdiomRecognize::transformLoopToCountable( + BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, + const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { + BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator()); + + // Step 1: Insert the CTLZ instruction at the end of the preheader block + // Count = BitWidth - CTLZ(InitX); + // If there are uses of CntPhi create: + // CountPrev = BitWidth - CTLZ(InitX >> 1); + IRBuilder<> Builder(PreheaderBr); + Builder.SetCurrentDebugLocation(DL); + Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext; + + if (IsCntPhiUsedOutsideLoop) + InitXNext = Builder.CreateAShr(InitX, + ConstantInt::get(InitX->getType(), 1)); + else + InitXNext = InitX; + CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck); + Count = Builder.CreateSub( + ConstantInt::get(CTLZ->getType(), + CTLZ->getType()->getIntegerBitWidth()), + CTLZ); + if (IsCntPhiUsedOutsideLoop) { + CountPrev = Count; + Count = Builder.CreateAdd( + CountPrev, + ConstantInt::get(CountPrev->getType(), 1)); + } + if (IsCntPhiUsedOutsideLoop) + NewCount = Builder.CreateZExtOrTrunc(CountPrev, + cast<IntegerType>(CntInst->getType())); + else + NewCount = Builder.CreateZExtOrTrunc(Count, + cast<IntegerType>(CntInst->getType())); + + // If the CTLZ counter's initial value is not zero, insert Add Inst. + Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); + ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); + if (!InitConst || !InitConst->isZero()) + NewCount = Builder.CreateAdd(NewCount, CntInitVal); + + // Step 2: Insert new IV and loop condition: + // loop: + // ... + // PhiCount = PHI [Count, Dec] + // ... + // Dec = PhiCount - 1 + // ... + // Br: loop if (Dec != 0) + BasicBlock *Body = *(CurLoop->block_begin()); + auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); + ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); + Type *Ty = Count->getType(); + + PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); + + Builder.SetInsertPoint(LbCond); + Instruction *TcDec = cast<Instruction>( + Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), + "tcdec", false, true)); + + TcPhi->addIncoming(Count, Preheader); + TcPhi->addIncoming(TcDec, Body); + + CmpInst::Predicate Pred = + (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; + LbCond->setPredicate(Pred); + LbCond->setOperand(0, TcDec); + LbCond->setOperand(1, ConstantInt::get(Ty, 0)); + + // Step 3: All the references to the original counter outside + // the loop are replaced with the NewCount -- the value returned from + // __builtin_ctlz(x). + if (IsCntPhiUsedOutsideLoop) + CntPhi->replaceUsesOutsideBlock(NewCount, Body); + else + CntInst->replaceUsesOutsideBlock(NewCount, Body); + + // step 4: Forget the "non-computable" trip-count SCEV associated with the + // loop. The loop would otherwise not be deleted even if it becomes empty. + SE->forgetLoop(CurLoop); +} + void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var) { diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 3c9850b156ac..5e0a705782ea 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -283,7 +283,6 @@ public: // Forward propagation info const Expression *getDefiningExpr() const { return DefiningExpr; } - void setDefiningExpr(const Expression *E) { DefiningExpr = E; } // Value member set bool empty() const { return Members.empty(); } @@ -317,6 +316,9 @@ public: --StoreCount; } + // True if this class has no memory members. + bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); } + // Return true if two congruence classes are equivalent to each other. This // means // that every field but the ID number and the dead field are equivalent. @@ -401,9 +403,12 @@ class NewGVN { MemorySSAWalker *MSSAWalker; const DataLayout &DL; std::unique_ptr<PredicateInfo> PredInfo; - BumpPtrAllocator ExpressionAllocator; - ArrayRecycler<Value *> ArgRecycler; - TarjanSCC SCCFinder; + + // These are the only two things the create* functions should have + // side-effects on due to allocating memory. + mutable BumpPtrAllocator ExpressionAllocator; + mutable ArrayRecycler<Value *> ArgRecycler; + mutable TarjanSCC SCCFinder; const SimplifyQuery SQ; // Number of function arguments, used by ranking @@ -430,11 +435,12 @@ class NewGVN { // In order to correctly ensure propagation, we must keep track of what // comparisons we used, so that when the values of the comparisons change, we // propagate the information to the places we used the comparison. - DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> PredicateToUsers; - // Mapping from MemoryAccess we used to the MemoryAccess we used it with. Has + mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> + PredicateToUsers; // the same reasoning as PredicateToUsers. When we skip MemoryAccesses for // stores, we no longer can rely solely on the def-use chains of MemorySSA. - DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> MemoryToUsers; + mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> + MemoryToUsers; // A table storing which memorydefs/phis represent a memory state provably // equivalent to another memory state. @@ -457,7 +463,7 @@ class NewGVN { DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState; enum PhiCycleState { PCS_Unknown, PCS_CycleFree, PCS_Cycle }; - DenseMap<const PHINode *, PhiCycleState> PhiCycleState; + mutable DenseMap<const PHINode *, PhiCycleState> PhiCycleState; // Expression to class mapping. using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>; ExpressionClassMap ExpressionToClass; @@ -511,21 +517,24 @@ public: private: // Expression handling. - const Expression *createExpression(Instruction *); - const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *); - PHIExpression *createPHIExpression(Instruction *, bool &HasBackedge, - bool &AllConstant); - const VariableExpression *createVariableExpression(Value *); - const ConstantExpression *createConstantExpression(Constant *); - const Expression *createVariableOrConstant(Value *V); - const UnknownExpression *createUnknownExpression(Instruction *); + const Expression *createExpression(Instruction *) const; + const Expression *createBinaryExpression(unsigned, Type *, Value *, + Value *) const; + PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge, + bool &AllConstant) const; + const VariableExpression *createVariableExpression(Value *) const; + const ConstantExpression *createConstantExpression(Constant *) const; + const Expression *createVariableOrConstant(Value *V) const; + const UnknownExpression *createUnknownExpression(Instruction *) const; const StoreExpression *createStoreExpression(StoreInst *, - const MemoryAccess *); + const MemoryAccess *) const; LoadExpression *createLoadExpression(Type *, Value *, LoadInst *, - const MemoryAccess *); - const CallExpression *createCallExpression(CallInst *, const MemoryAccess *); - const AggregateValueExpression *createAggregateValueExpression(Instruction *); - bool setBasicExpressionInfo(Instruction *, BasicExpression *); + const MemoryAccess *) const; + const CallExpression *createCallExpression(CallInst *, + const MemoryAccess *) const; + const AggregateValueExpression * + createAggregateValueExpression(Instruction *) const; + bool setBasicExpressionInfo(Instruction *, BasicExpression *) const; // Congruence class handling. CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) { @@ -560,17 +569,18 @@ private: // Symbolic evaluation. const Expression *checkSimplificationResults(Expression *, Instruction *, - Value *); - const Expression *performSymbolicEvaluation(Value *); + Value *) const; + const Expression *performSymbolicEvaluation(Value *) const; const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *, - Instruction *, MemoryAccess *); - const Expression *performSymbolicLoadEvaluation(Instruction *); - const Expression *performSymbolicStoreEvaluation(Instruction *); - const Expression *performSymbolicCallEvaluation(Instruction *); - const Expression *performSymbolicPHIEvaluation(Instruction *); - const Expression *performSymbolicAggrValueEvaluation(Instruction *); - const Expression *performSymbolicCmpEvaluation(Instruction *); - const Expression *performSymbolicPredicateInfoEvaluation(Instruction *); + Instruction *, + MemoryAccess *) const; + const Expression *performSymbolicLoadEvaluation(Instruction *) const; + const Expression *performSymbolicStoreEvaluation(Instruction *) const; + const Expression *performSymbolicCallEvaluation(Instruction *) const; + const Expression *performSymbolicPHIEvaluation(Instruction *) const; + const Expression *performSymbolicAggrValueEvaluation(Instruction *) const; + const Expression *performSymbolicCmpEvaluation(Instruction *) const; + const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const; // Congruence finding. bool someEquivalentDominates(const Instruction *, const Instruction *) const; @@ -620,8 +630,8 @@ private: void markPredicateUsersTouched(Instruction *); void markValueLeaderChangeTouched(CongruenceClass *CC); void markMemoryLeaderChangeTouched(CongruenceClass *CC); - void addPredicateUsers(const PredicateBase *, Instruction *); - void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U); + void addPredicateUsers(const PredicateBase *, Instruction *) const; + void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const; // Main loop of value numbering void iterateTouchedInstructions(); @@ -634,7 +644,7 @@ private: void verifyIterationSettled(Function &F); bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const; BasicBlock *getBlockForValue(Value *V) const; - void deleteExpression(const Expression *E); + void deleteExpression(const Expression *E) const; unsigned InstrToDFSNum(const Value *V) const { assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses"); return InstrDFS.lookup(V); @@ -654,7 +664,7 @@ private: ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst()) : InstrDFS.lookup(MA); } - bool isCycleFree(const PHINode *PN); + bool isCycleFree(const PHINode *PN) const; template <class T, class Range> T *getMinDFSOfRange(const Range &) const; // Debug counter info. When verifying, we have to reset the value numbering // debug counter to the same state it started in to get the same results. @@ -702,7 +712,7 @@ BasicBlock *NewGVN::getBlockForValue(Value *V) const { // Delete a definitely dead expression, so it can be reused by the expression // allocator. Some of these are not in creation functions, so we have to accept // const versions. -void NewGVN::deleteExpression(const Expression *E) { +void NewGVN::deleteExpression(const Expression *E) const { assert(isa<BasicExpression>(E)); auto *BE = cast<BasicExpression>(E); const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler); @@ -710,7 +720,7 @@ void NewGVN::deleteExpression(const Expression *E) { } PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge, - bool &AllConstant) { + bool &AllConstant) const { BasicBlock *PHIBlock = I->getParent(); auto *PN = cast<PHINode>(I); auto *E = @@ -722,30 +732,46 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge, unsigned PHIRPO = RPOOrdering.lookup(DT->getNode(PHIBlock)); + // NewGVN assumes the operands of a PHI node are in a consistent order across + // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix + // this in LLVM at some point we don't want GVN to find wrong congruences. + // Therefore, here we sort uses in predecessor order. + // We're sorting the values by pointer. In theory this might be cause of + // non-determinism, but here we don't rely on the ordering for anything + // significant, e.g. we don't create new instructions based on it so we're + // fine. + SmallVector<const Use *, 4> PHIOperands; + for (const Use &U : PN->operands()) + PHIOperands.push_back(&U); + std::sort(PHIOperands.begin(), PHIOperands.end(), + [&](const Use *U1, const Use *U2) { + return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2); + }); + // Filter out unreachable phi operands. - auto Filtered = make_filter_range(PN->operands(), [&](const Use &U) { - return ReachableEdges.count({PN->getIncomingBlock(U), PHIBlock}); + auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) { + return ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock}); }); std::transform(Filtered.begin(), Filtered.end(), op_inserter(E), - [&](const Use &U) -> Value * { - auto *BB = PN->getIncomingBlock(U); + [&](const Use *U) -> Value * { + auto *BB = PN->getIncomingBlock(*U); auto *DTN = DT->getNode(BB); if (RPOOrdering.lookup(DTN) >= PHIRPO) HasBackedge = true; - AllConstant &= isa<UndefValue>(U) || isa<Constant>(U); + AllConstant &= isa<UndefValue>(*U) || isa<Constant>(*U); // Don't try to transform self-defined phis. - if (U == PN) + if (*U == PN) return PN; - return lookupOperandLeader(U); + return lookupOperandLeader(*U); }); return E; } // Set basic expression info (Arguments, type, opcode) for Expression // E from Instruction I in block B. -bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) { +bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const { bool AllConstant = true; if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) E->setType(GEP->getSourceElementType()); @@ -766,7 +792,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) { } const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T, - Value *Arg1, Value *Arg2) { + Value *Arg1, + Value *Arg2) const { auto *E = new (ExpressionAllocator) BasicExpression(2); E->setType(T); @@ -795,7 +822,8 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T, // TODO: Once finished, this should not take an Instruction, we only // use it for printing. const Expression *NewGVN::checkSimplificationResults(Expression *E, - Instruction *I, Value *V) { + Instruction *I, + Value *V) const { if (!V) return nullptr; if (auto *C = dyn_cast<Constant>(V)) { @@ -827,7 +855,7 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E, return nullptr; } -const Expression *NewGVN::createExpression(Instruction *I) { +const Expression *NewGVN::createExpression(Instruction *I) const { auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands()); bool AllConstant = setBasicExpressionInfo(I, E); @@ -913,7 +941,7 @@ const Expression *NewGVN::createExpression(Instruction *I) { } const AggregateValueExpression * -NewGVN::createAggregateValueExpression(Instruction *I) { +NewGVN::createAggregateValueExpression(Instruction *I) const { if (auto *II = dyn_cast<InsertValueInst>(I)) { auto *E = new (ExpressionAllocator) AggregateValueExpression(I->getNumOperands(), II->getNumIndices()); @@ -932,32 +960,32 @@ NewGVN::createAggregateValueExpression(Instruction *I) { llvm_unreachable("Unhandled type of aggregate value operation"); } -const VariableExpression *NewGVN::createVariableExpression(Value *V) { +const VariableExpression *NewGVN::createVariableExpression(Value *V) const { auto *E = new (ExpressionAllocator) VariableExpression(V); E->setOpcode(V->getValueID()); return E; } -const Expression *NewGVN::createVariableOrConstant(Value *V) { +const Expression *NewGVN::createVariableOrConstant(Value *V) const { if (auto *C = dyn_cast<Constant>(V)) return createConstantExpression(C); return createVariableExpression(V); } -const ConstantExpression *NewGVN::createConstantExpression(Constant *C) { +const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const { auto *E = new (ExpressionAllocator) ConstantExpression(C); E->setOpcode(C->getValueID()); return E; } -const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) { +const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const { auto *E = new (ExpressionAllocator) UnknownExpression(I); E->setOpcode(I->getOpcode()); return E; } -const CallExpression *NewGVN::createCallExpression(CallInst *CI, - const MemoryAccess *MA) { +const CallExpression * +NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const { // FIXME: Add operand bundles for calls. auto *E = new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA); @@ -1017,9 +1045,8 @@ Value *NewGVN::lookupOperandLeader(Value *V) const { const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const { auto *CC = getMemoryClass(MA); assert(CC->getMemoryLeader() && - "Every MemoryAccess should be mapped to a " - "congruence class with a represenative memory " - "access"); + "Every MemoryAccess should be mapped to a congruence class with a " + "representative memory access"); return CC->getMemoryLeader(); } @@ -1032,7 +1059,7 @@ bool NewGVN::isMemoryAccessTop(const MemoryAccess *MA) const { LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp, LoadInst *LI, - const MemoryAccess *MA) { + const MemoryAccess *MA) const { auto *E = new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA)); E->allocateOperands(ArgRecycler, ExpressionAllocator); @@ -1050,8 +1077,8 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp, return E; } -const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI, - const MemoryAccess *MA) { +const StoreExpression * +NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const { auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand()); auto *E = new (ExpressionAllocator) StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA); @@ -1068,7 +1095,7 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI, return E; } -const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const { // Unlike loads, we never try to eliminate stores, so we do not check if they // are simple and avoid value numbering them. auto *SI = cast<StoreInst>(I); @@ -1126,7 +1153,7 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) { const Expression * NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, LoadInst *LI, Instruction *DepInst, - MemoryAccess *DefiningAccess) { + MemoryAccess *DefiningAccess) const { assert((!LI || LI->isSimple()) && "Not a simple load"); if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) { // Can't forward from non-atomic to atomic without violating memory model. @@ -1201,7 +1228,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, return nullptr; } -const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const { auto *LI = cast<LoadInst>(I); // We can eliminate in favor of non-simple loads, but we won't be able to @@ -1239,7 +1266,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) { } const Expression * -NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) { +NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { auto *PI = PredInfo->getPredicateInfoFor(I); if (!PI) return nullptr; @@ -1284,7 +1311,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) { return nullptr; if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) { - DEBUG(dbgs() << "Copy is not of any condition operands!"); + DEBUG(dbgs() << "Copy is not of any condition operands!\n"); return nullptr; } Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0)); @@ -1329,7 +1356,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) { } // Evaluate read only and pure calls, and create an expression result. -const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const { auto *CI = cast<CallInst>(I); if (auto *II = dyn_cast<IntrinsicInst>(I)) { // Instrinsics with the returned attribute are copies of arguments. @@ -1366,8 +1393,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From, DEBUG(dbgs() << "Setting " << *From); DEBUG(dbgs() << " equivalent to congruence class "); DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader "); - DEBUG(dbgs() << *NewClass->getMemoryLeader()); - DEBUG(dbgs() << "\n"); + DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n"); auto LookupResult = MemoryAccessToClass.find(From); bool Changed = false; @@ -1381,7 +1407,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From, NewClass->memory_insert(MP); // This may have killed the class if it had no non-memory members if (OldClass->getMemoryLeader() == From) { - if (OldClass->memory_empty()) { + if (OldClass->definesNoMemory()) { OldClass->setMemoryLeader(nullptr); } else { OldClass->setMemoryLeader(getNextMemoryLeader(OldClass)); @@ -1406,7 +1432,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From, // Determine if a phi is cycle-free. That means the values in the phi don't // depend on any expressions that can change value as a result of the phi. // For example, a non-cycle free phi would be v = phi(0, v+1). -bool NewGVN::isCycleFree(const PHINode *PN) { +bool NewGVN::isCycleFree(const PHINode *PN) const { // In order to compute cycle-freeness, we do SCC finding on the phi, and see // what kind of SCC it ends up in. If it is a singleton, it is cycle-free. // If it is not in a singleton, it is only cycle free if the other members are @@ -1436,7 +1462,7 @@ bool NewGVN::isCycleFree(const PHINode *PN) { } // Evaluate PHI nodes symbolically, and create an expression result. -const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { // True if one of the incoming phi edges is a backedge. bool HasBackedge = false; // All constant tracks the state of whether all the *original* phi operands @@ -1510,7 +1536,8 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) { return E; } -const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) { +const Expression * +NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const { if (auto *EI = dyn_cast<ExtractValueInst>(I)) { auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand()); if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) { @@ -1548,7 +1575,7 @@ const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) { return createAggregateValueExpression(I); } -const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { auto *CI = dyn_cast<CmpInst>(I); // See if our operands are equal to those of a previous predicate, and if so, // if it implies true or false. @@ -1663,7 +1690,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) { } // Substitute and symbolize the value before value numbering. -const Expression *NewGVN::performSymbolicEvaluation(Value *V) { +const Expression *NewGVN::performSymbolicEvaluation(Value *V) const { const Expression *E = nullptr; if (auto *C = dyn_cast<Constant>(V)) E = createConstantExpression(C); @@ -1749,7 +1776,7 @@ void NewGVN::markUsersTouched(Value *V) { } } -void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) { +void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const { DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n"); MemoryToUsers[To].insert(U); } @@ -1772,7 +1799,7 @@ void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) { } // Add I to the set of users of a given predicate. -void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) { +void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const { if (auto *PBranch = dyn_cast<PredicateBranch>(PB)) PredicateToUsers[PBranch->Condition].insert(I); else if (auto *PAssume = dyn_cast<PredicateBranch>(PB)) @@ -1825,8 +1852,7 @@ const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const { // TODO: If this ends up to slow, we can maintain a next memory leader like we // do for regular leaders. // Make sure there will be a leader to find - assert((CC->getStoreCount() > 0 || !CC->memory_empty()) && - "Can't get next leader if there is none"); + assert(!CC->definesNoMemory() && "Can't get next leader if there is none"); if (CC->getStoreCount() > 0) { if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first)) return MSSA->getMemoryAccess(NL); @@ -1898,7 +1924,7 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I, setMemoryClass(InstMA, NewClass); // Now, fixup the old class if necessary if (OldClass->getMemoryLeader() == InstMA) { - if (OldClass->getStoreCount() != 0 || !OldClass->memory_empty()) { + if (!OldClass->definesNoMemory()) { OldClass->setMemoryLeader(getNextMemoryLeader(OldClass)); DEBUG(dbgs() << "Memory class leader change for class " << OldClass->getID() << " to " @@ -1956,10 +1982,9 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) { // If it's a store expression we are using, it means we are not equivalent // to something earlier. - if (isa<StoreExpression>(E)) { - assert(lookupOperandLeader(SI->getValueOperand()) != - NewClass->getLeader()); - NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand())); + if (auto *SE = dyn_cast<StoreExpression>(E)) { + assert(SE->getStoredValue() != NewClass->getLeader()); + NewClass->setStoredValue(SE->getStoredValue()); markValueLeaderChangeTouched(NewClass); // Shift the new class leader to be the store DEBUG(dbgs() << "Changing leader of congruence class " @@ -1985,7 +2010,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, // See if we destroyed the class or need to swap leaders. if (OldClass->empty() && OldClass != TOPClass) { if (OldClass->getDefiningExpr()) { - DEBUG(dbgs() << "Erasing expression " << OldClass->getDefiningExpr() + DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr() << " from table\n"); ExpressionToClass.erase(OldClass->getDefiningExpr()); } @@ -2064,7 +2089,7 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) { } else if (const auto *SE = dyn_cast<StoreExpression>(E)) { StoreInst *SI = SE->getStoreInst(); NewClass->setLeader(SI); - NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand())); + NewClass->setStoredValue(SE->getStoredValue()); // The RepMemoryAccess field will be filled in properly by the // moveValueToNewCongruenceClass call. } else { @@ -2523,6 +2548,19 @@ void NewGVN::verifyMemoryCongruency() const { return false; if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first)) return !isInstructionTriviallyDead(MemDef->getMemoryInst()); + + // We could have phi nodes which operands are all trivially dead, + // so we don't process them. + if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) { + for (auto &U : MemPHI->incoming_values()) { + if (Instruction *I = dyn_cast<Instruction>(U.get())) { + if (!isInstructionTriviallyDead(I)) + return true; + } + } + return false; + } + return true; }; diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index fb1b47c48276..4f608c97147d 100644 --- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -55,7 +55,7 @@ static void replaceLoopUsesWithConstant(Loop &L, Value &LIC, /// Update the dominator tree after removing one exiting predecessor of a loop /// exit block. static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L, - DominatorTree &DT) { + DominatorTree &DT) { assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) && "Cannot have empty predecessors of the loop exit block if we split " "off a block to unswitch!"); @@ -137,6 +137,98 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH, } } +/// Check that all the LCSSA PHI nodes in the loop exit block have trivial +/// incoming values along this edge. +static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, + BasicBlock &ExitBB) { + for (Instruction &I : ExitBB) { + auto *PN = dyn_cast<PHINode>(&I); + if (!PN) + // No more PHIs to check. + return true; + + // If the incoming value for this edge isn't loop invariant the unswitch + // won't be trivial. + if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB))) + return false; + } + llvm_unreachable("Basic blocks should never be empty!"); +} + +/// Rewrite the PHI nodes in an unswitched loop exit basic block. +/// +/// Requires that the loop exit and unswitched basic block are the same, and +/// that the exiting block was a unique predecessor of that block. Rewrites the +/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial +/// PHI nodes from the old preheader that now contains the unswitched +/// terminator. +static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, + BasicBlock &OldExitingBB, + BasicBlock &OldPH) { + for (Instruction &I : UnswitchedBB) { + auto *PN = dyn_cast<PHINode>(&I); + if (!PN) + // No more PHIs to check. + break; + + // When the loop exit is directly unswitched we just need to update the + // incoming basic block. We loop to handle weird cases with repeated + // incoming blocks, but expect to typically only have one operand here. + for (auto i : llvm::seq<int>(0, PN->getNumOperands())) { + assert(PN->getIncomingBlock(i) == &OldExitingBB && + "Found incoming block different from unique predecessor!"); + PN->setIncomingBlock(i, &OldPH); + } + } +} + +/// Rewrite the PHI nodes in the loop exit basic block and the split off +/// unswitched block. +/// +/// Because the exit block remains an exit from the loop, this rewrites the +/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI +/// nodes into the unswitched basic block to select between the value in the +/// old preheader and the loop exit. +static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, + BasicBlock &UnswitchedBB, + BasicBlock &OldExitingBB, + BasicBlock &OldPH) { + assert(&ExitBB != &UnswitchedBB && + "Must have different loop exit and unswitched blocks!"); + Instruction *InsertPt = &*UnswitchedBB.begin(); + for (Instruction &I : ExitBB) { + auto *PN = dyn_cast<PHINode>(&I); + if (!PN) + // No more PHIs to check. + break; + + auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2, + PN->getName() + ".split", InsertPt); + + // Walk backwards over the old PHI node's inputs to minimize the cost of + // removing each one. We have to do this weird loop manually so that we + // create the same number of new incoming edges in the new PHI as we expect + // each case-based edge to be included in the unswitched switch in some + // cases. + // FIXME: This is really, really gross. It would be much cleaner if LLVM + // allowed us to create a single entry for a predecessor block without + // having separate entries for each "edge" even though these edges are + // required to produce identical results. + for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) { + if (PN->getIncomingBlock(i) != &OldExitingBB) + continue; + + Value *Incoming = PN->removeIncomingValue(i); + NewPN->addIncoming(Incoming, &OldPH); + } + + // Now replace the old PHI with the new one and wire the old one in as an + // input to the new one. + PN->replaceAllUsesWith(NewPN); + NewPN->addIncoming(PN, &ExitBB); + } +} + /// Unswitch a trivial branch if the condition is loop invariant. /// /// This routine should only be called when loop code leading to the branch has @@ -187,10 +279,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, assert(L.contains(ContinueBB) && "Cannot have both successors exit and still be in the loop!"); - // If the loop exit block contains phi nodes, this isn't trivial. - // FIXME: We should examine the PHI to determine whether or not we can handle - // it trivially. - if (isa<PHINode>(LoopExitBB->begin())) + auto *ParentBB = BI.getParent(); + if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB)) return false; DEBUG(dbgs() << " unswitching trivial branch when: " << CondVal @@ -209,14 +299,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, BasicBlock *UnswitchedBB; if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) { (void)PredBB; - assert(PredBB == BI.getParent() && "A branch's parent is't a predecessor!"); + assert(PredBB == BI.getParent() && + "A branch's parent isn't a predecessor!"); UnswitchedBB = LoopExitBB; } else { UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI); } - BasicBlock *ParentBB = BI.getParent(); - // Now splice the branch to gate reaching the new preheader and re-point its // successors. OldPH->getInstList().splice(std::prev(OldPH->end()), @@ -229,6 +318,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // terminator. BranchInst::Create(ContinueBB, ParentBB); + // Rewrite the relevant PHI nodes. + if (UnswitchedBB == LoopExitBB) + rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH); + else + rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB, + *ParentBB, *OldPH); + // Now we need to update the dominator tree. updateDTAfterUnswitch(UnswitchedBB, OldPH, DT); // But if we split something off of the loop exit block then we also removed @@ -278,6 +374,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, if (!L.isLoopInvariant(LoopCond)) return false; + auto *ParentBB = SI.getParent(); + // FIXME: We should compute this once at the start and update it! SmallVector<BasicBlock *, 16> ExitBlocks; L.getExitBlocks(ExitBlocks); @@ -287,12 +385,13 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, SmallVector<int, 4> ExitCaseIndices; for (auto Case : SI.cases()) { auto *SuccBB = Case.getCaseSuccessor(); - if (ExitBlockSet.count(SuccBB) && !isa<PHINode>(SuccBB->begin())) + if (ExitBlockSet.count(SuccBB) && + areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB)) ExitCaseIndices.push_back(Case.getCaseIndex()); } BasicBlock *DefaultExitBB = nullptr; if (ExitBlockSet.count(SI.getDefaultDest()) && - !isa<PHINode>(SI.getDefaultDest()->begin()) && + areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) && !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator())) DefaultExitBB = SI.getDefaultDest(); else if (ExitCaseIndices.empty()) @@ -330,7 +429,6 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, if (CommonSuccBB) { SI.setDefaultDest(CommonSuccBB); } else { - BasicBlock *ParentBB = SI.getParent(); BasicBlock *UnreachableBB = BasicBlock::Create( ParentBB->getContext(), Twine(ParentBB->getName()) + ".unreachable_default", @@ -358,30 +456,44 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, // Now add the unswitched switch. auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH); - // Split any exit blocks with remaining in-loop predecessors. We walk in - // reverse so that we split in the same order as the cases appeared. This is - // purely for convenience of reading the resulting IR, but it doesn't cost - // anything really. + // Rewrite the IR for the unswitched basic blocks. This requires two steps. + // First, we split any exit blocks with remaining in-loop predecessors. Then + // we update the PHIs in one of two ways depending on if there was a split. + // We walk in reverse so that we split in the same order as the cases + // appeared. This is purely for convenience of reading the resulting IR, but + // it doesn't cost anything really. + SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs; SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap; // Handle the default exit if necessary. // FIXME: It'd be great if we could merge this with the loop below but LLVM's // ranges aren't quite powerful enough yet. - if (DefaultExitBB && !pred_empty(DefaultExitBB)) { - auto *SplitBB = - SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI); - updateLoopExitIDom(DefaultExitBB, L, DT); - DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; + if (DefaultExitBB) { + if (pred_empty(DefaultExitBB)) { + UnswitchedExitBBs.insert(DefaultExitBB); + rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH); + } else { + auto *SplitBB = + SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI); + rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB, + *ParentBB, *OldPH); + updateLoopExitIDom(DefaultExitBB, L, DT); + DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; + } } // Note that we must use a reference in the for loop so that we update the // container. for (auto &CasePair : reverse(ExitCases)) { // Grab a reference to the exit block in the pair so that we can update it. - BasicBlock *&ExitBB = CasePair.second; + BasicBlock *ExitBB = CasePair.second; // If this case is the last edge into the exit block, we can simply reuse it // as it will no longer be a loop exit. No mapping necessary. - if (pred_empty(ExitBB)) + if (pred_empty(ExitBB)) { + // Only rewrite once. + if (UnswitchedExitBBs.insert(ExitBB).second) + rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH); continue; + } // Otherwise we need to split the exit block so that we retain an exit // block from the loop and a target for the unswitched condition. @@ -389,9 +501,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, if (!SplitExitBB) { // If this is the first time we see this, do the split and remember it. SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI); + rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB, + *ParentBB, *OldPH); updateLoopExitIDom(ExitBB, L, DT); } - ExitBB = SplitExitBB; + // Update the case pair to point to the split block. + CasePair.second = SplitExitBB; } // Now add the unswitched cases. We do this in reverse order as we built them diff --git a/lib/Transforms/Scalar/SpeculativeExecution.cpp b/lib/Transforms/Scalar/SpeculativeExecution.cpp index a0fc966cee2c..a7c308b59877 100644 --- a/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -208,6 +208,47 @@ bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) { return false; } +static unsigned ComputeSpeculationCost(const Instruction *I, + const TargetTransformInfo &TTI) { + switch (Operator::getOpcode(I)) { + case Instruction::GetElementPtr: + case Instruction::Add: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Select: + case Instruction::Shl: + case Instruction::Sub: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::Xor: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::Call: + case Instruction::BitCast: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::AddrSpaceCast: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPExt: + case Instruction::FPTrunc: + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::ICmp: + case Instruction::FCmp: + return TTI.getUserCost(I); + + default: + return UINT_MAX; // Disallow anything not whitelisted. + } +} + bool SpeculativeExecutionPass::considerHoistingFromTo( BasicBlock &FromBlock, BasicBlock &ToBlock) { SmallSet<const Instruction *, 8> NotHoisted; @@ -223,7 +264,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo( unsigned TotalSpeculationCost = 0; for (auto& I : FromBlock) { - const unsigned Cost = TTI->getUserCost(&I); + const unsigned Cost = ComputeSpeculationCost(&I, *TTI); if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) && AllPrecedingUsesFromBlockHoisted(&I)) { TotalSpeculationCost += Cost; diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp index 7ffdad597a9b..83ec7f55d1af 100644 --- a/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -261,10 +261,10 @@ ValueRange FastDivInsertionTask::getValueRange(Value *V, computeKnownBits(V, Known, DL); - if (Known.Zero.countLeadingOnes() >= HiBits) + if (Known.countMinLeadingZeros() >= HiBits) return VALRNG_KNOWN_SHORT; - if (Known.One.countLeadingZeros() < HiBits) + if (Known.countMaxLeadingZeros() < HiBits) return VALRNG_LIKELY_LONG; // Long integer divisions are often used in hashtable implementations. It's diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index d5124ac89016..4aa26fd14fee 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -41,6 +41,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix, Function *F, ClonedCodeInfo *CodeInfo) { + DenseMap<const MDNode *, MDNode *> Cache; BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F); if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix); @@ -50,6 +51,9 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) { Instruction *NewInst = II->clone(); + if (F && F->getSubprogram()) + DebugLoc::reparentDebugInfo(*NewInst, BB->getParent()->getSubprogram(), + F->getSubprogram(), Cache); if (II->hasName()) NewInst->setName(II->getName()+NameSuffix); NewBB->getInstList().push_back(NewInst); @@ -120,12 +124,28 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, SmallVector<std::pair<unsigned, MDNode *>, 1> MDs; OldFunc->getAllMetadata(MDs); - for (auto MD : MDs) - NewFunc->addMetadata( - MD.first, - *MapMetadata(MD.second, VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, - TypeMapper, Materializer)); + for (auto MD : MDs) { + MDNode *NewMD; + bool MustCloneSP = + (MD.first == LLVMContext::MD_dbg && OldFunc->getParent() && + OldFunc->getParent() == NewFunc->getParent()); + if (MustCloneSP) { + auto *SP = cast<DISubprogram>(MD.second); + NewMD = DISubprogram::getDistinct( + NewFunc->getContext(), SP->getScope(), SP->getName(), + NewFunc->getName(), SP->getFile(), SP->getLine(), SP->getType(), + SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(), + SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(), + SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(), + SP->getUnit(), SP->getTemplateParams(), SP->getDeclaration(), + SP->getVariables(), SP->getThrownTypes()); + } else + NewMD = + MapMetadata(MD.second, VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer); + NewFunc->addMetadata(MD.first, *NewMD); + } // Loop over all of the basic blocks in the function, cloning them as // appropriate. Note that we save BE this way in order to handle cloning of diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp index 4e9d67252d6c..5444b752de82 100644 --- a/lib/Transforms/Utils/CloneModule.cpp +++ b/lib/Transforms/Utils/CloneModule.cpp @@ -96,7 +96,7 @@ std::unique_ptr<Module> llvm::CloneModule( else GV = new GlobalVariable( *New, I->getValueType(), false, GlobalValue::ExternalLinkage, - (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr, + nullptr, I->getName(), nullptr, I->getThreadLocalMode(), I->getType()->getAddressSpace()); VMap[&*I] = GV; // We do not copy attributes (mainly because copying between different diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp index 8c2386554da5..78d7474e5b95 100644 --- a/lib/Transforms/Utils/EscapeEnumerator.cpp +++ b/lib/Transforms/Utils/EscapeEnumerator.cpp @@ -67,8 +67,7 @@ IRBuilder<> *EscapeEnumerator::Next() { // Create a cleanup block. LLVMContext &C = F.getContext(); BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F); - Type *ExnTy = - StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr); + Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C)); if (!F.hasPersonalityFn()) { Constant *PersFn = getDefaultPersonalityFn(F.getParent()); F.setPersonalityFn(PersFn); diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index 6d56e08af99f..9cb4762b683c 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -1302,41 +1302,6 @@ static bool hasLifetimeMarkers(AllocaInst *AI) { return false; } -/// Rebuild the entire inlined-at chain for this instruction so that the top of -/// the chain now is inlined-at the new call site. -static DebugLoc -updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode, - LLVMContext &Ctx, - DenseMap<const DILocation *, DILocation *> &IANodes) { - SmallVector<DILocation *, 3> InlinedAtLocations; - DILocation *Last = InlinedAtNode; - DILocation *CurInlinedAt = DL; - - // Gather all the inlined-at nodes - while (DILocation *IA = CurInlinedAt->getInlinedAt()) { - // Skip any we've already built nodes for - if (DILocation *Found = IANodes[IA]) { - Last = Found; - break; - } - - InlinedAtLocations.push_back(IA); - CurInlinedAt = IA; - } - - // Starting from the top, rebuild the nodes to point to the new inlined-at - // location (then rebuilding the rest of the chain behind it) and update the - // map of already-constructed inlined-at nodes. - for (const DILocation *MD : reverse(InlinedAtLocations)) { - Last = IANodes[MD] = DILocation::getDistinct( - Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last); - } - - // And finally create the normal location for this instruction, referring to - // the new inlined-at chain. - return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last); -} - /// Return the result of AI->isStaticAlloca() if AI were moved to the entry /// block. Allocas used in inalloca calls and allocas of dynamic array size /// cannot be static. @@ -1364,14 +1329,16 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, // Cache the inlined-at nodes as they're built so they are reused, without // this every instruction's inlined-at chain would become distinct from each // other. - DenseMap<const DILocation *, DILocation *> IANodes; + DenseMap<const MDNode *, MDNode *> IANodes; for (; FI != Fn->end(); ++FI) { for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { if (DebugLoc DL = BI->getDebugLoc()) { - BI->setDebugLoc( - updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes)); + auto IA = DebugLoc::appendInlinedAt(DL, InlinedAtNode, BI->getContext(), + IANodes); + auto IDL = DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), IA); + BI->setDebugLoc(IDL); continue; } @@ -1429,11 +1396,12 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock, /// Update the branch metadata for cloned call instructions. static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap, const Optional<uint64_t> &CalleeEntryCount, - const Instruction *TheCall) { + const Instruction *TheCall, + ProfileSummaryInfo *PSI) { if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1) return; Optional<uint64_t> CallSiteCount = - ProfileSummaryInfo::getProfileCount(TheCall, nullptr); + PSI ? PSI->getProfileCount(TheCall, nullptr) : None; uint64_t CallCount = std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0, CalleeEntryCount.getValue()); @@ -1456,16 +1424,16 @@ static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap, /// The callsite's block count is subtracted from the callee's function entry /// count. static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB, - Instruction *CallInst, Function *Callee) { + Instruction *CallInst, Function *Callee, + ProfileSummaryInfo *PSI) { // If the callee has a original count of N, and the estimated count of // callsite is M, the new callee count is set to N - M. M is estimated from // the caller's entry count, its entry block frequency and the block frequency // of the callsite. Optional<uint64_t> CalleeCount = Callee->getEntryCount(); - if (!CalleeCount.hasValue()) + if (!CalleeCount.hasValue() || !PSI) return; - Optional<uint64_t> CallCount = - ProfileSummaryInfo::getProfileCount(CallInst, CallerBFI); + Optional<uint64_t> CallCount = PSI->getProfileCount(CallInst, CallerBFI); if (!CallCount.hasValue()) return; // Since CallSiteCount is an estimate, it could exceed the original callee @@ -1668,9 +1636,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI, CalledFunc->front()); - updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall); + updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall, + IFI.PSI); // Update the profile count of callee. - updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc); + updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI); // Inject byval arguments initialization. for (std::pair<Value*, Value*> &Init : ByValInit) diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp index 8a1973d1db05..53b432fcafd4 100644 --- a/lib/Transforms/Utils/InstructionNamer.cpp +++ b/lib/Transforms/Utils/InstructionNamer.cpp @@ -26,16 +26,15 @@ namespace { InstNamer() : FunctionPass(ID) { initializeInstNamerPass(*PassRegistry::getPassRegistry()); } - + void getAnalysisUsage(AnalysisUsage &Info) const override { Info.setPreservesAll(); } bool runOnFunction(Function &F) override { - for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); - AI != AE; ++AI) - if (!AI->hasName() && !AI->getType()->isVoidTy()) - AI->setName("arg"); + for (auto &Arg : F.args()) + if (!Arg.hasName()) + Arg.setName("arg"); for (BasicBlock &BB : F) { if (!BB.hasName()) @@ -48,11 +47,11 @@ namespace { return true; } }; - + char InstNamer::ID = 0; } -INITIALIZE_PASS(InstNamer, "instnamer", +INITIALIZE_PASS(InstNamer, "instnamer", "Assign names to anonymous instructions", false, false) char &llvm::InstructionNamerID = InstNamer::ID; //===----------------------------------------------------------------------===// diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index ce6b703f3528..1ca509472b5f 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1041,7 +1041,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, KnownBits Known(BitWidth); computeKnownBits(V, Known, DL, 0, AC, CxtI, DT); - unsigned TrailZ = Known.Zero.countTrailingOnes(); + unsigned TrailZ = Known.countMinTrailingZeros(); // Avoid trouble with ridiculously large TrailZ values, such as // those computed from a null pointer. @@ -1105,8 +1105,9 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar, void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, StoreInst *SI, DIBuilder &Builder) { auto *DIVar = DDI->getVariable(); - auto *DIExpr = DDI->getExpression(); assert(DIVar && "Missing variable"); + auto *DIExpr = DDI->getExpression(); + Value *DV = SI->getOperand(0); // If an argument is zero extended then use argument directly. The ZExt // may be zapped by an optimization pass in future. @@ -1116,34 +1117,28 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0)); if (ExtendedArg) { - // We're now only describing a subset of the variable. The fragment we're - // describing will always be smaller than the variable size, because - // VariableSize == Size of Alloca described by DDI. Since SI stores - // to the alloca described by DDI, if it's first operand is an extend, - // we're guaranteed that before extension, the value was narrower than - // the size of the alloca, hence the size of the described variable. - SmallVector<uint64_t, 3> Ops; - unsigned FragmentOffset = 0; - // If this already is a bit fragment, we drop the bit fragment from the - // expression and record the offset. - auto Fragment = DIExpr->getFragmentInfo(); - if (Fragment) { - Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3); - FragmentOffset = Fragment->OffsetInBits; - } else { - Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()); + // If this DDI was already describing only a fragment of a variable, ensure + // that fragment is appropriately narrowed here. + // But if a fragment wasn't used, describe the value as the original + // argument (rather than the zext or sext) so that it remains described even + // if the sext/zext is optimized away. This widens the variable description, + // leaving it up to the consumer to know how the smaller value may be + // represented in a larger register. + if (auto Fragment = DIExpr->getFragmentInfo()) { + unsigned FragmentOffset = Fragment->OffsetInBits; + SmallVector<uint64_t, 3> Ops(DIExpr->elements_begin(), + DIExpr->elements_end() - 3); + Ops.push_back(dwarf::DW_OP_LLVM_fragment); + Ops.push_back(FragmentOffset); + const DataLayout &DL = DDI->getModule()->getDataLayout(); + Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); + DIExpr = Builder.createExpression(Ops); } - Ops.push_back(dwarf::DW_OP_LLVM_fragment); - Ops.push_back(FragmentOffset); - const DataLayout &DL = DDI->getModule()->getDataLayout(); - Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); - auto NewDIExpr = Builder.createExpression(Ops); - if (!LdStHasDebugValue(DIVar, NewDIExpr, SI)) - Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr, - DDI->getDebugLoc(), SI); - } else if (!LdStHasDebugValue(DIVar, DIExpr, SI)) - Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr, - DDI->getDebugLoc(), SI); + DV = ExtendedArg; + } + if (!LdStHasDebugValue(DIVar, DIExpr, SI)) + Builder.insertDbgValueIntrinsic(DV, 0, DIVar, DIExpr, DDI->getDebugLoc(), + SI); } /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value @@ -1781,44 +1776,43 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J) { combineMetadata(K, J, KnownIDs); } -unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, - DominatorTree &DT, - const BasicBlockEdge &Root) { +template <typename RootType, typename DominatesFn> +static unsigned replaceDominatedUsesWith(Value *From, Value *To, + const RootType &Root, + const DominatesFn &Dominates) { assert(From->getType() == To->getType()); - + unsigned Count = 0; for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); - UI != UE; ) { + UI != UE;) { Use &U = *UI++; - if (DT.dominates(Root, U)) { - U.set(To); - DEBUG(dbgs() << "Replace dominated use of '" - << From->getName() << "' as " - << *To << " in " << *U << "\n"); - ++Count; - } + if (!Dominates(Root, U)) + continue; + U.set(To); + DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as " + << *To << " in " << *U << "\n"); + ++Count; } return Count; } unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, - const BasicBlock *BB) { - assert(From->getType() == To->getType()); + const BasicBlockEdge &Root) { + auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) { + return DT.dominates(Root, U); + }; + return ::replaceDominatedUsesWith(From, To, Root, Dominates); +} - unsigned Count = 0; - for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); - UI != UE;) { - Use &U = *UI++; - auto *I = cast<Instruction>(U.getUser()); - if (DT.properlyDominates(BB, I->getParent())) { - U.set(To); - DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as " - << *To << " in " << *U << "\n"); - ++Count; - } - } - return Count; +unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, + DominatorTree &DT, + const BasicBlock *BB) { + auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) { + auto *I = cast<Instruction>(U.getUser())->getParent(); + return DT.properlyDominates(BB, I); + }; + return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates); } bool llvm::callsGCLeafFunction(ImmutableCallSite CS) { diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp index 175d013a011d..81f033e7d51a 100644 --- a/lib/Transforms/Utils/LoopUtils.cpp +++ b/lib/Transforms/Utils/LoopUtils.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" @@ -1112,3 +1113,203 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) { else return (FalseVal + (TrueVal / 2)) / TrueVal; } + +/// \brief Adds a 'fast' flag to floating point operations. +static Value *addFastMathFlag(Value *V) { + if (isa<FPMathOperator>(V)) { + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + cast<Instruction>(V)->setFastMathFlags(Flags); + } + return V; +} + +// Helper to generate a log2 shuffle reduction. +Value * +llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, + ArrayRef<Value *> RedOps) { + unsigned VF = Src->getType()->getVectorNumElements(); + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = Src; + SmallVector<Constant *, 32> ShuffleMask(VF, nullptr); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = Builder.getInt32(i / 2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), + UndefValue::get(Builder.getInt32Ty())); + + Value *Shuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), + ConstantVector::get(ShuffleMask), "rdx.shuf"); + + if (Op != Instruction::ICmp && Op != Instruction::FCmp) { + // Floating point operations had to be 'fast' to enable the reduction. + TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op, + TmpVec, Shuf, "bin.rdx")); + } else { + assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && + "Invalid min/max"); + TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec, + Shuf); + } + if (!RedOps.empty()) + propagateIRFlags(TmpVec, RedOps); + } + // The result is in the first element of the vector. + return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); +} + +/// Create a simple vector reduction specified by an opcode and some +/// flags (if generating min/max reductions). +Value *llvm::createSimpleTargetReduction( + IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode, + Value *Src, TargetTransformInfo::ReductionFlags Flags, + ArrayRef<Value *> RedOps) { + assert(isa<VectorType>(Src->getType()) && "Type must be a vector"); + + Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType()); + std::function<Value*()> BuildFunc; + using RD = RecurrenceDescriptor; + RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid; + // TODO: Support creating ordered reductions. + FastMathFlags FMFUnsafe; + FMFUnsafe.setUnsafeAlgebra(); + + switch (Opcode) { + case Instruction::Add: + BuildFunc = [&]() { return Builder.CreateAddReduce(Src); }; + break; + case Instruction::Mul: + BuildFunc = [&]() { return Builder.CreateMulReduce(Src); }; + break; + case Instruction::And: + BuildFunc = [&]() { return Builder.CreateAndReduce(Src); }; + break; + case Instruction::Or: + BuildFunc = [&]() { return Builder.CreateOrReduce(Src); }; + break; + case Instruction::Xor: + BuildFunc = [&]() { return Builder.CreateXorReduce(Src); }; + break; + case Instruction::FAdd: + BuildFunc = [&]() { + auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src); + cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe); + return Rdx; + }; + break; + case Instruction::FMul: + BuildFunc = [&]() { + auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src); + cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe); + return Rdx; + }; + break; + case Instruction::ICmp: + if (Flags.IsMaxOp) { + MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax; + BuildFunc = [&]() { + return Builder.CreateIntMaxReduce(Src, Flags.IsSigned); + }; + } else { + MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin; + BuildFunc = [&]() { + return Builder.CreateIntMinReduce(Src, Flags.IsSigned); + }; + } + break; + case Instruction::FCmp: + if (Flags.IsMaxOp) { + MinMaxKind = RD::MRK_FloatMax; + BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); }; + } else { + MinMaxKind = RD::MRK_FloatMin; + BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); }; + } + break; + default: + llvm_unreachable("Unhandled opcode"); + break; + } + if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags)) + return BuildFunc(); + return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps); +} + +/// Create a vector reduction using a given recurrence descriptor. +Value *llvm::createTargetReduction(IRBuilder<> &Builder, + const TargetTransformInfo *TTI, + RecurrenceDescriptor &Desc, Value *Src, + bool NoNaN) { + // TODO: Support in-order reductions based on the recurrence descriptor. + RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind(); + TargetTransformInfo::ReductionFlags Flags; + Flags.NoNaN = NoNaN; + auto getSimpleRdx = [&](unsigned Opc) { + return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags); + }; + switch (RecKind) { + case RecurrenceDescriptor::RK_FloatAdd: + return getSimpleRdx(Instruction::FAdd); + case RecurrenceDescriptor::RK_FloatMult: + return getSimpleRdx(Instruction::FMul); + case RecurrenceDescriptor::RK_IntegerAdd: + return getSimpleRdx(Instruction::Add); + case RecurrenceDescriptor::RK_IntegerMult: + return getSimpleRdx(Instruction::Mul); + case RecurrenceDescriptor::RK_IntegerAnd: + return getSimpleRdx(Instruction::And); + case RecurrenceDescriptor::RK_IntegerOr: + return getSimpleRdx(Instruction::Or); + case RecurrenceDescriptor::RK_IntegerXor: + return getSimpleRdx(Instruction::Xor); + case RecurrenceDescriptor::RK_IntegerMinMax: { + switch (Desc.getMinMaxRecurrenceKind()) { + case RecurrenceDescriptor::MRK_SIntMax: + Flags.IsSigned = true; + Flags.IsMaxOp = true; + break; + case RecurrenceDescriptor::MRK_UIntMax: + Flags.IsMaxOp = true; + break; + case RecurrenceDescriptor::MRK_SIntMin: + Flags.IsSigned = true; + break; + case RecurrenceDescriptor::MRK_UIntMin: + break; + default: + llvm_unreachable("Unhandled MRK"); + } + return getSimpleRdx(Instruction::ICmp); + } + case RecurrenceDescriptor::RK_FloatMinMax: { + Flags.IsMaxOp = + Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax; + return getSimpleRdx(Instruction::FCmp); + } + default: + llvm_unreachable("Unhandled RecKind"); + } +} + +void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL) { + if (auto *VecOp = dyn_cast<Instruction>(I)) { + if (auto *I0 = dyn_cast<Instruction>(VL[0])) { + // VecOVp is initialized to the 0th scalar, so start counting from index + // '1'. + VecOp->copyIRFlags(I0); + for (int i = 1, e = VL.size(); i < e; ++i) { + if (auto *Scalar = dyn_cast<Instruction>(VL[i])) + VecOp->andIRFlags(Scalar); + } + } + } +} diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp index 29d334f2968f..2ef3d6336ae2 100644 --- a/lib/Transforms/Utils/ModuleUtils.cpp +++ b/lib/Transforms/Utils/ModuleUtils.cpp @@ -35,7 +35,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F, // Upgrade a 2-field global array type to the new 3-field format if needed. if (Data && OldEltTy->getNumElements() < 3) EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy), - IRB.getInt8PtrTy(), nullptr); + IRB.getInt8PtrTy()); else EltTy = OldEltTy; if (Constant *Init = GVCtor->getInitializer()) { @@ -44,10 +44,10 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F, for (unsigned i = 0; i != n; ++i) { auto Ctor = cast<Constant>(Init->getOperand(i)); if (EltTy != OldEltTy) - Ctor = ConstantStruct::get( - EltTy, Ctor->getAggregateElement((unsigned)0), - Ctor->getAggregateElement(1), - Constant::getNullValue(IRB.getInt8PtrTy()), nullptr); + Ctor = + ConstantStruct::get(EltTy, Ctor->getAggregateElement((unsigned)0), + Ctor->getAggregateElement(1), + Constant::getNullValue(IRB.getInt8PtrTy())); CurrentCtors.push_back(Ctor); } } @@ -55,7 +55,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F, } else { // Use the new three-field struct if there isn't one already. EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy), - IRB.getInt8PtrTy(), nullptr); + IRB.getInt8PtrTy()); } // Build a 2 or 3 field global_ctor entry. We don't take a comdat key. diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 9e71d746de34..1de579ed41b0 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1450,11 +1450,11 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, // x86_64 can't use {float, float} since that would be returned in both // xmm0 and xmm1, which isn't what a real struct would do. ResTy = T.getArch() == Triple::x86_64 - ? static_cast<Type *>(VectorType::get(ArgTy, 2)) - : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr)); + ? static_cast<Type *>(VectorType::get(ArgTy, 2)) + : static_cast<Type *>(StructType::get(ArgTy, ArgTy)); } else { Name = "__sincospi_stret"; - ResTy = StructType::get(ArgTy, ArgTy, nullptr); + ResTy = StructType::get(ArgTy, ArgTy); } Module *M = OrigCallee->getParent(); diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp index 83bd29dbca65..60d9ede2c487 100644 --- a/lib/Transforms/Utils/VNCoercion.cpp +++ b/lib/Transforms/Utils/VNCoercion.cpp @@ -303,6 +303,15 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy, const DataLayout &DL) { LLVMContext &Ctx = SrcVal->getType()->getContext(); + // If two pointers are in the same address space, they have the same size, + // so we don't need to do any truncation, etc. This avoids introducing + // ptrtoint instructions for pointers that may be non-integral. + if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() && + cast<PointerType>(SrcVal->getType())->getAddressSpace() == + cast<PointerType>(LoadTy)->getAddressSpace()) { + return SrcVal; + } + uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8; // Compute which bits of the stored value are being used by the load. Convert diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index 84d89f103a2f..930972924c3c 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -949,11 +949,10 @@ void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix, Constant *NewV; if (IsOldCtorDtor) { auto *S = cast<ConstantStruct>(V); - auto *E1 = mapValue(S->getOperand(0)); - auto *E2 = mapValue(S->getOperand(1)); - Value *Null = Constant::getNullValue(VoidPtrTy); - NewV = - ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr); + auto *E1 = cast<Constant>(mapValue(S->getOperand(0))); + auto *E2 = cast<Constant>(mapValue(S->getOperand(1))); + Constant *Null = Constant::getNullValue(VoidPtrTy); + NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null); } else { NewV = cast_or_null<Constant>(mapValue(V)); } diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 97dcb40a1d72..9cf66382b581 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -346,7 +346,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { if (!Safe) { KnownBits Known(BitWidth); computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); - if (Known.Zero.countTrailingZeros() < (BitWidth - 1)) + if (Known.countMaxTrailingOnes() < (BitWidth - 1)) Safe = true; } diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 3fde0a453962..516ab7d03a88 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -391,13 +391,14 @@ public: TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM), AddedSafetyChecks(false) {} - // Perform the actual loop widening (vectorization). - void vectorize() { - // Create a new empty loop. Unlink the old loop and connect the new one. - createEmptyLoop(); - // Widen each instruction in the old loop to a new one in the new loop. - vectorizeLoop(); - } + /// Create a new empty loop. Unlink the old loop and connect the new one. + void createVectorizedLoopSkeleton(); + + /// Vectorize a single instruction within the innermost loop. + void vectorizeInstruction(Instruction &I); + + /// Fix the vectorized code, taking care of header phi's, live-outs, and more. + void fixVectorizedLoop(); // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } @@ -425,9 +426,6 @@ protected: EdgeMaskCacheTy; typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy; - /// Create an empty loop, based on the loop ranges of the old loop. - void createEmptyLoop(); - /// Set up the values of the IVs correctly when exiting the vector loop. void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *CountRoundDown, Value *EndValue, @@ -436,8 +434,6 @@ protected: /// Create a new induction variable inside L. PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, Value *Step, Instruction *DL); - /// Copy and widen the instructions from the old loop. - virtual void vectorizeLoop(); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(); @@ -450,10 +446,10 @@ protected: /// vectorizing this phi node. void fixReduction(PHINode *Phi); - /// \brief The Loop exit block may have single value PHI nodes where the - /// incoming value is 'Undef'. While vectorizing we only handled real values - /// that were defined inside the loop. Here we fix the 'undef case'. - /// See PR14725. + /// \brief The Loop exit block may have single value PHI nodes with some + /// incoming value. While vectorizing we only handled real values + /// that were defined inside the loop and we should have one value for + /// each predecessor of its parent basic block. See PR14725. void fixLCSSAPHIs(); /// Iteratively sink the scalarized operands of a predicated instruction into @@ -464,11 +460,6 @@ protected: /// respective conditions. void predicateInstructions(); - /// Collect the instructions from the original loop that would be trivially - /// dead in the vectorized loop if generated. - void collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions); - /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. void truncateToMinimalBitwidths(); @@ -481,10 +472,6 @@ protected: /// and DST. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - /// A helper function to vectorize a single instruction within the innermost - /// loop. - void vectorizeInstruction(Instruction &I); - /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -1700,6 +1687,9 @@ public: /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + // Returns true if the NoNaN attribute is set on the function. + bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -2185,7 +2175,10 @@ public: /// passed Legality checks. class LoopVectorizationPlanner { public: - LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {} + LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI, + LoopVectorizationLegality *Legal, + LoopVectorizationCostModel &CM) + : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {} ~LoopVectorizationPlanner() {} @@ -2193,7 +2186,25 @@ public: LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, unsigned UserVF); + /// Generate the IR code for the vectorized loop. + void executePlan(InnerLoopVectorizer &ILV); + +protected: + /// Collect the instructions from the original loop that would be trivially + /// dead in the vectorized loop if generated. + void collectTriviallyDeadInstructions( + SmallPtrSetImpl<Instruction *> &DeadInstructions); + private: + /// The loop that we evaluate. + Loop *OrigLoop; + + /// Loop Info analysis. + LoopInfo *LI; + + /// The legality analysis. + LoopVectorizationLegality *Legal; + /// The profitablity analysis. LoopVectorizationCostModel &CM; }; @@ -3361,7 +3372,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { LVer->prepareNoAliasMetadata(); } -void InnerLoopVectorizer::createEmptyLoop() { +void InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3883,36 +3894,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { } } -void InnerLoopVectorizer::vectorizeLoop() { - //===------------------------------------------------===// - // - // Notice: any optimization or new instruction that go - // into the code below should be also be implemented in - // the cost-model. - // - //===------------------------------------------------===// - - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet<Instruction *, 4> DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - - // Scan the loop in a topological order to ensure that defs are vectorized - // before users. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); - - // Vectorize all instructions in the original loop that will not become - // trivially dead when vectorized. - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - for (Instruction &I : *BB) - if (!DeadInstructions.count(&I)) - vectorizeInstruction(I); - +void InnerLoopVectorizer::fixVectorizedLoop() { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. if (VF > 1) @@ -4049,8 +4031,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Set the insertion point after the previous value if it is an instruction. // Note that the previous value may have been constant-folded so it is not - // guaranteed to be an instruction in the vector loop. - if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1])) + // guaranteed to be an instruction in the vector loop. Also, if the previous + // value is a phi node, we should insert after all the phi nodes to avoid + // breaking basic block verification. + if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) || + isa<PHINode>(PreviousParts[UF - 1])) Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); else Builder.SetInsertPoint( @@ -4258,39 +4243,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } if (VF > 1) { - // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles - // and vector ops, reducing the set of values being computed by half each - // round. - assert(isPowerOf2_32(VF) && - "Reduction emission only supported for pow2 vectors!"); - Value *TmpVec = ReducedPartRdx; - SmallVector<Constant *, 32> ShuffleMask(VF, nullptr); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i / 2; ++j) - ShuffleMask[j] = Builder.getInt32(i / 2 + j); - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), - UndefValue::get(Builder.getInt32Ty())); - - Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), - ConstantVector::get(ShuffleMask), "rdx.shuf"); - - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - TmpVec = addFastMathFlag(Builder.CreateBinOp( - (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); - else - TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, - TmpVec, Shuf); - } - - // The result is in the first element of the vector. + bool NoNaN = Legal->hasFunNoNaNAttr(); ReducedPartRdx = - Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); - + createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (Phi->getType() != RdxDesc.getRecurrenceType()) @@ -4345,33 +4300,11 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { auto *LCSSAPhi = dyn_cast<PHINode>(&LEI); if (!LCSSAPhi) break; - if (LCSSAPhi->getNumIncomingValues() == 1) - LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), - LoopMiddleBlock); - } -} - -void InnerLoopVectorizer::collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions) { - BasicBlock *Latch = OrigLoop->getLoopLatch(); - - // We create new control-flow for the vectorized loop, so the original - // condition will be dead after vectorization if it's only used by the - // branch. - auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); - if (Cmp && Cmp->hasOneUse()) - DeadInstructions.insert(Cmp); - - // We create new "steps" for induction variable updates to which the original - // induction variables map. An original update instruction will be dead if - // all its users except the induction variable are dead. - for (auto &Induction : *Legal->getInductionVars()) { - PHINode *Ind = Induction.first; - auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - if (all_of(IndUpdate->users(), [&](User *U) -> bool { - return U == Ind || DeadInstructions.count(cast<Instruction>(U)); - })) - DeadInstructions.insert(IndUpdate); + if (LCSSAPhi->getNumIncomingValues() == 1) { + assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) && + "Incoming value isn't loop invariant"); + LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock); + } } } @@ -7577,6 +7510,72 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { return CM.selectVectorizationFactor(MaxVF); } +void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { + // Perform the actual loop transformation. + + // 1. Create a new empty loop. Unlink the old loop and connect the new one. + ILV.createVectorizedLoopSkeleton(); + + //===------------------------------------------------===// + // + // Notice: any optimization or new instruction that go + // into the code below should also be implemented in + // the cost-model. + // + //===------------------------------------------------===// + + // 2. Copy and widen instructions from the old loop into the new loop. + + // Collect instructions from the original loop that will become trivially dead + // in the vectorized loop. We don't need to vectorize these instructions. For + // example, original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet<Instruction *, 4> DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); + + // Scan the loop in a topological order to ensure that defs are vectorized + // before users. + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); + + // Vectorize all instructions in the original loop that will not become + // trivially dead when vectorized. + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) + for (Instruction &I : *BB) + if (!DeadInstructions.count(&I)) + ILV.vectorizeInstruction(I); + + // 3. Fix the vectorized code: take care of header phi's, live-outs, + // predication, updating analyses. + ILV.fixVectorizedLoop(); +} + +void LoopVectorizationPlanner::collectTriviallyDeadInstructions( + SmallPtrSetImpl<Instruction *> &DeadInstructions) { + BasicBlock *Latch = OrigLoop->getLoopLatch(); + + // We create new control-flow for the vectorized loop, so the original + // condition will be dead after vectorization if it's only used by the + // branch. + auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); + if (Cmp && Cmp->hasOneUse()) + DeadInstructions.insert(Cmp); + + // We create new "steps" for induction variable updates to which the original + // induction variables map. An original update instruction will be dead if + // all its users except the induction variable are dead. + for (auto &Induction : *Legal->getInductionVars()) { + PHINode *Ind = Induction.first; + auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); + if (all_of(IndUpdate->users(), [&](User *U) -> bool { + return U == Ind || DeadInstructions.count(cast<Instruction>(U)); + })) + DeadInstructions.insert(IndUpdate); + } +} + void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { auto *SI = dyn_cast<StoreInst>(Instr); bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); @@ -7759,7 +7758,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(CM); + LoopVectorizationPlanner LVP(L, LI, &LVL, CM); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); @@ -7853,7 +7852,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM); - Unroller.vectorize(); + LVP.executePlan(Unroller); ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), L->getHeader()) @@ -7863,7 +7862,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM); - LB.vectorize(); + LVP.executePlan(LB); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there are diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index f112c555205c..64013d6d687d 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -40,7 +40,9 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GraphWriter.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <memory> @@ -212,23 +214,6 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) { return Opcode; } -/// Get the intersection (logical and) of all of the potential IR flags -/// of each scalar operation (VL) that will be converted into a vector (I). -/// Flag set: NSW, NUW, exact, and all of fast-math. -static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { - if (auto *VecOp = dyn_cast<Instruction>(I)) { - if (auto *I0 = dyn_cast<Instruction>(VL[0])) { - // VecOVp is initialized to the 0th scalar, so start counting from index - // '1'. - VecOp->copyIRFlags(I0); - for (int i = 1, e = VL.size(); i < e; ++i) { - if (auto *Scalar = dyn_cast<Instruction>(VL[i])) - VecOp->andIRFlags(Scalar); - } - } - } -} - /// \returns true if all of the values in \p VL have the same type or false /// otherwise. static bool allSameType(ArrayRef<Value *> VL) { @@ -315,10 +300,10 @@ public: BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, - const DataLayout *DL) + const DataLayout *DL, OptimizationRemarkEmitter *ORE) : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), - DL(DL), Builder(Se->getContext()) { + DL(DL), ORE(ORE), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); // Use the vector register size specified by the target unless overridden // by a command-line option. @@ -331,7 +316,10 @@ public: else MaxVecRegSize = TTI->getRegisterBitWidth(true); - MinVecRegSize = MinVectorRegSizeOption; + if (MinVectorRegSizeOption.getNumOccurrences()) + MinVecRegSize = MinVectorRegSizeOption; + else + MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); } /// \brief Vectorize the tree that starts with the elements in \p VL. @@ -377,6 +365,8 @@ public: MinBWs.clear(); } + unsigned getTreeSize() const { return VectorizableTree.size(); } + /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); @@ -415,6 +405,8 @@ public: /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(); + OptimizationRemarkEmitter *getORE() { return ORE; } + private: struct TreeEntry; @@ -944,6 +936,8 @@ private: AssumptionCache *AC; DemandedBits *DB; const DataLayout *DL; + OptimizationRemarkEmitter *ORE; + unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. unsigned MinVecRegSize; // Set by cl::opt (default: 128). /// Instruction builder to construct the vectorized tree. @@ -1835,11 +1829,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { CInt->getValue().isPowerOf2()) Op2VP = TargetTransformInfo::OP_PowerOf2; - int ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, - Op2VK, Op1VP, Op2VP); + SmallVector<const Value *, 4> Operands(VL0->operand_values()); + int ScalarCost = + VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, + Op2VP, Operands); int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, - Op1VP, Op2VP); + Op1VP, Op2VP, Operands); return VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -3703,10 +3699,8 @@ void BoUpSLP::computeMinimumValueSizes() { // Determine if the sign bit of all the roots is known to be zero. If not, // IsKnownPositive is set to False. IsKnownPositive = all_of(TreeRoot, [&](Value *R) { - bool KnownZero = false; - bool KnownOne = false; - ComputeSignBit(R, KnownZero, KnownOne, *DL); - return KnownZero; + KnownBits Known = computeKnownBits(R, *DL); + return Known.isNonNegative(); }); // Determine the maximum number of bits required to store the scalar @@ -3786,8 +3780,9 @@ struct SLPVectorizer : public FunctionPass { auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); + auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); + return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -3799,6 +3794,7 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<DemandedBitsWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<AAResultsWrapperPass>(); @@ -3817,8 +3813,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); auto *AC = &AM.getResult<AssumptionAnalysis>(F); auto *DB = &AM.getResult<DemandedBitsAnalysis>(F); + auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); - bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); + bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); if (!Changed) return PreservedAnalyses::all(); @@ -3833,7 +3830,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_, DominatorTree *DT_, - AssumptionCache *AC_, DemandedBits *DB_) { + AssumptionCache *AC_, DemandedBits *DB_, + OptimizationRemarkEmitter *ORE_) { SE = SE_; TTI = TTI_; TLI = TLI_; @@ -3861,7 +3859,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // Use the bottom up slp vectorizer to construct chains that start with // store instructions. - BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL); + BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_); // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to // delete instructions. @@ -3950,6 +3948,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + using namespace ore; + R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", + cast<StoreInst>(Chain[i])) + << "Stores SLP vectorized with cost " << NV("Cost", Cost) + << " and with tree size " + << NV("TreeSize", R.getTreeSize())); + R.vectorizeTree(); // Move to the next bundle. @@ -4163,6 +4168,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); + R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", + cast<Instruction>(Ops[0])) + << "SLP vectorized with cost " << ore::NV("Cost", Cost) + << " and with tree size " + << ore::NV("TreeSize", R.getTreeSize())); + Value *VectorizedRoot = R.vectorizeTree(); // Reconstruct the build vector by extracting the vectorized root. This @@ -4506,6 +4517,12 @@ public: DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost << ". (HorRdx)\n"); + auto *I0 = cast<Instruction>(VL[0]); + V.getORE()->emit( + OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0) + << "Vectorized horizontal reduction with cost " + << ore::NV("Cost", Cost) << " and with tree size " + << ore::NV("TreeSize", V.getTreeSize())); // Vectorize a tree. DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); @@ -4513,7 +4530,7 @@ public: // Emit a reduction. Value *ReducedSubTree = - emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps); + emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI); if (VectorizedTree) { Builder.SetCurrentDebugLocation(Loc); VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, @@ -4583,33 +4600,31 @@ private: /// \brief Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, - unsigned ReduxWidth, ArrayRef<Value *> RedOps) { + unsigned ReduxWidth, ArrayRef<Value *> RedOps, + const TargetTransformInfo *TTI) { assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); + if (!IsPairwiseReduction) + return createSimpleTargetReduction( + Builder, TTI, ReductionOpcode, VectorizedValue, + TargetTransformInfo::ReductionFlags(), RedOps); + Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { - if (IsPairwiseReduction) { - Value *LeftMask = + Value *LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true, Builder); - Value *RightMask = + Value *RightMask = createRdxShuffleMask(ReduxWidth, i, true, false, Builder); - Value *LeftShuf = Builder.CreateShuffleVector( + Value *LeftShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); - Value *RightShuf = Builder.CreateShuffleVector( + Value *RightShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), "rdx.shuf.r"); - TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, - "bin.rdx"); - } else { - Value *UpperHalf = - createRdxShuffleMask(ReduxWidth, i, false, false, Builder); - Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); - TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx"); - } + TmpVec = + Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx"); propagateIRFlags(TmpVec, RedOps); } @@ -5162,6 +5177,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) namespace llvm { |