diff options
Diffstat (limited to 'lib/Transforms')
39 files changed, 1845 insertions, 800 deletions
| diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp index 19e6789dfa74a..4480220f2cd49 100644 --- a/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/lib/Transforms/Coroutines/CoroFrame.cpp @@ -177,7 +177,7 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)    // consume. Note, that crossing coro.save also requires a spill, as any code    // between coro.save and coro.suspend may resume the coroutine and all of the    // state needs to be saved by that time. -  auto markSuspendBlock = [&](IntrinsicInst* BarrierInst) { +  auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) {      BasicBlock *SuspendBlock = BarrierInst->getParent();      auto &B = getBlockData(SuspendBlock);      B.Suspend = true; @@ -495,6 +495,78 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {    return FramePtr;  } +// Sets the unwind edge of an instruction to a particular successor. +static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) { +  if (auto *II = dyn_cast<InvokeInst>(TI)) +    II->setUnwindDest(Succ); +  else if (auto *CS = dyn_cast<CatchSwitchInst>(TI)) +    CS->setUnwindDest(Succ); +  else if (auto *CR = dyn_cast<CleanupReturnInst>(TI)) +    CR->setUnwindDest(Succ); +  else +    llvm_unreachable("unexpected terminator instruction"); +} + +// Replaces all uses of OldPred with the NewPred block in all PHINodes in a +// block. +static void updatePhiNodes(BasicBlock *DestBB, BasicBlock *OldPred, +                           BasicBlock *NewPred, +                           PHINode *LandingPadReplacement) { +  unsigned BBIdx = 0; +  for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) { +    PHINode *PN = cast<PHINode>(I); + +    // We manually update the LandingPadReplacement PHINode and it is the last +    // PHI Node. So, if we find it, we are done. +    if (LandingPadReplacement == PN) +      break; + +    // Reuse the previous value of BBIdx if it lines up.  In cases where we +    // have multiple phi nodes with *lots* of predecessors, this is a speed +    // win because we don't have to scan the PHI looking for TIBB.  This +    // happens because the BB list of PHI nodes are usually in the same +    // order. +    if (PN->getIncomingBlock(BBIdx) != OldPred) +      BBIdx = PN->getBasicBlockIndex(OldPred); + +    assert(BBIdx != (unsigned)-1 && "Invalid PHI Index!"); +    PN->setIncomingBlock(BBIdx, NewPred); +  } +} + +// Uses SplitEdge unless the successor block is an EHPad, in which case do EH +// specific handling. +static BasicBlock *ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ, +                                    LandingPadInst *OriginalPad, +                                    PHINode *LandingPadReplacement) { +  auto *PadInst = Succ->getFirstNonPHI(); +  if (!LandingPadReplacement && !PadInst->isEHPad()) +    return SplitEdge(BB, Succ); + +  auto *NewBB = BasicBlock::Create(BB->getContext(), "", BB->getParent(), Succ); +  setUnwindEdgeTo(BB->getTerminator(), NewBB); +  updatePhiNodes(Succ, BB, NewBB, LandingPadReplacement); + +  if (LandingPadReplacement) { +    auto *NewLP = OriginalPad->clone(); +    auto *Terminator = BranchInst::Create(Succ, NewBB); +    NewLP->insertBefore(Terminator); +    LandingPadReplacement->addIncoming(NewLP, NewBB); +    return NewBB; +  } +  Value *ParentPad = nullptr; +  if (auto *FuncletPad = dyn_cast<FuncletPadInst>(PadInst)) +    ParentPad = FuncletPad->getParentPad(); +  else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(PadInst)) +    ParentPad = CatchSwitch->getParentPad(); +  else +    llvm_unreachable("handling for other EHPads not implemented yet"); + +  auto *NewCleanupPad = CleanupPadInst::Create(ParentPad, {}, "", NewBB); +  CleanupReturnInst::Create(NewCleanupPad, Succ, NewBB); +  return NewBB; +} +  static void rewritePHIs(BasicBlock &BB) {    // For every incoming edge we will create a block holding all    // incoming values in a single PHI nodes. @@ -502,7 +574,7 @@ static void rewritePHIs(BasicBlock &BB) {    // loop:    //    %n.val = phi i32[%n, %entry], [%inc, %loop]    // -  // It will create:   +  // It will create:    //    // loop.from.entry:    //    %n.loop.pre = phi i32 [%n, %entry] @@ -517,9 +589,22 @@ static void rewritePHIs(BasicBlock &BB) {    // TODO: Simplify PHINodes in the basic block to remove duplicate    // predecessors. +  LandingPadInst *LandingPad = nullptr; +  PHINode *ReplPHI = nullptr; +  if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) { +    // ehAwareSplitEdge will clone the LandingPad in all the edge blocks. +    // We replace the original landing pad with a PHINode that will collect the +    // results from all of them. +    ReplPHI = PHINode::Create(LandingPad->getType(), 1, "", LandingPad); +    ReplPHI->takeName(LandingPad); +    LandingPad->replaceAllUsesWith(ReplPHI); +    // We will erase the original landing pad at the end of this function after +    // ehAwareSplitEdge cloned it in the transition blocks. +  } +    SmallVector<BasicBlock *, 8> Preds(pred_begin(&BB), pred_end(&BB));    for (BasicBlock *Pred : Preds) { -    auto *IncomingBB = SplitEdge(Pred, &BB); +    auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI);      IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());      auto *PN = cast<PHINode>(&BB.front());      do { @@ -531,7 +616,14 @@ static void rewritePHIs(BasicBlock &BB) {        InputV->addIncoming(V, Pred);        PN->setIncomingValue(Index, InputV);        PN = dyn_cast<PHINode>(PN->getNextNode()); -    } while (PN); +    } while (PN != ReplPHI); // ReplPHI is either null or the PHI that replaced +                             // the landing pad. +  } + +  if (LandingPad) { +    // Calls to ehAwareSplitEdge function cloned the original lading pad. +    // No longer need it. +    LandingPad->eraseFromParent();    }  } diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp index 7ed07d63c6275..231487923fada 100644 --- a/lib/Transforms/IPO/FunctionImport.cpp +++ b/lib/Transforms/IPO/FunctionImport.cpp @@ -610,8 +610,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,        return true;      // Lookup the linkage recorded in the summaries during global analysis. -    const auto &GS = DefinedGlobals.find(GV.getGUID()); -    GlobalValue::LinkageTypes Linkage; +    auto GS = DefinedGlobals.find(GV.getGUID());      if (GS == DefinedGlobals.end()) {        // Must have been promoted (possibly conservatively). Find original        // name so that we can access the correct summary and see if it can @@ -623,7 +622,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,        std::string OrigId = GlobalValue::getGlobalIdentifier(            OrigName, GlobalValue::InternalLinkage,            TheModule.getSourceFileName()); -      const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId)); +      GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));        if (GS == DefinedGlobals.end()) {          // Also check the original non-promoted non-globalized name. In some          // cases a preempted weak value is linked in as a local copy because @@ -631,15 +630,11 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,          // In that case, since it was originally not a local value, it was          // recorded in the index using the original name.          // FIXME: This may not be needed once PR27866 is fixed. -        const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName)); +        GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));          assert(GS != DefinedGlobals.end()); -        Linkage = GS->second->linkage(); -      } else { -        Linkage = GS->second->linkage();        } -    } else -      Linkage = GS->second->linkage(); -    return !GlobalValue::isLocalLinkage(Linkage); +    } +    return !GlobalValue::isLocalLinkage(GS->second->linkage());    };    // FIXME: See if we can just internalize directly here via linkage changes diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 6c83c99ae3be5..673d3af0ab524 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -502,7 +502,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,          std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);    InlinedArrayAllocasTy InlinedArrayAllocas; -  InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache); +  InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI);    // Now that we have all of the call sites, loop over them and inline them if    // it looks profitable to do so. @@ -872,7 +872,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,        // Setup the data structure used to plumb customization into the        // `InlineFunction` routine.        InlineFunctionInfo IFI( -          /*cg=*/nullptr, &GetAssumptionCache, +          /*cg=*/nullptr, &GetAssumptionCache, PSI,            &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())),            &FAM.getResult<BlockFrequencyAnalysis>(Callee)); diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 2db47b3b56222..8dff2fb3be8ac 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -16,6 +16,7 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/BlockFrequencyInfo.h"  #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CodeMetrics.h"  #include "llvm/Analysis/InlineCost.h"  #include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/OptimizationDiagnosticInfo.h" @@ -42,6 +43,11 @@ STATISTIC(NumPartialInlined,  static cl::opt<bool>      DisablePartialInlining("disable-partial-inlining", cl::init(false),                             cl::Hidden, cl::desc("Disable partial ininling")); +// This is an option used by testing: +static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", +                                      cl::init(false), cl::ZeroOrMore, +                                      cl::ReallyHidden, +                                      cl::desc("Skip Cost Analysis"));  static cl::opt<unsigned> MaxNumInlineBlocks(      "max-num-inline-blocks", cl::init(5), cl::Hidden, @@ -53,6 +59,15 @@ static cl::opt<int> MaxNumPartialInlining(      "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,      cl::desc("Max number of partial inlining. The default is unlimited")); +// Used only when PGO or user annotated branch data is absent. It is +// the least value that is used to weigh the outline region. If BFI +// produces larger value, the BFI value will be used. +static cl::opt<int> +    OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), +                             cl::Hidden, cl::ZeroOrMore, +                             cl::desc("Relative frequency of outline region to " +                                      "the entry block")); +  namespace {  struct FunctionOutliningInfo { @@ -84,8 +99,6 @@ struct PartialInlinerImpl {    bool run(Module &M);    Function *unswitchFunction(Function *F); -  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); -  private:    int NumPartialInlining = 0;    std::function<AssumptionCache &(Function &)> *GetAssumptionCache; @@ -93,11 +106,84 @@ private:    Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;    ProfileSummaryInfo *PSI; -  bool shouldPartialInline(CallSite CS, OptimizationRemarkEmitter &ORE); +  // Return the frequency of the OutlininingBB relative to F's entry point. +  // The result is no larger than 1 and is represented using BP. +  // (Note that the outlined region's 'head' block can only have incoming +  // edges from the guarding entry blocks). +  BranchProbability getOutliningCallBBRelativeFreq(Function *F, +                                                   FunctionOutliningInfo *OI, +                                                   Function *DuplicateFunction, +                                                   BlockFrequencyInfo *BFI, +                                                   BasicBlock *OutliningCallBB); + +  // Return true if the callee of CS should be partially inlined with +  // profit. +  bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI, +                           BlockFrequencyInfo *CalleeBFI, +                           BasicBlock *OutliningCallBB, +                           int OutliningCallOverhead, +                           OptimizationRemarkEmitter &ORE); + +  // Try to inline DuplicateFunction (cloned from F with call to +  // the OutlinedFunction into its callers. Return true +  // if there is any successful inlining. +  bool tryPartialInline(Function *DuplicateFunction, +                        Function *F, /*orignal function */ +                        FunctionOutliningInfo *OI, Function *OutlinedFunction, +                        BlockFrequencyInfo *CalleeBFI); + +  // Compute the mapping from use site of DuplicationFunction to the enclosing +  // BB's profile count. +  void computeCallsiteToProfCountMap(Function *DuplicateFunction, +                                     DenseMap<User *, uint64_t> &SiteCountMap); +    bool IsLimitReached() {      return (MaxNumPartialInlining != -1 &&              NumPartialInlining >= MaxNumPartialInlining);    } + +  CallSite getCallSite(User *U) { +    CallSite CS; +    if (CallInst *CI = dyn_cast<CallInst>(U)) +      CS = CallSite(CI); +    else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) +      CS = CallSite(II); +    else +      llvm_unreachable("All uses must be calls"); +    return CS; +  } + +  CallSite getOneCallSiteTo(Function *F) { +    User *User = *F->user_begin(); +    return getCallSite(User); +  } + +  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { +    CallSite CS = getOneCallSiteTo(F); +    DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); +    BasicBlock *Block = CS.getParent(); +    return std::make_tuple(DLoc, Block); +  } + +  // Returns the costs associated with function outlining: +  // - The first value is the non-weighted runtime cost for making the call +  //   to the outlined function 'OutlinedFunction', including the addtional +  //   setup cost in the outlined function itself; +  // - The second value is the estimated size of the new call sequence in +  //   basic block 'OutliningCallBB'; +  // - The third value is the estimated size of the original code from +  //   function 'F' that is extracted into the outlined function. +  std::tuple<int, int, int> +  computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo, +                        Function *OutlinedFunction, +                        BasicBlock *OutliningCallBB); +  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to +  // approximate both the size and runtime cost (Note that in the current +  // inline cost analysis, there is no clear distinction there either). +  int computeBBInlineCost(BasicBlock *BB); + +  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); +  };  struct PartialInlinerLegacyPass : public ModulePass { @@ -157,7 +243,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {      return isa<ReturnInst>(TI);    }; -  auto GetReturnBlock = [=](BasicBlock *Succ1, BasicBlock *Succ2) { +  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {      if (IsReturnBlock(Succ1))        return std::make_tuple(Succ1, Succ2);      if (IsReturnBlock(Succ2)) @@ -167,7 +253,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {    };    // Detect a triangular shape: -  auto GetCommonSucc = [=](BasicBlock *Succ1, BasicBlock *Succ2) { +  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {      if (IsSuccessor(Succ1, Succ2))        return std::make_tuple(Succ1, Succ2);      if (IsSuccessor(Succ2, Succ1)) @@ -223,7 +309,8 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {    // Do sanity check of the entries: threre should not    // be any successors (not in the entry set) other than    // {ReturnBlock, NonReturnBlock} -  assert(OutliningInfo->Entries[0] == &F->front()); +  assert(OutliningInfo->Entries[0] == &F->front() && +         "Function Entry must be the first in Entries vector");    DenseSet<BasicBlock *> Entries;    for (BasicBlock *E : OutliningInfo->Entries)      Entries.insert(E); @@ -289,10 +376,54 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {    return OutliningInfo;  } -bool PartialInlinerImpl::shouldPartialInline(CallSite CS, -                                             OptimizationRemarkEmitter &ORE) { -  // TODO : more sharing with shouldInline in Inliner.cpp +// Check if there is PGO data or user annoated branch data: +static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { +  if (F->getEntryCount()) +    return true; +  // Now check if any of the entry block has MD_prof data: +  for (auto *E : OI->Entries) { +    BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); +    if (!BR || BR->isUnconditional()) +      continue; +    uint64_t T, F; +    if (BR->extractProfMetadata(T, F)) +      return true; +  } +  return false; +} + +BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( +    Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction, +    BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) { + +  auto EntryFreq = +      BFI->getBlockFreq(&DuplicateFunction->getEntryBlock()); +  auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB); + +  auto OutlineRegionRelFreq = +      BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), +                                              EntryFreq.getFrequency()); + +  if (hasProfileData(F, OI)) +    return OutlineRegionRelFreq; + +  // When profile data is not available, we need to be very +  // conservative in estimating the overall savings. We need to make sure +  // the outline region relative frequency is not below the threshold +  // specified by the option. +  OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); + +  return OutlineRegionRelFreq; +} + +bool PartialInlinerImpl::shouldPartialInline( +    CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI, +    BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB, +    int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) {    using namespace ore; +  if (SkipCostAnalysis) +    return true; +    Instruction *Call = CS.getInstruction();    Function *Callee = CS.getCalledFunction();    Function *Caller = CS.getCaller(); @@ -302,36 +433,170 @@ bool PartialInlinerImpl::shouldPartialInline(CallSite CS,    if (IC.isAlways()) {      ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) -             << NV("Callee", Callee) +             << NV("Callee", F)               << " should always be fully inlined, not partially");      return false;    }    if (IC.isNever()) {      ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) -             << NV("Callee", Callee) << " not partially inlined into " +             << NV("Callee", F) << " not partially inlined into "               << NV("Caller", Caller)               << " because it should never be inlined (cost=never)");      return false;    }    if (!IC) { -    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call) -             << NV("Callee", Callee) << " not partially inlined into " +    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) +             << NV("Callee", F) << " not partially inlined into "               << NV("Caller", Caller) << " because too costly to inline (cost="               << NV("Cost", IC.getCost()) << ", threshold="               << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");      return false;    } +  const DataLayout &DL = Caller->getParent()->getDataLayout(); +  // The savings of eliminating the call: +  int NonWeightedSavings = getCallsiteCost(CS, DL); +  BlockFrequency NormWeightedSavings(NonWeightedSavings); + +  auto RelativeFreq = +      getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB); +  auto NormWeightedRcost = +      BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq; + +  // Weighted saving is smaller than weighted cost, return false +  if (NormWeightedSavings < NormWeightedRcost) { +    ORE.emit( +        OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call) +        << NV("Callee", F) << " not partially inlined into " +        << NV("Caller", Caller) << " runtime overhead (overhead=" +        << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency()) +        << ", savings=" +        << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" +        << " of making the outlined call is too high"); + +    return false; +  }    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) -           << NV("Callee", Callee) << " can be partially inlined into " +           << NV("Callee", F) << " can be partially inlined into "             << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())             << " (threshold="             << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");    return true;  } +// TODO: Ideally  we should share Inliner's InlineCost Analysis code. +// For now use a simplified version. The returned 'InlineCost' will be used +// to esimate the size cost as well as runtime cost of the BB. +int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { +  int InlineCost = 0; +  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); +  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { +    if (isa<DbgInfoIntrinsic>(I)) +      continue; + +    if (CallInst *CI = dyn_cast<CallInst>(I)) { +      InlineCost += getCallsiteCost(CallSite(CI), DL); +      continue; +    } + +    if (InvokeInst *II = dyn_cast<InvokeInst>(I)) { +      InlineCost += getCallsiteCost(CallSite(II), DL); +      continue; +    } + +    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { +      InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; +      continue; +    } +    InlineCost += InlineConstants::InstrCost; +  } +  return InlineCost; +} + +std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts( +    Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction, +    BasicBlock *OutliningCallBB) { +  // First compute the cost of the outlined region 'OI' in the original +  // function 'F': +  int OutlinedRegionCost = 0; +  for (BasicBlock &BB : *F) { +    if (&BB != OI->ReturnBlock && +        // Assuming Entry set is small -- do a linear search here: +        std::find(OI->Entries.begin(), OI->Entries.end(), &BB) == +            OI->Entries.end()) { +      OutlinedRegionCost += computeBBInlineCost(&BB); +    } +  } + +  // Now compute the cost of the call sequence to the outlined function +  // 'OutlinedFunction' in BB 'OutliningCallBB': +  int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB); + +  // Now compute the cost of the extracted/outlined function itself: +  int OutlinedFunctionCost = 0; +  for (BasicBlock &BB : *OutlinedFunction) { +    OutlinedFunctionCost += computeBBInlineCost(&BB); +  } + +  assert(OutlinedFunctionCost >= OutlinedRegionCost && +         "Outlined function cost should be no less than the outlined region"); +  int OutliningRuntimeOverhead = +      OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost); + +  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead, +                         OutlinedRegionCost); +} + +// Create the callsite to profile count map which is +// used to update the original function's entry count, +// after the function is partially inlined into the callsite. +void PartialInlinerImpl::computeCallsiteToProfCountMap( +    Function *DuplicateFunction, +    DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { +  std::vector<User *> Users(DuplicateFunction->user_begin(), +                            DuplicateFunction->user_end()); +  Function *CurrentCaller = nullptr; +  BlockFrequencyInfo *CurrentCallerBFI = nullptr; + +  auto ComputeCurrBFI = [&,this](Function *Caller) { +      // For the old pass manager: +      if (!GetBFI) { +        if (CurrentCallerBFI) +          delete CurrentCallerBFI; +        DominatorTree DT(*Caller); +        LoopInfo LI(DT); +        BranchProbabilityInfo BPI(*Caller, LI); +        CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI); +      } else { +        // New pass manager: +        CurrentCallerBFI = &(*GetBFI)(*Caller); +      } +  }; + +  for (User *User : Users) { +    CallSite CS = getCallSite(User); +    Function *Caller = CS.getCaller(); +    if (CurrentCaller != Caller) { +      CurrentCaller = Caller; +      ComputeCurrBFI(Caller); +    } else { +      assert(CurrentCallerBFI && "CallerBFI is not set"); +    } +    BasicBlock *CallBB = CS.getInstruction()->getParent(); +    auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); +    if (Count) +      CallSiteToProfCountMap[User] = *Count; +    else +      CallSiteToProfCountMap[User] = 0; +  } +  if (!GetBFI) { +    if (CurrentCallerBFI) +      delete CurrentCallerBFI; +  } +} +  Function *PartialInlinerImpl::unswitchFunction(Function *F) {    if (F->hasAddressTaken()) @@ -347,21 +612,21 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {    if (PSI->isFunctionEntryCold(F))      return nullptr; -  std::unique_ptr<FunctionOutliningInfo> OutliningInfo = -      computeOutliningInfo(F); +  if (F->user_begin() == F->user_end()) +    return nullptr; + +  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); -  if (!OutliningInfo) +  if (!OI)      return nullptr;    // Clone the function, so that we can hack away on it.    ValueToValueMapTy VMap;    Function *DuplicateFunction = CloneFunction(F, VMap); -  BasicBlock *NewReturnBlock = -      cast<BasicBlock>(VMap[OutliningInfo->ReturnBlock]); -  BasicBlock *NewNonReturnBlock = -      cast<BasicBlock>(VMap[OutliningInfo->NonReturnBlock]); +  BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); +  BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);    DenseSet<BasicBlock *> NewEntries; -  for (BasicBlock *BB : OutliningInfo->Entries) { +  for (BasicBlock *BB : OI->Entries) {      NewEntries.insert(cast<BasicBlock>(VMap[BB]));    } @@ -390,7 +655,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {    BasicBlock *PreReturn = NewReturnBlock;    // only split block when necessary:    PHINode *FirstPhi = getFirstPHI(PreReturn); -  unsigned NumPredsFromEntries = OutliningInfo->ReturnBlockPreds.size(); +  unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size();    if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) {      NewReturnBlock = NewReturnBlock->splitBasicBlock( @@ -408,14 +673,14 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {        Ins = NewReturnBlock->getFirstNonPHI();        RetPhi->addIncoming(&*I, PreReturn); -      for (BasicBlock *E : OutliningInfo->ReturnBlockPreds) { +      for (BasicBlock *E : OI->ReturnBlockPreds) {          BasicBlock *NewE = cast<BasicBlock>(VMap[E]);          RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE);          OldPhi->removeIncomingValue(NewE);        }        ++I;      } -    for (auto E : OutliningInfo->ReturnBlockPreds) { +    for (auto E : OI->ReturnBlockPreds) {        BasicBlock *NewE = cast<BasicBlock>(VMap[E]);        NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);      } @@ -423,7 +688,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {    // Returns true if the block is to be partial inlined into the caller    // (i.e. not to be extracted to the out of line function) -  auto ToBeInlined = [=](BasicBlock *BB) { +  auto ToBeInlined = [&](BasicBlock *BB) {      return BB == NewReturnBlock || NewEntries.count(BB);    };    // Gather up the blocks that we're going to extract. @@ -443,50 +708,113 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {    BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);    // Extract the body of the if. -  Function *ExtractedFunction = +  Function *OutlinedFunction =        CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)            .extractCodeRegion(); -  // Inline the top-level if test into all callers. +  bool AnyInline = +      tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI); + +  // Ditch the duplicate, since we're done with it, and rewrite all remaining +  // users (function pointers, etc.) back to the original function. +  DuplicateFunction->replaceAllUsesWith(F); +  DuplicateFunction->eraseFromParent(); + +  if (AnyInline) +    return OutlinedFunction; + +  // Remove the function that is speculatively created: +  if (OutlinedFunction) +    OutlinedFunction->eraseFromParent(); + +  return nullptr; +} + +bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction, +                                          Function *F, +                                          FunctionOutliningInfo *OI, +                                          Function *OutlinedFunction, +                                          BlockFrequencyInfo *CalleeBFI) { +  if (OutlinedFunction == nullptr) +    return false; + +  int NonWeightedRcost; +  int SizeCost; +  int OutlinedRegionSizeCost; + +  auto OutliningCallBB = +      getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent(); + +  std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) = +      computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB); + +  // The call sequence to the outlined function is larger than the original +  // outlined region size, it does not increase the chances of inlining +  // 'F' with outlining (The inliner usies the size increase to model the +  // the cost of inlining a callee). +  if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) { +    OptimizationRemarkEmitter ORE(F); +    DebugLoc DLoc; +    BasicBlock *Block; +    std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction); +    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", +                                        DLoc, Block) +             << ore::NV("Function", F) +             << " not partially inlined into callers (Original Size = " +             << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost) +             << ", Size of call sequence to outlined function = " +             << ore::NV("NewSize", SizeCost) << ")"); +    return false; +  } + +  assert(F->user_begin() == F->user_end() && +         "F's users should all be replaced!");    std::vector<User *> Users(DuplicateFunction->user_begin(),                              DuplicateFunction->user_end()); +  DenseMap<User *, uint64_t> CallSiteToProfCountMap; +  if (F->getEntryCount()) +    computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap); + +  auto CalleeEntryCount = F->getEntryCount(); +  uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); +  bool AnyInline = false;    for (User *User : Users) { -    CallSite CS; -    if (CallInst *CI = dyn_cast<CallInst>(User)) -      CS = CallSite(CI); -    else if (InvokeInst *II = dyn_cast<InvokeInst>(User)) -      CS = CallSite(II); -    else -      llvm_unreachable("All uses must be calls"); +    CallSite CS = getCallSite(User);      if (IsLimitReached())        continue;      OptimizationRemarkEmitter ORE(CS.getCaller()); -    if (!shouldPartialInline(CS, ORE)) + +    if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB, +                             NonWeightedRcost, ORE))        continue; -    DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); -    BasicBlock *Block = CS.getParent(); -    ORE.emit(OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", DLoc, Block) -             << ore::NV("Callee", F) << " partially inlined into " -             << ore::NV("Caller", CS.getCaller())); +    ORE.emit( +        OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()) +        << ore::NV("Callee", F) << " partially inlined into " +        << ore::NV("Caller", CS.getCaller())); -    InlineFunctionInfo IFI(nullptr, GetAssumptionCache); +    InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);      InlineFunction(CS, IFI); + +    // Now update the entry count: +    if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { +      uint64_t CallSiteCount = CallSiteToProfCountMap[User]; +      CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); +    } + +    AnyInline = true;      NumPartialInlining++; -    // update stats +    // Update the stats      NumPartialInlined++;    } -  // Ditch the duplicate, since we're done with it, and rewrite all remaining -  // users (function pointers, etc.) back to the original function. -  DuplicateFunction->replaceAllUsesWith(F); -  DuplicateFunction->eraseFromParent(); - +  if (AnyInline && CalleeEntryCount) +    F->setEntryCount(CalleeEntryCountV); -  return ExtractedFunction; +  return AnyInline;  }  bool PartialInlinerImpl::run(Module &M) { diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index d3a3c24ce7b44..659cb9df00a2c 100644 --- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -16,6 +16,7 @@  #include "llvm/Analysis/BasicAliasAnalysis.h"  #include "llvm/Analysis/ModuleSummaryAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h"  #include "llvm/Analysis/TypeMetadataUtils.h"  #include "llvm/Bitcode/BitcodeWriter.h"  #include "llvm/IR/Constants.h" @@ -178,7 +179,7 @@ void filterModule(      else        GO = new GlobalVariable(            *M, GA->getValueType(), false, GlobalValue::ExternalLinkage, -          (Constant *)nullptr, "", (GlobalVariable *)nullptr, +          nullptr, "", nullptr,            GA->getThreadLocalMode(), GA->getType()->getAddressSpace());      GO->takeName(GA);      GA->replaceAllUsesWith(GO); @@ -320,7 +321,8 @@ void splitAndWriteThinLTOBitcode(    // FIXME: Try to re-use BSI and PFI from the original module here. -  ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr); +  ProfileSummaryInfo PSI(M); +  ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);    SmallVector<char, 0> Buffer; diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 153a186d5ed40..0ca62b7ae40c1 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -847,92 +847,6 @@ Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {    return createFMul(OpndVal, Coeff.getValue(Instr->getType()));  } -/// \brief Return true if we can prove that adding the two values of the -/// knownbits will not overflow. -/// Otherwise return false. -static bool checkRippleForAdd(const KnownBits &LHSKnown, -                              const KnownBits &RHSKnown) { -  // Addition of two 2's complement numbers having opposite signs will never -  // overflow. -  if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) || -      (LHSKnown.isNonNegative() && RHSKnown.isNegative())) -    return true; - -  // If either of the values is known to be non-negative, adding them can only -  // overflow if the second is also non-negative, so we can assume that. -  // Two non-negative numbers will only overflow if there is a carry to the  -  // sign bit, so we can check if even when the values are as big as possible -  // there is no overflow to the sign bit. -  if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) { -    APInt MaxLHS = ~LHSKnown.Zero; -    MaxLHS.clearSignBit(); -    APInt MaxRHS = ~RHSKnown.Zero; -    MaxRHS.clearSignBit(); -    APInt Result = std::move(MaxLHS) + std::move(MaxRHS); -    return Result.isSignBitClear(); -  } - -  // If either of the values is known to be negative, adding them can only -  // overflow if the second is also negative, so we can assume that. -  // Two negative number will only overflow if there is no carry to the sign -  // bit, so we can check if even when the values are as small as possible -  // there is overflow to the sign bit. -  if (LHSKnown.isNegative() || RHSKnown.isNegative()) { -    APInt MinLHS = LHSKnown.One; -    MinLHS.clearSignBit(); -    APInt MinRHS = RHSKnown.One; -    MinRHS.clearSignBit(); -    APInt Result = std::move(MinLHS) + std::move(MinRHS); -    return Result.isSignBitSet(); -  } - -  // If we reached here it means that we know nothing about the sign bits. -  // In this case we can't know if there will be an overflow, since by  -  // changing the sign bits any two values can be made to overflow. -  return false; -} - -/// Return true if we can prove that: -///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS)) -/// This basically requires proving that the add in the original type would not -/// overflow to change the sign bit or have a carry out. -bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS, -                                            Instruction &CxtI) { -  // There are different heuristics we can use for this.  Here are some simple -  // ones. - -  // If LHS and RHS each have at least two sign bits, the addition will look -  // like -  // -  // XX..... + -  // YY..... -  // -  // If the carry into the most significant position is 0, X and Y can't both -  // be 1 and therefore the carry out of the addition is also 0. -  // -  // If the carry into the most significant position is 1, X and Y can't both -  // be 0 and therefore the carry out of the addition is also 1. -  // -  // Since the carry into the most significant position is always equal to -  // the carry out of the addition, there is no signed overflow. -  if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 && -      ComputeNumSignBits(RHS, 0, &CxtI) > 1) -    return true; - -  unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); -  KnownBits LHSKnown(BitWidth); -  computeKnownBits(LHS, LHSKnown, 0, &CxtI); - -  KnownBits RHSKnown(BitWidth); -  computeKnownBits(RHS, RHSKnown, 0, &CxtI); - -  // Check if carry bit of addition will not cause overflow. -  if (checkRippleForAdd(LHSKnown, RHSKnown)) -    return true; - -  return false; -} -  /// \brief Return true if we can prove that:  ///    (sub LHS, RHS)  === (sub nsw LHS, RHS)  /// This basically requires proving that the add in the original type would not @@ -968,13 +882,9 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,  bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS,                                                Instruction &CxtI) {    // If the LHS is negative and the RHS is non-negative, no unsigned wrap. -  bool LHSKnownNonNegative, LHSKnownNegative; -  bool RHSKnownNonNegative, RHSKnownNegative; -  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0, -                 &CxtI); -  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0, -                 &CxtI); -  if (LHSKnownNegative && RHSKnownNonNegative) +  KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI); +  KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI); +  if (LHSKnown.isNegative() && RHSKnown.isNonNegative())      return true;    return false; @@ -1041,6 +951,57 @@ static Value *checkForNegativeOperand(BinaryOperator &I,    return nullptr;  } +static Instruction *foldAddWithConstant(BinaryOperator &Add, +                                        InstCombiner::BuilderTy &Builder) { +  Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1); +  const APInt *C; +  if (!match(Op1, m_APInt(C))) +    return nullptr; + +  if (C->isSignMask()) { +    // If wrapping is not allowed, then the addition must set the sign bit: +    // X + (signmask) --> X | signmask +    if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap()) +      return BinaryOperator::CreateOr(Op0, Op1); + +    // If wrapping is allowed, then the addition flips the sign bit of LHS: +    // X + (signmask) --> X ^ signmask +    return BinaryOperator::CreateXor(Op0, Op1); +  } + +  Value *X; +  const APInt *C2; +  Type *Ty = Add.getType(); + +  // Is this add the last step in a convoluted sext? +  // add(zext(xor i16 X, -32768), -32768) --> sext X +  if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) && +      C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C) +    return CastInst::Create(Instruction::SExt, X, Ty); + +  // (add (zext (add nuw X, C2)), C) --> (zext (add nuw X, C2 + C)) +  // FIXME: This should check hasOneUse to not increase the instruction count? +  if (C->isNegative() && +      match(Op0, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2)))) && +      C->sge(-C2->sext(C->getBitWidth()))) { +    Constant *NewC = +        ConstantInt::get(X->getType(), *C2 + C->trunc(C2->getBitWidth())); +    return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty); +  } + +  // Shifts and add used to flip and mask off the low bit: +  // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1 +  const APInt *C3; +  if (*C == 1 && match(Op0, m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C2)), +                                            m_APInt(C3)))) && +      C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) { +    Value *NotX = Builder.CreateNot(X); +    return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1)); +  } + +  return nullptr; +} +  Instruction *InstCombiner::visitAdd(BinaryOperator &I) {    bool Changed = SimplifyAssociativeOrCommutative(I);    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); @@ -1056,41 +1017,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {    if (Value *V = SimplifyUsingDistributiveLaws(I))      return replaceInstUsesWith(I, V); -  const APInt *RHSC; -  if (match(RHS, m_APInt(RHSC))) { -    if (RHSC->isSignMask()) { -      // If wrapping is not allowed, then the addition must set the sign bit: -      // X + (signmask) --> X | signmask -      if (I.hasNoSignedWrap() || I.hasNoUnsignedWrap()) -        return BinaryOperator::CreateOr(LHS, RHS); - -      // If wrapping is allowed, then the addition flips the sign bit of LHS: -      // X + (signmask) --> X ^ signmask -      return BinaryOperator::CreateXor(LHS, RHS); -    } - -    // Is this add the last step in a convoluted sext? -    Value *X; -    const APInt *C; -    if (match(LHS, m_ZExt(m_Xor(m_Value(X), m_APInt(C)))) && -        C->isMinSignedValue() && -        C->sext(LHS->getType()->getScalarSizeInBits()) == *RHSC) { -      // add(zext(xor i16 X, -32768), -32768) --> sext X -      return CastInst::Create(Instruction::SExt, X, LHS->getType()); -    } - -    if (RHSC->isNegative() && -        match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) && -        RHSC->sge(-C->sext(RHSC->getBitWidth()))) { -      // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val)) -      Constant *NewC = -          ConstantInt::get(X->getType(), *C + RHSC->trunc(C->getBitWidth())); -      return new ZExtInst(Builder->CreateNUWAdd(X, NewC), I.getType()); -    } -  } +  if (Instruction *X = foldAddWithConstant(I, *Builder)) +    return X; -  // FIXME: Use the match above instead of dyn_cast to allow these transforms -  // for splat vectors. +  // FIXME: This should be moved into the above helper function to allow these +  // transforms for splat vectors.    if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {      // zext(bool) + C -> bool ? C + 1 : C      if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS)) @@ -1285,8 +1216,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {          Constant *CI =              ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());          if (ConstantExpr::getZExt(CI, I.getType()) == RHSC && -            computeOverflowForUnsignedAdd(LHSConv->getOperand(0), CI, &I) == -                OverflowResult::NeverOverflows) { +            willNotOverflowUnsignedAdd(LHSConv->getOperand(0), CI, I)) {            // Insert the new, smaller add.            Value *NewAdd =                Builder->CreateNUWAdd(LHSConv->getOperand(0), CI, "addconv"); @@ -1303,9 +1233,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {        if (LHSConv->getOperand(0)->getType() ==                RHSConv->getOperand(0)->getType() &&            (LHSConv->hasOneUse() || RHSConv->hasOneUse()) && -          computeOverflowForUnsignedAdd(LHSConv->getOperand(0), -                                        RHSConv->getOperand(0), -                                        &I) == OverflowResult::NeverOverflows) { +          willNotOverflowUnsignedAdd(LHSConv->getOperand(0), +                                     RHSConv->getOperand(0), I)) {          // Insert the new integer add.          Value *NewAdd = Builder->CreateNUWAdd(              LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv"); @@ -1347,15 +1276,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {    }    // TODO(jingyue): Consider WillNotOverflowSignedAdd and -  // WillNotOverflowUnsignedAdd to reduce the number of invocations of +  // willNotOverflowUnsignedAdd to reduce the number of invocations of    // computeKnownBits.    if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, I)) {      Changed = true;      I.setHasNoSignedWrap(true);    } -  if (!I.hasNoUnsignedWrap() && -      computeOverflowForUnsignedAdd(LHS, RHS, &I) == -          OverflowResult::NeverOverflows) { +  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) {      Changed = true;      I.setHasNoUnsignedWrap(true);    } diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index b114801cc1c02..82dc88f1b3ad6 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -23,21 +23,6 @@ using namespace PatternMatch;  #define DEBUG_TYPE "instcombine" -static inline Value *dyn_castNotVal(Value *V) { -  // If this is not(not(x)) don't return that this is a not: we want the two -  // not's to be folded first. -  if (BinaryOperator::isNot(V)) { -    Value *Operand = BinaryOperator::getNotArgument(V); -    if (!IsFreeToInvert(Operand, Operand->hasOneUse())) -      return Operand; -  } - -  // Constants can be considered to be not'ed values... -  if (ConstantInt *C = dyn_cast<ConstantInt>(V)) -    return ConstantInt::get(C->getType(), ~C->getValue()); -  return nullptr; -} -  /// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into  /// a four bit mask.  static unsigned getFCmpCode(FCmpInst::Predicate CC) { @@ -713,9 +698,8 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,    }    // This simplification is only valid if the upper range is not negative. -  bool IsNegative, IsNotNegative; -  ComputeSignBit(RangeEnd, IsNotNegative, IsNegative, /*Depth=*/0, Cmp1); -  if (!IsNotNegative) +  KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1); +  if (!Known.isNonNegative())      return nullptr;    if (Inverted) @@ -1013,26 +997,22 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {  /// (~A & ~B) == (~(A | B))  /// (~A | ~B) == (~(A & B))  static Instruction *matchDeMorgansLaws(BinaryOperator &I, -                                       InstCombiner::BuilderTy *Builder) { +                                       InstCombiner::BuilderTy &Builder) {    auto Opcode = I.getOpcode();    assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&           "Trying to match De Morgan's Laws with something other than and/or"); +    // Flip the logic operation. -  if (Opcode == Instruction::And) -    Opcode = Instruction::Or; -  else -    Opcode = Instruction::And; +  Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And; -  Value *Op0 = I.getOperand(0); -  Value *Op1 = I.getOperand(1); -  // TODO: Use pattern matchers instead of dyn_cast. -  if (Value *Op0NotVal = dyn_castNotVal(Op0)) -    if (Value *Op1NotVal = dyn_castNotVal(Op1)) -      if (Op0->hasOneUse() && Op1->hasOneUse()) { -        Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal, -                                              I.getName() + ".demorgan"); -        return BinaryOperator::CreateNot(LogicOp); -      } +  Value *A, *B; +  if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) && +      match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) && +      !IsFreeToInvert(A, A->hasOneUse()) && +      !IsFreeToInvert(B, B->hasOneUse())) { +    Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan"); +    return BinaryOperator::CreateNot(AndOr); +  }    return nullptr;  } @@ -1376,7 +1356,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {      if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))        return FoldedLogic; -  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) +  if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder))      return DeMorgan;    { @@ -2005,18 +1985,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {    if (Value *V = SimplifyBSwap(I))      return replaceInstUsesWith(I, V); -  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { -    ConstantInt *C1 = nullptr; Value *X = nullptr; -    // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2) -    if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) && -        Op0->hasOneUse()) { -      Value *Or = Builder->CreateOr(X, RHS); -      Or->takeName(Op0); -      return BinaryOperator::CreateXor(Or, -                            Builder->getInt(C1->getValue() & ~RHS->getValue())); -    } -  } -    if (isa<Constant>(Op1))      if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))        return FoldedLogic; @@ -2167,7 +2135,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {    if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))      return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C)); -  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) +  if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder))      return DeMorgan;    // Canonicalize xor to the RHS. @@ -2399,27 +2367,44 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {    }    // Is this a 'not' (~) fed by a binary operator? -  BinaryOperator *NotOp; -  if (match(&I, m_Not(m_BinOp(NotOp)))) { -    if (NotOp->getOpcode() == Instruction::And || -        NotOp->getOpcode() == Instruction::Or) { +  BinaryOperator *NotVal; +  if (match(&I, m_Not(m_BinOp(NotVal)))) { +    if (NotVal->getOpcode() == Instruction::And || +        NotVal->getOpcode() == Instruction::Or) {        // Apply DeMorgan's Law when inverts are free:        // ~(X & Y) --> (~X | ~Y)        // ~(X | Y) --> (~X & ~Y) -      if (IsFreeToInvert(NotOp->getOperand(0), -                         NotOp->getOperand(0)->hasOneUse()) && -          IsFreeToInvert(NotOp->getOperand(1), -                         NotOp->getOperand(1)->hasOneUse())) { -        Value *NotX = Builder->CreateNot(NotOp->getOperand(0), "notlhs"); -        Value *NotY = Builder->CreateNot(NotOp->getOperand(1), "notrhs"); -        if (NotOp->getOpcode() == Instruction::And) +      if (IsFreeToInvert(NotVal->getOperand(0), +                         NotVal->getOperand(0)->hasOneUse()) && +          IsFreeToInvert(NotVal->getOperand(1), +                         NotVal->getOperand(1)->hasOneUse())) { +        Value *NotX = Builder->CreateNot(NotVal->getOperand(0), "notlhs"); +        Value *NotY = Builder->CreateNot(NotVal->getOperand(1), "notrhs"); +        if (NotVal->getOpcode() == Instruction::And)            return BinaryOperator::CreateOr(NotX, NotY);          return BinaryOperator::CreateAnd(NotX, NotY);        } -    } else if (NotOp->getOpcode() == Instruction::AShr) { -      // ~(~X >>s Y) --> (X >>s Y) -      if (Value *Op0NotVal = dyn_castNotVal(NotOp->getOperand(0))) -        return BinaryOperator::CreateAShr(Op0NotVal, NotOp->getOperand(1)); +    } + +    // ~(~X >>s Y) --> (X >>s Y) +    if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y)))) +      return BinaryOperator::CreateAShr(X, Y); + +    // If we are inverting a right-shifted constant, we may be able to eliminate +    // the 'not' by inverting the constant and using the opposite shift type. +    // Canonicalization rules ensure that only a negative constant uses 'ashr', +    // but we must check that in case that transform has not fired yet. +    const APInt *C; +    if (match(NotVal, m_AShr(m_APInt(C), m_Value(Y))) && C->isNegative()) { +      // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits) +      Constant *NotC = ConstantInt::get(I.getType(), ~(*C)); +      return BinaryOperator::CreateLShr(NotC, Y); +    } + +    if (match(NotVal, m_LShr(m_APInt(C), m_Value(Y))) && C->isNonNegative()) { +      // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits) +      Constant *NotC = ConstantInt::get(I.getType(), ~(*C)); +      return BinaryOperator::CreateAShr(NotC, Y);      }    } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 6989d67f00603..face7abcc95f2 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1384,10 +1384,10 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {    // Create a mask for bits above (ctlz) or below (cttz) the first known one.    bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; -  unsigned PossibleZeros = IsTZ ? Known.One.countTrailingZeros() -                                : Known.One.countLeadingZeros(); -  unsigned DefiniteZeros = IsTZ ? Known.Zero.countTrailingOnes() -                                : Known.Zero.countLeadingOnes(); +  unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros() +                                : Known.countMaxLeadingZeros(); +  unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros() +                                : Known.countMinLeadingZeros();    // If all bits above (ctlz) or below (cttz) the first known one are known    // zero, this value is constant. diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 312d9baae43a6..001a4bcf16f33 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -559,6 +559,9 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {      return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);    } +  // FIXME: Maybe combine the next two transforms to handle the no cast case +  // more efficiently. Support vector types. Cleanup code by using m_OneUse. +    // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.    Value *A = nullptr; ConstantInt *Cst = nullptr;    if (Src->hasOneUse() && @@ -588,15 +591,20 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {    // the sign bit of the original value; performing ashr instead of lshr    // generates bits of the same value as the sign bit.    if (Src->hasOneUse() && -      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) && -      cast<Instruction>(Src)->getOperand(0)->hasOneUse()) { +      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) { +    Value *SExt = cast<Instruction>(Src)->getOperand(0); +    const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits();      const unsigned ASize = A->getType()->getPrimitiveSizeInBits(); +    unsigned ShiftAmt = Cst->getZExtValue();      // This optimization can be only performed when zero bits generated by      // the original lshr aren't pulled into the value after truncation, so we -    // can only shift by values smaller than the size of destination type (in -    // bits). -    if (Cst->getValue().ult(ASize)) { -      Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue()); +    // can only shift by values no larger than the number of extension bits. +    // FIXME: Instead of bailing when the shift is too large, use and to clear +    // the extra bits. +    if (SExt->hasOneUse() && ShiftAmt <= SExtSize - ASize) { +      // If shifting by the size of the original value in bits or more, it is +      // being filled with the sign bit, so shift by ASize-1 to avoid ub. +      Value *Shift = Builder->CreateAShr(A, std::min(ShiftAmt, ASize-1));        Shift->takeName(Src);        return CastInst::CreateIntegerCast(Shift, CI.getType(), true);      } @@ -1180,9 +1188,8 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {    // If we know that the value being extended is positive, we can use a zext    // instead. -  bool KnownZero, KnownOne; -  ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI); -  if (KnownZero) { +  KnownBits Known = computeKnownBits(Src, 0, &CI); +  if (Known.isNonNegative()) {      Value *ZExt = Builder->CreateZExt(Src, DestTy);      return replaceInstUsesWith(CI, ZExt);    } diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 34ce235b3fe23..60ed4057ceddf 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2785,6 +2785,9 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {  }  /// Try to fold icmp (binop), X or icmp X, (binop). +/// TODO: A large part of this logic is duplicated in InstSimplify's +/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code +/// duplication.  Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {    Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -2794,7 +2797,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {    if (!BO0 && !BO1)      return nullptr; -  CmpInst::Predicate Pred = I.getPredicate(); +  const CmpInst::Predicate Pred = I.getPredicate();    bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;    if (BO0 && isa<OverflowingBinaryOperator>(BO0))      NoOp0WrapProblem = @@ -3029,21 +3032,20 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {      case Instruction::Sub:      case Instruction::Xor:        if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b -        return new ICmpInst(I.getPredicate(), BO0->getOperand(0), -                            BO1->getOperand(0)); +        return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));        // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b        if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {          if (CI->getValue().isSignMask()) { -          ICmpInst::Predicate Pred = +          ICmpInst::Predicate NewPred =                I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); -          return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); +          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));          }          if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) { -          ICmpInst::Predicate Pred = +          ICmpInst::Predicate NewPred =                I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); -          Pred = I.getSwappedPredicate(Pred); -          return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); +          NewPred = I.getSwappedPredicate(NewPred); +          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));          }        }        break; @@ -3062,21 +3064,27 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {                                     AP.getBitWidth() - AP.countTrailingZeros()));            Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask);            Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask); -          return new ICmpInst(I.getPredicate(), And1, And2); +          return new ICmpInst(Pred, And1, And2);          }        }        break; +      case Instruction::UDiv:      case Instruction::LShr: -      if (I.isSigned()) +      if (I.isSigned() || !BO0->isExact() || !BO1->isExact())          break; -      LLVM_FALLTHROUGH; +      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); +      case Instruction::SDiv: +      if (!I.isEquality() || !BO0->isExact() || !BO1->isExact()) +        break; +      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); +      case Instruction::AShr:        if (!BO0->isExact() || !BO1->isExact())          break; -      return new ICmpInst(I.getPredicate(), BO0->getOperand(0), -                          BO1->getOperand(0)); +      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); +      case Instruction::Shl: {        bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();        bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap(); @@ -3084,8 +3092,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {          break;        if (!NSW && I.isSigned())          break; -      return new ICmpInst(I.getPredicate(), BO0->getOperand(0), -                          BO1->getOperand(0)); +      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));      }      }    } @@ -3096,7 +3103,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {      auto BitwiseAnd =          m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value())); -    if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) { +    if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {        auto *Zero = Constant::getNullValue(BO0->getType());        return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);      } diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 3be6419a129a4..1424f61fe7017 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -30,6 +30,7 @@  #include "llvm/IR/PatternMatch.h"  #include "llvm/Pass.h"  #include "llvm/Support/Dwarf.h" +#include "llvm/Support/KnownBits.h"  #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"  #include "llvm/Transforms/Utils/Local.h" @@ -388,10 +389,21 @@ private:                                   bool DoTransform = true);    Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI); -  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI); +  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) { +    return computeOverflowForSignedAdd(LHS, RHS, &CxtI) == +           OverflowResult::NeverOverflows; +  }; +  bool willNotOverflowUnsignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) { +    return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) == +           OverflowResult::NeverOverflows; +  };    bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI);    bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI);    bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction &CxtI); +  bool willNotOverflowUnsignedMul(Value *LHS, Value *RHS, Instruction &CxtI) { +    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) == +           OverflowResult::NeverOverflows; +  };    Value *EmitGEPOffset(User *GEP);    Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);    Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask); @@ -492,7 +504,11 @@ public:    void computeKnownBits(Value *V, KnownBits &Known,                          unsigned Depth, Instruction *CxtI) const { -    return llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT); +    llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT); +  } +  KnownBits computeKnownBits(Value *V, unsigned Depth, +                             Instruction *CxtI) const { +    return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);    }    bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0, @@ -503,11 +519,6 @@ public:                                Instruction *CxtI = nullptr) const {      return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);    } -  void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne, -                      unsigned Depth = 0, Instruction *CxtI = nullptr) const { -    return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI, -                                &DT); -  }    OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS,                                                 const Instruction *CxtI) {      return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT); @@ -516,6 +527,11 @@ public:                                                 const Instruction *CxtI) {      return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);    } +  OverflowResult computeOverflowForSignedAdd(const Value *LHS, +                                             const Value *RHS, +                                             const Instruction *CxtI) const { +    return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT); +  }    /// Maximum size of array considered when transforming.    uint64_t MaxArraySizeForCombine; diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 675553017838b..a4d84ae81aa02 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -885,10 +885,8 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,    // first non-zero index.    auto IsAllNonNegative = [&]() {      for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) { -      bool KnownNonNegative, KnownNegative; -      IC.ComputeSignBit(GEPI->getOperand(i), KnownNonNegative, -                        KnownNegative, 0, MemI); -      if (KnownNonNegative) +      KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI); +      if (Known.isNonNegative())          continue;        return false;      } diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index face9d9237ae1..2a35259f2103f 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -162,11 +162,9 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,      // product is exactly the minimum negative number.      // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000      // For simplicity we just check if at least one side is not negative. -    bool LHSNonNegative, LHSNegative; -    bool RHSNonNegative, RHSNegative; -    ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, &CxtI); -    ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, &CxtI); -    if (LHSNonNegative || RHSNonNegative) +    KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI); +    KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI); +    if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative())        return true;    }    return false; @@ -422,8 +420,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {          Constant *CI =              ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());          if (ConstantExpr::getZExt(CI, I.getType()) == Op1C && -            computeOverflowForUnsignedMul(Op0Conv->getOperand(0), CI, &I) == -                OverflowResult::NeverOverflows) { +            willNotOverflowUnsignedMul(Op0Conv->getOperand(0), CI, I)) {            // Insert the new, smaller mul.            Value *NewMul =                Builder->CreateNUWMul(Op0Conv->getOperand(0), CI, "mulconv"); @@ -440,9 +437,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {        if (Op0Conv->getOperand(0)->getType() ==                Op1Conv->getOperand(0)->getType() &&            (Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) && -          computeOverflowForUnsignedMul(Op0Conv->getOperand(0), -                                        Op1Conv->getOperand(0), -                                        &I) == OverflowResult::NeverOverflows) { +          willNotOverflowUnsignedMul(Op0Conv->getOperand(0), +                                     Op1Conv->getOperand(0), I)) {          // Insert the new integer mul.          Value *NewMul = Builder->CreateNUWMul(              Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv"); @@ -456,9 +452,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {      I.setHasNoSignedWrap(true);    } -  if (!I.hasNoUnsignedWrap() && -      computeOverflowForUnsignedMul(Op0, Op1, &I) == -          OverflowResult::NeverOverflows) { +  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) {      Changed = true;      I.setHasNoUnsignedWrap(true);    } diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 05b01774cd5ef..4028a92771a49 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -611,7 +611,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,          SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))        return I; -    unsigned Leaders = Known2.Zero.countLeadingOnes(); +    unsigned Leaders = Known2.countMinLeadingZeros();      Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;      break;    } diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 1792cb585f878..65b1148cb03b5 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2212,9 +2212,9 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {    // Canonicalize fcmp_one -> fcmp_oeq    FCmpInst::Predicate FPred; Value *Y; -  if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), -                             TrueDest, FalseDest)) && -      BI.getCondition()->hasOneUse()) +  if (match(&BI, m_Br(m_OneUse(m_FCmp(FPred, m_Value(X), m_Value(Y))), +                      TrueDest, FalseDest))) { +    // TODO: Why are we only transforming these 3 predicates?      if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||          FPred == FCmpInst::FCMP_OGE) {        FCmpInst *Cond = cast<FCmpInst>(BI.getCondition()); @@ -2225,12 +2225,12 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {        Worklist.Add(Cond);        return &BI;      } +  }    // Canonicalize icmp_ne -> icmp_eq    ICmpInst::Predicate IPred; -  if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)), -                      TrueDest, FalseDest)) && -      BI.getCondition()->hasOneUse()) +  if (match(&BI, m_Br(m_OneUse(m_ICmp(IPred, m_Value(X), m_Value(Y))), +                      TrueDest, FalseDest))) {      if (IPred == ICmpInst::ICMP_NE  || IPred == ICmpInst::ICMP_ULE ||          IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE ||          IPred == ICmpInst::ICMP_SGE) { @@ -2241,6 +2241,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {        Worklist.Add(Cond);        return &BI;      } +  }    return nullptr;  } @@ -2264,8 +2265,8 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {    unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth();    KnownBits Known(BitWidth);    computeKnownBits(Cond, Known, 0, &SI); -  unsigned LeadingKnownZeros = Known.Zero.countLeadingOnes(); -  unsigned LeadingKnownOnes = Known.One.countLeadingOnes(); +  unsigned LeadingKnownZeros = Known.countMinLeadingZeros(); +  unsigned LeadingKnownOnes = Known.countMinLeadingOnes();    // Compute the number of leading bits we can ignore.    // TODO: A better way to determine this would use ComputeNumSignBits(). @@ -3141,7 +3142,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,    // Lower dbg.declare intrinsics otherwise their value may be clobbered    // by instcombiner. -  bool DbgDeclaresChanged = LowerDbgDeclare(F); +  bool MadeIRChange = LowerDbgDeclare(F);    // Iterate while there is work to do.    int Iteration = 0; @@ -3150,18 +3151,17 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,      DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "                   << F.getName() << "\n"); -    bool Changed = prepareICWorklistFromFunction(F, DL, &TLI, Worklist); +    MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);      InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines,                      AA, AC, TLI, DT, DL, LI);      IC.MaxArraySizeForCombine = MaxArraySize; -    Changed |= IC.run(); -    if (!Changed) +    if (!IC.run())        break;    } -  return DbgDeclaresChanged || Iteration > 1; +  return MadeIRChange || Iteration > 1;  }  PreservedAnalyses InstCombinePass::run(Function &F, diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index b034ccc469338..7eea44d6aca03 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -613,7 +613,15 @@ public:                                    bool UseGlobalsGC = true)        : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan),          Recover(Recover || ClRecover), -        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC) {} +        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC), +        // Not a typo: ClWithComdat is almost completely pointless without +        // ClUseGlobalsGC (because then it only works on modules without +        // globals, which are rare); it is a prerequisite for ClUseGlobalsGC; +        // and both suffer from gold PR19002 for which UseGlobalsGC constructor +        // argument is designed as workaround. Therefore, disable both +        // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to +        // do globals-gc. +        UseCtorComdat(UseGlobalsGC && ClWithComdat) {}    bool runOnModule(Module &M) override;    static char ID; // Pass identification, replacement for typeid    StringRef getPassName() const override { return "AddressSanitizerModule"; } @@ -656,6 +664,7 @@ private:    bool CompileKernel;    bool Recover;    bool UseGlobalsGC; +  bool UseCtorComdat;    Type *IntptrTy;    LLVMContext *C;    Triple TargetTriple; @@ -1677,7 +1686,7 @@ AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer,                       : GlobalVariable::PrivateLinkage;    GlobalVariable *Metadata = new GlobalVariable(        M, Initializer->getType(), false, Linkage, Initializer, -      Twine("__asan_global_") + GlobalValue::getRealLinkageName(OriginalName)); +      Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));    Metadata->setSection(getGlobalMetadataSection());    return Metadata;  } @@ -1782,7 +1791,7 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(    // On recent Mach-O platforms, use a structure which binds the liveness of    // the global variable to the metadata struct. Keep the list of "Liveness" GV    // created to be added to llvm.compiler.used -  StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy, nullptr); +  StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);    SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());    for (size_t i = 0; i < ExtendedGlobals.size(); i++) { @@ -1793,9 +1802,9 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(      // On recent Mach-O platforms, we emit the global metadata in a way that      // allows the linker to properly strip dead globals. -    auto LivenessBinder = ConstantStruct::get( -        LivenessTy, Initializer->getAggregateElement(0u), -        ConstantExpr::getPointerCast(Metadata, IntptrTy), nullptr); +    auto LivenessBinder = +        ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u), +                            ConstantExpr::getPointerCast(Metadata, IntptrTy));      GlobalVariable *Liveness = new GlobalVariable(          M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,          Twine("__asan_binder_") + G->getName()); @@ -1893,7 +1902,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool    // We initialize an array of such structures and pass it to a run-time call.    StructType *GlobalStructTy =        StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy, -                      IntptrTy, IntptrTy, IntptrTy, nullptr); +                      IntptrTy, IntptrTy, IntptrTy);    SmallVector<GlobalVariable *, 16> NewGlobals(n);    SmallVector<Constant *, 16> Initializers(n); @@ -1929,10 +1938,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool      assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);      Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize); -    StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr); -    Constant *NewInitializer = -        ConstantStruct::get(NewTy, G->getInitializer(), -                            Constant::getNullValue(RightRedZoneTy), nullptr); +    StructType *NewTy = StructType::get(Ty, RightRedZoneTy); +    Constant *NewInitializer = ConstantStruct::get( +        NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));      // Create a new global variable with enough space for a redzone.      GlobalValue::LinkageTypes Linkage = G->getLinkage(); @@ -2013,7 +2021,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool          ConstantExpr::getPointerCast(Name, IntptrTy),          ConstantExpr::getPointerCast(ModuleName, IntptrTy),          ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc, -        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy), nullptr); +        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));      if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true; @@ -2073,7 +2081,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {    // Put the constructor and destructor in comdat if both    // (1) global instrumentation is not TU-specific    // (2) target is ELF. -  if (ClWithComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) { +  if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {      AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));      appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority,                          AsanCtorFunction); diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 8786781933ea1..e2e3cbdbc295b 100644 --- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -388,7 +388,7 @@ FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {      ArgTypes.push_back(ShadowPtrTy);    Type *RetType = T->getReturnType();    if (!RetType->isVoidTy()) -    RetType = StructType::get(RetType, ShadowTy, (Type *)nullptr); +    RetType = StructType::get(RetType, ShadowTy);    return FunctionType::get(RetType, ArgTypes, T->isVarArg());  } @@ -476,16 +476,14 @@ bool DataFlowSanitizer::doInitialization(Module &M) {      GetArgTLS = ConstantExpr::getIntToPtr(          ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),          PointerType::getUnqual( -            FunctionType::get(PointerType::getUnqual(ArgTLSTy), -                              (Type *)nullptr))); +            FunctionType::get(PointerType::getUnqual(ArgTLSTy), false)));    }    if (GetRetvalTLSPtr) {      RetvalTLS = nullptr;      GetRetvalTLS = ConstantExpr::getIntToPtr(          ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),          PointerType::getUnqual( -            FunctionType::get(PointerType::getUnqual(ShadowTy), -                              (Type *)nullptr))); +            FunctionType::get(PointerType::getUnqual(ShadowTy), false)));    }    ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000); diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp index 7dea1dee756ac..e89384c559fe0 100644 --- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp +++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp @@ -398,8 +398,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(    //   u64 *ArrayCounter;    // };    auto *StructInfoTy = -    StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy, -                    Int8PtrPtrTy, Int64PtrTy, Int64PtrTy, nullptr); +      StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy, +                      Int8PtrPtrTy, Int64PtrTy, Int64PtrTy);    auto *StructInfoPtrTy = StructInfoTy->getPointerTo();    // This structure should be kept consistent with the CacheFragInfo struct    // in the runtime library. @@ -408,8 +408,7 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(    //   u32 NumStructs;    //   StructInfo *Structs;    // }; -  auto *CacheFragInfoTy = -    StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy, nullptr); +  auto *CacheFragInfoTy = StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy);    std::vector<StructType *> Vec = M.getIdentifiedStructTypes();    unsigned NumStructs = 0; @@ -457,24 +456,23 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(      ArrayCounterIdx[0] = ConstantInt::get(Int32Ty, 0);      ArrayCounterIdx[1] = ConstantInt::get(Int32Ty,                                            getArrayCounterIdx(StructTy)); -    Initializers.push_back( -        ConstantStruct::get( -            StructInfoTy, -            ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy), -            ConstantInt::get(Int32Ty, -                             DL.getStructLayout(StructTy)->getSizeInBytes()), -            ConstantInt::get(Int32Ty, StructTy->getNumElements()), -            Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) : -                ConstantExpr::getPointerCast(Offset, Int32PtrTy), -            Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) : -                ConstantExpr::getPointerCast(Size, Int32PtrTy), -            TypeName == nullptr ? ConstantPointerNull::get(Int8PtrPtrTy) : -                ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy), -            ConstantExpr::getGetElementPtr(CounterArrayTy, Counters, -                                           FieldCounterIdx), -            ConstantExpr::getGetElementPtr(CounterArrayTy, Counters, -                                           ArrayCounterIdx), -            nullptr)); +    Initializers.push_back(ConstantStruct::get( +        StructInfoTy, +        ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy), +        ConstantInt::get(Int32Ty, +                         DL.getStructLayout(StructTy)->getSizeInBytes()), +        ConstantInt::get(Int32Ty, StructTy->getNumElements()), +        Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) +                          : ConstantExpr::getPointerCast(Offset, Int32PtrTy), +        Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) +                        : ConstantExpr::getPointerCast(Size, Int32PtrTy), +        TypeName == nullptr +            ? ConstantPointerNull::get(Int8PtrPtrTy) +            : ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy), +        ConstantExpr::getGetElementPtr(CounterArrayTy, Counters, +                                       FieldCounterIdx), +        ConstantExpr::getGetElementPtr(CounterArrayTy, Counters, +                                       ArrayCounterIdx)));    }    // Structs.    Constant *StructInfo; @@ -491,11 +489,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(    auto *CacheFragInfoGV = new GlobalVariable(        M, CacheFragInfoTy, true, GlobalVariable::InternalLinkage, -      ConstantStruct::get(CacheFragInfoTy, -                          UnitName, -                          ConstantInt::get(Int32Ty, NumStructs), -                          StructInfo, -                          nullptr)); +      ConstantStruct::get(CacheFragInfoTy, UnitName, +                          ConstantInt::get(Int32Ty, NumStructs), StructInfo));    return CacheFragInfoGV;  } diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 15333a5317dd2..ff753c20a94af 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -1576,13 +1576,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {    Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,                            bool Signed = false) {      Type *srcTy = V->getType(); +    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy); +    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy); +    if (srcSizeInBits > 1 && dstSizeInBits == 1) +      return IRB.CreateICmpNE(V, getCleanShadow(V)); +      if (dstTy->isIntegerTy() && srcTy->isIntegerTy())        return IRB.CreateIntCast(V, dstTy, Signed);      if (dstTy->isVectorTy() && srcTy->isVectorTy() &&          dstTy->getVectorNumElements() == srcTy->getVectorNumElements())        return IRB.CreateIntCast(V, dstTy, Signed); -    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy); -    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);      Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));      Value *V2 =        IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed); diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 3f1a77b49a442..ee493a8ec7e18 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -442,9 +442,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {    bool Changed = false;    if (!NUW) { -    ConstantRange NUWRange = -            LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange, -                                              OBO::NoUnsignedWrap); +    ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion( +        BinaryOperator::Add, LRange, OBO::NoUnsignedWrap);      if (!NUWRange.isEmptySet()) {        bool NewNUW = NUWRange.contains(LazyRRange());        AddOp->setHasNoUnsignedWrap(NewNUW); @@ -452,9 +451,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {      }    }    if (!NSW) { -    ConstantRange NSWRange = -            LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange, -                                              OBO::NoSignedWrap); +    ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion( +        BinaryOperator::Add, LRange, OBO::NoSignedWrap);      if (!NSWRange.isEmptySet()) {        bool NewNSW = NSWRange.contains(LazyRRange());        AddOp->setHasNoSignedWrap(NewNSW); diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 48d5ae88cda91..6693a26e8890f 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -144,6 +144,10 @@ private:    bool recognizePopcount();    void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,                                 PHINode *CntPhi, Value *Var); +  bool recognizeAndInsertCTLZ(); +  void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst, +                                PHINode *CntPhi, Value *Var, const DebugLoc DL, +                                bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);    /// @}  }; @@ -994,7 +998,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,  }  bool LoopIdiomRecognize::runOnNoncountableLoop() { -  return recognizePopcount(); +  return recognizePopcount() || recognizeAndInsertCTLZ();  }  /// Check if the given conditional branch is based on the comparison between @@ -1159,6 +1163,167 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,    return true;  } +/// Return true if the idiom is detected in the loop. +/// +/// Additionally: +/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) +///       or nullptr if there is no such. +/// 2) \p CntPhi is set to the corresponding phi node +///       or nullptr if there is no such. +/// 3) \p Var is set to the value whose CTLZ could be used. +/// 4) \p DefX is set to the instruction calculating Loop exit condition. +/// +/// The core idiom we are trying to detect is: +/// \code +///    if (x0 == 0) +///      goto loop-exit // the precondition of the loop +///    cnt0 = init-val; +///    do { +///       x = phi (x0, x.next);   //PhiX +///       cnt = phi(cnt0, cnt.next); +/// +///       cnt.next = cnt + 1; +///        ... +///       x.next = x >> 1;   // DefX +///        ... +///    } while(x.next != 0); +/// +/// loop-exit: +/// \endcode +static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, +                            Instruction *&CntInst, PHINode *&CntPhi, +                            Instruction *&DefX) { +  BasicBlock *LoopEntry; +  Value *VarX = nullptr; + +  DefX = nullptr; +  PhiX = nullptr; +  CntInst = nullptr; +  CntPhi = nullptr; +  LoopEntry = *(CurLoop->block_begin()); + +  // step 1: Check if the loop-back branch is in desirable form. +  if (Value *T = matchCondition( +          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) +    DefX = dyn_cast<Instruction>(T); +  else +    return false; + +  // step 2: detect instructions corresponding to "x.next = x >> 1" +  if (!DefX || DefX->getOpcode() != Instruction::AShr) +    return false; +  if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1))) +    if (!Shft || !Shft->isOne()) +      return false; +  VarX = DefX->getOperand(0); + +  // step 3: Check the recurrence of variable X +  PhiX = dyn_cast<PHINode>(VarX); +  if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX)) +    return false; + +  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 +  // TODO: We can skip the step. If loop trip count is known (CTLZ), +  //       then all uses of "cnt.next" could be optimized to the trip count +  //       plus "cnt0". Currently it is not optimized. +  //       This step could be used to detect POPCNT instruction: +  //       cnt.next = cnt + (x.next & 1) +  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), +                            IterE = LoopEntry->end(); +       Iter != IterE; Iter++) { +    Instruction *Inst = &*Iter; +    if (Inst->getOpcode() != Instruction::Add) +      continue; + +    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); +    if (!Inc || !Inc->isOne()) +      continue; + +    PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); +    if (!Phi || Phi->getParent() != LoopEntry) +      continue; + +    CntInst = Inst; +    CntPhi = Phi; +    break; +  } +  if (!CntInst) +    return false; + +  return true; +} + +/// Recognize CTLZ idiom in a non-countable loop and convert the loop +/// to countable (with CTLZ trip count). +/// If CTLZ inserted as a new trip count returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizeAndInsertCTLZ() { +  // Give up if the loop has multiple blocks or multiple backedges. +  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) +    return false; + +  Instruction *CntInst, *DefX; +  PHINode *CntPhi, *PhiX; +  if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX)) +    return false; + +  bool IsCntPhiUsedOutsideLoop = false; +  for (User *U : CntPhi->users()) +    if (!CurLoop->contains(dyn_cast<Instruction>(U))) { +      IsCntPhiUsedOutsideLoop = true; +      break; +    } +  bool IsCntInstUsedOutsideLoop = false; +  for (User *U : CntInst->users()) +    if (!CurLoop->contains(dyn_cast<Instruction>(U))) { +      IsCntInstUsedOutsideLoop = true; +      break; +    } +  // If both CntInst and CntPhi are used outside the loop the profitability +  // is questionable. +  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop) +    return false; + +  // For some CPUs result of CTLZ(X) intrinsic is undefined +  // when X is 0. If we can not guarantee X != 0, we need to check this +  // when expand. +  bool ZeroCheck = false; +  // It is safe to assume Preheader exist as it was checked in +  // parent function RunOnLoop. +  BasicBlock *PH = CurLoop->getLoopPreheader(); +  Value *InitX = PhiX->getIncomingValueForBlock(PH); +  // If we check X != 0 before entering the loop we don't need a zero +  // check in CTLZ intrinsic. +  if (BasicBlock *PreCondBB = PH->getSinglePredecessor()) +    if (BranchInst *PreCondBr = +        dyn_cast<BranchInst>(PreCondBB->getTerminator())) { +      if (matchCondition(PreCondBr, PH) == InitX) +        ZeroCheck = true; +    } + +  // Check if CTLZ intrinsic is profitable. Assume it is always profitable +  // if we delete the loop (the loop has only 6 instructions): +  //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] +  //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] +  //  %shr = ashr %n.addr.0, 1 +  //  %tobool = icmp eq %shr, 0 +  //  %inc = add nsw %i.0, 1 +  //  br i1 %tobool + +  IRBuilder<> Builder(PH->getTerminator()); +  SmallVector<const Value *, 2> Ops = +      {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()}; +  ArrayRef<const Value *> Args(Ops); +  if (CurLoop->getHeader()->size() != 6 && +      TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) > +          TargetTransformInfo::TCC_Basic) +    return false; + +  const DebugLoc DL = DefX->getDebugLoc(); +  transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck, +                           IsCntPhiUsedOutsideLoop); +  return true; +} +  /// Recognizes a population count idiom in a non-countable loop.  ///  /// If detected, transforms the relevant code to issue the popcount intrinsic @@ -1222,6 +1387,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,    return CI;  } +static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, +                                     const DebugLoc &DL, bool ZeroCheck) { +  Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()}; +  Type *Tys[] = {Val->getType()}; + +  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); +  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys); +  CallInst *CI = IRBuilder.CreateCall(Func, Ops); +  CI->setDebugLoc(DL); + +  return CI; +} + +/// Transform the following loop: +/// loop: +///   CntPhi = PHI [Cnt0, CntInst] +///   PhiX = PHI [InitX, DefX] +///   CntInst = CntPhi + 1 +///   DefX = PhiX >> 1 +//    LOOP_BODY +///   Br: loop if (DefX != 0) +/// Use(CntPhi) or Use(CntInst) +/// +/// Into: +/// If CntPhi used outside the loop: +///   CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) +///   Count = CountPrev + 1 +/// else +///   Count = BitWidth(InitX) - CTLZ(InitX) +/// loop: +///   CntPhi = PHI [Cnt0, CntInst] +///   PhiX = PHI [InitX, DefX] +///   PhiCount = PHI [Count, Dec] +///   CntInst = CntPhi + 1 +///   DefX = PhiX >> 1 +///   Dec = PhiCount - 1 +///   LOOP_BODY +///   Br: loop if (Dec != 0) +/// Use(CountPrev + Cnt0) // Use(CntPhi) +/// or +/// Use(Count + Cnt0) // Use(CntInst) +/// +/// If LOOP_BODY is empty the loop will be deleted. +/// If CntInst and DefX are not used in LOOP_BODY they will be removed. +void LoopIdiomRecognize::transformLoopToCountable( +    BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, +    const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { +  BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator()); + +  // Step 1: Insert the CTLZ instruction at the end of the preheader block +  //   Count = BitWidth - CTLZ(InitX); +  // If there are uses of CntPhi create: +  //   CountPrev = BitWidth - CTLZ(InitX >> 1); +  IRBuilder<> Builder(PreheaderBr); +  Builder.SetCurrentDebugLocation(DL); +  Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext; + +  if (IsCntPhiUsedOutsideLoop) +    InitXNext = Builder.CreateAShr(InitX, +                                   ConstantInt::get(InitX->getType(), 1)); +  else +    InitXNext = InitX; +  CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck); +  Count = Builder.CreateSub( +      ConstantInt::get(CTLZ->getType(), +                       CTLZ->getType()->getIntegerBitWidth()), +      CTLZ); +  if (IsCntPhiUsedOutsideLoop) { +    CountPrev = Count; +    Count = Builder.CreateAdd( +        CountPrev, +        ConstantInt::get(CountPrev->getType(), 1)); +  } +  if (IsCntPhiUsedOutsideLoop) +    NewCount = Builder.CreateZExtOrTrunc(CountPrev, +        cast<IntegerType>(CntInst->getType())); +  else +    NewCount = Builder.CreateZExtOrTrunc(Count, +        cast<IntegerType>(CntInst->getType())); + +  // If the CTLZ counter's initial value is not zero, insert Add Inst. +  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); +  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); +  if (!InitConst || !InitConst->isZero()) +    NewCount = Builder.CreateAdd(NewCount, CntInitVal); + +  // Step 2: Insert new IV and loop condition: +  // loop: +  //   ... +  //   PhiCount = PHI [Count, Dec] +  //   ... +  //   Dec = PhiCount - 1 +  //   ... +  //   Br: loop if (Dec != 0) +  BasicBlock *Body = *(CurLoop->block_begin()); +  auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); +  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); +  Type *Ty = Count->getType(); + +  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); + +  Builder.SetInsertPoint(LbCond); +  Instruction *TcDec = cast<Instruction>( +      Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), +                        "tcdec", false, true)); + +  TcPhi->addIncoming(Count, Preheader); +  TcPhi->addIncoming(TcDec, Body); + +  CmpInst::Predicate Pred = +      (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; +  LbCond->setPredicate(Pred); +  LbCond->setOperand(0, TcDec); +  LbCond->setOperand(1, ConstantInt::get(Ty, 0)); + +  // Step 3: All the references to the original counter outside +  //  the loop are replaced with the NewCount -- the value returned from +  //  __builtin_ctlz(x). +  if (IsCntPhiUsedOutsideLoop) +    CntPhi->replaceUsesOutsideBlock(NewCount, Body); +  else +    CntInst->replaceUsesOutsideBlock(NewCount, Body); + +  // step 4: Forget the "non-computable" trip-count SCEV associated with the +  //   loop. The loop would otherwise not be deleted even if it becomes empty. +  SE->forgetLoop(CurLoop); +} +  void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,                                                   Instruction *CntInst,                                                   PHINode *CntPhi, Value *Var) { diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 3c9850b156ac1..5e0a705782ea0 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -283,7 +283,6 @@ public:    // Forward propagation info    const Expression *getDefiningExpr() const { return DefiningExpr; } -  void setDefiningExpr(const Expression *E) { DefiningExpr = E; }    // Value member set    bool empty() const { return Members.empty(); } @@ -317,6 +316,9 @@ public:      --StoreCount;    } +  // True if this class has no memory members. +  bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); } +    // Return true if two congruence classes are equivalent to each other.  This    // means    // that every field but the ID number and the dead field are equivalent. @@ -401,9 +403,12 @@ class NewGVN {    MemorySSAWalker *MSSAWalker;    const DataLayout &DL;    std::unique_ptr<PredicateInfo> PredInfo; -  BumpPtrAllocator ExpressionAllocator; -  ArrayRecycler<Value *> ArgRecycler; -  TarjanSCC SCCFinder; + +  // These are the only two things the create* functions should have +  // side-effects on due to allocating memory. +  mutable BumpPtrAllocator ExpressionAllocator; +  mutable ArrayRecycler<Value *> ArgRecycler; +  mutable TarjanSCC SCCFinder;    const SimplifyQuery SQ;    // Number of function arguments, used by ranking @@ -430,11 +435,12 @@ class NewGVN {    // In order to correctly ensure propagation, we must keep track of what    // comparisons we used, so that when the values of the comparisons change, we    // propagate the information to the places we used the comparison. -  DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> PredicateToUsers; -  // Mapping from MemoryAccess we used to the MemoryAccess we used it with.  Has +  mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> +      PredicateToUsers;    // the same reasoning as PredicateToUsers.  When we skip MemoryAccesses for    // stores, we no longer can rely solely on the def-use chains of MemorySSA. -  DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> MemoryToUsers; +  mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> +      MemoryToUsers;    // A table storing which memorydefs/phis represent a memory state provably    // equivalent to another memory state. @@ -457,7 +463,7 @@ class NewGVN {    DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;    enum PhiCycleState { PCS_Unknown, PCS_CycleFree, PCS_Cycle }; -  DenseMap<const PHINode *, PhiCycleState> PhiCycleState; +  mutable DenseMap<const PHINode *, PhiCycleState> PhiCycleState;    // Expression to class mapping.    using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;    ExpressionClassMap ExpressionToClass; @@ -511,21 +517,24 @@ public:  private:    // Expression handling. -  const Expression *createExpression(Instruction *); -  const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *); -  PHIExpression *createPHIExpression(Instruction *, bool &HasBackedge, -                                     bool &AllConstant); -  const VariableExpression *createVariableExpression(Value *); -  const ConstantExpression *createConstantExpression(Constant *); -  const Expression *createVariableOrConstant(Value *V); -  const UnknownExpression *createUnknownExpression(Instruction *); +  const Expression *createExpression(Instruction *) const; +  const Expression *createBinaryExpression(unsigned, Type *, Value *, +                                           Value *) const; +  PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge, +                                     bool &AllConstant) const; +  const VariableExpression *createVariableExpression(Value *) const; +  const ConstantExpression *createConstantExpression(Constant *) const; +  const Expression *createVariableOrConstant(Value *V) const; +  const UnknownExpression *createUnknownExpression(Instruction *) const;    const StoreExpression *createStoreExpression(StoreInst *, -                                               const MemoryAccess *); +                                               const MemoryAccess *) const;    LoadExpression *createLoadExpression(Type *, Value *, LoadInst *, -                                       const MemoryAccess *); -  const CallExpression *createCallExpression(CallInst *, const MemoryAccess *); -  const AggregateValueExpression *createAggregateValueExpression(Instruction *); -  bool setBasicExpressionInfo(Instruction *, BasicExpression *); +                                       const MemoryAccess *) const; +  const CallExpression *createCallExpression(CallInst *, +                                             const MemoryAccess *) const; +  const AggregateValueExpression * +  createAggregateValueExpression(Instruction *) const; +  bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;    // Congruence class handling.    CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) { @@ -560,17 +569,18 @@ private:    // Symbolic evaluation.    const Expression *checkSimplificationResults(Expression *, Instruction *, -                                               Value *); -  const Expression *performSymbolicEvaluation(Value *); +                                               Value *) const; +  const Expression *performSymbolicEvaluation(Value *) const;    const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *, -                                                Instruction *, MemoryAccess *); -  const Expression *performSymbolicLoadEvaluation(Instruction *); -  const Expression *performSymbolicStoreEvaluation(Instruction *); -  const Expression *performSymbolicCallEvaluation(Instruction *); -  const Expression *performSymbolicPHIEvaluation(Instruction *); -  const Expression *performSymbolicAggrValueEvaluation(Instruction *); -  const Expression *performSymbolicCmpEvaluation(Instruction *); -  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *); +                                                Instruction *, +                                                MemoryAccess *) const; +  const Expression *performSymbolicLoadEvaluation(Instruction *) const; +  const Expression *performSymbolicStoreEvaluation(Instruction *) const; +  const Expression *performSymbolicCallEvaluation(Instruction *) const; +  const Expression *performSymbolicPHIEvaluation(Instruction *) const; +  const Expression *performSymbolicAggrValueEvaluation(Instruction *) const; +  const Expression *performSymbolicCmpEvaluation(Instruction *) const; +  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;    // Congruence finding.    bool someEquivalentDominates(const Instruction *, const Instruction *) const; @@ -620,8 +630,8 @@ private:    void markPredicateUsersTouched(Instruction *);    void markValueLeaderChangeTouched(CongruenceClass *CC);    void markMemoryLeaderChangeTouched(CongruenceClass *CC); -  void addPredicateUsers(const PredicateBase *, Instruction *); -  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U); +  void addPredicateUsers(const PredicateBase *, Instruction *) const; +  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;    // Main loop of value numbering    void iterateTouchedInstructions(); @@ -634,7 +644,7 @@ private:    void verifyIterationSettled(Function &F);    bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;    BasicBlock *getBlockForValue(Value *V) const; -  void deleteExpression(const Expression *E); +  void deleteExpression(const Expression *E) const;    unsigned InstrToDFSNum(const Value *V) const {      assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");      return InstrDFS.lookup(V); @@ -654,7 +664,7 @@ private:                 ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())                 : InstrDFS.lookup(MA);    } -  bool isCycleFree(const PHINode *PN); +  bool isCycleFree(const PHINode *PN) const;    template <class T, class Range> T *getMinDFSOfRange(const Range &) const;    // Debug counter info.  When verifying, we have to reset the value numbering    // debug counter to the same state it started in to get the same results. @@ -702,7 +712,7 @@ BasicBlock *NewGVN::getBlockForValue(Value *V) const {  // Delete a definitely dead expression, so it can be reused by the expression  // allocator.  Some of these are not in creation functions, so we have to accept  // const versions. -void NewGVN::deleteExpression(const Expression *E) { +void NewGVN::deleteExpression(const Expression *E) const {    assert(isa<BasicExpression>(E));    auto *BE = cast<BasicExpression>(E);    const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler); @@ -710,7 +720,7 @@ void NewGVN::deleteExpression(const Expression *E) {  }  PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge, -                                           bool &AllConstant) { +                                           bool &AllConstant) const {    BasicBlock *PHIBlock = I->getParent();    auto *PN = cast<PHINode>(I);    auto *E = @@ -722,30 +732,46 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,    unsigned PHIRPO = RPOOrdering.lookup(DT->getNode(PHIBlock)); +  // NewGVN assumes the operands of a PHI node are in a consistent order across +  // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix +  // this in LLVM at some point we don't want GVN to find wrong congruences. +  // Therefore, here we sort uses in predecessor order. +  // We're sorting the values by pointer. In theory this might be cause of +  // non-determinism, but here we don't rely on the ordering for anything +  // significant, e.g. we don't create new instructions based on it so we're +  // fine. +  SmallVector<const Use *, 4> PHIOperands; +  for (const Use &U : PN->operands()) +    PHIOperands.push_back(&U); +  std::sort(PHIOperands.begin(), PHIOperands.end(), +            [&](const Use *U1, const Use *U2) { +              return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2); +            }); +    // Filter out unreachable phi operands. -  auto Filtered = make_filter_range(PN->operands(), [&](const Use &U) { -    return ReachableEdges.count({PN->getIncomingBlock(U), PHIBlock}); +  auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) { +    return ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock});    });    std::transform(Filtered.begin(), Filtered.end(), op_inserter(E), -                 [&](const Use &U) -> Value * { -                   auto *BB = PN->getIncomingBlock(U); +                 [&](const Use *U) -> Value * { +                   auto *BB = PN->getIncomingBlock(*U);                     auto *DTN = DT->getNode(BB);                     if (RPOOrdering.lookup(DTN) >= PHIRPO)                       HasBackedge = true; -                   AllConstant &= isa<UndefValue>(U) || isa<Constant>(U); +                   AllConstant &= isa<UndefValue>(*U) || isa<Constant>(*U);                     // Don't try to transform self-defined phis. -                   if (U == PN) +                   if (*U == PN)                       return PN; -                   return lookupOperandLeader(U); +                   return lookupOperandLeader(*U);                   });    return E;  }  // Set basic expression info (Arguments, type, opcode) for Expression  // E from Instruction I in block B. -bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) { +bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {    bool AllConstant = true;    if (auto *GEP = dyn_cast<GetElementPtrInst>(I))      E->setType(GEP->getSourceElementType()); @@ -766,7 +792,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {  }  const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T, -                                                 Value *Arg1, Value *Arg2) { +                                                 Value *Arg1, +                                                 Value *Arg2) const {    auto *E = new (ExpressionAllocator) BasicExpression(2);    E->setType(T); @@ -795,7 +822,8 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,  // TODO: Once finished, this should not take an Instruction, we only  // use it for printing.  const Expression *NewGVN::checkSimplificationResults(Expression *E, -                                                     Instruction *I, Value *V) { +                                                     Instruction *I, +                                                     Value *V) const {    if (!V)      return nullptr;    if (auto *C = dyn_cast<Constant>(V)) { @@ -827,7 +855,7 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,    return nullptr;  } -const Expression *NewGVN::createExpression(Instruction *I) { +const Expression *NewGVN::createExpression(Instruction *I) const {    auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());    bool AllConstant = setBasicExpressionInfo(I, E); @@ -913,7 +941,7 @@ const Expression *NewGVN::createExpression(Instruction *I) {  }  const AggregateValueExpression * -NewGVN::createAggregateValueExpression(Instruction *I) { +NewGVN::createAggregateValueExpression(Instruction *I) const {    if (auto *II = dyn_cast<InsertValueInst>(I)) {      auto *E = new (ExpressionAllocator)          AggregateValueExpression(I->getNumOperands(), II->getNumIndices()); @@ -932,32 +960,32 @@ NewGVN::createAggregateValueExpression(Instruction *I) {    llvm_unreachable("Unhandled type of aggregate value operation");  } -const VariableExpression *NewGVN::createVariableExpression(Value *V) { +const VariableExpression *NewGVN::createVariableExpression(Value *V) const {    auto *E = new (ExpressionAllocator) VariableExpression(V);    E->setOpcode(V->getValueID());    return E;  } -const Expression *NewGVN::createVariableOrConstant(Value *V) { +const Expression *NewGVN::createVariableOrConstant(Value *V) const {    if (auto *C = dyn_cast<Constant>(V))      return createConstantExpression(C);    return createVariableExpression(V);  } -const ConstantExpression *NewGVN::createConstantExpression(Constant *C) { +const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {    auto *E = new (ExpressionAllocator) ConstantExpression(C);    E->setOpcode(C->getValueID());    return E;  } -const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) { +const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {    auto *E = new (ExpressionAllocator) UnknownExpression(I);    E->setOpcode(I->getOpcode());    return E;  } -const CallExpression *NewGVN::createCallExpression(CallInst *CI, -                                                   const MemoryAccess *MA) { +const CallExpression * +NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {    // FIXME: Add operand bundles for calls.    auto *E =        new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA); @@ -1017,9 +1045,8 @@ Value *NewGVN::lookupOperandLeader(Value *V) const {  const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {    auto *CC = getMemoryClass(MA);    assert(CC->getMemoryLeader() && -         "Every MemoryAccess should be mapped to a " -         "congruence class with a represenative memory " -         "access"); +         "Every MemoryAccess should be mapped to a congruence class with a " +         "representative memory access");    return CC->getMemoryLeader();  } @@ -1032,7 +1059,7 @@ bool NewGVN::isMemoryAccessTop(const MemoryAccess *MA) const {  LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,                                               LoadInst *LI, -                                             const MemoryAccess *MA) { +                                             const MemoryAccess *MA) const {    auto *E =        new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));    E->allocateOperands(ArgRecycler, ExpressionAllocator); @@ -1050,8 +1077,8 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,    return E;  } -const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI, -                                                     const MemoryAccess *MA) { +const StoreExpression * +NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {    auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());    auto *E = new (ExpressionAllocator)        StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA); @@ -1068,7 +1095,7 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,    return E;  } -const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {    // Unlike loads, we never try to eliminate stores, so we do not check if they    // are simple and avoid value numbering them.    auto *SI = cast<StoreInst>(I); @@ -1126,7 +1153,7 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {  const Expression *  NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,                                      LoadInst *LI, Instruction *DepInst, -                                    MemoryAccess *DefiningAccess) { +                                    MemoryAccess *DefiningAccess) const {    assert((!LI || LI->isSimple()) && "Not a simple load");    if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {      // Can't forward from non-atomic to atomic without violating memory model. @@ -1201,7 +1228,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,    return nullptr;  } -const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {    auto *LI = cast<LoadInst>(I);    // We can eliminate in favor of non-simple loads, but we won't be able to @@ -1239,7 +1266,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {  }  const Expression * -NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) { +NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {    auto *PI = PredInfo->getPredicateInfoFor(I);    if (!PI)      return nullptr; @@ -1284,7 +1311,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {      return nullptr;    if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) { -    DEBUG(dbgs() << "Copy is not of any condition operands!"); +    DEBUG(dbgs() << "Copy is not of any condition operands!\n");      return nullptr;    }    Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0)); @@ -1329,7 +1356,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {  }  // Evaluate read only and pure calls, and create an expression result. -const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {    auto *CI = cast<CallInst>(I);    if (auto *II = dyn_cast<IntrinsicInst>(I)) {      // Instrinsics with the returned attribute are copies of arguments. @@ -1366,8 +1393,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,    DEBUG(dbgs() << "Setting " << *From);    DEBUG(dbgs() << " equivalent to congruence class ");    DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader "); -  DEBUG(dbgs() << *NewClass->getMemoryLeader()); -  DEBUG(dbgs() << "\n"); +  DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");    auto LookupResult = MemoryAccessToClass.find(From);    bool Changed = false; @@ -1381,7 +1407,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,          NewClass->memory_insert(MP);          // This may have killed the class if it had no non-memory members          if (OldClass->getMemoryLeader() == From) { -          if (OldClass->memory_empty()) { +          if (OldClass->definesNoMemory()) {              OldClass->setMemoryLeader(nullptr);            } else {              OldClass->setMemoryLeader(getNextMemoryLeader(OldClass)); @@ -1406,7 +1432,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,  // Determine if a phi is cycle-free.  That means the values in the phi don't  // depend on any expressions that can change value as a result of the phi.  // For example, a non-cycle free phi would be  v = phi(0, v+1). -bool NewGVN::isCycleFree(const PHINode *PN) { +bool NewGVN::isCycleFree(const PHINode *PN) const {    // In order to compute cycle-freeness, we do SCC finding on the phi, and see    // what kind of SCC it ends up in.  If it is a singleton, it is cycle-free.    // If it is not in a singleton, it is only cycle free if the other members are @@ -1436,7 +1462,7 @@ bool NewGVN::isCycleFree(const PHINode *PN) {  }  // Evaluate PHI nodes symbolically, and create an expression result. -const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {    // True if one of the incoming phi edges is a backedge.    bool HasBackedge = false;    // All constant tracks the state of whether all the *original* phi operands @@ -1510,7 +1536,8 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {    return E;  } -const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) { +const Expression * +NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {    if (auto *EI = dyn_cast<ExtractValueInst>(I)) {      auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());      if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) { @@ -1548,7 +1575,7 @@ const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {    return createAggregateValueExpression(I);  } -const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {    auto *CI = dyn_cast<CmpInst>(I);    // See if our operands are equal to those of a previous predicate, and if so,    // if it implies true or false. @@ -1663,7 +1690,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {  }  // Substitute and symbolize the value before value numbering. -const Expression *NewGVN::performSymbolicEvaluation(Value *V) { +const Expression *NewGVN::performSymbolicEvaluation(Value *V) const {    const Expression *E = nullptr;    if (auto *C = dyn_cast<Constant>(V))      E = createConstantExpression(C); @@ -1749,7 +1776,7 @@ void NewGVN::markUsersTouched(Value *V) {    }  } -void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) { +void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {    DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");    MemoryToUsers[To].insert(U);  } @@ -1772,7 +1799,7 @@ void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {  }  // Add I to the set of users of a given predicate. -void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) { +void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {    if (auto *PBranch = dyn_cast<PredicateBranch>(PB))      PredicateToUsers[PBranch->Condition].insert(I);    else if (auto *PAssume = dyn_cast<PredicateBranch>(PB)) @@ -1825,8 +1852,7 @@ const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {    // TODO: If this ends up to slow, we can maintain a next memory leader like we    // do for regular leaders.    // Make sure there will be a leader to find -  assert((CC->getStoreCount() > 0 || !CC->memory_empty()) && -         "Can't get next leader if there is none"); +  assert(!CC->definesNoMemory() && "Can't get next leader if there is none");    if (CC->getStoreCount() > 0) {      if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))        return MSSA->getMemoryAccess(NL); @@ -1898,7 +1924,7 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,    setMemoryClass(InstMA, NewClass);    // Now, fixup the old class if necessary    if (OldClass->getMemoryLeader() == InstMA) { -    if (OldClass->getStoreCount() != 0 || !OldClass->memory_empty()) { +    if (!OldClass->definesNoMemory()) {        OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));        DEBUG(dbgs() << "Memory class leader change for class "                     << OldClass->getID() << " to " @@ -1956,10 +1982,9 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,      if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {        // If it's a store expression we are using, it means we are not equivalent        // to something earlier. -      if (isa<StoreExpression>(E)) { -        assert(lookupOperandLeader(SI->getValueOperand()) != -               NewClass->getLeader()); -        NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand())); +      if (auto *SE = dyn_cast<StoreExpression>(E)) { +        assert(SE->getStoredValue() != NewClass->getLeader()); +        NewClass->setStoredValue(SE->getStoredValue());          markValueLeaderChangeTouched(NewClass);          // Shift the new class leader to be the store          DEBUG(dbgs() << "Changing leader of congruence class " @@ -1985,7 +2010,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,    // See if we destroyed the class or need to swap leaders.    if (OldClass->empty() && OldClass != TOPClass) {      if (OldClass->getDefiningExpr()) { -      DEBUG(dbgs() << "Erasing expression " << OldClass->getDefiningExpr() +      DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()                     << " from table\n");        ExpressionToClass.erase(OldClass->getDefiningExpr());      } @@ -2064,7 +2089,7 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {        } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {          StoreInst *SI = SE->getStoreInst();          NewClass->setLeader(SI); -        NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand())); +        NewClass->setStoredValue(SE->getStoredValue());          // The RepMemoryAccess field will be filled in properly by the          // moveValueToNewCongruenceClass call.        } else { @@ -2523,6 +2548,19 @@ void NewGVN::verifyMemoryCongruency() const {            return false;          if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))            return !isInstructionTriviallyDead(MemDef->getMemoryInst()); + +        // We could have phi nodes which operands are all trivially dead, +        // so we don't process them. +        if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) { +          for (auto &U : MemPHI->incoming_values()) { +            if (Instruction *I = dyn_cast<Instruction>(U.get())) { +              if (!isInstructionTriviallyDead(I)) +                return true; +            } +          } +          return false; +        } +          return true;        }; diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index fb1b47c48276c..4f608c97147db 100644 --- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -55,7 +55,7 @@ static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,  /// Update the dominator tree after removing one exiting predecessor of a loop  /// exit block.  static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L, -                                            DominatorTree &DT) { +                               DominatorTree &DT) {    assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) &&           "Cannot have empty predecessors of the loop exit block if we split "           "off a block to unswitch!"); @@ -137,6 +137,98 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,    }  } +/// Check that all the LCSSA PHI nodes in the loop exit block have trivial +/// incoming values along this edge. +static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, +                                         BasicBlock &ExitBB) { +  for (Instruction &I : ExitBB) { +    auto *PN = dyn_cast<PHINode>(&I); +    if (!PN) +      // No more PHIs to check. +      return true; + +    // If the incoming value for this edge isn't loop invariant the unswitch +    // won't be trivial. +    if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB))) +      return false; +  } +  llvm_unreachable("Basic blocks should never be empty!"); +} + +/// Rewrite the PHI nodes in an unswitched loop exit basic block. +/// +/// Requires that the loop exit and unswitched basic block are the same, and +/// that the exiting block was a unique predecessor of that block. Rewrites the +/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial +/// PHI nodes from the old preheader that now contains the unswitched +/// terminator. +static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, +                                                  BasicBlock &OldExitingBB, +                                                  BasicBlock &OldPH) { +  for (Instruction &I : UnswitchedBB) { +    auto *PN = dyn_cast<PHINode>(&I); +    if (!PN) +      // No more PHIs to check. +      break; + +    // When the loop exit is directly unswitched we just need to update the +    // incoming basic block. We loop to handle weird cases with repeated +    // incoming blocks, but expect to typically only have one operand here. +    for (auto i : llvm::seq<int>(0, PN->getNumOperands())) { +      assert(PN->getIncomingBlock(i) == &OldExitingBB && +             "Found incoming block different from unique predecessor!"); +      PN->setIncomingBlock(i, &OldPH); +    } +  } +} + +/// Rewrite the PHI nodes in the loop exit basic block and the split off +/// unswitched block. +/// +/// Because the exit block remains an exit from the loop, this rewrites the +/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI +/// nodes into the unswitched basic block to select between the value in the +/// old preheader and the loop exit. +static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, +                                                      BasicBlock &UnswitchedBB, +                                                      BasicBlock &OldExitingBB, +                                                      BasicBlock &OldPH) { +  assert(&ExitBB != &UnswitchedBB && +         "Must have different loop exit and unswitched blocks!"); +  Instruction *InsertPt = &*UnswitchedBB.begin(); +  for (Instruction &I : ExitBB) { +    auto *PN = dyn_cast<PHINode>(&I); +    if (!PN) +      // No more PHIs to check. +      break; + +    auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2, +                                  PN->getName() + ".split", InsertPt); + +    // Walk backwards over the old PHI node's inputs to minimize the cost of +    // removing each one. We have to do this weird loop manually so that we +    // create the same number of new incoming edges in the new PHI as we expect +    // each case-based edge to be included in the unswitched switch in some +    // cases. +    // FIXME: This is really, really gross. It would be much cleaner if LLVM +    // allowed us to create a single entry for a predecessor block without +    // having separate entries for each "edge" even though these edges are +    // required to produce identical results. +    for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) { +      if (PN->getIncomingBlock(i) != &OldExitingBB) +        continue; + +      Value *Incoming = PN->removeIncomingValue(i); +      NewPN->addIncoming(Incoming, &OldPH); +    } + +    // Now replace the old PHI with the new one and wire the old one in as an +    // input to the new one. +    PN->replaceAllUsesWith(NewPN); +    NewPN->addIncoming(PN, &ExitBB); +  } +} +  /// Unswitch a trivial branch if the condition is loop invariant.  ///  /// This routine should only be called when loop code leading to the branch has @@ -187,10 +279,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,    assert(L.contains(ContinueBB) &&           "Cannot have both successors exit and still be in the loop!"); -  // If the loop exit block contains phi nodes, this isn't trivial. -  // FIXME: We should examine the PHI to determine whether or not we can handle -  // it trivially. -  if (isa<PHINode>(LoopExitBB->begin())) +  auto *ParentBB = BI.getParent(); +  if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))      return false;    DEBUG(dbgs() << "    unswitching trivial branch when: " << CondVal @@ -209,14 +299,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,    BasicBlock *UnswitchedBB;    if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {      (void)PredBB; -    assert(PredBB == BI.getParent() && "A branch's parent is't a predecessor!"); +    assert(PredBB == BI.getParent() && +           "A branch's parent isn't a predecessor!");      UnswitchedBB = LoopExitBB;    } else {      UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);    } -  BasicBlock *ParentBB = BI.getParent(); -    // Now splice the branch to gate reaching the new preheader and re-point its    // successors.    OldPH->getInstList().splice(std::prev(OldPH->end()), @@ -229,6 +318,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,    // terminator.    BranchInst::Create(ContinueBB, ParentBB); +  // Rewrite the relevant PHI nodes. +  if (UnswitchedBB == LoopExitBB) +    rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH); +  else +    rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB, +                                              *ParentBB, *OldPH); +    // Now we need to update the dominator tree.    updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);    // But if we split something off of the loop exit block then we also removed @@ -278,6 +374,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,    if (!L.isLoopInvariant(LoopCond))      return false; +  auto *ParentBB = SI.getParent(); +    // FIXME: We should compute this once at the start and update it!    SmallVector<BasicBlock *, 16> ExitBlocks;    L.getExitBlocks(ExitBlocks); @@ -287,12 +385,13 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,    SmallVector<int, 4> ExitCaseIndices;    for (auto Case : SI.cases()) {      auto *SuccBB = Case.getCaseSuccessor(); -    if (ExitBlockSet.count(SuccBB) && !isa<PHINode>(SuccBB->begin())) +    if (ExitBlockSet.count(SuccBB) && +        areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))        ExitCaseIndices.push_back(Case.getCaseIndex());    }    BasicBlock *DefaultExitBB = nullptr;    if (ExitBlockSet.count(SI.getDefaultDest()) && -      !isa<PHINode>(SI.getDefaultDest()->begin()) && +      areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&        !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))      DefaultExitBB = SI.getDefaultDest();    else if (ExitCaseIndices.empty()) @@ -330,7 +429,6 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,      if (CommonSuccBB) {        SI.setDefaultDest(CommonSuccBB);      } else { -      BasicBlock *ParentBB = SI.getParent();        BasicBlock *UnreachableBB = BasicBlock::Create(            ParentBB->getContext(),            Twine(ParentBB->getName()) + ".unreachable_default", @@ -358,30 +456,44 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,    // Now add the unswitched switch.    auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH); -  // Split any exit blocks with remaining in-loop predecessors. We walk in -  // reverse so that we split in the same order as the cases appeared. This is -  // purely for convenience of reading the resulting IR, but it doesn't cost -  // anything really. +  // Rewrite the IR for the unswitched basic blocks. This requires two steps. +  // First, we split any exit blocks with remaining in-loop predecessors. Then +  // we update the PHIs in one of two ways depending on if there was a split. +  // We walk in reverse so that we split in the same order as the cases +  // appeared. This is purely for convenience of reading the resulting IR, but +  // it doesn't cost anything really. +  SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;    SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;    // Handle the default exit if necessary.    // FIXME: It'd be great if we could merge this with the loop below but LLVM's    // ranges aren't quite powerful enough yet. -  if (DefaultExitBB && !pred_empty(DefaultExitBB)) { -    auto *SplitBB = -        SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI); -    updateLoopExitIDom(DefaultExitBB, L, DT); -    DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; +  if (DefaultExitBB) { +    if (pred_empty(DefaultExitBB)) { +      UnswitchedExitBBs.insert(DefaultExitBB); +      rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH); +    } else { +      auto *SplitBB = +          SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI); +      rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB, +                                                *ParentBB, *OldPH); +      updateLoopExitIDom(DefaultExitBB, L, DT); +      DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; +    }    }    // Note that we must use a reference in the for loop so that we update the    // container.    for (auto &CasePair : reverse(ExitCases)) {      // Grab a reference to the exit block in the pair so that we can update it. -    BasicBlock *&ExitBB = CasePair.second; +    BasicBlock *ExitBB = CasePair.second;      // If this case is the last edge into the exit block, we can simply reuse it      // as it will no longer be a loop exit. No mapping necessary. -    if (pred_empty(ExitBB)) +    if (pred_empty(ExitBB)) { +      // Only rewrite once. +      if (UnswitchedExitBBs.insert(ExitBB).second) +        rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);        continue; +    }      // Otherwise we need to split the exit block so that we retain an exit      // block from the loop and a target for the unswitched condition. @@ -389,9 +501,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,      if (!SplitExitBB) {        // If this is the first time we see this, do the split and remember it.        SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI); +      rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB, +                                                *ParentBB, *OldPH);        updateLoopExitIDom(ExitBB, L, DT);      } -    ExitBB = SplitExitBB; +    // Update the case pair to point to the split block. +    CasePair.second = SplitExitBB;    }    // Now add the unswitched cases. We do this in reverse order as we built them diff --git a/lib/Transforms/Scalar/SpeculativeExecution.cpp b/lib/Transforms/Scalar/SpeculativeExecution.cpp index a0fc966cee2c6..a7c308b598779 100644 --- a/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -208,6 +208,47 @@ bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {    return false;  } +static unsigned ComputeSpeculationCost(const Instruction *I, +                                       const TargetTransformInfo &TTI) { +  switch (Operator::getOpcode(I)) { +    case Instruction::GetElementPtr: +    case Instruction::Add: +    case Instruction::Mul: +    case Instruction::And: +    case Instruction::Or: +    case Instruction::Select: +    case Instruction::Shl: +    case Instruction::Sub: +    case Instruction::LShr: +    case Instruction::AShr: +    case Instruction::Xor: +    case Instruction::ZExt: +    case Instruction::SExt: +    case Instruction::Call: +    case Instruction::BitCast: +    case Instruction::PtrToInt: +    case Instruction::IntToPtr: +    case Instruction::AddrSpaceCast: +    case Instruction::FPToUI: +    case Instruction::FPToSI: +    case Instruction::UIToFP: +    case Instruction::SIToFP: +    case Instruction::FPExt: +    case Instruction::FPTrunc: +    case Instruction::FAdd: +    case Instruction::FSub: +    case Instruction::FMul: +    case Instruction::FDiv: +    case Instruction::FRem: +    case Instruction::ICmp: +    case Instruction::FCmp: +      return TTI.getUserCost(I); + +    default: +      return UINT_MAX; // Disallow anything not whitelisted. +  } +} +  bool SpeculativeExecutionPass::considerHoistingFromTo(      BasicBlock &FromBlock, BasicBlock &ToBlock) {    SmallSet<const Instruction *, 8> NotHoisted; @@ -223,7 +264,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(    unsigned TotalSpeculationCost = 0;    for (auto& I : FromBlock) { -    const unsigned Cost = TTI->getUserCost(&I); +    const unsigned Cost = ComputeSpeculationCost(&I, *TTI);      if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&          AllPrecedingUsesFromBlockHoisted(&I)) {        TotalSpeculationCost += Cost; diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp index 7ffdad597a9b8..83ec7f55d1afd 100644 --- a/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -261,10 +261,10 @@ ValueRange FastDivInsertionTask::getValueRange(Value *V,    computeKnownBits(V, Known, DL); -  if (Known.Zero.countLeadingOnes() >= HiBits) +  if (Known.countMinLeadingZeros() >= HiBits)      return VALRNG_KNOWN_SHORT; -  if (Known.One.countLeadingZeros() < HiBits) +  if (Known.countMaxLeadingZeros() < HiBits)      return VALRNG_LIKELY_LONG;    // Long integer divisions are often used in hashtable implementations. It's diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index d5124ac89016f..4aa26fd14fee3 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -41,6 +41,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,                                    ValueToValueMapTy &VMap,                                    const Twine &NameSuffix, Function *F,                                    ClonedCodeInfo *CodeInfo) { +  DenseMap<const MDNode *, MDNode *> Cache;    BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);    if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix); @@ -50,6 +51,9 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,    for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();         II != IE; ++II) {      Instruction *NewInst = II->clone(); +    if (F && F->getSubprogram()) +      DebugLoc::reparentDebugInfo(*NewInst, BB->getParent()->getSubprogram(), +                                  F->getSubprogram(), Cache);      if (II->hasName())        NewInst->setName(II->getName()+NameSuffix);      NewBB->getInstList().push_back(NewInst); @@ -120,12 +124,28 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,    SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;    OldFunc->getAllMetadata(MDs); -  for (auto MD : MDs) -    NewFunc->addMetadata( -        MD.first, -        *MapMetadata(MD.second, VMap, -                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, -                     TypeMapper, Materializer)); +  for (auto MD : MDs) { +    MDNode *NewMD; +    bool MustCloneSP = +        (MD.first == LLVMContext::MD_dbg && OldFunc->getParent() && +         OldFunc->getParent() == NewFunc->getParent()); +    if (MustCloneSP) { +      auto *SP = cast<DISubprogram>(MD.second); +      NewMD = DISubprogram::getDistinct( +          NewFunc->getContext(), SP->getScope(), SP->getName(), +          NewFunc->getName(), SP->getFile(), SP->getLine(), SP->getType(), +          SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(), +          SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(), +          SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(), +          SP->getUnit(), SP->getTemplateParams(), SP->getDeclaration(), +          SP->getVariables(), SP->getThrownTypes()); +    } else +      NewMD = +          MapMetadata(MD.second, VMap, +                      ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, +                      TypeMapper, Materializer); +    NewFunc->addMetadata(MD.first, *NewMD); +  }    // Loop over all of the basic blocks in the function, cloning them as    // appropriate.  Note that we save BE this way in order to handle cloning of diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp index 4e9d67252d6c5..5444b752de829 100644 --- a/lib/Transforms/Utils/CloneModule.cpp +++ b/lib/Transforms/Utils/CloneModule.cpp @@ -96,7 +96,7 @@ std::unique_ptr<Module> llvm::CloneModule(        else          GV = new GlobalVariable(              *New, I->getValueType(), false, GlobalValue::ExternalLinkage, -            (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr, +            nullptr, I->getName(), nullptr,              I->getThreadLocalMode(), I->getType()->getAddressSpace());        VMap[&*I] = GV;        // We do not copy attributes (mainly because copying between different diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp index 8c2386554da56..78d7474e5b954 100644 --- a/lib/Transforms/Utils/EscapeEnumerator.cpp +++ b/lib/Transforms/Utils/EscapeEnumerator.cpp @@ -67,8 +67,7 @@ IRBuilder<> *EscapeEnumerator::Next() {    // Create a cleanup block.    LLVMContext &C = F.getContext();    BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F); -  Type *ExnTy = -      StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr); +  Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C));    if (!F.hasPersonalityFn()) {      Constant *PersFn = getDefaultPersonalityFn(F.getParent());      F.setPersonalityFn(PersFn); diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index 6d56e08af99f9..9cb4762b683c4 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -1302,41 +1302,6 @@ static bool hasLifetimeMarkers(AllocaInst *AI) {    return false;  } -/// Rebuild the entire inlined-at chain for this instruction so that the top of -/// the chain now is inlined-at the new call site. -static DebugLoc -updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode, -                    LLVMContext &Ctx, -                    DenseMap<const DILocation *, DILocation *> &IANodes) { -  SmallVector<DILocation *, 3> InlinedAtLocations; -  DILocation *Last = InlinedAtNode; -  DILocation *CurInlinedAt = DL; - -  // Gather all the inlined-at nodes -  while (DILocation *IA = CurInlinedAt->getInlinedAt()) { -    // Skip any we've already built nodes for -    if (DILocation *Found = IANodes[IA]) { -      Last = Found; -      break; -    } - -    InlinedAtLocations.push_back(IA); -    CurInlinedAt = IA; -  } - -  // Starting from the top, rebuild the nodes to point to the new inlined-at -  // location (then rebuilding the rest of the chain behind it) and update the -  // map of already-constructed inlined-at nodes. -  for (const DILocation *MD : reverse(InlinedAtLocations)) { -    Last = IANodes[MD] = DILocation::getDistinct( -        Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last); -  } - -  // And finally create the normal location for this instruction, referring to -  // the new inlined-at chain. -  return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last); -} -  /// Return the result of AI->isStaticAlloca() if AI were moved to the entry  /// block. Allocas used in inalloca calls and allocas of dynamic array size  /// cannot be static. @@ -1364,14 +1329,16 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,    // Cache the inlined-at nodes as they're built so they are reused, without    // this every instruction's inlined-at chain would become distinct from each    // other. -  DenseMap<const DILocation *, DILocation *> IANodes; +  DenseMap<const MDNode *, MDNode *> IANodes;    for (; FI != Fn->end(); ++FI) {      for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();           BI != BE; ++BI) {        if (DebugLoc DL = BI->getDebugLoc()) { -        BI->setDebugLoc( -            updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes)); +        auto IA = DebugLoc::appendInlinedAt(DL, InlinedAtNode, BI->getContext(), +                                            IANodes); +        auto IDL = DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), IA); +        BI->setDebugLoc(IDL);          continue;        } @@ -1429,11 +1396,12 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,  /// Update the branch metadata for cloned call instructions.  static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,                                const Optional<uint64_t> &CalleeEntryCount, -                              const Instruction *TheCall) { +                              const Instruction *TheCall, +                              ProfileSummaryInfo *PSI) {    if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)      return;    Optional<uint64_t> CallSiteCount = -      ProfileSummaryInfo::getProfileCount(TheCall, nullptr); +      PSI ? PSI->getProfileCount(TheCall, nullptr) : None;    uint64_t CallCount =        std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,                 CalleeEntryCount.getValue()); @@ -1456,16 +1424,16 @@ static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,  /// The callsite's block count is subtracted from the callee's function entry  /// count.  static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB, -                              Instruction *CallInst, Function *Callee) { +                              Instruction *CallInst, Function *Callee, +                              ProfileSummaryInfo *PSI) {    // If the callee has a original count of N, and the estimated count of    // callsite is M, the new callee count is set to N - M. M is estimated from    // the caller's entry count, its entry block frequency and the block frequency    // of the callsite.    Optional<uint64_t> CalleeCount = Callee->getEntryCount(); -  if (!CalleeCount.hasValue()) +  if (!CalleeCount.hasValue() || !PSI)      return; -  Optional<uint64_t> CallCount = -      ProfileSummaryInfo::getProfileCount(CallInst, CallerBFI); +  Optional<uint64_t> CallCount = PSI->getProfileCount(CallInst, CallerBFI);    if (!CallCount.hasValue())      return;    // Since CallSiteCount is an estimate, it could exceed the original callee @@ -1668,9 +1636,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,        updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,                        CalledFunc->front()); -    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall); +    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall, +                      IFI.PSI);      // Update the profile count of callee. -    updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc); +    updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI);      // Inject byval arguments initialization.      for (std::pair<Value*, Value*> &Init : ByValInit) diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp index 8a1973d1db051..53b432fcafd4f 100644 --- a/lib/Transforms/Utils/InstructionNamer.cpp +++ b/lib/Transforms/Utils/InstructionNamer.cpp @@ -26,16 +26,15 @@ namespace {      InstNamer() : FunctionPass(ID) {        initializeInstNamerPass(*PassRegistry::getPassRegistry());      } -     +      void getAnalysisUsage(AnalysisUsage &Info) const override {        Info.setPreservesAll();      }      bool runOnFunction(Function &F) override { -      for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); -           AI != AE; ++AI) -        if (!AI->hasName() && !AI->getType()->isVoidTy()) -          AI->setName("arg"); +      for (auto &Arg : F.args()) +        if (!Arg.hasName()) +          Arg.setName("arg");        for (BasicBlock &BB : F) {          if (!BB.hasName()) @@ -48,11 +47,11 @@ namespace {        return true;      }    }; -   +    char InstNamer::ID = 0;  } -INITIALIZE_PASS(InstNamer, "instnamer",  +INITIALIZE_PASS(InstNamer, "instnamer",                  "Assign names to anonymous instructions", false, false)  char &llvm::InstructionNamerID = InstNamer::ID;  //===----------------------------------------------------------------------===// diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index ce6b703f3528d..1ca509472b5fa 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1041,7 +1041,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,    KnownBits Known(BitWidth);    computeKnownBits(V, Known, DL, 0, AC, CxtI, DT); -  unsigned TrailZ = Known.Zero.countTrailingOnes(); +  unsigned TrailZ = Known.countMinTrailingZeros();    // Avoid trouble with ridiculously large TrailZ values, such as    // those computed from a null pointer. @@ -1105,8 +1105,9 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,  void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,                                             StoreInst *SI, DIBuilder &Builder) {    auto *DIVar = DDI->getVariable(); -  auto *DIExpr = DDI->getExpression();    assert(DIVar && "Missing variable"); +  auto *DIExpr = DDI->getExpression(); +  Value *DV = SI->getOperand(0);    // If an argument is zero extended then use argument directly. The ZExt    // may be zapped by an optimization pass in future. @@ -1116,34 +1117,28 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,    if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))      ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));    if (ExtendedArg) { -    // We're now only describing a subset of the variable. The fragment we're -    // describing will always be smaller than the variable size, because -    // VariableSize == Size of Alloca described by DDI. Since SI stores -    // to the alloca described by DDI, if it's first operand is an extend, -    // we're guaranteed that before extension, the value was narrower than -    // the size of the alloca, hence the size of the described variable. -    SmallVector<uint64_t, 3> Ops; -    unsigned FragmentOffset = 0; -    // If this already is a bit fragment, we drop the bit fragment from the -    // expression and record the offset. -    auto Fragment = DIExpr->getFragmentInfo(); -    if (Fragment) { -      Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3); -      FragmentOffset = Fragment->OffsetInBits; -    } else { -      Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()); +    // If this DDI was already describing only a fragment of a variable, ensure +    // that fragment is appropriately narrowed here. +    // But if a fragment wasn't used, describe the value as the original +    // argument (rather than the zext or sext) so that it remains described even +    // if the sext/zext is optimized away. This widens the variable description, +    // leaving it up to the consumer to know how the smaller value may be +    // represented in a larger register. +    if (auto Fragment = DIExpr->getFragmentInfo()) { +      unsigned FragmentOffset = Fragment->OffsetInBits; +      SmallVector<uint64_t, 3> Ops(DIExpr->elements_begin(), +                                   DIExpr->elements_end() - 3); +      Ops.push_back(dwarf::DW_OP_LLVM_fragment); +      Ops.push_back(FragmentOffset); +      const DataLayout &DL = DDI->getModule()->getDataLayout(); +      Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); +      DIExpr = Builder.createExpression(Ops);      } -    Ops.push_back(dwarf::DW_OP_LLVM_fragment); -    Ops.push_back(FragmentOffset); -    const DataLayout &DL = DDI->getModule()->getDataLayout(); -    Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); -    auto NewDIExpr = Builder.createExpression(Ops); -    if (!LdStHasDebugValue(DIVar, NewDIExpr, SI)) -      Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr, -                                      DDI->getDebugLoc(), SI); -  } else if (!LdStHasDebugValue(DIVar, DIExpr, SI)) -    Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr, -                                    DDI->getDebugLoc(), SI); +    DV = ExtendedArg; +  } +  if (!LdStHasDebugValue(DIVar, DIExpr, SI)) +    Builder.insertDbgValueIntrinsic(DV, 0, DIVar, DIExpr, DDI->getDebugLoc(), +                                    SI);  }  /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value @@ -1781,44 +1776,43 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J) {    combineMetadata(K, J, KnownIDs);  } -unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, -                                        DominatorTree &DT, -                                        const BasicBlockEdge &Root) { +template <typename RootType, typename DominatesFn> +static unsigned replaceDominatedUsesWith(Value *From, Value *To, +                                         const RootType &Root, +                                         const DominatesFn &Dominates) {    assert(From->getType() == To->getType()); -   +    unsigned Count = 0;    for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); -       UI != UE; ) { +       UI != UE;) {      Use &U = *UI++; -    if (DT.dominates(Root, U)) { -      U.set(To); -      DEBUG(dbgs() << "Replace dominated use of '" -            << From->getName() << "' as " -            << *To << " in " << *U << "\n"); -      ++Count; -    } +    if (!Dominates(Root, U)) +      continue; +    U.set(To); +    DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as " +                 << *To << " in " << *U << "\n"); +    ++Count;    }    return Count;  }  unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,                                          DominatorTree &DT, -                                        const BasicBlock *BB) { -  assert(From->getType() == To->getType()); +                                        const BasicBlockEdge &Root) { +  auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) { +    return DT.dominates(Root, U); +  }; +  return ::replaceDominatedUsesWith(From, To, Root, Dominates); +} -  unsigned Count = 0; -  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); -       UI != UE;) { -    Use &U = *UI++; -    auto *I = cast<Instruction>(U.getUser()); -    if (DT.properlyDominates(BB, I->getParent())) { -      U.set(To); -      DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as " -                   << *To << " in " << *U << "\n"); -      ++Count; -    } -  } -  return Count; +unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, +                                        DominatorTree &DT, +                                        const BasicBlock *BB) { +  auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) { +    auto *I = cast<Instruction>(U.getUser())->getParent(); +    return DT.properlyDominates(BB, I); +  }; +  return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates);  }  bool llvm::callsGCLeafFunction(ImmutableCallSite CS) { diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp index 175d013a011d0..81f033e7d51a2 100644 --- a/lib/Transforms/Utils/LoopUtils.cpp +++ b/lib/Transforms/Utils/LoopUtils.cpp @@ -18,6 +18,7 @@  #include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h"  #include "llvm/Analysis/ScalarEvolution.h"  #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"  #include "llvm/Analysis/ScalarEvolutionExpander.h" @@ -1112,3 +1113,203 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {    else      return (FalseVal + (TrueVal / 2)) / TrueVal;  } + +/// \brief Adds a 'fast' flag to floating point operations. +static Value *addFastMathFlag(Value *V) { +  if (isa<FPMathOperator>(V)) { +    FastMathFlags Flags; +    Flags.setUnsafeAlgebra(); +    cast<Instruction>(V)->setFastMathFlags(Flags); +  } +  return V; +} + +// Helper to generate a log2 shuffle reduction. +Value * +llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, +                          RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, +                          ArrayRef<Value *> RedOps) { +  unsigned VF = Src->getType()->getVectorNumElements(); +  // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles +  // and vector ops, reducing the set of values being computed by half each +  // round. +  assert(isPowerOf2_32(VF) && +         "Reduction emission only supported for pow2 vectors!"); +  Value *TmpVec = Src; +  SmallVector<Constant *, 32> ShuffleMask(VF, nullptr); +  for (unsigned i = VF; i != 1; i >>= 1) { +    // Move the upper half of the vector to the lower half. +    for (unsigned j = 0; j != i / 2; ++j) +      ShuffleMask[j] = Builder.getInt32(i / 2 + j); + +    // Fill the rest of the mask with undef. +    std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), +              UndefValue::get(Builder.getInt32Ty())); + +    Value *Shuf = Builder.CreateShuffleVector( +        TmpVec, UndefValue::get(TmpVec->getType()), +        ConstantVector::get(ShuffleMask), "rdx.shuf"); + +    if (Op != Instruction::ICmp && Op != Instruction::FCmp) { +      // Floating point operations had to be 'fast' to enable the reduction. +      TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op, +                                                   TmpVec, Shuf, "bin.rdx")); +    } else { +      assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && +             "Invalid min/max"); +      TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec, +                                                    Shuf); +    } +    if (!RedOps.empty()) +      propagateIRFlags(TmpVec, RedOps); +  } +  // The result is in the first element of the vector. +  return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); +} + +/// Create a simple vector reduction specified by an opcode and some +/// flags (if generating min/max reductions). +Value *llvm::createSimpleTargetReduction( +    IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode, +    Value *Src, TargetTransformInfo::ReductionFlags Flags, +    ArrayRef<Value *> RedOps) { +  assert(isa<VectorType>(Src->getType()) && "Type must be a vector"); + +  Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType()); +  std::function<Value*()> BuildFunc; +  using RD = RecurrenceDescriptor; +  RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid; +  // TODO: Support creating ordered reductions. +  FastMathFlags FMFUnsafe; +  FMFUnsafe.setUnsafeAlgebra(); + +  switch (Opcode) { +  case Instruction::Add: +    BuildFunc = [&]() { return Builder.CreateAddReduce(Src); }; +    break; +  case Instruction::Mul: +    BuildFunc = [&]() { return Builder.CreateMulReduce(Src); }; +    break; +  case Instruction::And: +    BuildFunc = [&]() { return Builder.CreateAndReduce(Src); }; +    break; +  case Instruction::Or: +    BuildFunc = [&]() { return Builder.CreateOrReduce(Src); }; +    break; +  case Instruction::Xor: +    BuildFunc = [&]() { return Builder.CreateXorReduce(Src); }; +    break; +  case Instruction::FAdd: +    BuildFunc = [&]() { +      auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src); +      cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe); +      return Rdx; +    }; +    break; +  case Instruction::FMul: +    BuildFunc = [&]() { +      auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src); +      cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe); +      return Rdx; +    }; +    break; +  case Instruction::ICmp: +    if (Flags.IsMaxOp) { +      MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax; +      BuildFunc = [&]() { +        return Builder.CreateIntMaxReduce(Src, Flags.IsSigned); +      }; +    } else { +      MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin; +      BuildFunc = [&]() { +        return Builder.CreateIntMinReduce(Src, Flags.IsSigned); +      }; +    } +    break; +  case Instruction::FCmp: +    if (Flags.IsMaxOp) { +      MinMaxKind = RD::MRK_FloatMax; +      BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); }; +    } else { +      MinMaxKind = RD::MRK_FloatMin; +      BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); }; +    } +    break; +  default: +    llvm_unreachable("Unhandled opcode"); +    break; +  } +  if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags)) +    return BuildFunc(); +  return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps); +} + +/// Create a vector reduction using a given recurrence descriptor. +Value *llvm::createTargetReduction(IRBuilder<> &Builder, +                                   const TargetTransformInfo *TTI, +                                   RecurrenceDescriptor &Desc, Value *Src, +                                   bool NoNaN) { +  // TODO: Support in-order reductions based on the recurrence descriptor. +  RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind(); +  TargetTransformInfo::ReductionFlags Flags; +  Flags.NoNaN = NoNaN; +  auto getSimpleRdx = [&](unsigned Opc) { +    return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags); +  }; +  switch (RecKind) { +  case RecurrenceDescriptor::RK_FloatAdd: +    return getSimpleRdx(Instruction::FAdd); +  case RecurrenceDescriptor::RK_FloatMult: +    return getSimpleRdx(Instruction::FMul); +  case RecurrenceDescriptor::RK_IntegerAdd: +    return getSimpleRdx(Instruction::Add); +  case RecurrenceDescriptor::RK_IntegerMult: +    return getSimpleRdx(Instruction::Mul); +  case RecurrenceDescriptor::RK_IntegerAnd: +    return getSimpleRdx(Instruction::And); +  case RecurrenceDescriptor::RK_IntegerOr: +    return getSimpleRdx(Instruction::Or); +  case RecurrenceDescriptor::RK_IntegerXor: +    return getSimpleRdx(Instruction::Xor); +  case RecurrenceDescriptor::RK_IntegerMinMax: { +    switch (Desc.getMinMaxRecurrenceKind()) { +    case RecurrenceDescriptor::MRK_SIntMax: +      Flags.IsSigned = true; +      Flags.IsMaxOp = true; +      break; +    case RecurrenceDescriptor::MRK_UIntMax: +      Flags.IsMaxOp = true; +      break; +    case RecurrenceDescriptor::MRK_SIntMin: +      Flags.IsSigned = true; +      break; +    case RecurrenceDescriptor::MRK_UIntMin: +      break; +    default: +      llvm_unreachable("Unhandled MRK"); +    } +    return getSimpleRdx(Instruction::ICmp); +  } +  case RecurrenceDescriptor::RK_FloatMinMax: { +    Flags.IsMaxOp = +        Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax; +    return getSimpleRdx(Instruction::FCmp); +  } +  default: +    llvm_unreachable("Unhandled RecKind"); +  } +} + +void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL) { +  if (auto *VecOp = dyn_cast<Instruction>(I)) { +    if (auto *I0 = dyn_cast<Instruction>(VL[0])) { +      // VecOVp is initialized to the 0th scalar, so start counting from index +      // '1'. +      VecOp->copyIRFlags(I0); +      for (int i = 1, e = VL.size(); i < e; ++i) { +        if (auto *Scalar = dyn_cast<Instruction>(VL[i])) +          VecOp->andIRFlags(Scalar); +      } +    } +  } +} diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp index 29d334f2968f1..2ef3d6336ae2b 100644 --- a/lib/Transforms/Utils/ModuleUtils.cpp +++ b/lib/Transforms/Utils/ModuleUtils.cpp @@ -35,7 +35,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,      // Upgrade a 2-field global array type to the new 3-field format if needed.      if (Data && OldEltTy->getNumElements() < 3)        EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy), -                              IRB.getInt8PtrTy(), nullptr); +                              IRB.getInt8PtrTy());      else        EltTy = OldEltTy;      if (Constant *Init = GVCtor->getInitializer()) { @@ -44,10 +44,10 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,        for (unsigned i = 0; i != n; ++i) {          auto Ctor = cast<Constant>(Init->getOperand(i));          if (EltTy != OldEltTy) -          Ctor = ConstantStruct::get( -              EltTy, Ctor->getAggregateElement((unsigned)0), -              Ctor->getAggregateElement(1), -              Constant::getNullValue(IRB.getInt8PtrTy()), nullptr); +          Ctor = +              ConstantStruct::get(EltTy, Ctor->getAggregateElement((unsigned)0), +                                  Ctor->getAggregateElement(1), +                                  Constant::getNullValue(IRB.getInt8PtrTy()));          CurrentCtors.push_back(Ctor);        }      } @@ -55,7 +55,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,    } else {      // Use the new three-field struct if there isn't one already.      EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy), -                            IRB.getInt8PtrTy(), nullptr); +                            IRB.getInt8PtrTy());    }    // Build a 2 or 3 field global_ctor entry.  We don't take a comdat key. diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 9e71d746de349..1de579ed41b09 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1450,11 +1450,11 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,      // x86_64 can't use {float, float} since that would be returned in both      // xmm0 and xmm1, which isn't what a real struct would do.      ResTy = T.getArch() == Triple::x86_64 -    ? static_cast<Type *>(VectorType::get(ArgTy, 2)) -    : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr)); +                ? static_cast<Type *>(VectorType::get(ArgTy, 2)) +                : static_cast<Type *>(StructType::get(ArgTy, ArgTy));    } else {      Name = "__sincospi_stret"; -    ResTy = StructType::get(ArgTy, ArgTy, nullptr); +    ResTy = StructType::get(ArgTy, ArgTy);    }    Module *M = OrigCallee->getParent(); diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp index 83bd29dbca651..60d9ede2c4871 100644 --- a/lib/Transforms/Utils/VNCoercion.cpp +++ b/lib/Transforms/Utils/VNCoercion.cpp @@ -303,6 +303,15 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,                                       const DataLayout &DL) {    LLVMContext &Ctx = SrcVal->getType()->getContext(); +  // If two pointers are in the same address space, they have the same size, +  // so we don't need to do any truncation, etc. This avoids introducing +  // ptrtoint instructions for pointers that may be non-integral. +  if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() && +      cast<PointerType>(SrcVal->getType())->getAddressSpace() == +          cast<PointerType>(LoadTy)->getAddressSpace()) { +    return SrcVal; +  } +    uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;    uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;    // Compute which bits of the stored value are being used by the load.  Convert diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index 84d89f103a2fd..930972924c3c0 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -949,11 +949,10 @@ void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,      Constant *NewV;      if (IsOldCtorDtor) {        auto *S = cast<ConstantStruct>(V); -      auto *E1 = mapValue(S->getOperand(0)); -      auto *E2 = mapValue(S->getOperand(1)); -      Value *Null = Constant::getNullValue(VoidPtrTy); -      NewV = -          ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr); +      auto *E1 = cast<Constant>(mapValue(S->getOperand(0))); +      auto *E2 = cast<Constant>(mapValue(S->getOperand(1))); +      Constant *Null = Constant::getNullValue(VoidPtrTy); +      NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null);      } else {        NewV = cast_or_null<Constant>(mapValue(V));      } diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 97dcb40a1d721..9cf66382b5817 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -346,7 +346,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {    if (!Safe) {      KnownBits Known(BitWidth);      computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); -    if (Known.Zero.countTrailingZeros() < (BitWidth - 1)) +    if (Known.countMaxTrailingOnes() < (BitWidth - 1))        Safe = true;    } diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 3fde0a4539623..516ab7d03a88e 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -391,13 +391,14 @@ public:          TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),          AddedSafetyChecks(false) {} -  // Perform the actual loop widening (vectorization). -  void vectorize() { -    // Create a new empty loop. Unlink the old loop and connect the new one. -    createEmptyLoop(); -    // Widen each instruction in the old loop to a new one in the new loop. -    vectorizeLoop(); -  } +  /// Create a new empty loop. Unlink the old loop and connect the new one. +  void createVectorizedLoopSkeleton(); + +  /// Vectorize a single instruction within the innermost loop. +  void vectorizeInstruction(Instruction &I); + +  /// Fix the vectorized code, taking care of header phi's, live-outs, and more. +  void fixVectorizedLoop();    // Return true if any runtime check is added.    bool areSafetyChecksAdded() { return AddedSafetyChecks; } @@ -425,9 +426,6 @@ protected:        EdgeMaskCacheTy;    typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy; -  /// Create an empty loop, based on the loop ranges of the old loop. -  void createEmptyLoop(); -    /// Set up the values of the IVs correctly when exiting the vector loop.    void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,                      Value *CountRoundDown, Value *EndValue, @@ -436,8 +434,6 @@ protected:    /// Create a new induction variable inside L.    PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,                                     Value *Step, Instruction *DL); -  /// Copy and widen the instructions from the old loop. -  virtual void vectorizeLoop();    /// Handle all cross-iteration phis in the header.    void fixCrossIterationPHIs(); @@ -450,10 +446,10 @@ protected:    /// vectorizing this phi node.    void fixReduction(PHINode *Phi); -  /// \brief The Loop exit block may have single value PHI nodes where the -  /// incoming value is 'Undef'. While vectorizing we only handled real values -  /// that were defined inside the loop. Here we fix the 'undef case'. -  /// See PR14725. +  /// \brief The Loop exit block may have single value PHI nodes with some +  /// incoming value. While vectorizing we only handled real values +  /// that were defined inside the loop and we should have one value for +  /// each predecessor of its parent basic block. See PR14725.    void fixLCSSAPHIs();    /// Iteratively sink the scalarized operands of a predicated instruction into @@ -464,11 +460,6 @@ protected:    /// respective conditions.    void predicateInstructions(); -  /// Collect the instructions from the original loop that would be trivially -  /// dead in the vectorized loop if generated. -  void collectTriviallyDeadInstructions( -      SmallPtrSetImpl<Instruction *> &DeadInstructions); -    /// Shrinks vector element sizes to the smallest bitwidth they can be legally    /// represented as.    void truncateToMinimalBitwidths(); @@ -481,10 +472,6 @@ protected:    /// and DST.    VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); -  /// A helper function to vectorize a single instruction within the innermost -  /// loop. -  void vectorizeInstruction(Instruction &I); -    /// Vectorize a single PHINode in a block. This method handles the induction    /// variable canonicalization. It supports both VF = 1 for unrolled loops and    /// arbitrary length vectors. @@ -1700,6 +1687,9 @@ public:    /// access that can be widened.    bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); +  // Returns true if the NoNaN attribute is set on the function. +  bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } +  private:    /// Check if a single basic block loop is vectorizable.    /// At this point we know that this is a loop with a constant trip count @@ -2185,7 +2175,10 @@ public:  /// passed Legality checks.  class LoopVectorizationPlanner {  public: -  LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {} +  LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI, +                           LoopVectorizationLegality *Legal, +                           LoopVectorizationCostModel &CM) +      : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}    ~LoopVectorizationPlanner() {} @@ -2193,7 +2186,25 @@ public:    LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,                                                         unsigned UserVF); +  /// Generate the IR code for the vectorized loop. +  void executePlan(InnerLoopVectorizer &ILV); + +protected: +  /// Collect the instructions from the original loop that would be trivially +  /// dead in the vectorized loop if generated. +  void collectTriviallyDeadInstructions( +      SmallPtrSetImpl<Instruction *> &DeadInstructions); +  private: +  /// The loop that we evaluate. +  Loop *OrigLoop; + +  /// Loop Info analysis. +  LoopInfo *LI; + +  /// The legality analysis. +  LoopVectorizationLegality *Legal; +    /// The profitablity analysis.    LoopVectorizationCostModel &CM;  }; @@ -3361,7 +3372,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {    LVer->prepareNoAliasMetadata();  } -void InnerLoopVectorizer::createEmptyLoop() { +void InnerLoopVectorizer::createVectorizedLoopSkeleton() {    /*     In this function we generate a new loop. The new loop will contain     the vectorized instructions while the old loop will continue to run the @@ -3883,36 +3894,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {    }  } -void InnerLoopVectorizer::vectorizeLoop() { -  //===------------------------------------------------===// -  // -  // Notice: any optimization or new instruction that go -  // into the code below should be also be implemented in -  // the cost-model. -  // -  //===------------------------------------------------===// - -  // Collect instructions from the original loop that will become trivially dead -  // in the vectorized loop. We don't need to vectorize these instructions. For -  // example, original induction update instructions can become dead because we -  // separately emit induction "steps" when generating code for the new loop. -  // Similarly, we create a new latch condition when setting up the structure -  // of the new loop, so the old one can become dead. -  SmallPtrSet<Instruction *, 4> DeadInstructions; -  collectTriviallyDeadInstructions(DeadInstructions); - -  // Scan the loop in a topological order to ensure that defs are vectorized -  // before users. -  LoopBlocksDFS DFS(OrigLoop); -  DFS.perform(LI); - -  // Vectorize all instructions in the original loop that will not become -  // trivially dead when vectorized. -  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) -    for (Instruction &I : *BB) -      if (!DeadInstructions.count(&I)) -        vectorizeInstruction(I); - +void InnerLoopVectorizer::fixVectorizedLoop() {    // Insert truncates and extends for any truncated instructions as hints to    // InstCombine.    if (VF > 1) @@ -4049,8 +4031,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {    // Set the insertion point after the previous value if it is an instruction.    // Note that the previous value may have been constant-folded so it is not -  // guaranteed to be an instruction in the vector loop. -  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1])) +  // guaranteed to be an instruction in the vector loop. Also, if the previous +  // value is a phi node, we should insert after all the phi nodes to avoid +  // breaking basic block verification. +  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) || +      isa<PHINode>(PreviousParts[UF - 1]))      Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());    else      Builder.SetInsertPoint( @@ -4258,39 +4243,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {    }    if (VF > 1) { -    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles -    // and vector ops, reducing the set of values being computed by half each -    // round. -    assert(isPowerOf2_32(VF) && -           "Reduction emission only supported for pow2 vectors!"); -    Value *TmpVec = ReducedPartRdx; -    SmallVector<Constant *, 32> ShuffleMask(VF, nullptr); -    for (unsigned i = VF; i != 1; i >>= 1) { -      // Move the upper half of the vector to the lower half. -      for (unsigned j = 0; j != i / 2; ++j) -        ShuffleMask[j] = Builder.getInt32(i / 2 + j); - -      // Fill the rest of the mask with undef. -      std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -                UndefValue::get(Builder.getInt32Ty())); - -      Value *Shuf = Builder.CreateShuffleVector( -          TmpVec, UndefValue::get(TmpVec->getType()), -          ConstantVector::get(ShuffleMask), "rdx.shuf"); - -      if (Op != Instruction::ICmp && Op != Instruction::FCmp) -        // Floating point operations had to be 'fast' to enable the reduction. -        TmpVec = addFastMathFlag(Builder.CreateBinOp( -                                     (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); -      else -        TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, -                                                      TmpVec, Shuf); -    } - -    // The result is in the first element of the vector. +    bool NoNaN = Legal->hasFunNoNaNAttr();      ReducedPartRdx = -      Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); - +        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);      // If the reduction can be performed in a smaller type, we need to extend      // the reduction to the wider type before we branch to the original loop.      if (Phi->getType() != RdxDesc.getRecurrenceType()) @@ -4345,33 +4300,11 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {      auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);      if (!LCSSAPhi)        break; -    if (LCSSAPhi->getNumIncomingValues() == 1) -      LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), -                            LoopMiddleBlock); -  } -} - -void InnerLoopVectorizer::collectTriviallyDeadInstructions( -    SmallPtrSetImpl<Instruction *> &DeadInstructions) { -  BasicBlock *Latch = OrigLoop->getLoopLatch(); - -  // We create new control-flow for the vectorized loop, so the original -  // condition will be dead after vectorization if it's only used by the -  // branch. -  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); -  if (Cmp && Cmp->hasOneUse()) -    DeadInstructions.insert(Cmp); - -  // We create new "steps" for induction variable updates to which the original -  // induction variables map. An original update instruction will be dead if -  // all its users except the induction variable are dead. -  for (auto &Induction : *Legal->getInductionVars()) { -    PHINode *Ind = Induction.first; -    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); -    if (all_of(IndUpdate->users(), [&](User *U) -> bool { -          return U == Ind || DeadInstructions.count(cast<Instruction>(U)); -        })) -      DeadInstructions.insert(IndUpdate); +    if (LCSSAPhi->getNumIncomingValues() == 1) { +      assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) && +             "Incoming value isn't loop invariant"); +      LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock); +    }    }  } @@ -7577,6 +7510,72 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {    return CM.selectVectorizationFactor(MaxVF);  } +void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { +  // Perform the actual loop transformation. + +  // 1. Create a new empty loop. Unlink the old loop and connect the new one. +  ILV.createVectorizedLoopSkeleton(); + +  //===------------------------------------------------===// +  // +  // Notice: any optimization or new instruction that go +  // into the code below should also be implemented in +  // the cost-model. +  // +  //===------------------------------------------------===// + +  // 2. Copy and widen instructions from the old loop into the new loop. + +  // Collect instructions from the original loop that will become trivially dead +  // in the vectorized loop. We don't need to vectorize these instructions. For +  // example, original induction update instructions can become dead because we +  // separately emit induction "steps" when generating code for the new loop. +  // Similarly, we create a new latch condition when setting up the structure +  // of the new loop, so the old one can become dead. +  SmallPtrSet<Instruction *, 4> DeadInstructions; +  collectTriviallyDeadInstructions(DeadInstructions); + +  // Scan the loop in a topological order to ensure that defs are vectorized +  // before users. +  LoopBlocksDFS DFS(OrigLoop); +  DFS.perform(LI); + +  // Vectorize all instructions in the original loop that will not become +  // trivially dead when vectorized. +  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) +    for (Instruction &I : *BB) +      if (!DeadInstructions.count(&I)) +        ILV.vectorizeInstruction(I); + +  // 3. Fix the vectorized code: take care of header phi's, live-outs, +  //    predication, updating analyses. +  ILV.fixVectorizedLoop(); +} + +void LoopVectorizationPlanner::collectTriviallyDeadInstructions( +    SmallPtrSetImpl<Instruction *> &DeadInstructions) { +  BasicBlock *Latch = OrigLoop->getLoopLatch(); + +  // We create new control-flow for the vectorized loop, so the original +  // condition will be dead after vectorization if it's only used by the +  // branch. +  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); +  if (Cmp && Cmp->hasOneUse()) +    DeadInstructions.insert(Cmp); + +  // We create new "steps" for induction variable updates to which the original +  // induction variables map. An original update instruction will be dead if +  // all its users except the induction variable are dead. +  for (auto &Induction : *Legal->getInductionVars()) { +    PHINode *Ind = Induction.first; +    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); +    if (all_of(IndUpdate->users(), [&](User *U) -> bool { +          return U == Ind || DeadInstructions.count(cast<Instruction>(U)); +        })) +      DeadInstructions.insert(IndUpdate); +  } +} +  void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {    auto *SI = dyn_cast<StoreInst>(Instr);    bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); @@ -7759,7 +7758,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {    CM.collectValuesToIgnore();    // Use the planner for vectorization. -  LoopVectorizationPlanner LVP(CM); +  LoopVectorizationPlanner LVP(L, LI, &LVL, CM);    // Get user vectorization factor.    unsigned UserVF = Hints.getWidth(); @@ -7853,7 +7852,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {      // interleave it.      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,                                 &CM); -    Unroller.vectorize(); +    LVP.executePlan(Unroller);      ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),                                   L->getHeader()) @@ -7863,7 +7862,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {      // If we decided that it is *legal* to vectorize the loop, then do it.      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,                             &LVL, &CM); -    LB.vectorize(); +    LVP.executePlan(LB);      ++LoopsVectorized;      // Add metadata to disable runtime unrolling a scalar loop when there are diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index f112c555205c3..64013d6d687df 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -40,7 +40,9 @@  #include "llvm/Support/CommandLine.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/GraphWriter.h" +#include "llvm/Support/KnownBits.h"  #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/LoopUtils.h"  #include "llvm/Transforms/Vectorize.h"  #include <algorithm>  #include <memory> @@ -212,23 +214,6 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {    return Opcode;  } -/// Get the intersection (logical and) of all of the potential IR flags -/// of each scalar operation (VL) that will be converted into a vector (I). -/// Flag set: NSW, NUW, exact, and all of fast-math. -static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { -  if (auto *VecOp = dyn_cast<Instruction>(I)) { -    if (auto *I0 = dyn_cast<Instruction>(VL[0])) { -      // VecOVp is initialized to the 0th scalar, so start counting from index -      // '1'. -      VecOp->copyIRFlags(I0); -      for (int i = 1, e = VL.size(); i < e; ++i) { -        if (auto *Scalar = dyn_cast<Instruction>(VL[i])) -          VecOp->andIRFlags(Scalar); -      } -    } -  } -} -  /// \returns true if all of the values in \p VL have the same type or false  /// otherwise.  static bool allSameType(ArrayRef<Value *> VL) { @@ -315,10 +300,10 @@ public:    BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,            TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,            DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, -          const DataLayout *DL) +          const DataLayout *DL, OptimizationRemarkEmitter *ORE)        : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),          SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), -        DL(DL), Builder(Se->getContext()) { +        DL(DL), ORE(ORE), Builder(Se->getContext()) {      CodeMetrics::collectEphemeralValues(F, AC, EphValues);      // Use the vector register size specified by the target unless overridden      // by a command-line option. @@ -331,7 +316,10 @@ public:      else        MaxVecRegSize = TTI->getRegisterBitWidth(true); -    MinVecRegSize = MinVectorRegSizeOption; +    if (MinVectorRegSizeOption.getNumOccurrences()) +      MinVecRegSize = MinVectorRegSizeOption; +    else +      MinVecRegSize = TTI->getMinVectorRegisterBitWidth();    }    /// \brief Vectorize the tree that starts with the elements in \p VL. @@ -377,6 +365,8 @@ public:      MinBWs.clear();    } +  unsigned getTreeSize() const { return VectorizableTree.size(); } +    /// \brief Perform LICM and CSE on the newly generated gather sequences.    void optimizeGatherSequence(); @@ -415,6 +405,8 @@ public:    /// vectorizable. We do not vectorize such trees.    bool isTreeTinyAndNotFullyVectorizable(); +  OptimizationRemarkEmitter *getORE() { return ORE; } +  private:    struct TreeEntry; @@ -944,6 +936,8 @@ private:    AssumptionCache *AC;    DemandedBits *DB;    const DataLayout *DL; +  OptimizationRemarkEmitter *ORE; +    unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.    unsigned MinVecRegSize; // Set by cl::opt (default: 128).    /// Instruction builder to construct the vectorized tree. @@ -1835,11 +1829,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {            CInt->getValue().isPowerOf2())          Op2VP = TargetTransformInfo::OP_PowerOf2; -      int ScalarCost = VecTy->getNumElements() * -                       TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, -                                                   Op2VK, Op1VP, Op2VP); +      SmallVector<const Value *, 4> Operands(VL0->operand_values()); +      int ScalarCost = +          VecTy->getNumElements() * +          TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, +                                      Op2VP, Operands);        int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, -                                                Op1VP, Op2VP); +                                                Op1VP, Op2VP, Operands);        return VecCost - ScalarCost;      }      case Instruction::GetElementPtr: { @@ -3703,10 +3699,8 @@ void BoUpSLP::computeMinimumValueSizes() {      // Determine if the sign bit of all the roots is known to be zero. If not,      // IsKnownPositive is set to False.      IsKnownPositive = all_of(TreeRoot, [&](Value *R) { -      bool KnownZero = false; -      bool KnownOne = false; -      ComputeSignBit(R, KnownZero, KnownOne, *DL); -      return KnownZero; +      KnownBits Known = computeKnownBits(R, *DL); +      return Known.isNonNegative();      });      // Determine the maximum number of bits required to store the scalar @@ -3786,8 +3780,9 @@ struct SLPVectorizer : public FunctionPass {      auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();      auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);      auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); +    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); -    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); +    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);    }    void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -3799,6 +3794,7 @@ struct SLPVectorizer : public FunctionPass {      AU.addRequired<LoopInfoWrapperPass>();      AU.addRequired<DominatorTreeWrapperPass>();      AU.addRequired<DemandedBitsWrapperPass>(); +    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();      AU.addPreserved<LoopInfoWrapperPass>();      AU.addPreserved<DominatorTreeWrapperPass>();      AU.addPreserved<AAResultsWrapperPass>(); @@ -3817,8 +3813,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A    auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);    auto *AC = &AM.getResult<AssumptionAnalysis>(F);    auto *DB = &AM.getResult<DemandedBitsAnalysis>(F); +  auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); -  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); +  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);    if (!Changed)      return PreservedAnalyses::all(); @@ -3833,7 +3830,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,                                  TargetTransformInfo *TTI_,                                  TargetLibraryInfo *TLI_, AliasAnalysis *AA_,                                  LoopInfo *LI_, DominatorTree *DT_, -                                AssumptionCache *AC_, DemandedBits *DB_) { +                                AssumptionCache *AC_, DemandedBits *DB_, +                                OptimizationRemarkEmitter *ORE_) {    SE = SE_;    TTI = TTI_;    TLI = TLI_; @@ -3861,7 +3859,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,    // Use the bottom up slp vectorizer to construct chains that start with    // store instructions. -  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL); +  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);    // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to    // delete instructions. @@ -3950,6 +3948,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,      DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");      if (Cost < -SLPCostThreshold) {        DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); +      using namespace ore; +      R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", +                                          cast<StoreInst>(Chain[i])) +                       << "Stores SLP vectorized with cost " << NV("Cost", Cost) +                       << " and with tree size " +                       << NV("TreeSize", R.getTreeSize())); +        R.vectorizeTree();        // Move to the next bundle. @@ -4163,6 +4168,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,        if (Cost < -SLPCostThreshold) {          DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); +        R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", +                                            cast<Instruction>(Ops[0])) +                         << "SLP vectorized with cost " << ore::NV("Cost", Cost) +                         << " and with tree size " +                         << ore::NV("TreeSize", R.getTreeSize())); +          Value *VectorizedRoot = R.vectorizeTree();          // Reconstruct the build vector by extracting the vectorized root. This @@ -4506,6 +4517,12 @@ public:        DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost                     << ". (HorRdx)\n"); +      auto *I0 = cast<Instruction>(VL[0]); +      V.getORE()->emit( +          OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0) +          << "Vectorized horizontal reduction with cost " +          << ore::NV("Cost", Cost) << " and with tree size " +          << ore::NV("TreeSize", V.getTreeSize()));        // Vectorize a tree.        DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); @@ -4513,7 +4530,7 @@ public:        // Emit a reduction.        Value *ReducedSubTree = -          emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps); +          emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);        if (VectorizedTree) {          Builder.SetCurrentDebugLocation(Loc);          VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, @@ -4583,33 +4600,31 @@ private:    /// \brief Emit a horizontal reduction of the vectorized value.    Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, -                       unsigned ReduxWidth, ArrayRef<Value *> RedOps) { +                       unsigned ReduxWidth, ArrayRef<Value *> RedOps, +                       const TargetTransformInfo *TTI) {      assert(VectorizedValue && "Need to have a vectorized tree node");      assert(isPowerOf2_32(ReduxWidth) &&             "We only handle power-of-two reductions for now"); +    if (!IsPairwiseReduction) +      return createSimpleTargetReduction( +          Builder, TTI, ReductionOpcode, VectorizedValue, +          TargetTransformInfo::ReductionFlags(), RedOps); +      Value *TmpVec = VectorizedValue;      for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { -      if (IsPairwiseReduction) { -        Value *LeftMask = +      Value *LeftMask =            createRdxShuffleMask(ReduxWidth, i, true, true, Builder); -        Value *RightMask = +      Value *RightMask =            createRdxShuffleMask(ReduxWidth, i, true, false, Builder); -        Value *LeftShuf = Builder.CreateShuffleVector( +      Value *LeftShuf = Builder.CreateShuffleVector(            TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); -        Value *RightShuf = Builder.CreateShuffleVector( +      Value *RightShuf = Builder.CreateShuffleVector(            TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),            "rdx.shuf.r"); -        TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, -                                     "bin.rdx"); -      } else { -        Value *UpperHalf = -          createRdxShuffleMask(ReduxWidth, i, false, false, Builder); -        Value *Shuf = Builder.CreateShuffleVector( -          TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); -        TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx"); -      } +      TmpVec = +          Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");        propagateIRFlags(TmpVec, RedOps);      } @@ -5162,6 +5177,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)  INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)  INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)  INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)  namespace llvm { | 
