diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp')
| -rw-r--r-- | contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp | 591 | 
1 files changed, 454 insertions, 137 deletions
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp index b2bc75c19709..77ce3d2fb563 100644 --- a/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -46,6 +46,7 @@  #include "llvm/IR/Constant.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h"  #include "llvm/IR/DerivedTypes.h"  #include "llvm/IR/Dominators.h"  #include "llvm/IR/Function.h" @@ -377,6 +378,7 @@ class TypePromotionTransaction;      }      void removeAllAssertingVHReferences(Value *V); +    bool eliminateAssumptions(Function &F);      bool eliminateFallThrough(Function &F);      bool eliminateMostlyEmptyBlocks(Function &F);      BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB); @@ -404,6 +406,7 @@ class TypePromotionTransaction;      bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT);      bool fixupDbgValue(Instruction *I);      bool placeDbgValues(Function &F); +    bool placePseudoProbes(Function &F);      bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,                        LoadInst *&LI, Instruction *&Inst, bool HasPromoted);      bool tryToPromoteExts(TypePromotionTransaction &TPT, @@ -506,6 +509,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {      }    } +  // Get rid of @llvm.assume builtins before attempting to eliminate empty +  // blocks, since there might be blocks that only contain @llvm.assume calls +  // (plus arguments that we can get rid of). +  EverMadeChange |= eliminateAssumptions(F); +    // Eliminate blocks that contain only PHI nodes and an    // unconditional branch.    EverMadeChange |= eliminateMostlyEmptyBlocks(F); @@ -566,10 +574,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) {        MadeChange |= ConstantFoldTerminator(&BB, true);        if (!MadeChange) continue; -      for (SmallVectorImpl<BasicBlock*>::iterator -             II = Successors.begin(), IE = Successors.end(); II != IE; ++II) -        if (pred_empty(*II)) -          WorkList.insert(*II); +      for (BasicBlock *Succ : Successors) +        if (pred_empty(Succ)) +          WorkList.insert(Succ);      }      // Delete the dead blocks and any of their dead successors. @@ -580,10 +587,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) {        DeleteDeadBlock(BB); -      for (SmallVectorImpl<BasicBlock*>::iterator -             II = Successors.begin(), IE = Successors.end(); II != IE; ++II) -        if (pred_empty(*II)) -          WorkList.insert(*II); +      for (BasicBlock *Succ : Successors) +        if (pred_empty(Succ)) +          WorkList.insert(Succ);      }      // Merge pairs of basic blocks with unconditional branches, connected by @@ -607,6 +613,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {    // Do this last to clean up use-before-def scenarios introduced by other    // preparatory transforms.    EverMadeChange |= placeDbgValues(F); +  EverMadeChange |= placePseudoProbes(F);  #ifndef NDEBUG    if (VerifyBFIUpdates) @@ -616,6 +623,26 @@ bool CodeGenPrepare::runOnFunction(Function &F) {    return EverMadeChange;  } +bool CodeGenPrepare::eliminateAssumptions(Function &F) { +  bool MadeChange = false; +  for (BasicBlock &BB : F) { +    CurInstIterator = BB.begin(); +    while (CurInstIterator != BB.end()) { +      Instruction *I = &*(CurInstIterator++); +      if (auto *Assume = dyn_cast<AssumeInst>(I)) { +        MadeChange = true; +        Value *Operand = Assume->getOperand(0); +        Assume->eraseFromParent(); + +        resetIteratorIfInvalidatedWhileCalling(&BB, [&]() { +          RecursivelyDeleteTriviallyDeadInstructions(Operand, TLInfo, nullptr); +        }); +      } +    } +  } +  return MadeChange; +} +  /// An instruction is about to be deleted, so remove all references to it in our  /// GEP-tracking data strcutures.  void CodeGenPrepare::removeAllAssertingVHReferences(Value *V) { @@ -780,8 +807,8 @@ bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,    // Skip merging if the block's successor is also a successor to any callbr    // that leads to this block.    // FIXME: Is this really needed? Is this a correctness issue? -  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { -    if (auto *CBI = dyn_cast<CallBrInst>((*PI)->getTerminator())) +  for (BasicBlock *Pred : predecessors(BB)) { +    if (auto *CBI = dyn_cast<CallBrInst>((Pred)->getTerminator()))        for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i)          if (DestBB == CBI->getSuccessor(i))            return false; @@ -822,9 +849,7 @@ bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,    // Find all other incoming blocks from which incoming values of all PHIs in    // DestBB are the same as the ones from BB. -  for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E; -       ++PI) { -    BasicBlock *DestBBPred = *PI; +  for (BasicBlock *DestBBPred : predecessors(DestBB)) {      if (DestBBPred == BB)        continue; @@ -964,8 +989,8 @@ void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {          for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)            PN.addIncoming(InVal, BBPN->getIncomingBlock(i));        } else { -        for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) -          PN.addIncoming(InVal, *PI); +        for (BasicBlock *Pred : predecessors(BB)) +          PN.addIncoming(InVal, Pred);        }      }    } @@ -1280,11 +1305,83 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,    return SinkCast(CI);  } +// Match a simple increment by constant operation.  Note that if a sub is +// matched, the step is negated (as if the step had been canonicalized to +// an add, even though we leave the instruction alone.) +bool matchIncrement(const Instruction* IVInc, Instruction *&LHS, +                    Constant *&Step) { +  if (match(IVInc, m_Add(m_Instruction(LHS), m_Constant(Step))) || +      match(IVInc, m_ExtractValue<0>(m_Intrinsic<Intrinsic::uadd_with_overflow>( +                       m_Instruction(LHS), m_Constant(Step))))) +    return true; +  if (match(IVInc, m_Sub(m_Instruction(LHS), m_Constant(Step))) || +      match(IVInc, m_ExtractValue<0>(m_Intrinsic<Intrinsic::usub_with_overflow>( +                       m_Instruction(LHS), m_Constant(Step))))) { +    Step = ConstantExpr::getNeg(Step); +    return true; +  } +  return false; +} + +/// If given \p PN is an inductive variable with value IVInc coming from the +/// backedge, and on each iteration it gets increased by Step, return pair +/// <IVInc, Step>. Otherwise, return None. +static Optional<std::pair<Instruction *, Constant *> > +getIVIncrement(const PHINode *PN, const LoopInfo *LI) { +  const Loop *L = LI->getLoopFor(PN->getParent()); +  if (!L || L->getHeader() != PN->getParent() || !L->getLoopLatch()) +    return None; +  auto *IVInc = +      dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch())); +  if (!IVInc || LI->getLoopFor(IVInc->getParent()) != L) +    return None; +  Instruction *LHS = nullptr; +  Constant *Step = nullptr; +  if (matchIncrement(IVInc, LHS, Step) && LHS == PN) +    return std::make_pair(IVInc, Step); +  return None; +} + +static bool isIVIncrement(const Value *V, const LoopInfo *LI) { +  auto *I = dyn_cast<Instruction>(V); +  if (!I) +    return false; +  Instruction *LHS = nullptr; +  Constant *Step = nullptr; +  if (!matchIncrement(I, LHS, Step)) +    return false; +  if (auto *PN = dyn_cast<PHINode>(LHS)) +    if (auto IVInc = getIVIncrement(PN, LI)) +      return IVInc->first == I; +  return false; +} +  bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,                                                   Value *Arg0, Value *Arg1,                                                   CmpInst *Cmp,                                                   Intrinsic::ID IID) { -  if (BO->getParent() != Cmp->getParent()) { +  auto IsReplacableIVIncrement = [this, &Cmp](BinaryOperator *BO) { +    if (!isIVIncrement(BO, LI)) +      return false; +    const Loop *L = LI->getLoopFor(BO->getParent()); +    assert(L && "L should not be null after isIVIncrement()"); +    // Do not risk on moving increment into a child loop. +    if (LI->getLoopFor(Cmp->getParent()) != L) +      return false; + +    // Finally, we need to ensure that the insert point will dominate all +    // existing uses of the increment. + +    auto &DT = getDT(*BO->getParent()->getParent()); +    if (DT.dominates(Cmp->getParent(), BO->getParent())) +      // If we're moving up the dom tree, all uses are trivially dominated. +      // (This is the common case for code produced by LSR.) +      return true; + +    // Otherwise, special case the single use in the phi recurrence. +    return BO->hasOneUse() && DT.dominates(Cmp->getParent(), L->getLoopLatch()); +  }; +  if (BO->getParent() != Cmp->getParent() && !IsReplacableIVIncrement(BO)) {      // We used to use a dominator tree here to allow multi-block optimization.      // But that was problematic because:      // 1. It could cause a perf regression by hoisting the math op into the @@ -1295,6 +1392,14 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,      //    This is because we recompute the DT on every change in the main CGP      //    run-loop. The recomputing is probably unnecessary in many cases, so if      //    that was fixed, using a DT here would be ok. +    // +    // There is one important particular case we still want to handle: if BO is +    // the IV increment. Important properties that make it profitable: +    // - We can speculate IV increment anywhere in the loop (as long as the +    //   indvar Phi is its only user); +    // - Upon computing Cmp, we effectively compute something equivalent to the +    //   IV increment (despite it loops differently in the IR). So moving it up +    //   to the cmp point does not really increase register pressure.      return false;    } @@ -1936,6 +2041,10 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,    if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())      return false; +  // Bail if the value is never zero. +  if (llvm::isKnownNonZero(CountZeros->getOperand(0), *DL)) +    return false; +    // The intrinsic will be sunk behind a compare against zero and branch.    BasicBlock *StartBlock = CountZeros->getParent();    BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false"); @@ -2061,18 +2170,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {    if (II) {      switch (II->getIntrinsicID()) {      default: break; -    case Intrinsic::assume: { -      Value *Operand = II->getOperand(0); -      II->eraseFromParent(); -      // Prune the operand, it's most likely dead. -      resetIteratorIfInvalidatedWhileCalling(BB, [&]() { -        RecursivelyDeleteTriviallyDeadInstructions( -            Operand, TLInfo, nullptr, -            [&](Value *V) { removeAllAssertingVHReferences(V); }); -      }); -      return true; -    } - +    case Intrinsic::assume: +      llvm_unreachable("llvm.assume should have been removed already");      case Intrinsic::experimental_widenable_condition: {        // Give up on future widening oppurtunties so that we can fold away dead        // paths and merge blocks before going into block-local instruction @@ -2242,21 +2341,25 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT    if (PN && PN->getParent() != BB)      return false; -  // Make sure there are no instructions between the PHI and return, or that the -  // return is the first instruction in the block. -  if (PN) { -    BasicBlock::iterator BI = BB->begin(); -    // Skip over debug and the bitcast. -    do { -      ++BI; -    } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI || -             isa<PseudoProbeInst>(BI)); -    if (&*BI != RetI) -      return false; -  } else { -    if (BB->getFirstNonPHIOrDbg(true) != RetI) -      return false; -  } +  auto isLifetimeEndOrBitCastFor = [](const Instruction *Inst) { +    const BitCastInst *BC = dyn_cast<BitCastInst>(Inst); +    if (BC && BC->hasOneUse()) +      Inst = BC->user_back(); + +    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) +      return II->getIntrinsicID() == Intrinsic::lifetime_end; +    return false; +  }; + +  // Make sure there are no instructions between the first instruction +  // and return. +  const Instruction *BI = BB->getFirstNonPHI(); +  // Skip over debug and the bitcast. +  while (isa<DbgInfoIntrinsic>(BI) || BI == BCI || BI == EVI || +         isa<PseudoProbeInst>(BI) || isLifetimeEndOrBitCastFor(BI)) +    BI = BI->getNextNode(); +  if (BI != RetI) +    return false;    /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail    /// call. @@ -2276,14 +2379,14 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT      }    } else {      SmallPtrSet<BasicBlock*, 4> VisitedBBs; -    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { -      if (!VisitedBBs.insert(*PI).second) +    for (BasicBlock *Pred : predecessors(BB)) { +      if (!VisitedBBs.insert(Pred).second)          continue; -      if (Instruction *I = (*PI)->rbegin()->getPrevNonDebugInstruction(true)) { +      if (Instruction *I = Pred->rbegin()->getPrevNonDebugInstruction(true)) {          CallInst *CI = dyn_cast<CallInst>(I);          if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&              attributesPermitTailCall(F, CI, RetI, *TLI)) -          TailCallBBs.push_back(*PI); +          TailCallBBs.push_back(Pred);        }      }    } @@ -2775,11 +2878,16 @@ class TypePromotionTransaction {      /// Keep track of the debug users.      SmallVector<DbgValueInst *, 1> DbgValues; +    /// Keep track of the new value so that we can undo it by replacing +    /// instances of the new value with the original value. +    Value *New; +      using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator;    public:      /// Replace all the use of \p Inst by \p New. -    UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) { +    UsesReplacer(Instruction *Inst, Value *New) +        : TypePromotionAction(Inst), New(New) {        LLVM_DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New                          << "\n");        // Record the original uses. @@ -2798,20 +2906,14 @@ class TypePromotionTransaction {      /// Reassign the original uses of Inst to Inst.      void undo() override {        LLVM_DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n"); -      for (use_iterator UseIt = OriginalUses.begin(), -                        EndIt = OriginalUses.end(); -           UseIt != EndIt; ++UseIt) { -        UseIt->Inst->setOperand(UseIt->Idx, Inst); -      } +      for (InstructionAndIdx &Use : OriginalUses) +        Use.Inst->setOperand(Use.Idx, Inst);        // RAUW has replaced all original uses with references to the new value,        // including the debug uses. Since we are undoing the replacements,        // the original debug uses must also be reinstated to maintain the        // correctness and utility of debug value instructions. -      for (auto *DVI: DbgValues) { -        LLVMContext &Ctx = Inst->getType()->getContext(); -        auto *MV = MetadataAsValue::get(Ctx, ValueAsMetadata::get(Inst)); -        DVI->setOperand(0, MV); -      } +      for (auto *DVI : DbgValues) +        DVI->replaceVariableLocationOp(New, Inst);      }    }; @@ -2981,9 +3083,8 @@ TypePromotionTransaction::getRestorationPoint() const {  }  bool TypePromotionTransaction::commit() { -  for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt; -       ++It) -    (*It)->commit(); +  for (std::unique_ptr<TypePromotionAction> &Action : Actions) +    Action->commit();    bool Modified = !Actions.empty();    Actions.clear();    return Modified; @@ -3007,6 +3108,8 @@ class AddressingModeMatcher {    const TargetLowering &TLI;    const TargetRegisterInfo &TRI;    const DataLayout &DL; +  const LoopInfo &LI; +  const std::function<const DominatorTree &()> getDTFn;    /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and    /// the memory instruction that we're computing this address for. @@ -3042,16 +3145,18 @@ class AddressingModeMatcher {    AddressingModeMatcher(        SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI, -      const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI, -      ExtAddrMode &AM, const SetOfInstrs &InsertedInsts, -      InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT, +      const TargetRegisterInfo &TRI, const LoopInfo &LI, +      const std::function<const DominatorTree &()> getDTFn, +      Type *AT, unsigned AS, Instruction *MI, ExtAddrMode &AM, +      const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, +      TypePromotionTransaction &TPT,        std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,        bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)        : AddrModeInsts(AMI), TLI(TLI), TRI(TRI), -        DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS), -        MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts), -        PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP), -        OptSize(OptSize), PSI(PSI), BFI(BFI) { +        DL(MI->getModule()->getDataLayout()), LI(LI), getDTFn(getDTFn), +        AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM), +        InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT), +        LargeOffsetGEP(LargeOffsetGEP), OptSize(OptSize), PSI(PSI), BFI(BFI) {      IgnoreProfitability = false;    } @@ -3066,18 +3171,18 @@ public:    static ExtAddrMode    Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst,          SmallVectorImpl<Instruction *> &AddrModeInsts, -        const TargetLowering &TLI, const TargetRegisterInfo &TRI, -        const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts, -        TypePromotionTransaction &TPT, +        const TargetLowering &TLI, const LoopInfo &LI, +        const std::function<const DominatorTree &()> getDTFn, +        const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts, +        InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,          std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,          bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {      ExtAddrMode Result; -    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS, -                                         MemoryInst, Result, InsertedInsts, -                                         PromotedInsts, TPT, LargeOffsetGEP, -                                         OptSize, PSI, BFI) -                       .matchAddr(V, 0); +    bool Success = AddressingModeMatcher( +        AddrModeInsts, TLI, TRI, LI, getDTFn, AccessTy, AS, MemoryInst, Result, +        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, +        BFI).matchAddr(V, 0);      (void)Success; assert(Success && "Couldn't select *anything*?");      return Result;    } @@ -3773,11 +3878,12 @@ bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,    // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now    // to see if ScaleReg is actually X+C.  If so, we can turn this into adding -  // X*Scale + C*Scale to addr mode. +  // X*Scale + C*Scale to addr mode. If we found available IV increment, do not +  // go any further: we can reuse it and cannot eliminate it.    ConstantInt *CI = nullptr; Value *AddLHS = nullptr; -  if (isa<Instruction>(ScaleReg) &&  // not a constant expr. +  if (isa<Instruction>(ScaleReg) && // not a constant expr.        match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) && -      CI->getValue().isSignedIntN(64)) { +      !isIVIncrement(ScaleReg, &LI) && CI->getValue().isSignedIntN(64)) {      TestAddrMode.InBounds = false;      TestAddrMode.ScaledReg = AddLHS;      TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale; @@ -3789,9 +3895,75 @@ bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,        AddrMode = TestAddrMode;        return true;      } +    // Restore status quo. +    TestAddrMode = AddrMode; +  } + +  // If this is an add recurrence with a constant step, return the increment +  // instruction and the canonicalized step. +  auto GetConstantStep = [this](const Value * V) +      ->Optional<std::pair<Instruction *, APInt> > { +    auto *PN = dyn_cast<PHINode>(V); +    if (!PN) +      return None; +    auto IVInc = getIVIncrement(PN, &LI); +    if (!IVInc) +      return None; +    // TODO: The result of the intrinsics above is two-compliment. However when +    // IV inc is expressed as add or sub, iv.next is potentially a poison value. +    // If it has nuw or nsw flags, we need to make sure that these flags are +    // inferrable at the point of memory instruction. Otherwise we are replacing +    // well-defined two-compliment computation with poison. Currently, to avoid +    // potentially complex analysis needed to prove this, we reject such cases. +    if (auto *OIVInc = dyn_cast<OverflowingBinaryOperator>(IVInc->first)) +      if (OIVInc->hasNoSignedWrap() || OIVInc->hasNoUnsignedWrap()) +        return None; +    if (auto *ConstantStep = dyn_cast<ConstantInt>(IVInc->second)) +      return std::make_pair(IVInc->first, ConstantStep->getValue()); +    return None; +  }; + +  // Try to account for the following special case: +  // 1. ScaleReg is an inductive variable; +  // 2. We use it with non-zero offset; +  // 3. IV's increment is available at the point of memory instruction. +  // +  // In this case, we may reuse the IV increment instead of the IV Phi to +  // achieve the following advantages: +  // 1. If IV step matches the offset, we will have no need in the offset; +  // 2. Even if they don't match, we will reduce the overlap of living IV +  //    and IV increment, that will potentially lead to better register +  //    assignment. +  if (AddrMode.BaseOffs) { +    if (auto IVStep = GetConstantStep(ScaleReg)) { +      Instruction *IVInc = IVStep->first; +      // The following assert is important to ensure a lack of infinite loops. +      // This transforms is (intentionally) the inverse of the one just above. +      // If they don't agree on the definition of an increment, we'd alternate +      // back and forth indefinitely. +      assert(isIVIncrement(IVInc, &LI) && "implied by GetConstantStep"); +      APInt Step = IVStep->second; +      APInt Offset = Step * AddrMode.Scale; +      if (Offset.isSignedIntN(64)) { +        TestAddrMode.InBounds = false; +        TestAddrMode.ScaledReg = IVInc; +        TestAddrMode.BaseOffs -= Offset.getLimitedValue(); +        // If this addressing mode is legal, commit it.. +        // (Note that we defer the (expensive) domtree base legality check +        // to the very last possible point.) +        if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace) && +            getDTFn().dominates(IVInc, MemoryInst)) { +          AddrModeInsts.push_back(cast<Instruction>(IVInc)); +          AddrMode = TestAddrMode; +          return true; +        } +        // Restore status quo. +        TestAddrMode = AddrMode; +      } +    }    } -  // Otherwise, not (x+c)*scale, just return what we have. +  // Otherwise, just return what we have.    return true;  } @@ -4881,9 +5053,10 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,                                                                        0);      TypePromotionTransaction::ConstRestorationPt LastKnownGood =          TPT.getRestorationPoint(); -    AddressingModeMatcher Matcher( -        MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result, -        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI); +    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, LI, getDTFn, +                                  AddressAccessTy, AS, MemoryInst, Result, +                                  InsertedInsts, PromotedInsts, TPT, +                                  LargeOffsetGEP, OptSize, PSI, BFI);      Matcher.IgnoreProfitability = true;      bool Success = Matcher.matchAddr(Address, 0);      (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -4986,9 +5159,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,      AddrModeInsts.clear();      std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,                                                                        0); +    // Defer the query (and possible computation of) the dom tree to point of +    // actual use.  It's expected that most address matches don't actually need +    // the domtree. +    auto getDTFn = [MemoryInst, this]() -> const DominatorTree & { +      Function *F = MemoryInst->getParent()->getParent(); +      return this->getDT(*F); +    };      ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( -        V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI, -        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, +        V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *LI, getDTFn, +        *TRI, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,          BFI.get());      GetElementPtrInst *GEP = LargeOffsetGEP.first; @@ -5373,14 +5553,19 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,      IRBuilder<> Builder(MemoryInst); +    Type *SourceTy = GEP->getSourceElementType();      Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());      // If the final index isn't a vector, emit a scalar GEP containing all ops      // and a vector GEP with all zeroes final index.      if (!Ops[FinalIndex]->getType()->isVectorTy()) { -      NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front()); +      NewAddr = Builder.CreateGEP(SourceTy, Ops[0], +                                  makeArrayRef(Ops).drop_front());        auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts); -      NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy)); +      auto *SecondTy = GetElementPtrInst::getIndexedType( +          SourceTy, makeArrayRef(Ops).drop_front()); +      NewAddr = +          Builder.CreateGEP(SecondTy, NewAddr, Constant::getNullValue(IndexTy));      } else {        Value *Base = Ops[0];        Value *Index = Ops[FinalIndex]; @@ -5389,11 +5574,14 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,        if (Ops.size() != 2) {          // Replace the last index with 0.          Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy); -        Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front()); +        Base = Builder.CreateGEP(SourceTy, Base, +                                 makeArrayRef(Ops).drop_front()); +        SourceTy = GetElementPtrInst::getIndexedType( +            SourceTy, makeArrayRef(Ops).drop_front());        }        // Now create the GEP with scalar pointer and vector index. -      NewAddr = Builder.CreateGEP(Base, Index); +      NewAddr = Builder.CreateGEP(SourceTy, Base, Index);      }    } else if (!isa<Constant>(Ptr)) {      // Not a GEP, maybe its a splat and we can create a GEP to enable @@ -5409,7 +5597,16 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,      // Emit a vector GEP with a scalar pointer and all 0s vector index.      Type *ScalarIndexTy = DL->getIndexType(V->getType()->getScalarType());      auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts); -    NewAddr = Builder.CreateGEP(V, Constant::getNullValue(IndexTy)); +    Type *ScalarTy; +    if (cast<IntrinsicInst>(MemoryInst)->getIntrinsicID() == +        Intrinsic::masked_gather) { +      ScalarTy = MemoryInst->getType()->getScalarType(); +    } else { +      assert(cast<IntrinsicInst>(MemoryInst)->getIntrinsicID() == +             Intrinsic::masked_scatter); +      ScalarTy = MemoryInst->getOperand(0)->getType()->getScalarType(); +    } +    NewAddr = Builder.CreateGEP(ScalarTy, V, Constant::getNullValue(IndexTy));    } else {      // Constant, SelectionDAGBuilder knows to check if its a splat.      return false; @@ -6272,6 +6469,10 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {    EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());    unsigned BitWidth = LoadResultVT.getSizeInBits(); +  // If the BitWidth is 0, do not try to optimize the type +  if (BitWidth == 0) +    return false; +    APInt DemandBits(BitWidth, 0);    APInt WidestAndBits(BitWidth, 0); @@ -6409,7 +6610,7 @@ static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,      uint64_t Sum = TrueWeight + FalseWeight;      if (Sum != 0) {        auto Probability = BranchProbability::getBranchProbability(Max, Sum); -      if (Probability > TLI->getPredictableBranchThreshold()) +      if (Probability > TTI->getPredictableBranchThreshold())          return true;      }    } @@ -6795,7 +6996,8 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {    Value *Cond = SI->getCondition();    Type *OldType = Cond->getType();    LLVMContext &Context = Cond->getContext(); -  MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType)); +  EVT OldVT = TLI->getValueType(*DL, OldType); +  MVT RegType = TLI->getRegisterType(Context, OldVT);    unsigned RegWidth = RegType.getSizeInBits();    if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth()) @@ -6809,14 +7011,21 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {    // where N is the number of cases in the switch.    auto *NewType = Type::getIntNTy(Context, RegWidth); -  // Zero-extend the switch condition and case constants unless the switch -  // condition is a function argument that is already being sign-extended. -  // In that case, we can avoid an unnecessary mask/extension by sign-extending -  // everything instead. +  // Extend the switch condition and case constants using the target preferred +  // extend unless the switch condition is a function argument with an extend +  // attribute. In that case, we can avoid an unnecessary mask/extension by +  // matching the argument extension instead.    Instruction::CastOps ExtType = Instruction::ZExt; -  if (auto *Arg = dyn_cast<Argument>(Cond)) +  // Some targets prefer SExt over ZExt. +  if (TLI->isSExtCheaperThanZExt(OldVT, RegType)) +    ExtType = Instruction::SExt; + +  if (auto *Arg = dyn_cast<Argument>(Cond)) {      if (Arg->hasSExtAttr())        ExtType = Instruction::SExt; +    if (Arg->hasZExtAttr()) +      ExtType = Instruction::ZExt; +  }    auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);    ExtInst->insertBefore(SI); @@ -6927,11 +7136,10 @@ class VectorPromoteHelper {      StoreInst *ST = cast<StoreInst>(CombineInst);      unsigned AS = ST->getPointerAddressSpace(); -    unsigned Align = ST->getAlignment();      // Check if this store is supported.      if (!TLI.allowsMisalignedMemoryAccesses(              TLI.getValueType(DL, ST->getValueOperand()->getType()), AS, -            Align)) { +            ST->getAlign())) {        // If this is not supported, there is no way we can combine        // the extract with the store.        return false; @@ -6940,9 +7148,9 @@ class VectorPromoteHelper {      // The scalar chain of computation has to pay for the transition      // scalar to vector.      // The vector chain has to account for the combining cost. -    uint64_t ScalarCost = +    InstructionCost ScalarCost =          TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index); -    uint64_t VectorCost = StoreExtractCombineCost; +    InstructionCost VectorCost = StoreExtractCombineCost;      enum TargetTransformInfo::TargetCostKind CostKind =        TargetTransformInfo::TCK_RecipThroughput;      for (const auto &Inst : InstsToBePromoted) { @@ -7483,9 +7691,8 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,    for (GetElementPtrInst *UGEPI : UGEPIs) {      ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));      APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue(); -    unsigned ImmCost = -      TTI->getIntImmCost(NewIdx, GEPIIdx->getType(), -                         TargetTransformInfo::TCK_SizeAndLatency); +    InstructionCost ImmCost = TTI->getIntImmCost( +        NewIdx, GEPIIdx->getType(), TargetTransformInfo::TCK_SizeAndLatency);      if (ImmCost > TargetTransformInfo::TCC_Basic)        return false;    } @@ -7511,6 +7718,67 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,    return true;  } +static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI) { +  // Try and convert +  //  %c = icmp ult %x, 8 +  //  br %c, bla, blb +  //  %tc = lshr %x, 3 +  // to +  //  %tc = lshr %x, 3 +  //  %c = icmp eq %tc, 0 +  //  br %c, bla, blb +  // Creating the cmp to zero can be better for the backend, especially if the +  // lshr produces flags that can be used automatically. +  if (!TLI.preferZeroCompareBranch() || !Branch->isConditional()) +    return false; + +  ICmpInst *Cmp = dyn_cast<ICmpInst>(Branch->getCondition()); +  if (!Cmp || !isa<ConstantInt>(Cmp->getOperand(1)) || !Cmp->hasOneUse()) +    return false; + +  Value *X = Cmp->getOperand(0); +  APInt CmpC = cast<ConstantInt>(Cmp->getOperand(1))->getValue(); + +  for (auto *U : X->users()) { +    Instruction *UI = dyn_cast<Instruction>(U); +    // A quick dominance check +    if (!UI || +        (UI->getParent() != Branch->getParent() && +         UI->getParent() != Branch->getSuccessor(0) && +         UI->getParent() != Branch->getSuccessor(1)) || +        (UI->getParent() != Branch->getParent() && +         !UI->getParent()->getSinglePredecessor())) +      continue; + +    if (CmpC.isPowerOf2() && Cmp->getPredicate() == ICmpInst::ICMP_ULT && +        match(UI, m_Shr(m_Specific(X), m_SpecificInt(CmpC.logBase2())))) { +      IRBuilder<> Builder(Branch); +      if (UI->getParent() != Branch->getParent()) +        UI->moveBefore(Branch); +      Value *NewCmp = Builder.CreateCmp(ICmpInst::ICMP_EQ, UI, +                                        ConstantInt::get(UI->getType(), 0)); +      LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n"); +      LLVM_DEBUG(dbgs() << " to compare on zero: " << *NewCmp << "\n"); +      Cmp->replaceAllUsesWith(NewCmp); +      return true; +    } +    if (Cmp->isEquality() && +        (match(UI, m_Add(m_Specific(X), m_SpecificInt(-CmpC))) || +         match(UI, m_Sub(m_Specific(X), m_SpecificInt(CmpC))))) { +      IRBuilder<> Builder(Branch); +      if (UI->getParent() != Branch->getParent()) +        UI->moveBefore(Branch); +      Value *NewCmp = Builder.CreateCmp(Cmp->getPredicate(), UI, +                                        ConstantInt::get(UI->getType(), 0)); +      LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n"); +      LLVM_DEBUG(dbgs() << " to compare on zero: " << *NewCmp << "\n"); +      Cmp->replaceAllUsesWith(NewCmp); +      return true; +    } +  } +  return false; +} +  bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {    // Bail out if we inserted the instruction to prevent optimizations from    // stepping on each other's toes. @@ -7672,6 +7940,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {      return optimizeSwitchInst(cast<SwitchInst>(I));    case Instruction::ExtractElement:      return optimizeExtractElementInst(cast<ExtractElementInst>(I)); +  case Instruction::Br: +    return optimizeBranch(cast<BranchInst>(I), *TLI);    }    return false; @@ -7731,19 +8001,23 @@ bool CodeGenPrepare::fixupDbgValue(Instruction *I) {    DbgValueInst &DVI = *cast<DbgValueInst>(I);    // Does this dbg.value refer to a sunk address calculation? -  Value *Location = DVI.getVariableLocation(); -  WeakTrackingVH SunkAddrVH = SunkAddrs[Location]; -  Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr; -  if (SunkAddr) { -    // Point dbg.value at locally computed address, which should give the best -    // opportunity to be accurately lowered. This update may change the type of -    // pointer being referred to; however this makes no difference to debugging -    // information, and we can't generate bitcasts that may affect codegen. -    DVI.setOperand(0, MetadataAsValue::get(DVI.getContext(), -                                           ValueAsMetadata::get(SunkAddr))); -    return true; -  } -  return false; +  bool AnyChange = false; +  SmallDenseSet<Value *> LocationOps(DVI.location_ops().begin(), +                                     DVI.location_ops().end()); +  for (Value *Location : LocationOps) { +    WeakTrackingVH SunkAddrVH = SunkAddrs[Location]; +    Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr; +    if (SunkAddr) { +      // Point dbg.value at locally computed address, which should give the best +      // opportunity to be accurately lowered. This update may change the type +      // of pointer being referred to; however this makes no difference to +      // debugging information, and we can't generate bitcasts that may affect +      // codegen. +      DVI.replaceVariableLocationOp(Location, SunkAddr); +      AnyChange = true; +    } +  } +  return AnyChange;  }  // A llvm.dbg.value may be using a value before its definition, due to @@ -7762,30 +8036,73 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {        if (!DVI)          continue; -      Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()); +      SmallVector<Instruction *, 4> VIs; +      for (Value *V : DVI->getValues()) +        if (Instruction *VI = dyn_cast_or_null<Instruction>(V)) +          VIs.push_back(VI); + +      // This DVI may depend on multiple instructions, complicating any +      // potential sink. This block takes the defensive approach, opting to +      // "undef" the DVI if it has more than one instruction and any of them do +      // not dominate DVI. +      for (Instruction *VI : VIs) { +        if (VI->isTerminator()) +          continue; -      if (!VI || VI->isTerminator()) -        continue; +        // If VI is a phi in a block with an EHPad terminator, we can't insert +        // after it. +        if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad()) +          continue; -      // If VI is a phi in a block with an EHPad terminator, we can't insert -      // after it. -      if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad()) -        continue; +        // If the defining instruction dominates the dbg.value, we do not need +        // to move the dbg.value. +        if (DT.dominates(VI, DVI)) +          continue; -      // If the defining instruction dominates the dbg.value, we do not need -      // to move the dbg.value. -      if (DT.dominates(VI, DVI)) -        continue; +        // If we depend on multiple instructions and any of them doesn't +        // dominate this DVI, we probably can't salvage it: moving it to +        // after any of the instructions could cause us to lose the others. +        if (VIs.size() > 1) { +          LLVM_DEBUG( +              dbgs() +              << "Unable to find valid location for Debug Value, undefing:\n" +              << *DVI); +          DVI->setUndef(); +          break; +        } -      LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n" -                        << *DVI << ' ' << *VI); -      DVI->removeFromParent(); -      if (isa<PHINode>(VI)) -        DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt()); -      else -        DVI->insertAfter(VI); -      MadeChange = true; -      ++NumDbgValueMoved; +        LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n" +                          << *DVI << ' ' << *VI); +        DVI->removeFromParent(); +        if (isa<PHINode>(VI)) +          DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt()); +        else +          DVI->insertAfter(VI); +        MadeChange = true; +        ++NumDbgValueMoved; +      } +    } +  } +  return MadeChange; +} + +// Group scattered pseudo probes in a block to favor SelectionDAG. Scattered +// probes can be chained dependencies of other regular DAG nodes and block DAG +// combine optimizations. +bool CodeGenPrepare::placePseudoProbes(Function &F) { +  bool MadeChange = false; +  for (auto &Block : F) { +    // Move the rest probes to the beginning of the block. +    auto FirstInst = Block.getFirstInsertionPt(); +    while (FirstInst != Block.end() && FirstInst->isDebugOrPseudoInst()) +      ++FirstInst; +    BasicBlock::iterator I(FirstInst); +    I++; +    while (I != Block.end()) { +      if (auto *II = dyn_cast<PseudoProbeInst>(I++)) { +        II->moveBefore(&*FirstInst); +        MadeChange = true; +      }      }    }    return MadeChange;  | 
