diff options
Diffstat (limited to 'contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp')
| -rw-r--r-- | contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp | 2192 | 
1 files changed, 1363 insertions, 829 deletions
diff --git a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp index 934b470f13b5..cb31c21293f4 100644 --- a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -13,20 +13,23 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/Passes.h"  #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h"  #include "llvm/ADT/SmallSet.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/BlockFrequencyInfo.h"  #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h"  #include "llvm/Analysis/InstructionSimplify.h"  #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h"  #include "llvm/Analysis/ProfileSummaryInfo.h"  #include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/Analysis/TargetTransformInfo.h"  #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/MemoryBuiltins.h"  #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h"  #include "llvm/IR/CallSite.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h" @@ -53,8 +56,11 @@  #include "llvm/Transforms/Utils/BasicBlockUtils.h"  #include "llvm/Transforms/Utils/BuildLibCalls.h"  #include "llvm/Transforms/Utils/BypassSlowDivision.h" +#include "llvm/Transforms/Utils/Cloning.h"  #include "llvm/Transforms/Utils/Local.h"  #include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +  using namespace llvm;  using namespace llvm::PatternMatch; @@ -77,9 +83,14 @@ STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");  STATISTIC(NumRetsDup,    "Number of return instructions duplicated");  STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");  STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); -STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");  STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, +          "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); +  static cl::opt<bool> DisableBranchOpts(    "disable-cgp-branch-opts", cl::Hidden, cl::init(false),    cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -93,7 +104,7 @@ static cl::opt<bool> DisableSelectToBranch(    cl::desc("Disable select to branch conversion."));  static cl::opt<bool> AddrSinkUsingGEPs( -  "addr-sink-using-gep", cl::Hidden, cl::init(false), +  "addr-sink-using-gep", cl::Hidden, cl::init(true),    cl::desc("Address sinking in CGP using GEPs."));  static cl::opt<bool> EnableAndCmpSinking( @@ -123,7 +134,7 @@ static cl::opt<bool> DisablePreheaderProtect(      cl::desc("Disable protection against removing loop preheaders"));  static cl::opt<bool> ProfileGuidedSectionPrefix( -    "profile-guided-section-prefix", cl::Hidden, cl::init(true), +    "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,      cl::desc("Use profile info to add section prefix for hot/cold functions"));  static cl::opt<unsigned> FreqRatioToSkipMerge( @@ -135,15 +146,29 @@ static cl::opt<bool> ForceSplitStore(      "force-split-store", cl::Hidden, cl::init(false),      cl::desc("Force store splitting no matter what the target query says.")); +static cl::opt<bool> +EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, +    cl::desc("Enable merging of redundant sexts when one is dominating" +    " the other."), cl::init(true)); + +static cl::opt<unsigned> MemCmpNumLoadsPerBlock( +    "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), +    cl::desc("The number of loads per basic block for inline expansion of " +             "memcmp that is only being compared against zero.")); +  namespace {  typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;  typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;  typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy; +typedef SmallVector<Instruction *, 16> SExts; +typedef DenseMap<Value *, SExts> ValueToSExts;  class TypePromotionTransaction;    class CodeGenPrepare : public FunctionPass {      const TargetMachine *TM; +    const TargetSubtargetInfo *SubtargetInfo;      const TargetLowering *TLI; +    const TargetRegisterInfo *TRI;      const TargetTransformInfo *TTI;      const TargetLibraryInfo *TLInfo;      const LoopInfo *LI; @@ -165,6 +190,15 @@ class TypePromotionTransaction;      /// promotion for the current function.      InstrToOrigTy PromotedInsts; +    /// Keep track of instructions removed during promotion. +    SetOfInstrs RemovedInsts; + +    /// Keep track of sext chains based on their initial value. +    DenseMap<Value *, Instruction *> SeenChainsForSExt; + +    /// Keep track of SExt promoted. +    ValueToSExts ValToSExtendedUses; +      /// True if CFG is modified in any way.      bool ModifiedDT; @@ -176,10 +210,11 @@ class TypePromotionTransaction;    public:      static char ID; // Pass identification, replacement for typeid -    explicit CodeGenPrepare(const TargetMachine *TM = nullptr) -        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr) { -        initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); -      } +    CodeGenPrepare() +        : FunctionPass(ID), TM(nullptr), TLI(nullptr), TTI(nullptr), +          DL(nullptr) { +      initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); +    }      bool runOnFunction(Function &F) override;      StringRef getPassName() const override { return "CodeGen Prepare"; } @@ -200,13 +235,13 @@ class TypePromotionTransaction;      void eliminateMostlyEmptyBlock(BasicBlock *BB);      bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,                                         bool isPreheader); -    bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT); -    bool optimizeInst(Instruction *I, bool& ModifiedDT); +    bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT); +    bool optimizeInst(Instruction *I, bool &ModifiedDT);      bool optimizeMemoryInst(Instruction *I, Value *Addr,                              Type *AccessTy, unsigned AS);      bool optimizeInlineAsmInst(CallInst *CS); -    bool optimizeCallInst(CallInst *CI, bool& ModifiedDT); -    bool moveExtToFormExtLoad(Instruction *&I); +    bool optimizeCallInst(CallInst *CI, bool &ModifiedDT); +    bool optimizeExt(Instruction *&I);      bool optimizeExtUses(Instruction *I);      bool optimizeLoadExt(LoadInst *I);      bool optimizeSelectInst(SelectInst *SI); @@ -215,26 +250,32 @@ class TypePromotionTransaction;      bool optimizeExtractElementInst(Instruction *Inst);      bool dupRetToEnableTailCallOpts(BasicBlock *BB);      bool placeDbgValues(Function &F); -    bool sinkAndCmp(Function &F); -    bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, -                        Instruction *&Inst, -                        const SmallVectorImpl<Instruction *> &Exts, -                        unsigned CreatedInstCost); +    bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts, +                      LoadInst *&LI, Instruction *&Inst, bool HasPromoted); +    bool tryToPromoteExts(TypePromotionTransaction &TPT, +                          const SmallVectorImpl<Instruction *> &Exts, +                          SmallVectorImpl<Instruction *> &ProfitablyMovedExts, +                          unsigned CreatedInstsCost = 0); +    bool mergeSExts(Function &F); +    bool performAddressTypePromotion( +        Instruction *&Inst, +        bool AllowPromotionWithoutCommonHeader, +        bool HasPromoted, TypePromotionTransaction &TPT, +        SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);      bool splitBranchCondition(Function &F);      bool simplifyOffsetableRelocate(Instruction &I); +    bool splitIndirectCriticalEdges(Function &F);    };  }  char CodeGenPrepare::ID = 0; -INITIALIZE_TM_PASS_BEGIN(CodeGenPrepare, "codegenprepare", -                         "Optimize for code generation", false, false) +INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE, +                      "Optimize for code generation", false, false)  INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) -INITIALIZE_TM_PASS_END(CodeGenPrepare, "codegenprepare", -                       "Optimize for code generation", false, false) +INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE, +                    "Optimize for code generation", false, false) -FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) { -  return new CodeGenPrepare(TM); -} +FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }  bool CodeGenPrepare::runOnFunction(Function &F) {    if (skipFunction(F)) @@ -250,8 +291,12 @@ bool CodeGenPrepare::runOnFunction(Function &F) {    BPI.reset();    ModifiedDT = false; -  if (TM) -    TLI = TM->getSubtargetImpl(F)->getTargetLowering(); +  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { +    TM = &TPC->getTM<TargetMachine>(); +    SubtargetInfo = TM->getSubtargetImpl(F); +    TLI = SubtargetInfo->getTargetLowering(); +    TRI = SubtargetInfo->getRegisterInfo(); +  }    TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); @@ -260,10 +305,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) {    if (ProfileGuidedSectionPrefix) {      ProfileSummaryInfo *PSI =          getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); -    if (PSI->isFunctionEntryHot(&F)) +    if (PSI->isFunctionHotInCallGraph(&F))        F.setSectionPrefix(".hot"); -    else if (PSI->isFunctionEntryCold(&F)) -      F.setSectionPrefix(".cold"); +    else if (PSI->isFunctionColdInCallGraph(&F)) +      F.setSectionPrefix(".unlikely");    }    /// This optimization identifies DIV instructions that can be @@ -290,18 +335,19 @@ bool CodeGenPrepare::runOnFunction(Function &F) {    // find a node corresponding to the value.    EverMadeChange |= placeDbgValues(F); -  // If there is a mask, compare against zero, and branch that can be combined -  // into a single target instruction, push the mask and compare into branch -  // users. Do this before OptimizeBlock -> OptimizeInst -> -  // OptimizeCmpExpression, which perturbs the pattern being searched for. -  if (!DisableBranchOpts) { -    EverMadeChange |= sinkAndCmp(F); +  if (!DisableBranchOpts)      EverMadeChange |= splitBranchCondition(F); -  } + +  // Split some critical edges where one of the sources is an indirect branch, +  // to help generate sane code for PHIs involving such edges. +  EverMadeChange |= splitIndirectCriticalEdges(F);    bool MadeChange = true;    while (MadeChange) {      MadeChange = false; +    SeenChainsForSExt.clear(); +    ValToSExtendedUses.clear(); +    RemovedInsts.clear();      for (Function::iterator I = F.begin(); I != F.end(); ) {        BasicBlock *BB = &*I++;        bool ModifiedDTOnIteration = false; @@ -311,6 +357,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) {        if (ModifiedDTOnIteration)          break;      } +    if (EnableTypePromotionMerge && !ValToSExtendedUses.empty()) +      MadeChange |= mergeSExts(F); + +    // Really free removed instructions during promotion. +    for (Instruction *I : RemovedInsts) +      I->deleteValue(); +      EverMadeChange |= MadeChange;    } @@ -432,6 +485,160 @@ BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {    return DestBB;  } +// Return the unique indirectbr predecessor of a block. This may return null +// even if such a predecessor exists, if it's not useful for splitting. +// If a predecessor is found, OtherPreds will contain all other (non-indirectbr) +// predecessors of BB. +static BasicBlock * +findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) { +  // If the block doesn't have any PHIs, we don't care about it, since there's +  // no point in splitting it. +  PHINode *PN = dyn_cast<PHINode>(BB->begin()); +  if (!PN) +    return nullptr; + +  // Verify we have exactly one IBR predecessor. +  // Conservatively bail out if one of the other predecessors is not a "regular" +  // terminator (that is, not a switch or a br). +  BasicBlock *IBB = nullptr; +  for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) { +    BasicBlock *PredBB = PN->getIncomingBlock(Pred); +    TerminatorInst *PredTerm = PredBB->getTerminator(); +    switch (PredTerm->getOpcode()) { +    case Instruction::IndirectBr: +      if (IBB) +        return nullptr; +      IBB = PredBB; +      break; +    case Instruction::Br: +    case Instruction::Switch: +      OtherPreds.push_back(PredBB); +      continue; +    default: +      return nullptr; +    } +  } + +  return IBB; +} + +// Split critical edges where the source of the edge is an indirectbr +// instruction. This isn't always possible, but we can handle some easy cases. +// This is useful because MI is unable to split such critical edges, +// which means it will not be able to sink instructions along those edges. +// This is especially painful for indirect branches with many successors, where +// we end up having to prepare all outgoing values in the origin block. +// +// Our normal algorithm for splitting critical edges requires us to update +// the outgoing edges of the edge origin block, but for an indirectbr this +// is hard, since it would require finding and updating the block addresses +// the indirect branch uses. But if a block only has a single indirectbr +// predecessor, with the others being regular branches, we can do it in a +// different way. +// Say we have A -> D, B -> D, I -> D where only I -> D is an indirectbr. +// We can split D into D0 and D1, where D0 contains only the PHIs from D, +// and D1 is the D block body. We can then duplicate D0 as D0A and D0B, and +// create the following structure: +// A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1 +bool CodeGenPrepare::splitIndirectCriticalEdges(Function &F) { +  // Check whether the function has any indirectbrs, and collect which blocks +  // they may jump to. Since most functions don't have indirect branches, +  // this lowers the common case's overhead to O(Blocks) instead of O(Edges). +  SmallSetVector<BasicBlock *, 16> Targets; +  for (auto &BB : F) { +    auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator()); +    if (!IBI) +      continue; + +    for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ) +      Targets.insert(IBI->getSuccessor(Succ)); +  } + +  if (Targets.empty()) +    return false; + +  bool Changed = false; +  for (BasicBlock *Target : Targets) { +    SmallVector<BasicBlock *, 16> OtherPreds; +    BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds); +    // If we did not found an indirectbr, or the indirectbr is the only +    // incoming edge, this isn't the kind of edge we're looking for. +    if (!IBRPred || OtherPreds.empty()) +      continue; + +    // Don't even think about ehpads/landingpads. +    Instruction *FirstNonPHI = Target->getFirstNonPHI(); +    if (FirstNonPHI->isEHPad() || Target->isLandingPad()) +      continue; + +    BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split"); +    // It's possible Target was its own successor through an indirectbr. +    // In this case, the indirectbr now comes from BodyBlock. +    if (IBRPred == Target) +      IBRPred = BodyBlock; + +    // At this point Target only has PHIs, and BodyBlock has the rest of the +    // block's body. Create a copy of Target that will be used by the "direct" +    // preds. +    ValueToValueMapTy VMap; +    BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F); + +    for (BasicBlock *Pred : OtherPreds) { +      // If the target is a loop to itself, then the terminator of the split +      // block needs to be updated. +      if (Pred == Target) +        BodyBlock->getTerminator()->replaceUsesOfWith(Target, DirectSucc); +      else +        Pred->getTerminator()->replaceUsesOfWith(Target, DirectSucc); +    } + +    // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that +    // they are clones, so the number of PHIs are the same. +    // (a) Remove the edge coming from IBRPred from the "Direct" PHI +    // (b) Leave that as the only edge in the "Indirect" PHI. +    // (c) Merge the two in the body block. +    BasicBlock::iterator Indirect = Target->begin(), +                         End = Target->getFirstNonPHI()->getIterator(); +    BasicBlock::iterator Direct = DirectSucc->begin(); +    BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt(); + +    assert(&*End == Target->getTerminator() && +           "Block was expected to only contain PHIs"); + +    while (Indirect != End) { +      PHINode *DirPHI = cast<PHINode>(Direct); +      PHINode *IndPHI = cast<PHINode>(Indirect); + +      // Now, clean up - the direct block shouldn't get the indirect value, +      // and vice versa. +      DirPHI->removeIncomingValue(IBRPred); +      Direct++; + +      // Advance the pointer here, to avoid invalidation issues when the old +      // PHI is erased. +      Indirect++; + +      PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI); +      NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred), +                             IBRPred); + +      // Create a PHI in the body block, to merge the direct and indirect +      // predecessors. +      PHINode *MergePHI = +          PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert); +      MergePHI->addIncoming(NewIndPHI, Target); +      MergePHI->addIncoming(DirPHI, DirectSucc); + +      IndPHI->replaceAllUsesWith(MergePHI); +      IndPHI->eraseFromParent(); +    } + +    Changed = true; +  } + +  return Changed; +} +  /// Eliminate blocks that contain only PHI nodes, debug info directives, and an  /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split  /// edges in ways that are non-optimal for isel. Start by eliminating these @@ -1090,6 +1297,83 @@ static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) {    return false;  } +/// Duplicate and sink the given 'and' instruction into user blocks where it is +/// used in a compare to allow isel to generate better code for targets where +/// this operation can be combined. +/// +/// Return true if any changes are made. +static bool sinkAndCmp0Expression(Instruction *AndI, +                                  const TargetLowering &TLI, +                                  SetOfInstrs &InsertedInsts) { +  // Double-check that we're not trying to optimize an instruction that was +  // already optimized by some other part of this pass. +  assert(!InsertedInsts.count(AndI) && +         "Attempting to optimize already optimized and instruction"); +  (void) InsertedInsts; + +  // Nothing to do for single use in same basic block. +  if (AndI->hasOneUse() && +      AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent()) +    return false; + +  // Try to avoid cases where sinking/duplicating is likely to increase register +  // pressure. +  if (!isa<ConstantInt>(AndI->getOperand(0)) && +      !isa<ConstantInt>(AndI->getOperand(1)) && +      AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse()) +    return false; + +  for (auto *U : AndI->users()) { +    Instruction *User = cast<Instruction>(U); + +    // Only sink for and mask feeding icmp with 0. +    if (!isa<ICmpInst>(User)) +      return false; + +    auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1)); +    if (!CmpC || !CmpC->isZero()) +      return false; +  } + +  if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI)) +    return false; + +  DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n"); +  DEBUG(AndI->getParent()->dump()); + +  // Push the 'and' into the same block as the icmp 0.  There should only be +  // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any +  // others, so we don't need to keep track of which BBs we insert into. +  for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end(); +       UI != E; ) { +    Use &TheUse = UI.getUse(); +    Instruction *User = cast<Instruction>(*UI); + +    // Preincrement use iterator so we don't invalidate it. +    ++UI; + +    DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n"); + +    // Keep the 'and' in the same place if the use is already in the same block. +    Instruction *InsertPt = +        User->getParent() == AndI->getParent() ? AndI : User; +    Instruction *InsertedAnd = +        BinaryOperator::Create(Instruction::And, AndI->getOperand(0), +                               AndI->getOperand(1), "", InsertPt); +    // Propagate the debug info. +    InsertedAnd->setDebugLoc(AndI->getDebugLoc()); + +    // Replace a use of the 'and' with a use of the new 'and'. +    TheUse = InsertedAnd; +    ++NumAndUses; +    DEBUG(User->getParent()->dump()); +  } + +  // We removed all uses, nuke the and. +  AndI->eraseFromParent(); +  return true; +} +  /// Check if the candidates could be combined with a shift instruction, which  /// includes:  /// 1. Truncate instruction @@ -1278,519 +1562,6 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,    return MadeChange;  } -// Translate a masked load intrinsic like -// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, -//                               <16 x i1> %mask, <16 x i32> %passthru) -// to a chain of basic blocks, with loading element one-by-one if -// the appropriate mask bit is set -// -//  %1 = bitcast i8* %addr to i32* -//  %2 = extractelement <16 x i1> %mask, i32 0 -//  %3 = icmp eq i1 %2, true -//  br i1 %3, label %cond.load, label %else -// -//cond.load:                                        ; preds = %0 -//  %4 = getelementptr i32* %1, i32 0 -//  %5 = load i32* %4 -//  %6 = insertelement <16 x i32> undef, i32 %5, i32 0 -//  br label %else -// -//else:                                             ; preds = %0, %cond.load -//  %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ] -//  %7 = extractelement <16 x i1> %mask, i32 1 -//  %8 = icmp eq i1 %7, true -//  br i1 %8, label %cond.load1, label %else2 -// -//cond.load1:                                       ; preds = %else -//  %9 = getelementptr i32* %1, i32 1 -//  %10 = load i32* %9 -//  %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1 -//  br label %else2 -// -//else2:                                            ; preds = %else, %cond.load1 -//  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] -//  %12 = extractelement <16 x i1> %mask, i32 2 -//  %13 = icmp eq i1 %12, true -//  br i1 %13, label %cond.load4, label %else5 -// -static void scalarizeMaskedLoad(CallInst *CI) { -  Value *Ptr  = CI->getArgOperand(0); -  Value *Alignment = CI->getArgOperand(1); -  Value *Mask = CI->getArgOperand(2); -  Value *Src0 = CI->getArgOperand(3); - -  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); -  VectorType *VecType = dyn_cast<VectorType>(CI->getType()); -  assert(VecType && "Unexpected return type of masked load intrinsic"); - -  Type *EltTy = CI->getType()->getVectorElementType(); - -  IRBuilder<> Builder(CI->getContext()); -  Instruction *InsertPt = CI; -  BasicBlock *IfBlock = CI->getParent(); -  BasicBlock *CondBlock = nullptr; -  BasicBlock *PrevIfBlock = CI->getParent(); - -  Builder.SetInsertPoint(InsertPt); -  Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - -  // Short-cut if the mask is all-true. -  bool IsAllOnesMask = isa<Constant>(Mask) && -    cast<Constant>(Mask)->isAllOnesValue(); - -  if (IsAllOnesMask) { -    Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal); -    CI->replaceAllUsesWith(NewI); -    CI->eraseFromParent(); -    return; -  } - -  // Adjust alignment for the scalar instruction. -  AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8); -  // Bitcast %addr fron i8* to EltTy* -  Type *NewPtrType = -    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); -  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); -  unsigned VectorWidth = VecType->getNumElements(); - -  Value *UndefVal = UndefValue::get(VecType); - -  // The result vector -  Value *VResult = UndefVal; - -  if (isa<ConstantVector>(Mask)) { -    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { -      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) -          continue; -      Value *Gep = -          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); -      LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal); -      VResult = Builder.CreateInsertElement(VResult, Load, -                                            Builder.getInt32(Idx)); -    } -    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); -    CI->replaceAllUsesWith(NewI); -    CI->eraseFromParent(); -    return; -  } - -  PHINode *Phi = nullptr; -  Value *PrevPhi = UndefVal; - -  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - -    // Fill the "else" block, created in the previous iteration -    // -    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] -    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx -    //  %to_load = icmp eq i1 %mask_1, true -    //  br i1 %to_load, label %cond.load, label %else -    // -    if (Idx > 0) { -      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); -      Phi->addIncoming(VResult, CondBlock); -      Phi->addIncoming(PrevPhi, PrevIfBlock); -      PrevPhi = Phi; -      VResult = Phi; -    } - -    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); -    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, -                                    ConstantInt::get(Predicate->getType(), 1)); - -    // Create "cond" block -    // -    //  %EltAddr = getelementptr i32* %1, i32 0 -    //  %Elt = load i32* %EltAddr -    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx -    // -    CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load"); -    Builder.SetInsertPoint(InsertPt); - -    Value *Gep = -        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); -    LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); -    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); - -    // Create "else" block, fill it in the next iteration -    BasicBlock *NewIfBlock = -        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); -    Builder.SetInsertPoint(InsertPt); -    Instruction *OldBr = IfBlock->getTerminator(); -    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); -    OldBr->eraseFromParent(); -    PrevIfBlock = IfBlock; -    IfBlock = NewIfBlock; -  } - -  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); -  Phi->addIncoming(VResult, CondBlock); -  Phi->addIncoming(PrevPhi, PrevIfBlock); -  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); -  CI->replaceAllUsesWith(NewI); -  CI->eraseFromParent(); -} - -// Translate a masked store intrinsic, like -// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, -//                               <16 x i1> %mask) -// to a chain of basic blocks, that stores element one-by-one if -// the appropriate mask bit is set -// -//   %1 = bitcast i8* %addr to i32* -//   %2 = extractelement <16 x i1> %mask, i32 0 -//   %3 = icmp eq i1 %2, true -//   br i1 %3, label %cond.store, label %else -// -// cond.store:                                       ; preds = %0 -//   %4 = extractelement <16 x i32> %val, i32 0 -//   %5 = getelementptr i32* %1, i32 0 -//   store i32 %4, i32* %5 -//   br label %else -// -// else:                                             ; preds = %0, %cond.store -//   %6 = extractelement <16 x i1> %mask, i32 1 -//   %7 = icmp eq i1 %6, true -//   br i1 %7, label %cond.store1, label %else2 -// -// cond.store1:                                      ; preds = %else -//   %8 = extractelement <16 x i32> %val, i32 1 -//   %9 = getelementptr i32* %1, i32 1 -//   store i32 %8, i32* %9 -//   br label %else2 -//   . . . -static void scalarizeMaskedStore(CallInst *CI) { -  Value *Src = CI->getArgOperand(0); -  Value *Ptr  = CI->getArgOperand(1); -  Value *Alignment = CI->getArgOperand(2); -  Value *Mask = CI->getArgOperand(3); - -  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); -  VectorType *VecType = dyn_cast<VectorType>(Src->getType()); -  assert(VecType && "Unexpected data type in masked store intrinsic"); - -  Type *EltTy = VecType->getElementType(); - -  IRBuilder<> Builder(CI->getContext()); -  Instruction *InsertPt = CI; -  BasicBlock *IfBlock = CI->getParent(); -  Builder.SetInsertPoint(InsertPt); -  Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - -  // Short-cut if the mask is all-true. -  bool IsAllOnesMask = isa<Constant>(Mask) && -    cast<Constant>(Mask)->isAllOnesValue(); - -  if (IsAllOnesMask) { -    Builder.CreateAlignedStore(Src, Ptr, AlignVal); -    CI->eraseFromParent(); -    return; -  } - -  // Adjust alignment for the scalar instruction. -  AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8); -  // Bitcast %addr fron i8* to EltTy* -  Type *NewPtrType = -    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); -  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); -  unsigned VectorWidth = VecType->getNumElements(); - -  if (isa<ConstantVector>(Mask)) { -    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { -      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) -          continue; -      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); -      Value *Gep = -          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); -      Builder.CreateAlignedStore(OneElt, Gep, AlignVal); -    } -    CI->eraseFromParent(); -    return; -  } - -  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - -    // Fill the "else" block, created in the previous iteration -    // -    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx -    //  %to_store = icmp eq i1 %mask_1, true -    //  br i1 %to_store, label %cond.store, label %else -    // -    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); -    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, -                                    ConstantInt::get(Predicate->getType(), 1)); - -    // Create "cond" block -    // -    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx -    //  %EltAddr = getelementptr i32* %1, i32 0 -    //  %store i32 %OneElt, i32* %EltAddr -    // -    BasicBlock *CondBlock = -        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store"); -    Builder.SetInsertPoint(InsertPt); - -    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); -    Value *Gep = -        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); -    Builder.CreateAlignedStore(OneElt, Gep, AlignVal); - -    // Create "else" block, fill it in the next iteration -    BasicBlock *NewIfBlock = -        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); -    Builder.SetInsertPoint(InsertPt); -    Instruction *OldBr = IfBlock->getTerminator(); -    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); -    OldBr->eraseFromParent(); -    IfBlock = NewIfBlock; -  } -  CI->eraseFromParent(); -} - -// Translate a masked gather intrinsic like -// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, -//                               <16 x i1> %Mask, <16 x i32> %Src) -// to a chain of basic blocks, with loading element one-by-one if -// the appropriate mask bit is set -// -// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind -// % Mask0 = extractelement <16 x i1> %Mask, i32 0 -// % ToLoad0 = icmp eq i1 % Mask0, true -// br i1 % ToLoad0, label %cond.load, label %else -// -// cond.load: -// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 -// % Load0 = load i32, i32* % Ptr0, align 4 -// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0 -// br label %else -// -// else: -// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0] -// % Mask1 = extractelement <16 x i1> %Mask, i32 1 -// % ToLoad1 = icmp eq i1 % Mask1, true -// br i1 % ToLoad1, label %cond.load1, label %else2 -// -// cond.load1: -// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 -// % Load1 = load i32, i32* % Ptr1, align 4 -// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1 -// br label %else2 -// . . . -// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src -// ret <16 x i32> %Result -static void scalarizeMaskedGather(CallInst *CI) { -  Value *Ptrs = CI->getArgOperand(0); -  Value *Alignment = CI->getArgOperand(1); -  Value *Mask = CI->getArgOperand(2); -  Value *Src0 = CI->getArgOperand(3); - -  VectorType *VecType = dyn_cast<VectorType>(CI->getType()); - -  assert(VecType && "Unexpected return type of masked load intrinsic"); - -  IRBuilder<> Builder(CI->getContext()); -  Instruction *InsertPt = CI; -  BasicBlock *IfBlock = CI->getParent(); -  BasicBlock *CondBlock = nullptr; -  BasicBlock *PrevIfBlock = CI->getParent(); -  Builder.SetInsertPoint(InsertPt); -  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); - -  Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - -  Value *UndefVal = UndefValue::get(VecType); - -  // The result vector -  Value *VResult = UndefVal; -  unsigned VectorWidth = VecType->getNumElements(); - -  // Shorten the way if the mask is a vector of constants. -  bool IsConstMask = isa<ConstantVector>(Mask); - -  if (IsConstMask) { -    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { -      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) -        continue; -      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), -                                                "Ptr" + Twine(Idx)); -      LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, -                                                 "Load" + Twine(Idx)); -      VResult = Builder.CreateInsertElement(VResult, Load, -                                            Builder.getInt32(Idx), -                                            "Res" + Twine(Idx)); -    } -    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); -    CI->replaceAllUsesWith(NewI); -    CI->eraseFromParent(); -    return; -  } - -  PHINode *Phi = nullptr; -  Value *PrevPhi = UndefVal; - -  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - -    // Fill the "else" block, created in the previous iteration -    // -    //  %Mask1 = extractelement <16 x i1> %Mask, i32 1 -    //  %ToLoad1 = icmp eq i1 %Mask1, true -    //  br i1 %ToLoad1, label %cond.load, label %else -    // -    if (Idx > 0) { -      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); -      Phi->addIncoming(VResult, CondBlock); -      Phi->addIncoming(PrevPhi, PrevIfBlock); -      PrevPhi = Phi; -      VResult = Phi; -    } - -    Value *Predicate = Builder.CreateExtractElement(Mask, -                                                    Builder.getInt32(Idx), -                                                    "Mask" + Twine(Idx)); -    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, -                                    ConstantInt::get(Predicate->getType(), 1), -                                    "ToLoad" + Twine(Idx)); - -    // Create "cond" block -    // -    //  %EltAddr = getelementptr i32* %1, i32 0 -    //  %Elt = load i32* %EltAddr -    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx -    // -    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); -    Builder.SetInsertPoint(InsertPt); - -    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), -                                              "Ptr" + Twine(Idx)); -    LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, -                                               "Load" + Twine(Idx)); -    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), -                                          "Res" + Twine(Idx)); - -    // Create "else" block, fill it in the next iteration -    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); -    Builder.SetInsertPoint(InsertPt); -    Instruction *OldBr = IfBlock->getTerminator(); -    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); -    OldBr->eraseFromParent(); -    PrevIfBlock = IfBlock; -    IfBlock = NewIfBlock; -  } - -  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); -  Phi->addIncoming(VResult, CondBlock); -  Phi->addIncoming(PrevPhi, PrevIfBlock); -  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); -  CI->replaceAllUsesWith(NewI); -  CI->eraseFromParent(); -} - -// Translate a masked scatter intrinsic, like -// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, -//                                  <16 x i1> %Mask) -// to a chain of basic blocks, that stores element one-by-one if -// the appropriate mask bit is set. -// -// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind -// % Mask0 = extractelement <16 x i1> % Mask, i32 0 -// % ToStore0 = icmp eq i1 % Mask0, true -// br i1 %ToStore0, label %cond.store, label %else -// -// cond.store: -// % Elt0 = extractelement <16 x i32> %Src, i32 0 -// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 -// store i32 %Elt0, i32* % Ptr0, align 4 -// br label %else -// -// else: -// % Mask1 = extractelement <16 x i1> % Mask, i32 1 -// % ToStore1 = icmp eq i1 % Mask1, true -// br i1 % ToStore1, label %cond.store1, label %else2 -// -// cond.store1: -// % Elt1 = extractelement <16 x i32> %Src, i32 1 -// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 -// store i32 % Elt1, i32* % Ptr1, align 4 -// br label %else2 -//   . . . -static void scalarizeMaskedScatter(CallInst *CI) { -  Value *Src = CI->getArgOperand(0); -  Value *Ptrs = CI->getArgOperand(1); -  Value *Alignment = CI->getArgOperand(2); -  Value *Mask = CI->getArgOperand(3); - -  assert(isa<VectorType>(Src->getType()) && -         "Unexpected data type in masked scatter intrinsic"); -  assert(isa<VectorType>(Ptrs->getType()) && -         isa<PointerType>(Ptrs->getType()->getVectorElementType()) && -         "Vector of pointers is expected in masked scatter intrinsic"); - -  IRBuilder<> Builder(CI->getContext()); -  Instruction *InsertPt = CI; -  BasicBlock *IfBlock = CI->getParent(); -  Builder.SetInsertPoint(InsertPt); -  Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - -  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); -  unsigned VectorWidth = Src->getType()->getVectorNumElements(); - -  // Shorten the way if the mask is a vector of constants. -  bool IsConstMask = isa<ConstantVector>(Mask); - -  if (IsConstMask) { -    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { -      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) -        continue; -      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), -                                                   "Elt" + Twine(Idx)); -      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), -                                                "Ptr" + Twine(Idx)); -      Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); -    } -    CI->eraseFromParent(); -    return; -  } -  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { -    // Fill the "else" block, created in the previous iteration -    // -    //  % Mask1 = extractelement <16 x i1> % Mask, i32 Idx -    //  % ToStore = icmp eq i1 % Mask1, true -    //  br i1 % ToStore, label %cond.store, label %else -    // -    Value *Predicate = Builder.CreateExtractElement(Mask, -                                                    Builder.getInt32(Idx), -                                                    "Mask" + Twine(Idx)); -    Value *Cmp = -       Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, -                          ConstantInt::get(Predicate->getType(), 1), -                          "ToStore" + Twine(Idx)); - -    // Create "cond" block -    // -    //  % Elt1 = extractelement <16 x i32> %Src, i32 1 -    //  % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 -    //  %store i32 % Elt1, i32* % Ptr1 -    // -    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); -    Builder.SetInsertPoint(InsertPt); - -    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), -                                                 "Elt" + Twine(Idx)); -    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), -                                              "Ptr" + Twine(Idx)); -    Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); - -    // Create "else" block, fill it in the next iteration -    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); -    Builder.SetInsertPoint(InsertPt); -    Instruction *OldBr = IfBlock->getTerminator(); -    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); -    OldBr->eraseFromParent(); -    IfBlock = NewIfBlock; -  } -  CI->eraseFromParent(); -} -  /// If counting leading or trailing zeros is an expensive operation and a zero  /// input is defined, add a check for zero to avoid calling the intrinsic.  /// @@ -1870,7 +1641,635 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,    return true;  } -bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { +// This class provides helper functions to expand a memcmp library call into an +// inline expansion. +class MemCmpExpansion { +  struct ResultBlock { +    BasicBlock *BB; +    PHINode *PhiSrc1; +    PHINode *PhiSrc2; +    ResultBlock(); +  }; + +  CallInst *CI; +  ResultBlock ResBlock; +  unsigned MaxLoadSize; +  unsigned NumBlocks; +  unsigned NumBlocksNonOneByte; +  unsigned NumLoadsPerBlock; +  std::vector<BasicBlock *> LoadCmpBlocks; +  BasicBlock *EndBlock; +  PHINode *PhiRes; +  bool IsUsedForZeroCmp; +  const DataLayout &DL; + +  unsigned calculateNumBlocks(unsigned Size); +  void createLoadCmpBlocks(); +  void createResultBlock(); +  void setupResultBlockPHINodes(); +  void setupEndBlockPHINodes(); +  void emitLoadCompareBlock(unsigned Index, unsigned LoadSize, +                            unsigned GEPIndex); +  Value *getCompareLoadPairs(unsigned Index, unsigned Size, +                             unsigned &NumBytesProcessed, IRBuilder<> &Builder); +  void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size, +                                         unsigned &NumBytesProcessed); +  void emitLoadCompareByteBlock(unsigned Index, unsigned GEPIndex); +  void emitMemCmpResultBlock(); +  Value *getMemCmpExpansionZeroCase(unsigned Size); +  Value *getMemCmpEqZeroOneBlock(unsigned Size); +  unsigned getLoadSize(unsigned Size); +  unsigned getNumLoads(unsigned Size); + +public: +  MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize, +                  unsigned NumLoadsPerBlock, const DataLayout &DL); +  Value *getMemCmpExpansion(uint64_t Size); +}; + +MemCmpExpansion::ResultBlock::ResultBlock() +    : BB(nullptr), PhiSrc1(nullptr), PhiSrc2(nullptr) {} + +// Initialize the basic block structure required for expansion of memcmp call +// with given maximum load size and memcmp size parameter. +// This structure includes: +// 1. A list of load compare blocks - LoadCmpBlocks. +// 2. An EndBlock, split from original instruction point, which is the block to +// return from. +// 3. ResultBlock, block to branch to for early exit when a +// LoadCmpBlock finds a difference. +MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size, +                                 unsigned MaxLoadSize, unsigned LoadsPerBlock, +                                 const DataLayout &TheDataLayout) +    : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock), +      DL(TheDataLayout) { + +  // A memcmp with zero-comparison with only one block of load and compare does +  // not need to set up any extra blocks. This case could be handled in the DAG, +  // but since we have all of the machinery to flexibly expand any memcpy here, +  // we choose to handle this case too to avoid fragmented lowering. +  IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); +  NumBlocks = calculateNumBlocks(Size); +  if (!IsUsedForZeroCmp || NumBlocks != 1) { +    BasicBlock *StartBlock = CI->getParent(); +    EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); +    setupEndBlockPHINodes(); +    createResultBlock(); + +    // If return value of memcmp is not used in a zero equality, we need to +    // calculate which source was larger. The calculation requires the +    // two loaded source values of each load compare block. +    // These will be saved in the phi nodes created by setupResultBlockPHINodes. +    if (!IsUsedForZeroCmp) +      setupResultBlockPHINodes(); + +    // Create the number of required load compare basic blocks. +    createLoadCmpBlocks(); + +    // Update the terminator added by splitBasicBlock to branch to the first +    // LoadCmpBlock. +    StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); +  } + +  IRBuilder<> Builder(CI->getContext()); +  Builder.SetCurrentDebugLocation(CI->getDebugLoc()); +} + +void MemCmpExpansion::createLoadCmpBlocks() { +  for (unsigned i = 0; i < NumBlocks; i++) { +    BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", +                                        EndBlock->getParent(), EndBlock); +    LoadCmpBlocks.push_back(BB); +  } +} + +void MemCmpExpansion::createResultBlock() { +  ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", +                                   EndBlock->getParent(), EndBlock); +} + +// This function creates the IR instructions for loading and comparing 1 byte. +// It loads 1 byte from each source of the memcmp parameters with the given +// GEPIndex. It then subtracts the two loaded values and adds this result to the +// final phi node for selecting the memcmp result. +void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, +                                               unsigned GEPIndex) { +  IRBuilder<> Builder(CI->getContext()); + +  Value *Source1 = CI->getArgOperand(0); +  Value *Source2 = CI->getArgOperand(1); + +  Builder.SetInsertPoint(LoadCmpBlocks[Index]); +  Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); +  // Cast source to LoadSizeType*. +  if (Source1->getType() != LoadSizeType) +    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); +  if (Source2->getType() != LoadSizeType) +    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + +  // Get the base address using the GEPIndex. +  if (GEPIndex != 0) { +    Source1 = Builder.CreateGEP(LoadSizeType, Source1, +                                ConstantInt::get(LoadSizeType, GEPIndex)); +    Source2 = Builder.CreateGEP(LoadSizeType, Source2, +                                ConstantInt::get(LoadSizeType, GEPIndex)); +  } + +  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); +  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + +  LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); +  LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); +  Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + +  PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]); + +  if (Index < (LoadCmpBlocks.size() - 1)) { +    // Early exit branch if difference found to EndBlock. Otherwise, continue to +    // next LoadCmpBlock, +    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, +                                    ConstantInt::get(Diff->getType(), 0)); +    BranchInst *CmpBr = +        BranchInst::Create(EndBlock, LoadCmpBlocks[Index + 1], Cmp); +    Builder.Insert(CmpBr); +  } else { +    // The last block has an unconditional branch to EndBlock. +    BranchInst *CmpBr = BranchInst::Create(EndBlock); +    Builder.Insert(CmpBr); +  } +} + +unsigned MemCmpExpansion::getNumLoads(unsigned Size) { +  return (Size / MaxLoadSize) + countPopulation(Size % MaxLoadSize); +} + +unsigned MemCmpExpansion::getLoadSize(unsigned Size) { +  return MinAlign(PowerOf2Floor(Size), MaxLoadSize); +} + +/// Generate an equality comparison for one or more pairs of loaded values. +/// This is used in the case where the memcmp() call is compared equal or not +/// equal to zero. +Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size, +                                            unsigned &NumBytesProcessed, +                                            IRBuilder<> &Builder) { +  std::vector<Value *> XorList, OrList; +  Value *Diff; + +  unsigned RemainingBytes = Size - NumBytesProcessed; +  unsigned NumLoadsRemaining = getNumLoads(RemainingBytes); +  unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock); + +  // For a single-block expansion, start inserting before the memcmp call. +  if (LoadCmpBlocks.empty()) +    Builder.SetInsertPoint(CI); +  else +    Builder.SetInsertPoint(LoadCmpBlocks[Index]); + +  Value *Cmp = nullptr; +  for (unsigned i = 0; i < NumLoads; ++i) { +    unsigned LoadSize = getLoadSize(RemainingBytes); +    unsigned GEPIndex = NumBytesProcessed / LoadSize; +    NumBytesProcessed += LoadSize; +    RemainingBytes -= LoadSize; + +    Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); +    Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); +    assert(LoadSize <= MaxLoadSize && "Unexpected load type"); + +    Value *Source1 = CI->getArgOperand(0); +    Value *Source2 = CI->getArgOperand(1); + +    // Cast source to LoadSizeType*. +    if (Source1->getType() != LoadSizeType) +      Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); +    if (Source2->getType() != LoadSizeType) +      Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + +    // Get the base address using the GEPIndex. +    if (GEPIndex != 0) { +      Source1 = Builder.CreateGEP(LoadSizeType, Source1, +                                  ConstantInt::get(LoadSizeType, GEPIndex)); +      Source2 = Builder.CreateGEP(LoadSizeType, Source2, +                                  ConstantInt::get(LoadSizeType, GEPIndex)); +    } + +    // Get a constant or load a value for each source address. +    Value *LoadSrc1 = nullptr; +    if (auto *Source1C = dyn_cast<Constant>(Source1)) +      LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); +    if (!LoadSrc1) +      LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + +    Value *LoadSrc2 = nullptr; +    if (auto *Source2C = dyn_cast<Constant>(Source2)) +      LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); +    if (!LoadSrc2) +      LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + +    if (NumLoads != 1) { +      if (LoadSizeType != MaxLoadType) { +        LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); +        LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); +      } +      // If we have multiple loads per block, we need to generate a composite +      // comparison using xor+or. +      Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); +      Diff = Builder.CreateZExt(Diff, MaxLoadType); +      XorList.push_back(Diff); +    } else { +      // If there's only one load per block, we just compare the loaded values. +      Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); +    } +  } + +  auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> { +    std::vector<Value *> OutList; +    for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { +      Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); +      OutList.push_back(Or); +    } +    if (InList.size() % 2 != 0) +      OutList.push_back(InList.back()); +    return OutList; +  }; + +  if (!Cmp) { +    // Pairwise OR the XOR results. +    OrList = pairWiseOr(XorList); + +    // Pairwise OR the OR results until one result left. +    while (OrList.size() != 1) { +      OrList = pairWiseOr(OrList); +    } +    Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); +  } + +  return Cmp; +} + +void MemCmpExpansion::emitLoadCompareBlockMultipleLoads( +    unsigned Index, unsigned Size, unsigned &NumBytesProcessed) { +  IRBuilder<> Builder(CI->getContext()); +  Value *Cmp = getCompareLoadPairs(Index, Size, NumBytesProcessed, Builder); + +  BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) +                           ? EndBlock +                           : LoadCmpBlocks[Index + 1]; +  // Early exit branch if difference found to ResultBlock. Otherwise, +  // continue to next LoadCmpBlock or EndBlock. +  BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); +  Builder.Insert(CmpBr); + +  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 +  // since early exit to ResultBlock was not taken (no difference was found in +  // any of the bytes). +  if (Index == LoadCmpBlocks.size() - 1) { +    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); +    PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]); +  } +} + +// This function creates the IR intructions for loading and comparing using the +// given LoadSize. It loads the number of bytes specified by LoadSize from each +// source of the memcmp parameters. It then does a subtract to see if there was +// a difference in the loaded values. If a difference is found, it branches +// with an early exit to the ResultBlock for calculating which source was +// larger. Otherwise, it falls through to the either the next LoadCmpBlock or +// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with +// a special case through emitLoadCompareByteBlock. The special handling can +// simply subtract the loaded values and add it to the result phi node. +void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize, +                                           unsigned GEPIndex) { +  if (LoadSize == 1) { +    MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex); +    return; +  } + +  IRBuilder<> Builder(CI->getContext()); + +  Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); +  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); +  assert(LoadSize <= MaxLoadSize && "Unexpected load type"); + +  Value *Source1 = CI->getArgOperand(0); +  Value *Source2 = CI->getArgOperand(1); + +  Builder.SetInsertPoint(LoadCmpBlocks[Index]); +  // Cast source to LoadSizeType*. +  if (Source1->getType() != LoadSizeType) +    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); +  if (Source2->getType() != LoadSizeType) +    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + +  // Get the base address using the GEPIndex. +  if (GEPIndex != 0) { +    Source1 = Builder.CreateGEP(LoadSizeType, Source1, +                                ConstantInt::get(LoadSizeType, GEPIndex)); +    Source2 = Builder.CreateGEP(LoadSizeType, Source2, +                                ConstantInt::get(LoadSizeType, GEPIndex)); +  } + +  // Load LoadSizeType from the base address. +  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); +  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + +  if (DL.isLittleEndian()) { +    Function *F = LoadCmpBlocks[Index]->getParent(); + +    Function *Bswap = Intrinsic::getDeclaration(F->getParent(), +                                                Intrinsic::bswap, LoadSizeType); +    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); +    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); +  } + +  if (LoadSizeType != MaxLoadType) { +    LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); +    LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); +  } + +  // Add the loaded values to the phi nodes for calculating memcmp result only +  // if result is not used in a zero equality. +  if (!IsUsedForZeroCmp) { +    ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[Index]); +    ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]); +  } + +  Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + +  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, +                                  ConstantInt::get(Diff->getType(), 0)); +  BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) +                           ? EndBlock +                           : LoadCmpBlocks[Index + 1]; +  // Early exit branch if difference found to ResultBlock. Otherwise, continue +  // to next LoadCmpBlock or EndBlock. +  BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); +  Builder.Insert(CmpBr); + +  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 +  // since early exit to ResultBlock was not taken (no difference was found in +  // any of the bytes). +  if (Index == LoadCmpBlocks.size() - 1) { +    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); +    PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]); +  } +} + +// This function populates the ResultBlock with a sequence to calculate the +// memcmp result. It compares the two loaded source values and returns -1 if +// src1 < src2 and 1 if src1 > src2. +void MemCmpExpansion::emitMemCmpResultBlock() { +  IRBuilder<> Builder(CI->getContext()); + +  // Special case: if memcmp result is used in a zero equality, result does not +  // need to be calculated and can simply return 1. +  if (IsUsedForZeroCmp) { +    BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); +    Builder.SetInsertPoint(ResBlock.BB, InsertPt); +    Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); +    PhiRes->addIncoming(Res, ResBlock.BB); +    BranchInst *NewBr = BranchInst::Create(EndBlock); +    Builder.Insert(NewBr); +    return; +  } +  BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); +  Builder.SetInsertPoint(ResBlock.BB, InsertPt); + +  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, +                                  ResBlock.PhiSrc2); + +  Value *Res = +      Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), +                           ConstantInt::get(Builder.getInt32Ty(), 1)); + +  BranchInst *NewBr = BranchInst::Create(EndBlock); +  Builder.Insert(NewBr); +  PhiRes->addIncoming(Res, ResBlock.BB); +} + +unsigned MemCmpExpansion::calculateNumBlocks(unsigned Size) { +  unsigned NumBlocks = 0; +  bool HaveOneByteLoad = false; +  unsigned RemainingSize = Size; +  unsigned LoadSize = MaxLoadSize; +  while (RemainingSize) { +    if (LoadSize == 1) +      HaveOneByteLoad = true; +    NumBlocks += RemainingSize / LoadSize; +    RemainingSize = RemainingSize % LoadSize; +    LoadSize = LoadSize / 2; +  } +  NumBlocksNonOneByte = HaveOneByteLoad ? (NumBlocks - 1) : NumBlocks; + +  if (IsUsedForZeroCmp) +    NumBlocks = NumBlocks / NumLoadsPerBlock + +                (NumBlocks % NumLoadsPerBlock != 0 ? 1 : 0); + +  return NumBlocks; +} + +void MemCmpExpansion::setupResultBlockPHINodes() { +  IRBuilder<> Builder(CI->getContext()); +  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); +  Builder.SetInsertPoint(ResBlock.BB); +  ResBlock.PhiSrc1 = +      Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1"); +  ResBlock.PhiSrc2 = +      Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2"); +} + +void MemCmpExpansion::setupEndBlockPHINodes() { +  IRBuilder<> Builder(CI->getContext()); + +  Builder.SetInsertPoint(&EndBlock->front()); +  PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); +} + +Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size) { +  unsigned NumBytesProcessed = 0; +  // This loop populates each of the LoadCmpBlocks with the IR sequence to +  // handle multiple loads per block. +  for (unsigned i = 0; i < NumBlocks; ++i) +    emitLoadCompareBlockMultipleLoads(i, Size, NumBytesProcessed); + +  emitMemCmpResultBlock(); +  return PhiRes; +} + +/// A memcmp expansion that compares equality with 0 and only has one block of +/// load and compare can bypass the compare, branch, and phi IR that is required +/// in the general case. +Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) { +  unsigned NumBytesProcessed = 0; +  IRBuilder<> Builder(CI->getContext()); +  Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed, Builder); +  return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); +} + +// This function expands the memcmp call into an inline expansion and returns +// the memcmp result. +Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) { +  if (IsUsedForZeroCmp) +    return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) : +                            getMemCmpExpansionZeroCase(Size); + +  // This loop calls emitLoadCompareBlock for comparing Size bytes of the two +  // memcmp sources. It starts with loading using the maximum load size set by +  // the target. It processes any remaining bytes using a load size which is the +  // next smallest power of 2. +  unsigned LoadSize = MaxLoadSize; +  unsigned NumBytesToBeProcessed = Size; +  unsigned Index = 0; +  while (NumBytesToBeProcessed) { +    // Calculate how many blocks we can create with the current load size. +    unsigned NumBlocks = NumBytesToBeProcessed / LoadSize; +    unsigned GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize; +    NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize; + +    // For each NumBlocks, populate the instruction sequence for loading and +    // comparing LoadSize bytes. +    while (NumBlocks--) { +      emitLoadCompareBlock(Index, LoadSize, GEPIndex); +      Index++; +      GEPIndex++; +    } +    // Get the next LoadSize to use. +    LoadSize = LoadSize / 2; +  } + +  emitMemCmpResultBlock(); +  return PhiRes; +} + +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced with a new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) +/// To: +/// loadbb: +///  %0 = bitcast i32* %buffer2 to i8* +///  %1 = bitcast i32* %buffer1 to i8* +///  %2 = bitcast i8* %1 to i64* +///  %3 = bitcast i8* %0 to i64* +///  %4 = load i64, i64* %2 +///  %5 = load i64, i64* %3 +///  %6 = call i64 @llvm.bswap.i64(i64 %4) +///  %7 = call i64 @llvm.bswap.i64(i64 %5) +///  %8 = sub i64 %6, %7 +///  %9 = icmp ne i64 %8, 0 +///  br i1 %9, label %res_block, label %loadbb1 +/// res_block:                                        ; preds = %loadbb2, +/// %loadbb1, %loadbb +///  %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] +///  %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] +///  %10 = icmp ult i64 %phi.src1, %phi.src2 +///  %11 = select i1 %10, i32 -1, i32 1 +///  br label %endblock +/// loadbb1:                                          ; preds = %loadbb +///  %12 = bitcast i32* %buffer2 to i8* +///  %13 = bitcast i32* %buffer1 to i8* +///  %14 = bitcast i8* %13 to i32* +///  %15 = bitcast i8* %12 to i32* +///  %16 = getelementptr i32, i32* %14, i32 2 +///  %17 = getelementptr i32, i32* %15, i32 2 +///  %18 = load i32, i32* %16 +///  %19 = load i32, i32* %17 +///  %20 = call i32 @llvm.bswap.i32(i32 %18) +///  %21 = call i32 @llvm.bswap.i32(i32 %19) +///  %22 = zext i32 %20 to i64 +///  %23 = zext i32 %21 to i64 +///  %24 = sub i64 %22, %23 +///  %25 = icmp ne i64 %24, 0 +///  br i1 %25, label %res_block, label %loadbb2 +/// loadbb2:                                          ; preds = %loadbb1 +///  %26 = bitcast i32* %buffer2 to i8* +///  %27 = bitcast i32* %buffer1 to i8* +///  %28 = bitcast i8* %27 to i16* +///  %29 = bitcast i8* %26 to i16* +///  %30 = getelementptr i16, i16* %28, i16 6 +///  %31 = getelementptr i16, i16* %29, i16 6 +///  %32 = load i16, i16* %30 +///  %33 = load i16, i16* %31 +///  %34 = call i16 @llvm.bswap.i16(i16 %32) +///  %35 = call i16 @llvm.bswap.i16(i16 %33) +///  %36 = zext i16 %34 to i64 +///  %37 = zext i16 %35 to i64 +///  %38 = sub i64 %36, %37 +///  %39 = icmp ne i64 %38, 0 +///  br i1 %39, label %res_block, label %loadbb3 +/// loadbb3:                                          ; preds = %loadbb2 +///  %40 = bitcast i32* %buffer2 to i8* +///  %41 = bitcast i32* %buffer1 to i8* +///  %42 = getelementptr i8, i8* %41, i8 14 +///  %43 = getelementptr i8, i8* %40, i8 14 +///  %44 = load i8, i8* %42 +///  %45 = load i8, i8* %43 +///  %46 = zext i8 %44 to i32 +///  %47 = zext i8 %45 to i32 +///  %48 = sub i32 %46, %47 +///  br label %endblock +/// endblock:                                         ; preds = %res_block, +/// %loadbb3 +///  %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] +///  ret i32 %phi.res +static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, +                         const TargetLowering *TLI, const DataLayout *DL) { +  NumMemCmpCalls++; +  IRBuilder<> Builder(CI->getContext()); + +  // TTI call to check if target would like to expand memcmp. Also, get the +  // MaxLoadSize. +  unsigned MaxLoadSize; +  if (!TTI->expandMemCmp(CI, MaxLoadSize)) +    return false; + +  // Early exit from expansion if -Oz. +  if (CI->getFunction()->optForMinSize()) +    return false; + +  // Early exit from expansion if size is not a constant. +  ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2)); +  if (!SizeCast) { +    NumMemCmpNotConstant++; +    return false; +  } + +  // Early exit from expansion if size greater than max bytes to load. +  uint64_t SizeVal = SizeCast->getZExtValue(); +  unsigned NumLoads = 0; +  unsigned RemainingSize = SizeVal; +  unsigned LoadSize = MaxLoadSize; +  while (RemainingSize) { +    NumLoads += RemainingSize / LoadSize; +    RemainingSize = RemainingSize % LoadSize; +    LoadSize = LoadSize / 2; +  } + +  if (NumLoads > TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize())) { +    NumMemCmpGreaterThanMax++; +    return false; +  } + +  NumMemCmpInlined++; + +  // MemCmpHelper object creates and sets up basic blocks required for +  // expanding memcmp with size SizeVal. +  unsigned NumLoadsPerBlock = MemCmpNumLoadsPerBlock; +  MemCmpExpansion MemCmpHelper(CI, SizeVal, MaxLoadSize, NumLoadsPerBlock, *DL); + +  Value *Res = MemCmpHelper.getMemCmpExpansion(SizeVal); + +  // Replace call with result of expansion and erase call. +  CI->replaceAllUsesWith(Res); +  CI->eraseFromParent(); + +  return true; +} + +bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {    BasicBlock *BB = CI->getParent();    // Lower inline assembly if we can. @@ -1955,10 +2354,11 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {        ConstantInt *RetVal =            lowerObjectSizeCall(II, *DL, TLInfo, /*MustSucceed=*/true);        // Substituting this can cause recursive simplifications, which can -      // invalidate our iterator.  Use a WeakVH to hold onto it in case this +      // invalidate our iterator.  Use a WeakTrackingVH to hold onto it in case +      // this        // happens.        Value *CurValue = &*CurInstIterator; -      WeakVH IterHandle(CurValue); +      WeakTrackingVH IterHandle(CurValue);        replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr); @@ -1970,39 +2370,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {        }        return true;      } -    case Intrinsic::masked_load: { -      // Scalarize unsupported vector masked load -      if (!TTI->isLegalMaskedLoad(CI->getType())) { -        scalarizeMaskedLoad(CI); -        ModifiedDT = true; -        return true; -      } -      return false; -    } -    case Intrinsic::masked_store: { -      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) { -        scalarizeMaskedStore(CI); -        ModifiedDT = true; -        return true; -      } -      return false; -    } -    case Intrinsic::masked_gather: { -      if (!TTI->isLegalMaskedGather(CI->getType())) { -        scalarizeMaskedGather(CI); -        ModifiedDT = true; -        return true; -      } -      return false; -    } -    case Intrinsic::masked_scatter: { -      if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) { -        scalarizeMaskedScatter(CI); -        ModifiedDT = true; -        return true; -      } -      return false; -    }      case Intrinsic::aarch64_stlxr:      case Intrinsic::aarch64_stxr: {        ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0)); @@ -2028,16 +2395,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {      }      if (TLI) { -      // Unknown address space. -      // TODO: Target hook to pick which address space the intrinsic cares -      // about? -      unsigned AddrSpace = ~0u;        SmallVector<Value*, 2> PtrOps;        Type *AccessTy; -      if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace)) -        while (!PtrOps.empty()) -          if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace)) +      if (TLI->getAddrModeArguments(II, PtrOps, AccessTy)) +        while (!PtrOps.empty()) { +          Value *PtrVal = PtrOps.pop_back_val(); +          unsigned AS = PtrVal->getType()->getPointerAddressSpace(); +          if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))              return true; +        }      }    } @@ -2054,6 +2420,13 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {      CI->eraseFromParent();      return true;    } + +  LibFunc Func; +  if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) && +      Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) { +    ModifiedDT = true; +    return true; +  }    return false;  } @@ -2168,11 +2541,11 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) {      // Conservatively require the attributes of the call to match those of the      // return. Ignore noalias because it doesn't affect the call sequence. -    AttributeSet CalleeAttrs = CS.getAttributes(); -    if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). -          removeAttribute(Attribute::NoAlias) != -        AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex). -          removeAttribute(Attribute::NoAlias)) +    AttributeList CalleeAttrs = CS.getAttributes(); +    if (AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex) +            .removeAttribute(Attribute::NoAlias) != +        AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex) +            .removeAttribute(Attribute::NoAlias))        continue;      // Make sure the call instruction is followed by an unconditional branch to @@ -2561,25 +2934,30 @@ class TypePromotionTransaction {      OperandsHider Hider;      /// Keep track of the uses replaced, if any.      UsesReplacer *Replacer; +    /// Keep track of instructions removed. +    SetOfInstrs &RemovedInsts;    public:      /// \brief Remove all reference of \p Inst and optinally replace all its      /// uses with New. +    /// \p RemovedInsts Keep track of the instructions removed by this Action.      /// \pre If !Inst->use_empty(), then New != nullptr -    InstructionRemover(Instruction *Inst, Value *New = nullptr) +    InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts, +                       Value *New = nullptr)          : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst), -          Replacer(nullptr) { +          Replacer(nullptr), RemovedInsts(RemovedInsts) {        if (New)          Replacer = new UsesReplacer(Inst, New);        DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n"); +      RemovedInsts.insert(Inst); +      /// The instructions removed here will be freed after completing +      /// optimizeBlock() for all blocks as we need to keep track of the +      /// removed instructions during promotion.        Inst->removeFromParent();      }      ~InstructionRemover() override { delete Replacer; } -    /// \brief Really remove the instruction. -    void commit() override { delete Inst; } -      /// \brief Resurrect the instruction and reassign it to the proper uses if      /// new value was provided when build this action.      void undo() override { @@ -2588,6 +2966,7 @@ class TypePromotionTransaction {        if (Replacer)          Replacer->undo();        Hider.undo(); +      RemovedInsts.erase(Inst);      }    }; @@ -2596,6 +2975,10 @@ public:    /// The restoration point is a pointer to an action instead of an iterator    /// because the iterator may be invalidated but not the pointer.    typedef const TypePromotionAction *ConstRestorationPt; + +  TypePromotionTransaction(SetOfInstrs &RemovedInsts) +      : RemovedInsts(RemovedInsts) {} +    /// Advocate every changes made in that transaction.    void commit();    /// Undo all the changes made after the given point. @@ -2627,6 +3010,7 @@ private:    /// The ordered list of actions made so far.    SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;    typedef SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator CommitPt; +  SetOfInstrs &RemovedInsts;  };  void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx, @@ -2638,7 +3022,8 @@ void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,  void TypePromotionTransaction::eraseInstruction(Instruction *Inst,                                                  Value *NewVal) {    Actions.push_back( -      make_unique<TypePromotionTransaction::InstructionRemover>(Inst, NewVal)); +      make_unique<TypePromotionTransaction::InstructionRemover>(Inst, +                                                         RemovedInsts, NewVal));  }  void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst, @@ -2705,8 +3090,8 @@ void TypePromotionTransaction::rollback(  /// This encapsulates the logic for matching the target-legal addressing modes.  class AddressingModeMatcher {    SmallVectorImpl<Instruction*> &AddrModeInsts; -  const TargetMachine &TM;    const TargetLowering &TLI; +  const TargetRegisterInfo &TRI;    const DataLayout &DL;    /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and @@ -2731,14 +3116,14 @@ class AddressingModeMatcher {    bool IgnoreProfitability;    AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI, -                        const TargetMachine &TM, Type *AT, unsigned AS, +                        const TargetLowering &TLI, +                        const TargetRegisterInfo &TRI, +                        Type *AT, unsigned AS,                          Instruction *MI, ExtAddrMode &AM,                          const SetOfInstrs &InsertedInsts,                          InstrToOrigTy &PromotedInsts,                          TypePromotionTransaction &TPT) -      : AddrModeInsts(AMI), TM(TM), -        TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent()) -                 ->getTargetLowering()), +      : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),          DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),          MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),          PromotedInsts(PromotedInsts), TPT(TPT) { @@ -2756,13 +3141,15 @@ public:    static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS,                             Instruction *MemoryInst,                             SmallVectorImpl<Instruction*> &AddrModeInsts, -                           const TargetMachine &TM, +                           const TargetLowering &TLI, +                           const TargetRegisterInfo &TRI,                             const SetOfInstrs &InsertedInsts,                             InstrToOrigTy &PromotedInsts,                             TypePromotionTransaction &TPT) {      ExtAddrMode Result; -    bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS, +    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, +                                         AccessTy, AS,                                           MemoryInst, Result, InsertedInsts,                                           PromotedInsts, TPT).matchAddr(V, 0);      (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -3583,18 +3970,18 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {  /// Check to see if all uses of OpVal by the specified inline asm call are due  /// to memory operands. If so, return true, otherwise return false.  static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, -                                    const TargetMachine &TM) { -  const Function *F = CI->getParent()->getParent(); -  const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering(); -  const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo(); +                                    const TargetLowering &TLI, +                                    const TargetRegisterInfo &TRI) { +  const Function *F = CI->getFunction();    TargetLowering::AsmOperandInfoVector TargetConstraints = -      TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI, +      TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI,                              ImmutableCallSite(CI)); +    for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {      TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];      // Compute the constraint code and ConstraintType to use. -    TLI->ComputeConstraintToUse(OpInfo, SDValue()); +    TLI.ComputeConstraintToUse(OpInfo, SDValue());      // If this asm operand is our Value*, and if it isn't an indirect memory      // operand, we can't fold it! @@ -3613,7 +4000,8 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,  static bool FindAllMemoryUses(      Instruction *I,      SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses, -    SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) { +    SmallPtrSetImpl<Instruction *> &ConsideredInsts, +    const TargetLowering &TLI, const TargetRegisterInfo &TRI) {    // If we already considered this instruction, we're done.    if (!ConsideredInsts.insert(I).second)      return false; @@ -3635,11 +4023,28 @@ static bool FindAllMemoryUses(      if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {        unsigned opNo = U.getOperandNo(); -      if (opNo == 0) return true; // Storing addr, not into addr. +      if (opNo != StoreInst::getPointerOperandIndex()) +        return true; // Storing addr, not into addr.        MemoryUses.push_back(std::make_pair(SI, opNo));        continue;      } +    if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) { +      unsigned opNo = U.getOperandNo(); +      if (opNo != AtomicRMWInst::getPointerOperandIndex()) +        return true; // Storing addr, not into addr. +      MemoryUses.push_back(std::make_pair(RMW, opNo)); +      continue; +    } + +    if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) { +      unsigned opNo = U.getOperandNo(); +      if (opNo != AtomicCmpXchgInst::getPointerOperandIndex()) +        return true; // Storing addr, not into addr. +      MemoryUses.push_back(std::make_pair(CmpX, opNo)); +      continue; +    } +      if (CallInst *CI = dyn_cast<CallInst>(UserI)) {        // If this is a cold call, we can sink the addressing calculation into        // the cold path.  See optimizeCallInst @@ -3650,12 +4055,12 @@ static bool FindAllMemoryUses(        if (!IA) return true;        // If this is a memory operand, we're cool, otherwise bail out. -      if (!IsOperandAMemoryOperand(CI, IA, I, TM)) +      if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))          return true;        continue;      } -    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM)) +    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI))        return true;    } @@ -3743,7 +4148,7 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,    // the use is just a particularly nice way of sinking it.    SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;    SmallPtrSet<Instruction*, 16> ConsideredInsts; -  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM)) +  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI))      return false;  // Has a non-memory, non-foldable use!    // Now that we know that all uses of this instruction are part of a chain of @@ -3775,7 +4180,8 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,      ExtAddrMode Result;      TypePromotionTransaction::ConstRestorationPt LastKnownGood =          TPT.getRestorationPoint(); -    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS, +    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, +                                  AddressAccessTy, AS,                                    MemoryInst, Result, InsertedInsts,                                    PromotedInsts, TPT);      Matcher.IgnoreProfitability = true; @@ -3844,7 +4250,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,    bool IsNumUsesConsensusValid = false;    SmallVector<Instruction*, 16> AddrModeInsts;    ExtAddrMode AddrMode; -  TypePromotionTransaction TPT; +  TypePromotionTransaction TPT(RemovedInsts);    TypePromotionTransaction::ConstRestorationPt LastKnownGood =        TPT.getRestorationPoint();    while (!worklist.empty()) { @@ -3869,7 +4275,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,      // addressing instructions might have.      SmallVector<Instruction*, 16> NewAddrModeInsts;      ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( -      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM, +      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TLI, *TRI,        InsertedInsts, PromotedInsts, TPT);      // This check is broken into two cases with very similar code to avoid using @@ -3935,11 +4341,10 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,      DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "                   << *MemoryInst << "\n");      if (SunkAddr->getType() != Addr->getType()) -      SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); +      SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());    } else if (AddrSinkUsingGEPs ||               (!AddrSinkUsingGEPs.getNumOccurrences() && TM && -              TM->getSubtargetImpl(*MemoryInst->getParent()->getParent()) -                  ->useAA())) { +              SubtargetInfo->useAA())) {      // By default, we use the GEP-based method when AA is used later. This      // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.      DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " @@ -4042,7 +4447,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,            // We need to add this separately from the scale above to help with            // SDAG consecutive load/store merging.            if (ResultPtr->getType() != I8PtrTy) -            ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); +            ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);            ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");          } @@ -4053,12 +4458,12 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,          SunkAddr = ResultPtr;        } else {          if (ResultPtr->getType() != I8PtrTy) -          ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); +          ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);          SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");        }        if (SunkAddr->getType() != Addr->getType()) -        SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); +        SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());      }    } else {      DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " @@ -4140,9 +4545,9 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,    // using it.    if (Repl->use_empty()) {      // This can cause recursive deletion, which can invalidate our iterator. -    // Use a WeakVH to hold onto it in case this happens. +    // Use a WeakTrackingVH to hold onto it in case this happens.      Value *CurValue = &*CurInstIterator; -    WeakVH IterHandle(CurValue); +    WeakTrackingVH IterHandle(CurValue);      BasicBlock *BB = CurInstIterator->getParent();      RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo); @@ -4164,7 +4569,7 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {    bool MadeChange = false;    const TargetRegisterInfo *TRI = -      TM->getSubtargetImpl(*CS->getParent()->getParent())->getRegisterInfo(); +      TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();    TargetLowering::AsmOperandInfoVector TargetConstraints =        TLI->ParseConstraints(*DL, TRI, CS);    unsigned ArgNo = 0; @@ -4185,14 +4590,14 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {    return MadeChange;  } -/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or +/// \brief Check if all the uses of \p Val are equivalent (or free) zero or  /// sign extensions. -static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { -  assert(!Inst->use_empty() && "Input must have at least one use"); -  const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin()); +static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) { +  assert(!Val->use_empty() && "Input must have at least one use"); +  const Instruction *FirstUser = cast<Instruction>(*Val->user_begin());    bool IsSExt = isa<SExtInst>(FirstUser);    Type *ExtTy = FirstUser->getType(); -  for (const User *U : Inst->users()) { +  for (const User *U : Val->users()) {      const Instruction *UI = cast<Instruction>(U);      if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))        return false; @@ -4202,11 +4607,11 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {        continue;      // If IsSExt is true, we are in this situation: -    // a = Inst +    // a = Val      // b = sext ty1 a to ty2      // c = sext ty1 a to ty3      // Assuming ty2 is shorter than ty3, this could be turned into: -    // a = Inst +    // a = Val      // b = sext ty1 a to ty2      // c = sext ty2 b to ty3      // However, the last sext is not free. @@ -4233,51 +4638,44 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {    return true;  } -/// \brief Try to form ExtLd by promoting \p Exts until they reach a -/// load instruction. -/// If an ext(load) can be formed, it is returned via \p LI for the load -/// and \p Inst for the extension. -/// Otherwise LI == nullptr and Inst == nullptr. -/// When some promotion happened, \p TPT contains the proper state to -/// revert them. -/// -/// \return true when promoting was necessary to expose the ext(load) -/// opportunity, false otherwise. +/// \brief Try to speculatively promote extensions in \p Exts and continue +/// promoting through newly promoted operands recursively as far as doing so is +/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts. +/// When some promotion happened, \p TPT contains the proper state to revert +/// them.  /// -/// Example: -/// \code -/// %ld = load i32* %addr -/// %add = add nuw i32 %ld, 4 -/// %zext = zext i32 %add to i64 -/// \endcode -/// => -/// \code -/// %ld = load i32* %addr -/// %zext = zext i32 %ld to i64 -/// %add = add nuw i64 %zext, 4 -/// \encode -/// Thanks to the promotion, we can match zext(load i32*) to i64. -bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT, -                                    LoadInst *&LI, Instruction *&Inst, -                                    const SmallVectorImpl<Instruction *> &Exts, -                                    unsigned CreatedInstsCost = 0) { -  // Iterate over all the extensions to see if one form an ext(load). +/// \return true if some promotion happened, false otherwise. +bool CodeGenPrepare::tryToPromoteExts( +    TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts, +    SmallVectorImpl<Instruction *> &ProfitablyMovedExts, +    unsigned CreatedInstsCost) { +  bool Promoted = false; + +  // Iterate over all the extensions to try to promote them.    for (auto I : Exts) { -    // Check if we directly have ext(load). -    if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) { -      Inst = I; -      // No promotion happened here. -      return false; +    // Early check if we directly have ext(load). +    if (isa<LoadInst>(I->getOperand(0))) { +      ProfitablyMovedExts.push_back(I); +      continue;      } -    // Check whether or not we want to do any promotion. + +    // Check whether or not we want to do any promotion.  The reason we have +    // this check inside the for loop is to catch the case where an extension +    // is directly fed by a load because in such case the extension can be moved +    // up without any promotion on its operands.      if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion) -      continue; +      return false; +      // Get the action to perform the promotion. -    TypePromotionHelper::Action TPH = TypePromotionHelper::getAction( -        I, InsertedInsts, *TLI, PromotedInsts); +    TypePromotionHelper::Action TPH = +        TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);      // Check if we can promote. -    if (!TPH) +    if (!TPH) { +      // Save the current extension as we cannot move up through its operand. +      ProfitablyMovedExts.push_back(I);        continue; +    } +      // Save the current state.      TypePromotionTransaction::ConstRestorationPt LastKnownGood =          TPT.getRestorationPoint(); @@ -4297,110 +4695,293 @@ bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT,      // one extension but leave one. However, we optimistically keep going,      // because the new extension may be removed too.      long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost; -    TotalCreatedInstsCost -= ExtCost; +    // FIXME: It would be possible to propagate a negative value instead of +    // conservatively ceiling it to 0. +    TotalCreatedInstsCost = +        std::max((long long)0, (TotalCreatedInstsCost - ExtCost));      if (!StressExtLdPromotion &&          (TotalCreatedInstsCost > 1 ||           !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) { -      // The promotion is not profitable, rollback to the previous state. +      // This promotion is not profitable, rollback to the previous state, and +      // save the current extension in ProfitablyMovedExts as the latest +      // speculative promotion turned out to be unprofitable.        TPT.rollback(LastKnownGood); +      ProfitablyMovedExts.push_back(I); +      continue; +    } +    // Continue promoting NewExts as far as doing so is profitable. +    SmallVector<Instruction *, 2> NewlyMovedExts; +    (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost); +    bool NewPromoted = false; +    for (auto ExtInst : NewlyMovedExts) { +      Instruction *MovedExt = cast<Instruction>(ExtInst); +      Value *ExtOperand = MovedExt->getOperand(0); +      // If we have reached to a load, we need this extra profitability check +      // as it could potentially be merged into an ext(load). +      if (isa<LoadInst>(ExtOperand) && +          !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || +            (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI)))) +        continue; + +      ProfitablyMovedExts.push_back(MovedExt); +      NewPromoted = true; +    } + +    // If none of speculative promotions for NewExts is profitable, rollback +    // and save the current extension (I) as the last profitable extension. +    if (!NewPromoted) { +      TPT.rollback(LastKnownGood); +      ProfitablyMovedExts.push_back(I);        continue;      }      // The promotion is profitable. -    // Check if it exposes an ext(load). -    (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost); -    if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || -               // If we have created a new extension, i.e., now we have two -               // extensions. We must make sure one of them is merged with -               // the load, otherwise we may degrade the code quality. -               (LI->hasOneUse() || hasSameExtUse(LI, *TLI)))) -      // Promotion happened. -      return true; -    // If this does not help to expose an ext(load) then, rollback. -    TPT.rollback(LastKnownGood); +    Promoted = true;    } -  // None of the extension can form an ext(load). -  LI = nullptr; -  Inst = nullptr; -  return false; +  return Promoted;  } -/// Move a zext or sext fed by a load into the same basic block as the load, -/// unless conditions are unfavorable. This allows SelectionDAG to fold the -/// extend into the load. -/// \p I[in/out] the extension may be modified during the process if some -/// promotions apply. -/// -bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) { -  // ExtLoad formation infrastructure requires TLI to be effective. -  if (!TLI) -    return false; +/// Merging redundant sexts when one is dominating the other. +bool CodeGenPrepare::mergeSExts(Function &F) { +  DominatorTree DT(F); +  bool Changed = false; +  for (auto &Entry : ValToSExtendedUses) { +    SExts &Insts = Entry.second; +    SExts CurPts; +    for (Instruction *Inst : Insts) { +      if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) || +          Inst->getOperand(0) != Entry.first) +        continue; +      bool inserted = false; +      for (auto &Pt : CurPts) { +        if (DT.dominates(Inst, Pt)) { +          Pt->replaceAllUsesWith(Inst); +          RemovedInsts.insert(Pt); +          Pt->removeFromParent(); +          Pt = Inst; +          inserted = true; +          Changed = true; +          break; +        } +        if (!DT.dominates(Pt, Inst)) +          // Give up if we need to merge in a common dominator as the +          // expermients show it is not profitable. +          continue; +        Inst->replaceAllUsesWith(Pt); +        RemovedInsts.insert(Inst); +        Inst->removeFromParent(); +        inserted = true; +        Changed = true; +        break; +      } +      if (!inserted) +        CurPts.push_back(Inst); +    } +  } +  return Changed; +} -  // Try to promote a chain of computation if it allows to form -  // an extended load. -  TypePromotionTransaction TPT; -  TypePromotionTransaction::ConstRestorationPt LastKnownGood = -    TPT.getRestorationPoint(); -  SmallVector<Instruction *, 1> Exts; -  Exts.push_back(I); -  // Look for a load being extended. -  LoadInst *LI = nullptr; -  Instruction *OldExt = I; -  bool HasPromoted = extLdPromotion(TPT, LI, I, Exts); -  if (!LI || !I) { -    assert(!HasPromoted && !LI && "If we did not match any load instruction " -                                  "the code must remain the same"); -    I = OldExt; -    return false; +/// Return true, if an ext(load) can be formed from an extension in +/// \p MovedExts. +bool CodeGenPrepare::canFormExtLd( +    const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI, +    Instruction *&Inst, bool HasPromoted) { +  for (auto *MovedExtInst : MovedExts) { +    if (isa<LoadInst>(MovedExtInst->getOperand(0))) { +      LI = cast<LoadInst>(MovedExtInst->getOperand(0)); +      Inst = MovedExtInst; +      break; +    }    } +  if (!LI) +    return false;    // If they're already in the same block, there's nothing to do.    // Make the cheap checks first if we did not promote.    // If we promoted, we need to check if it is indeed profitable. -  if (!HasPromoted && LI->getParent() == I->getParent()) +  if (!HasPromoted && LI->getParent() == Inst->getParent())      return false; -  EVT VT = TLI->getValueType(*DL, I->getType()); +  EVT VT = TLI->getValueType(*DL, Inst->getType());    EVT LoadVT = TLI->getValueType(*DL, LI->getType());    // If the load has other users and the truncate is not free, this probably    // isn't worthwhile. -  if (!LI->hasOneUse() && -      (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && -      !TLI->isTruncateFree(I->getType(), LI->getType())) { -    I = OldExt; -    TPT.rollback(LastKnownGood); +  if (!LI->hasOneUse() && (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) && +      !TLI->isTruncateFree(Inst->getType(), LI->getType()))      return false; -  }    // Check whether the target supports casts folded into loads.    unsigned LType; -  if (isa<ZExtInst>(I)) +  if (isa<ZExtInst>(Inst))      LType = ISD::ZEXTLOAD;    else { -    assert(isa<SExtInst>(I) && "Unexpected ext type!"); +    assert(isa<SExtInst>(Inst) && "Unexpected ext type!");      LType = ISD::SEXTLOAD;    } -  if (!TLI->isLoadExtLegal(LType, VT, LoadVT)) { -    I = OldExt; -    TPT.rollback(LastKnownGood); + +  return TLI->isLoadExtLegal(LType, VT, LoadVT); +} + +/// Move a zext or sext fed by a load into the same basic block as the load, +/// unless conditions are unfavorable. This allows SelectionDAG to fold the +/// extend into the load. +/// +/// E.g., +/// \code +/// %ld = load i32* %addr +/// %add = add nuw i32 %ld, 4 +/// %zext = zext i32 %add to i64 +// \endcode +/// => +/// \code +/// %ld = load i32* %addr +/// %zext = zext i32 %ld to i64 +/// %add = add nuw i64 %zext, 4 +/// \encode +/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which +/// allow us to match zext(load i32*) to i64. +/// +/// Also, try to promote the computations used to obtain a sign extended +/// value used into memory accesses. +/// E.g., +/// \code +/// a = add nsw i32 b, 3 +/// d = sext i32 a to i64 +/// e = getelementptr ..., i64 d +/// \endcode +/// => +/// \code +/// f = sext i32 b to i64 +/// a = add nsw i64 f, 3 +/// e = getelementptr ..., i64 a +/// \endcode +/// +/// \p Inst[in/out] the extension may be modified during the process if some +/// promotions apply. +bool CodeGenPrepare::optimizeExt(Instruction *&Inst) { +  // ExtLoad formation and address type promotion infrastructure requires TLI to +  // be effective. +  if (!TLI)      return false; + +  bool AllowPromotionWithoutCommonHeader = false; +  /// See if it is an interesting sext operations for the address type +  /// promotion before trying to promote it, e.g., the ones with the right +  /// type and used in memory accesses. +  bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion( +      *Inst, AllowPromotionWithoutCommonHeader); +  TypePromotionTransaction TPT(RemovedInsts); +  TypePromotionTransaction::ConstRestorationPt LastKnownGood = +      TPT.getRestorationPoint(); +  SmallVector<Instruction *, 1> Exts; +  SmallVector<Instruction *, 2> SpeculativelyMovedExts; +  Exts.push_back(Inst); + +  bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts); + +  // Look for a load being extended. +  LoadInst *LI = nullptr; +  Instruction *ExtFedByLoad; + +  // Try to promote a chain of computation if it allows to form an extended +  // load. +  if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) { +    assert(LI && ExtFedByLoad && "Expect a valid load and extension"); +    TPT.commit(); +    // Move the extend into the same block as the load +    ExtFedByLoad->removeFromParent(); +    ExtFedByLoad->insertAfter(LI); +    // CGP does not check if the zext would be speculatively executed when moved +    // to the same basic block as the load. Preserving its original location +    // would pessimize the debugging experience, as well as negatively impact +    // the quality of sample pgo. We don't want to use "line 0" as that has a +    // size cost in the line-table section and logically the zext can be seen as +    // part of the load. Therefore we conservatively reuse the same debug +    // location for the load and the zext. +    ExtFedByLoad->setDebugLoc(LI->getDebugLoc()); +    ++NumExtsMoved; +    Inst = ExtFedByLoad; +    return true;    } -  // Move the extend into the same block as the load, so that SelectionDAG -  // can fold it. -  TPT.commit(); -  I->removeFromParent(); -  I->insertAfter(LI); -  // CGP does not check if the zext would be speculatively executed when moved -  // to the same basic block as the load. Preserving its original location would -  // pessimize the debugging experience, as well as negatively impact the  -  // quality of sample pgo. We don't want to use "line 0" as that has a -  // size cost in the line-table section and logically the zext can be seen as -  // part of the load. Therefore we conservatively reuse the same debug location -  // for the load and the zext. -  I->setDebugLoc(LI->getDebugLoc()); -  ++NumExtsMoved; -  return true; +  // Continue promoting SExts if known as considerable depending on targets. +  if (ATPConsiderable && +      performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader, +                                  HasPromoted, TPT, SpeculativelyMovedExts)) +    return true; + +  TPT.rollback(LastKnownGood); +  return false; +} + +// Perform address type promotion if doing so is profitable. +// If AllowPromotionWithoutCommonHeader == false, we should find other sext +// instructions that sign extended the same initial value. However, if +// AllowPromotionWithoutCommonHeader == true, we expect promoting the +// extension is just profitable. +bool CodeGenPrepare::performAddressTypePromotion( +    Instruction *&Inst, bool AllowPromotionWithoutCommonHeader, +    bool HasPromoted, TypePromotionTransaction &TPT, +    SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) { +  bool Promoted = false; +  SmallPtrSet<Instruction *, 1> UnhandledExts; +  bool AllSeenFirst = true; +  for (auto I : SpeculativelyMovedExts) { +    Value *HeadOfChain = I->getOperand(0); +    DenseMap<Value *, Instruction *>::iterator AlreadySeen = +        SeenChainsForSExt.find(HeadOfChain); +    // If there is an unhandled SExt which has the same header, try to promote +    // it as well. +    if (AlreadySeen != SeenChainsForSExt.end()) { +      if (AlreadySeen->second != nullptr) +        UnhandledExts.insert(AlreadySeen->second); +      AllSeenFirst = false; +    } +  } + +  if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader && +                        SpeculativelyMovedExts.size() == 1)) { +    TPT.commit(); +    if (HasPromoted) +      Promoted = true; +    for (auto I : SpeculativelyMovedExts) { +      Value *HeadOfChain = I->getOperand(0); +      SeenChainsForSExt[HeadOfChain] = nullptr; +      ValToSExtendedUses[HeadOfChain].push_back(I); +    } +    // Update Inst as promotion happen. +    Inst = SpeculativelyMovedExts.pop_back_val(); +  } else { +    // This is the first chain visited from the header, keep the current chain +    // as unhandled. Defer to promote this until we encounter another SExt +    // chain derived from the same header. +    for (auto I : SpeculativelyMovedExts) { +      Value *HeadOfChain = I->getOperand(0); +      SeenChainsForSExt[HeadOfChain] = Inst; +    } +    return false; +  } + +  if (!AllSeenFirst && !UnhandledExts.empty()) +    for (auto VisitedSExt : UnhandledExts) { +      if (RemovedInsts.count(VisitedSExt)) +        continue; +      TypePromotionTransaction TPT(RemovedInsts); +      SmallVector<Instruction *, 1> Exts; +      SmallVector<Instruction *, 2> Chains; +      Exts.push_back(VisitedSExt); +      bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains); +      TPT.commit(); +      if (HasPromoted) +        Promoted = true; +      for (auto I : Chains) { +        Value *HeadOfChain = I->getOperand(0); +        // Mark this as handled. +        SeenChainsForSExt[HeadOfChain] = nullptr; +        ValToSExtendedUses[HeadOfChain].push_back(I); +      } +    } +  return Promoted;  }  bool CodeGenPrepare::optimizeExtUses(Instruction *I) { @@ -4534,13 +5115,10 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {        !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))      return false; -  // Skip loads we've already transformed or have no reason to transform. -  if (Load->hasOneUse()) { -    User *LoadUser = *Load->user_begin(); -    if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() && -        !dyn_cast<PHINode>(LoadUser)) -      return false; -  } +  // Skip loads we've already transformed. +  if (Load->hasOneUse() && +      InsertedInsts.count(cast<Instruction>(*Load->user_begin()))) +    return false;    // Look at all uses of Load, looking through phis, to determine how many bits    // of the loaded value are needed. @@ -4590,16 +5168,14 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {        if (!ShlC)          return false;        uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); -      auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt); -      DemandBits |= ShlDemandBits; +      DemandBits.setLowBits(BitWidth - ShiftAmt);        break;      }      case llvm::Instruction::Trunc: {        EVT TruncVT = TLI->getValueType(*DL, I->getType());        unsigned TruncBitWidth = TruncVT.getSizeInBits(); -      auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth); -      DemandBits |= TruncBits; +      DemandBits.setLowBits(TruncBitWidth);        break;      } @@ -4620,7 +5196,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {    //    // Also avoid hoisting if we didn't see any ands with the exact DemandBits    // mask, since these are the only ands that will be removed by isel. -  if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) || +  if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) ||        WidestAndBits != DemandBits)      return false; @@ -4636,6 +5212,9 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {    IRBuilder<> Builder(Load->getNextNode());    auto *NewAnd = dyn_cast<Instruction>(        Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); +  // Mark this instruction as "inserted by CGP", so that other +  // optimizations don't touch it. +  InsertedInsts.insert(NewAnd);    // Replace all uses of load with new and (except for the use of load in the    // new and itself). @@ -4985,7 +5564,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {    auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);    ExtInst->insertBefore(SI);    SI->setCondition(ExtInst); -  for (SwitchInst::CaseIt Case : SI->cases()) { +  for (auto Case : SI->cases()) {      APInt NarrowConst = Case.getCaseValue()->getValue();      APInt WideConst = (ExtType == Instruction::ZExt) ?                        NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth); @@ -4995,6 +5574,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {    return true;  } +  namespace {  /// \brief Helper class to promote a scalar operation to a vector one.  /// This class is used to move downward extractelement transition. @@ -5473,7 +6053,7 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,    return true;  } -bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { +bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {    // Bail out if we inserted the instruction to prevent optimizations from    // stepping on each other's toes.    if (InsertedInsts.count(I)) @@ -5483,7 +6063,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {      // It is possible for very late stage optimizations (such as SimplifyCFG)      // to introduce PHI nodes too late to be cleaned up.  If we detect such a      // trivial PHI, go ahead and zap it here. -    if (Value *V = SimplifyInstruction(P, *DL, TLInfo, nullptr)) { +    if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) {        P->replaceAllUsesWith(V);        P->eraseFromParent();        ++NumPHIsElim; @@ -5514,7 +6094,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {                TargetLowering::TypeExpandInteger) {          return SinkCast(CI);        } else { -        bool MadeChange = moveExtToFormExtLoad(I); +        bool MadeChange = optimizeExt(I);          return MadeChange | optimizeExtUses(I);        }      } @@ -5548,8 +6128,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {      return false;    } +  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { +      unsigned AS = RMW->getPointerAddressSpace(); +      return optimizeMemoryInst(I, RMW->getPointerOperand(), +                                RMW->getType(), AS); +  } + +  if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) { +      unsigned AS = CmpX->getPointerAddressSpace(); +      return optimizeMemoryInst(I, CmpX->getPointerOperand(), +                                CmpX->getCompareOperand()->getType(), AS); +  } +    BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I); +  if (BinOp && (BinOp->getOpcode() == Instruction::And) && +      EnableAndCmpSinking && TLI) +    return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts); +    if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||                  BinOp->getOpcode() == Instruction::LShr)) {      ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1)); @@ -5612,7 +6208,7 @@ static bool makeBitReverse(Instruction &I, const DataLayout &DL,  // In this pass we look for GEP and cast instructions that are used  // across basic blocks and rewrite them to improve basic-block-at-a-time  // selection. -bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) { +bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {    SunkAddrs.clear();    bool MadeChange = false; @@ -5679,68 +6275,6 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {    return MadeChange;  } -// If there is a sequence that branches based on comparing a single bit -// against zero that can be combined into a single instruction, and the -// target supports folding these into a single instruction, sink the -// mask and compare into the branch uses. Do this before OptimizeBlock -> -// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being -// searched for. -bool CodeGenPrepare::sinkAndCmp(Function &F) { -  if (!EnableAndCmpSinking) -    return false; -  if (!TLI || !TLI->isMaskAndBranchFoldingLegal()) -    return false; -  bool MadeChange = false; -  for (BasicBlock &BB : F) { -    // Does this BB end with the following? -    //   %andVal = and %val, #single-bit-set -    //   %icmpVal = icmp %andResult, 0 -    //   br i1 %cmpVal label %dest1, label %dest2" -    BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator()); -    if (!Brcc || !Brcc->isConditional()) -      continue; -    ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0)); -    if (!Cmp || Cmp->getParent() != &BB) -      continue; -    ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1)); -    if (!Zero || !Zero->isZero()) -      continue; -    Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0)); -    if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB) -      continue; -    ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1)); -    if (!Mask || !Mask->getUniqueInteger().isPowerOf2()) -      continue; -    DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump()); - -    // Push the "and; icmp" for any users that are conditional branches. -    // Since there can only be one branch use per BB, we don't need to keep -    // track of which BBs we insert into. -    for (Use &TheUse : Cmp->uses()) { -      // Find brcc use. -      BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse); -      if (!BrccUser || !BrccUser->isConditional()) -        continue; -      BasicBlock *UserBB = BrccUser->getParent(); -      if (UserBB == &BB) continue; -      DEBUG(dbgs() << "found Brcc use\n"); - -      // Sink the "and; icmp" to use. -      MadeChange = true; -      BinaryOperator *NewAnd = -        BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "", -                                  BrccUser); -      CmpInst *NewCmp = -        CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero, -                        "", BrccUser); -      TheUse = NewCmp; -      ++NumAndCmpsMoved; -      DEBUG(BrccUser->getParent()->dump()); -    } -  } -  return MadeChange; -} -  /// \brief Scale down both weights to fit into uint32_t.  static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {    uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;  | 
