diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2015-12-30 13:13:10 +0000 | 
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2015-12-30 13:13:10 +0000 | 
| commit | 7d523365ff1a3cc95bc058b33102500f61e8166d (patch) | |
| tree | b466a4817f79516eb1df8eae92bccf62ecc84003 /contrib/llvm/lib/Transforms | |
| parent | e3b65fde506060bec5cd110fcf03b440bd0eea1d (diff) | |
| parent | dd58ef019b700900793a1eb48b52123db01b654e (diff) | |
Notes
Diffstat (limited to 'contrib/llvm/lib/Transforms')
154 files changed, 19960 insertions, 12172 deletions
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 4762011d63d8..0e05129b5261 100644 --- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -34,8 +34,11 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/ADT/StringExtras.h"  #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h"  #include "llvm/Analysis/CallGraph.h"  #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/CFG.h"  #include "llvm/IR/CallSite.h" @@ -63,7 +66,8 @@ namespace {    ///    struct ArgPromotion : public CallGraphSCCPass {      void getAnalysisUsage(AnalysisUsage &AU) const override { -      AU.addRequired<AliasAnalysis>(); +      AU.addRequired<AssumptionCacheTracker>(); +      AU.addRequired<TargetLibraryInfoWrapperPass>();        CallGraphSCCPass::getAnalysisUsage(AU);      } @@ -81,7 +85,8 @@ namespace {      bool isDenselyPacked(Type *type, const DataLayout &DL);      bool canPaddingBeAccessed(Argument *Arg);      CallGraphNode *PromoteArguments(CallGraphNode *CGN); -    bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const; +    bool isSafeToPromoteArgument(Argument *Arg, bool isByVal, +                                 AAResults &AAR) const;      CallGraphNode *DoPromotion(Function *F,                                SmallPtrSetImpl<Argument*> &ArgsToPromote,                                SmallPtrSetImpl<Argument*> &ByValArgsToTransform); @@ -90,15 +95,15 @@ namespace {      bool doInitialization(CallGraph &CG) override;      /// The maximum number of elements to expand, or 0 for unlimited.      unsigned maxElements; -    DenseMap<const Function *, DISubprogram *> FunctionDIs;    };  }  char ArgPromotion::ID = 0;  INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",                  "Promote 'by reference' arguments to scalars", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)  INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)  INITIALIZE_PASS_END(ArgPromotion, "argpromotion",                  "Promote 'by reference' arguments to scalars", false, false) @@ -217,9 +222,9 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {    // First check: see if there are any pointer arguments!  If not, quick exit.    SmallVector<Argument*, 16> PointerArgs; -  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) -    if (I->getType()->isPointerTy()) -      PointerArgs.push_back(I); +  for (Argument &I : F->args()) +    if (I.getType()->isPointerTy()) +      PointerArgs.push_back(&I);    if (PointerArgs.empty()) return nullptr;    // Second check: make sure that all callers are direct callers.  We can't @@ -237,6 +242,14 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {    const DataLayout &DL = F->getParent()->getDataLayout(); +  // We need to manually construct BasicAA directly in order to disable its use +  // of other function analyses. +  BasicAAResult BAR(createLegacyPMBasicAAResult(*this, *F)); + +  // Construct our own AA results for this function. We do this manually to +  // work around the limitations of the legacy pass manager. +  AAResults AAR(createLegacyPMAAResults(*this, *F, BAR)); +    // Check to see which arguments are promotable.  If an argument is promotable,    // add it to ArgsToPromote.    SmallPtrSet<Argument*, 8> ArgsToPromote; @@ -281,8 +294,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {          // If all the elements are single-value types, we can promote it.          bool AllSimple = true; -        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { -          if (!STy->getElementType(i)->isSingleValueType()) { +        for (const auto *EltTy : STy->elements()) { +          if (!EltTy->isSingleValueType()) {              AllSimple = false;              break;            } @@ -303,8 +316,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {      if (isSelfRecursive) {        if (StructType *STy = dyn_cast<StructType>(AgTy)) {          bool RecursiveType = false; -        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { -          if (STy->getElementType(i) == PtrArg->getType()) { +        for (const auto *EltTy : STy->elements()) { +          if (EltTy == PtrArg->getType()) {              RecursiveType = true;              break;            } @@ -315,7 +328,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {      }      // Otherwise, see if we can promote the pointer to its value. -    if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr())) +    if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR))        ArgsToPromote.insert(PtrArg);    } @@ -416,7 +429,8 @@ static void MarkIndicesSafe(const ArgPromotion::IndicesVector &ToMark,  /// elements of the aggregate in order to avoid exploding the number of  /// arguments passed in.  bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, -                                           bool isByValOrInAlloca) const { +                                           bool isByValOrInAlloca, +                                           AAResults &AAR) const {    typedef std::set<IndicesVector> GEPIndicesSet;    // Quick exit for unused arguments @@ -453,12 +467,11 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,    // First, iterate the entry block and mark loads of (geps of) arguments as    // safe. -  BasicBlock *EntryBlock = Arg->getParent()->begin(); +  BasicBlock &EntryBlock = Arg->getParent()->front();    // Declare this here so we can reuse it    IndicesVector Indices; -  for (BasicBlock::iterator I = EntryBlock->begin(), E = EntryBlock->end(); -       I != E; ++I) -    if (LoadInst *LI = dyn_cast<LoadInst>(I)) { +  for (Instruction &I : EntryBlock) +    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {        Value *V = LI->getPointerOperand();        if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {          V = GEP->getPointerOperand(); @@ -501,12 +514,11 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,        if (GEP->use_empty()) {          // Dead GEP's cause trouble later.  Just remove them if we run into          // them. -        getAnalysis<AliasAnalysis>().deleteValue(GEP);          GEP->eraseFromParent();          // TODO: This runs the above loop over and over again for dead GEPs          // Couldn't we just do increment the UI iterator earlier and erase the          // use? -        return isSafeToPromoteArgument(Arg, isByValOrInAlloca); +        return isSafeToPromoteArgument(Arg, isByValOrInAlloca, AAR);        }        // Ensure that all of the indices are constants. @@ -563,8 +575,6 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,    // blocks we know to be transparent to the load.    SmallPtrSet<BasicBlock*, 16> TranspBlocks; -  AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); -    for (unsigned i = 0, e = Loads.size(); i != e; ++i) {      // Check to see if the load is invalidated from the start of the block to      // the load itself. @@ -572,8 +582,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,      BasicBlock *BB = Load->getParent();      MemoryLocation Loc = MemoryLocation::get(Load); -    if (AA.canInstructionRangeModRef(BB->front(), *Load, Loc, -        AliasAnalysis::Mod)) +    if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, MRI_Mod))        return false;  // Pointer is invalidated!      // Now check every path from the entry block to the load for transparency. @@ -581,7 +590,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,      // loading block.      for (BasicBlock *P : predecessors(BB)) {        for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks)) -        if (AA.canBasicBlockModify(*TranspBB, Loc)) +        if (AAR.canBasicBlockModify(*TranspBB, Loc))            return false;      }    } @@ -637,13 +646,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,    unsigned ArgIndex = 1;    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;         ++I, ++ArgIndex) { -    if (ByValArgsToTransform.count(I)) { +    if (ByValArgsToTransform.count(&*I)) {        // Simple byval argument? Just add all the struct element types.        Type *AgTy = cast<PointerType>(I->getType())->getElementType();        StructType *STy = cast<StructType>(AgTy);        Params.insert(Params.end(), STy->element_begin(), STy->element_end());        ++NumByValArgsPromoted; -    } else if (!ArgsToPromote.count(I)) { +    } else if (!ArgsToPromote.count(&*I)) {        // Unchanged argument        Params.push_back(I->getType());        AttributeSet attrs = PAL.getParamAttributes(ArgIndex); @@ -661,7 +670,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,        // In this table, we will track which indices are loaded from the argument        // (where direct loads are tracked as no indices). -      ScalarizeTable &ArgIndices = ScalarizedElements[I]; +      ScalarizeTable &ArgIndices = ScalarizedElements[&*I];        for (User *U : I->users()) {          Instruction *UI = cast<Instruction>(U);          Type *SrcTy; @@ -687,7 +696,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,          else            // Take any load, we will use it only to update Alias Analysis            OrigLoad = cast<LoadInst>(UI->user_back()); -        OriginalLoads[std::make_pair(I, Indices)] = OrigLoad; +        OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;        }        // Add a parameter to the function for each element passed in. @@ -722,15 +731,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,    NF->copyAttributesFrom(F);    // Patch the pointer to LLVM function in debug info descriptor. -  auto DI = FunctionDIs.find(F); -  if (DI != FunctionDIs.end()) { -    DISubprogram *SP = DI->second; -    SP->replaceFunction(NF); -    // Ensure the map is updated so it can be reused on subsequent argument -    // promotions of the same function. -    FunctionDIs.erase(DI); -    FunctionDIs[NF] = SP; -  } +  NF->setSubprogram(F->getSubprogram()); +  F->setSubprogram(nullptr);    DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"          << "From: " << *F); @@ -740,13 +742,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,    NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec));    AttributesVec.clear(); -  F->getParent()->getFunctionList().insert(F, NF); +  F->getParent()->getFunctionList().insert(F->getIterator(), NF);    NF->takeName(F); -  // Get the alias analysis information that we need to update to reflect our -  // changes. -  AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); -    // Get the callgraph information that we need to update to reflect our    // changes.    CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); @@ -775,7 +773,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,      ArgIndex = 1;      for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();           I != E; ++I, ++AI, ++ArgIndex) -      if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) { +      if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {          Args.push_back(*AI);          // Unmodified argument          if (CallPAL.hasAttributes(ArgIndex)) { @@ -783,7 +781,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,            AttributesVec.              push_back(AttributeSet::get(F->getContext(), Args.size(), B));          } -      } else if (ByValArgsToTransform.count(I)) { +      } else if (ByValArgsToTransform.count(&*I)) {          // Emit a GEP and load for each element of the struct.          Type *AgTy = cast<PointerType>(I->getType())->getElementType();          StructType *STy = cast<StructType>(AgTy); @@ -798,14 +796,14 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,          }        } else if (!I->use_empty()) {          // Non-dead argument: insert GEPs and loads as appropriate. -        ScalarizeTable &ArgIndices = ScalarizedElements[I]; +        ScalarizeTable &ArgIndices = ScalarizedElements[&*I];          // Store the Value* version of the indices in here, but declare it now          // for reuse.          std::vector<Value*> Ops;          for (ScalarizeTable::iterator SI = ArgIndices.begin(),                 E = ArgIndices.end(); SI != E; ++SI) {            Value *V = *AI; -          LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, SI->second)]; +          LoadInst *OrigLoad = OriginalLoads[std::make_pair(&*I, SI->second)];            if (!SI->second.empty()) {              Ops.reserve(SI->second.size());              Type *ElTy = V->getType(); @@ -873,10 +871,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,      Args.clear();      AttributesVec.clear(); -    // Update the alias analysis implementation to know that we are replacing -    // the old call with a new one. -    AA.replaceWithNewValue(Call, New); -      // Update the callgraph to know that the callsite has been transformed.      CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()];      CalleeNode->replaceCallEdge(CS, CallSite(New), NF_CGN); @@ -901,20 +895,19 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,    //    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),         I2 = NF->arg_begin(); I != E; ++I) { -    if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) { +    if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {        // If this is an unmodified argument, move the name and users over to the        // new version. -      I->replaceAllUsesWith(I2); -      I2->takeName(I); -      AA.replaceWithNewValue(I, I2); +      I->replaceAllUsesWith(&*I2); +      I2->takeName(&*I);        ++I2;        continue;      } -    if (ByValArgsToTransform.count(I)) { +    if (ByValArgsToTransform.count(&*I)) {        // In the callee, we create an alloca, and store each of the new incoming        // arguments into the alloca. -      Instruction *InsertPt = NF->begin()->begin(); +      Instruction *InsertPt = &NF->begin()->front();        // Just add all the struct element types.        Type *AgTy = cast<PointerType>(I->getType())->getElementType(); @@ -929,13 +922,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,              AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),              InsertPt);          I2->setName(I->getName()+"."+Twine(i)); -        new StoreInst(I2++, Idx, InsertPt); +        new StoreInst(&*I2++, Idx, InsertPt);        }        // Anything that used the arg should now use the alloca.        I->replaceAllUsesWith(TheAlloca); -      TheAlloca->takeName(I); -      AA.replaceWithNewValue(I, TheAlloca); +      TheAlloca->takeName(&*I);        // If the alloca is used in a call, we must clear the tail flag since        // the callee now uses an alloca from the caller. @@ -948,23 +940,20 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,        continue;      } -    if (I->use_empty()) { -      AA.deleteValue(I); +    if (I->use_empty())        continue; -    }      // Otherwise, if we promoted this argument, then all users are load      // instructions (or GEPs with only load users), and all loads should be      // using the new argument that we added. -    ScalarizeTable &ArgIndices = ScalarizedElements[I]; +    ScalarizeTable &ArgIndices = ScalarizedElements[&*I];      while (!I->use_empty()) {        if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) {          assert(ArgIndices.begin()->second.empty() &&                 "Load element should sort to front!");          I2->setName(I->getName()+".val"); -        LI->replaceAllUsesWith(I2); -        AA.replaceWithNewValue(LI, I2); +        LI->replaceAllUsesWith(&*I2);          LI->eraseFromParent();          DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()                << "' in function '" << F->getName() << "'\n"); @@ -1000,11 +989,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,          // the argument specified by ArgNo.          while (!GEP->use_empty()) {            LoadInst *L = cast<LoadInst>(GEP->user_back()); -          L->replaceAllUsesWith(TheArg); -          AA.replaceWithNewValue(L, TheArg); +          L->replaceAllUsesWith(&*TheArg);            L->eraseFromParent();          } -        AA.deleteValue(GEP);          GEP->eraseFromParent();        }      } @@ -1013,10 +1000,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,      std::advance(I2, ArgIndices.size());    } -  // Tell the alias analysis that the old function is about to disappear. -  AA.replaceWithNewValue(F, NF); - -      NF_CGN->stealCalledFunctionsFrom(CG[F]);    // Now that the old function is dead, delete it.  If there is a dangling @@ -1032,6 +1015,5 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,  }  bool ArgPromotion::doInitialization(CallGraph &CG) { -  FunctionDIs = makeSubprogramMap(CG.getModule());    return CallGraphSCCPass::doInitialization(CG);  } diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp index 8ce7646621ff..0aa49d6fde01 100644 --- a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp @@ -119,7 +119,7 @@ bool ConstantMerge::runOnModule(Module &M) {      // First: Find the canonical constants others will be merged with.      for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();           GVI != E; ) { -      GlobalVariable *GV = GVI++; +      GlobalVariable *GV = &*GVI++;        // If this GV is dead, remove it.        GV->removeDeadConstantUsers(); @@ -160,7 +160,7 @@ bool ConstantMerge::runOnModule(Module &M) {      // invalidating the Constant* pointers in CMap.      for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();           GVI != E; ) { -      GlobalVariable *GV = GVI++; +      GlobalVariable *GV = &*GVI++;        // Only process constants with initializers in the default address space.        if (!GV->isConstant() || !GV->hasDefinitiveInitializer() || diff --git a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp new file mode 100644 index 000000000000..5bbb7513005c --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -0,0 +1,166 @@ +//===-- CrossDSOCFI.cpp - Externalize this module's CFI checks ------------===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass exports all llvm.bitset's found in the module in the form of a +// __cfi_check function, which can be used to verify cross-DSO call targets. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "cross-dso-cfi" + +STATISTIC(TypeIds, "Number of unique type identifiers"); + +namespace { + +struct CrossDSOCFI : public ModulePass { +  static char ID; +  CrossDSOCFI() : ModulePass(ID) { +    initializeCrossDSOCFIPass(*PassRegistry::getPassRegistry()); +  } + +  Module *M; +  MDNode *VeryLikelyWeights; + +  ConstantInt *extractBitSetTypeId(MDNode *MD); +  void buildCFICheck(); + +  bool doInitialization(Module &M) override; +  bool runOnModule(Module &M) override; +}; + +} // anonymous namespace + +INITIALIZE_PASS_BEGIN(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, +                      false) +INITIALIZE_PASS_END(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, false) +char CrossDSOCFI::ID = 0; + +ModulePass *llvm::createCrossDSOCFIPass() { return new CrossDSOCFI; } + +bool CrossDSOCFI::doInitialization(Module &Mod) { +  M = &Mod; +  VeryLikelyWeights = +      MDBuilder(M->getContext()).createBranchWeights((1U << 20) - 1, 1); + +  return false; +} + +/// extractBitSetTypeId - Extracts TypeId from a hash-based bitset MDNode. +ConstantInt *CrossDSOCFI::extractBitSetTypeId(MDNode *MD) { +  // This check excludes vtables for classes inside anonymous namespaces. +  auto TM = dyn_cast<ValueAsMetadata>(MD->getOperand(0)); +  if (!TM) +    return nullptr; +  auto C = dyn_cast_or_null<ConstantInt>(TM->getValue()); +  if (!C) return nullptr; +  // We are looking for i64 constants. +  if (C->getBitWidth() != 64) return nullptr; + +  // Sanity check. +  auto FM = dyn_cast_or_null<ValueAsMetadata>(MD->getOperand(1)); +  // Can be null if a function was removed by an optimization. +  if (FM) { +    auto F = dyn_cast<Function>(FM->getValue()); +    // But can never be a function declaration. +    assert(!F || !F->isDeclaration()); +    (void)F; // Suppress unused variable warning in the no-asserts build. +  } +  return C; +} + +/// buildCFICheck - emits __cfi_check for the current module. +void CrossDSOCFI::buildCFICheck() { +  // FIXME: verify that __cfi_check ends up near the end of the code section, +  // but before the jump slots created in LowerBitSets. +  llvm::DenseSet<uint64_t> BitSetIds; +  NamedMDNode *BitSetNM = M->getNamedMetadata("llvm.bitsets"); + +  if (BitSetNM) +    for (unsigned I = 0, E = BitSetNM->getNumOperands(); I != E; ++I) +      if (ConstantInt *TypeId = extractBitSetTypeId(BitSetNM->getOperand(I))) +        BitSetIds.insert(TypeId->getZExtValue()); + +  LLVMContext &Ctx = M->getContext(); +  Constant *C = M->getOrInsertFunction( +      "__cfi_check", +      FunctionType::get( +          Type::getVoidTy(Ctx), +          {Type::getInt64Ty(Ctx), PointerType::getUnqual(Type::getInt8Ty(Ctx))}, +          false)); +  Function *F = dyn_cast<Function>(C); +  F->setAlignment(4096); +  auto args = F->arg_begin(); +  Argument &CallSiteTypeId = *(args++); +  CallSiteTypeId.setName("CallSiteTypeId"); +  Argument &Addr = *(args++); +  Addr.setName("Addr"); +  assert(args == F->arg_end()); + +  BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F); + +  BasicBlock *TrapBB = BasicBlock::Create(Ctx, "trap", F); +  IRBuilder<> IRBTrap(TrapBB); +  Function *TrapFn = Intrinsic::getDeclaration(M, Intrinsic::trap); +  llvm::CallInst *TrapCall = IRBTrap.CreateCall(TrapFn); +  TrapCall->setDoesNotReturn(); +  TrapCall->setDoesNotThrow(); +  IRBTrap.CreateUnreachable(); + +  BasicBlock *ExitBB = BasicBlock::Create(Ctx, "exit", F); +  IRBuilder<> IRBExit(ExitBB); +  IRBExit.CreateRetVoid(); + +  IRBuilder<> IRB(BB); +  SwitchInst *SI = IRB.CreateSwitch(&CallSiteTypeId, TrapBB, BitSetIds.size()); +  for (uint64_t TypeId : BitSetIds) { +    ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId); +    BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F); +    IRBuilder<> IRBTest(TestBB); +    Function *BitsetTestFn = +        Intrinsic::getDeclaration(M, Intrinsic::bitset_test); + +    Value *Test = IRBTest.CreateCall( +        BitsetTestFn, {&Addr, MetadataAsValue::get( +                                  Ctx, ConstantAsMetadata::get(CaseTypeId))}); +    BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB); +    BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights); + +    SI->addCase(CaseTypeId, TestBB); +    ++TypeIds; +  } +} + +bool CrossDSOCFI::runOnModule(Module &M) { +  if (M.getModuleFlag("Cross-DSO CFI") == nullptr) +    return false; +  buildCFICheck(); +  return true; +} diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index d0447640259e..4de3d95ab11d 100644 --- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -35,6 +35,7 @@  #include "llvm/Pass.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h"  #include <map>  #include <set>  #include <tuple> @@ -121,14 +122,6 @@ namespace {      typedef SmallVector<RetOrArg, 5> UseVector; -    // Map each LLVM function to corresponding metadata with debug info. If -    // the function is replaced with another one, we should patch the pointer -    // to LLVM function in metadata. -    // As the code generation for module is finished (and DIBuilder is -    // finalized) we assume that subprogram descriptors won't be changed, and -    // they are stored in map for short duration anyway. -    DenseMap<const Function *, DISubprogram *> FunctionDIs; -    protected:      // DAH uses this to specify a different ID.      explicit DAE(char &ID) : ModulePass(ID) {} @@ -198,6 +191,13 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {    if (Fn.hasAddressTaken())      return false; +  // Don't touch naked functions. The assembly might be using an argument, or +  // otherwise rely on the frame layout in a way that this analysis will not +  // see. +  if (Fn.hasFnAttribute(Attribute::Naked)) { +    return false; +  } +    // Okay, we know we can transform this function if safe.  Scan its body    // looking for calls marked musttail or calls to llvm.vastart.    for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { @@ -229,7 +229,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {    // Create the new function body and insert it into the module...    Function *NF = Function::Create(NFTy, Fn.getLinkage());    NF->copyAttributesFrom(&Fn); -  Fn.getParent()->getFunctionList().insert(&Fn, NF); +  Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF);    NF->takeName(&Fn);    // Loop over all of the callers of the function, transforming the call sites @@ -296,20 +296,12 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {    for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(),         I2 = NF->arg_begin(); I != E; ++I, ++I2) {      // Move the name and users over to the new version. -    I->replaceAllUsesWith(I2); -    I2->takeName(I); +    I->replaceAllUsesWith(&*I2); +    I2->takeName(&*I);    }    // Patch the pointer to LLVM function in debug info descriptor. -  auto DI = FunctionDIs.find(&Fn); -  if (DI != FunctionDIs.end()) { -    DISubprogram *SP = DI->second; -    SP->replaceFunction(NF); -    // Ensure the map is updated so it can be reused on non-varargs argument -    // eliminations of the same function. -    FunctionDIs.erase(DI); -    FunctionDIs[NF] = SP; -  } +  NF->setSubprogram(Fn.getSubprogram());    // Fix up any BlockAddresses that refer to the function.    Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType())); @@ -345,16 +337,19 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)    if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())      return false; +  // Don't touch naked functions. The assembly might be using an argument, or +  // otherwise rely on the frame layout in a way that this analysis will not +  // see. +  if (Fn.hasFnAttribute(Attribute::Naked)) +    return false; +    if (Fn.use_empty())      return false;    SmallVector<unsigned, 8> UnusedArgs; -  for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end();  -       I != E; ++I) { -    Argument *Arg = I; - -    if (Arg->use_empty() && !Arg->hasByValOrInAllocaAttr()) -      UnusedArgs.push_back(Arg->getArgNo()); +  for (Argument &Arg : Fn.args()) { +    if (Arg.use_empty() && !Arg.hasByValOrInAllocaAttr()) +      UnusedArgs.push_back(Arg.getArgNo());    }    if (UnusedArgs.empty()) @@ -485,6 +480,10 @@ DAE::Liveness DAE::SurveyUse(const Use *U,        if (F) {          // Used in a direct call. +        // The function argument is live if it is used as a bundle operand. +        if (CS.isBundleOperand(U)) +          return Live; +          // Find the argument number. We know for sure that this use is an          // argument, since if it was the function argument this would be an          // indirect call and the we know can't be looking at a value of the @@ -543,6 +542,14 @@ void DAE::SurveyFunction(const Function &F) {      return;    } +  // Don't touch naked functions. The assembly might be using an argument, or +  // otherwise rely on the frame layout in a way that this analysis will not +  // see. +  if (F.hasFnAttribute(Attribute::Naked)) { +    MarkLive(F); +    return; +  } +    unsigned RetCount = NumRetVals(&F);    // Assume all return values are dead    typedef SmallVector<Liveness, 5> RetVals; @@ -648,7 +655,7 @@ void DAE::SurveyFunction(const Function &F) {      } else {        // See what the effect of this use is (recording any uses that cause        // MaybeLive in MaybeLiveArgUses).  -      Result = SurveyUses(AI, MaybeLiveArgUses); +      Result = SurveyUses(&*AI, MaybeLiveArgUses);      }      // Mark the result. @@ -878,7 +885,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {    NF->setAttributes(NewPAL);    // Insert the new function before the old function, so we won't be processing    // it again. -  F->getParent()->getFunctionList().insert(F, NF); +  F->getParent()->getFunctionList().insert(F->getIterator(), NF);    NF->takeName(F);    // Loop over all of the callers of the function, transforming the call sites @@ -946,7 +953,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {      Instruction *New;      if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {        New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), -                               Args, "", Call); +                               Args, "", Call->getParent());        cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());        cast<InvokeInst>(New)->setAttributes(NewCallPAL);      } else { @@ -976,9 +983,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {                 " must have been a struct or an array!");          Instruction *InsertPt = Call;          if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { -          BasicBlock::iterator IP = II->getNormalDest()->begin(); -          while (isa<PHINode>(IP)) ++IP; -          InsertPt = IP; +          BasicBlock *NewEdge = SplitEdge(New->getParent(), II->getNormalDest()); +          InsertPt = &*NewEdge->getFirstInsertionPt();          }          // We used to return a struct or array. Instead of doing smart stuff @@ -1026,8 +1032,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {      if (ArgAlive[i]) {        // If this is a live argument, move the name and users over to the new        // version. -      I->replaceAllUsesWith(I2); -      I2->takeName(I); +      I->replaceAllUsesWith(&*I2); +      I2->takeName(&*I);        ++I2;      } else {        // If this argument is dead, replace any uses of it with null constants @@ -1079,9 +1085,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {        }    // Patch the pointer to LLVM function in debug info descriptor. -  auto DI = FunctionDIs.find(F); -  if (DI != FunctionDIs.end()) -    DI->second->replaceFunction(NF); +  NF->setSubprogram(F->getSubprogram());    // Now that the old function is dead, delete it.    F->eraseFromParent(); @@ -1092,9 +1096,6 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {  bool DAE::runOnModule(Module &M) {    bool Changed = false; -  // Collect debug info descriptors for functions. -  FunctionDIs = makeSubprogramMap(M); -    // First pass: Do a simple check to see if any functions can have their "..."    // removed.  We can do this if they never call va_start.  This loop cannot be    // fused with the next loop, because deleting a function invalidates @@ -1119,7 +1120,7 @@ bool DAE::runOnModule(Module &M) {    for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {      // Increment now, because the function will probably get removed (ie.      // replaced by a new one). -    Function *F = I++; +    Function *F = &*I++;      Changed |= RemoveDeadStuffFromFunction(F);    } diff --git a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp index 67ba72d6a360..af313a6b001d 100644 --- a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp @@ -1,4 +1,5 @@ -//===-- ElimAvailExtern.cpp - DCE unreachable internal functions ----------------===// +//===-- ElimAvailExtern.cpp - DCE unreachable internal functions +//----------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -15,9 +16,7 @@  #include "llvm/Transforms/IPO.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/IR/Constants.h" -#include "llvm/IR/Instructions.h"  #include "llvm/IR/Module.h" -#include "llvm/Transforms/Utils/CtorUtils.h"  #include "llvm/Transforms/Utils/GlobalStatus.h"  #include "llvm/Pass.h"  using namespace llvm; @@ -28,18 +27,18 @@ STATISTIC(NumFunctions, "Number of functions removed");  STATISTIC(NumVariables, "Number of global variables removed");  namespace { -  struct EliminateAvailableExternally : public ModulePass { -    static char ID; // Pass identification, replacement for typeid -    EliminateAvailableExternally() : ModulePass(ID) { -      initializeEliminateAvailableExternallyPass( -          *PassRegistry::getPassRegistry()); -    } +struct EliminateAvailableExternally : public ModulePass { +  static char ID; // Pass identification, replacement for typeid +  EliminateAvailableExternally() : ModulePass(ID) { +    initializeEliminateAvailableExternallyPass( +        *PassRegistry::getPassRegistry()); +  } -    // run - Do the EliminateAvailableExternally pass on the specified module, -    // optionally updating the specified callgraph to reflect the changes. -    // -    bool runOnModule(Module &M) override; -  }; +  // run - Do the EliminateAvailableExternally pass on the specified module, +  // optionally updating the specified callgraph to reflect the changes. +  // +  bool runOnModule(Module &M) override; +};  }  char EliminateAvailableExternally::ID = 0; @@ -54,30 +53,31 @@ bool EliminateAvailableExternally::runOnModule(Module &M) {    bool Changed = false;    // Drop initializers of available externally global variables. -  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); -       I != E; ++I) { -    if (!I->hasAvailableExternallyLinkage()) +  for (GlobalVariable &GV : M.globals()) { +    if (!GV.hasAvailableExternallyLinkage())        continue; -    if (I->hasInitializer()) { -      Constant *Init = I->getInitializer(); -      I->setInitializer(nullptr); +    if (GV.hasInitializer()) { +      Constant *Init = GV.getInitializer(); +      GV.setInitializer(nullptr);        if (isSafeToDestroyConstant(Init))          Init->destroyConstant();      } -    I->removeDeadConstantUsers(); -    I->setLinkage(GlobalValue::ExternalLinkage); +    GV.removeDeadConstantUsers(); +    GV.setLinkage(GlobalValue::ExternalLinkage);      NumVariables++; +    Changed = true;    }    // Drop the bodies of available externally functions. -  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { -    if (!I->hasAvailableExternallyLinkage()) +  for (Function &F : M) { +    if (!F.hasAvailableExternallyLinkage())        continue; -    if (!I->isDeclaration()) +    if (!F.isDeclaration())        // This will set the linkage to external -      I->deleteBody(); -    I->removeDeadConstantUsers(); +      F.deleteBody(); +    F.removeDeadConstantUsers();      NumFunctions++; +    Changed = true;    }    return Changed; diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp index b9462f2ffc72..1a3b9253d72f 100644 --- a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp @@ -83,7 +83,7 @@ namespace {        for (Module::global_iterator I = M.global_begin(), E = M.global_end();             I != E; ++I) {          bool Delete = -          deleteStuff == (bool)Named.count(I) && !I->isDeclaration(); +            deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration();          if (!Delete) {            if (I->hasAvailableExternallyLinkage())              continue; @@ -103,7 +103,7 @@ namespace {        // Visit the Functions.        for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {          bool Delete = -          deleteStuff == (bool)Named.count(I) && !I->isDeclaration(); +            deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration();          if (!Delete) {            if (I->hasAvailableExternallyLinkage())              continue; @@ -124,7 +124,7 @@ namespace {          Module::alias_iterator CurI = I;          ++I; -        bool Delete = deleteStuff == (bool)Named.count(CurI); +        bool Delete = deleteStuff == (bool)Named.count(&*CurI);          makeVisible(*CurI, Delete);          if (Delete) { @@ -143,7 +143,7 @@ namespace {            }            CurI->replaceAllUsesWith(Declaration); -          delete CurI; +          delete &*CurI;          }        } diff --git a/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp new file mode 100644 index 000000000000..816291dac9e8 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp @@ -0,0 +1,121 @@ +//===- ForceFunctionAttrs.cpp - Force function attrs for debugging --------===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/ForceFunctionAttrs.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "forceattrs" + +static cl::list<std::string> +    ForceAttributes("force-attribute", cl::Hidden, +                    cl::desc("Add an attribute to a function. This should be a " +                             "pair of 'function-name:attribute-name', for " +                             "example -force-add-attribute=foo:noinline. This " +                             "option can be specified multiple times.")); + +static Attribute::AttrKind parseAttrKind(StringRef Kind) { +  return StringSwitch<Attribute::AttrKind>(Kind) +      .Case("alwaysinline", Attribute::AlwaysInline) +      .Case("builtin", Attribute::Builtin) +      .Case("cold", Attribute::Cold) +      .Case("convergent", Attribute::Convergent) +      .Case("inlinehint", Attribute::InlineHint) +      .Case("jumptable", Attribute::JumpTable) +      .Case("minsize", Attribute::MinSize) +      .Case("naked", Attribute::Naked) +      .Case("nobuiltin", Attribute::NoBuiltin) +      .Case("noduplicate", Attribute::NoDuplicate) +      .Case("noimplicitfloat", Attribute::NoImplicitFloat) +      .Case("noinline", Attribute::NoInline) +      .Case("nonlazybind", Attribute::NonLazyBind) +      .Case("noredzone", Attribute::NoRedZone) +      .Case("noreturn", Attribute::NoReturn) +      .Case("norecurse", Attribute::NoRecurse) +      .Case("nounwind", Attribute::NoUnwind) +      .Case("optnone", Attribute::OptimizeNone) +      .Case("optsize", Attribute::OptimizeForSize) +      .Case("readnone", Attribute::ReadNone) +      .Case("readonly", Attribute::ReadOnly) +      .Case("argmemonly", Attribute::ArgMemOnly) +      .Case("returns_twice", Attribute::ReturnsTwice) +      .Case("safestack", Attribute::SafeStack) +      .Case("sanitize_address", Attribute::SanitizeAddress) +      .Case("sanitize_memory", Attribute::SanitizeMemory) +      .Case("sanitize_thread", Attribute::SanitizeThread) +      .Case("ssp", Attribute::StackProtect) +      .Case("sspreq", Attribute::StackProtectReq) +      .Case("sspstrong", Attribute::StackProtectStrong) +      .Case("uwtable", Attribute::UWTable) +      .Default(Attribute::None); +} + +/// If F has any forced attributes given on the command line, add them. +static void addForcedAttributes(Function &F) { +  for (auto &S : ForceAttributes) { +    auto KV = StringRef(S).split(':'); +    if (KV.first != F.getName()) +      continue; + +    auto Kind = parseAttrKind(KV.second); +    if (Kind == Attribute::None) { +      DEBUG(dbgs() << "ForcedAttribute: " << KV.second +                   << " unknown or not handled!\n"); +      continue; +    } +    if (F.hasFnAttribute(Kind)) +      continue; +    F.addFnAttr(Kind); +  } +} + +PreservedAnalyses ForceFunctionAttrsPass::run(Module &M) { +  if (ForceAttributes.empty()) +    return PreservedAnalyses::all(); + +  for (Function &F : M.functions()) +    addForcedAttributes(F); + +  // Just conservatively invalidate analyses, this isn't likely to be important. +  return PreservedAnalyses::none(); +} + +namespace { +struct ForceFunctionAttrsLegacyPass : public ModulePass { +  static char ID; // Pass identification, replacement for typeid +  ForceFunctionAttrsLegacyPass() : ModulePass(ID) { +    initializeForceFunctionAttrsLegacyPassPass( +        *PassRegistry::getPassRegistry()); +  } + +  bool runOnModule(Module &M) override { +    if (ForceAttributes.empty()) +      return false; + +    for (Function &F : M.functions()) +      addForcedAttributes(F); + +    // Conservatively assume we changed something. +    return true; +  } +}; +} + +char ForceFunctionAttrsLegacyPass::ID = 0; +INITIALIZE_PASS(ForceFunctionAttrsLegacyPass, "forceattrs", +                "Force set function attributes", false, false) + +Pass *llvm::createForceFunctionAttrsLegacyPass() { +  return new ForceFunctionAttrsLegacyPass(); +} diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index bb5e64aef338..6dcfb3f83004 100644 --- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -23,14 +23,21 @@  #include "llvm/ADT/SetVector.h"  #include "llvm/ADT/SmallSet.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringSwitch.h"  #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h"  #include "llvm/Analysis/CallGraph.h"  #include "llvm/Analysis/CallGraphSCCPass.h"  #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/GlobalVariable.h"  #include "llvm/IR/InstIterator.h"  #include "llvm/IR/IntrinsicInst.h"  #include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h"  #include "llvm/Analysis/TargetLibraryInfo.h"  using namespace llvm; @@ -42,230 +49,191 @@ STATISTIC(NumNoCapture, "Number of arguments marked nocapture");  STATISTIC(NumReadNoneArg, "Number of arguments marked readnone");  STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");  STATISTIC(NumNoAlias, "Number of function returns marked noalias"); -STATISTIC(NumAnnotated, "Number of attributes added to library functions"); +STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull"); +STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");  namespace { -  struct FunctionAttrs : public CallGraphSCCPass { -    static char ID; // Pass identification, replacement for typeid -    FunctionAttrs() : CallGraphSCCPass(ID), AA(nullptr) { -      initializeFunctionAttrsPass(*PassRegistry::getPassRegistry()); -    } - -    // runOnSCC - Analyze the SCC, performing the transformation if possible. -    bool runOnSCC(CallGraphSCC &SCC) override; - -    // AddReadAttrs - Deduce readonly/readnone attributes for the SCC. -    bool AddReadAttrs(const CallGraphSCC &SCC); - -    // AddArgumentAttrs - Deduce nocapture attributes for the SCC. -    bool AddArgumentAttrs(const CallGraphSCC &SCC); - -    // IsFunctionMallocLike - Does this function allocate new memory? -    bool IsFunctionMallocLike(Function *F, -                              SmallPtrSet<Function*, 8> &) const; - -    // AddNoAliasAttrs - Deduce noalias attributes for the SCC. -    bool AddNoAliasAttrs(const CallGraphSCC &SCC); - -    // Utility methods used by inferPrototypeAttributes to add attributes -    // and maintain annotation statistics. - -    void setDoesNotAccessMemory(Function &F) { -      if (!F.doesNotAccessMemory()) { -        F.setDoesNotAccessMemory(); -        ++NumAnnotated; -      } -    } - -    void setOnlyReadsMemory(Function &F) { -      if (!F.onlyReadsMemory()) { -        F.setOnlyReadsMemory(); -        ++NumAnnotated; -      } -    } - -    void setDoesNotThrow(Function &F) { -      if (!F.doesNotThrow()) { -        F.setDoesNotThrow(); -        ++NumAnnotated; -      } -    } - -    void setDoesNotCapture(Function &F, unsigned n) { -      if (!F.doesNotCapture(n)) { -        F.setDoesNotCapture(n); -        ++NumAnnotated; -      } -    } - -    void setOnlyReadsMemory(Function &F, unsigned n) { -      if (!F.onlyReadsMemory(n)) { -        F.setOnlyReadsMemory(n); -        ++NumAnnotated; -      } -    } - -    void setDoesNotAlias(Function &F, unsigned n) { -      if (!F.doesNotAlias(n)) { -        F.setDoesNotAlias(n); -        ++NumAnnotated; -      } -    } - -    // inferPrototypeAttributes - Analyze the name and prototype of the -    // given function and set any applicable attributes.  Returns true -    // if any attributes were set and false otherwise. -    bool inferPrototypeAttributes(Function &F); +typedef SmallSetVector<Function *, 8> SCCNodeSet; +} -    // annotateLibraryCalls - Adds attributes to well-known standard library -    // call declarations. -    bool annotateLibraryCalls(const CallGraphSCC &SCC); +namespace { +struct FunctionAttrs : public CallGraphSCCPass { +  static char ID; // Pass identification, replacement for typeid +  FunctionAttrs() : CallGraphSCCPass(ID) { +    initializeFunctionAttrsPass(*PassRegistry::getPassRegistry()); +  } -    void getAnalysisUsage(AnalysisUsage &AU) const override { -      AU.setPreservesCFG(); -      AU.addRequired<AliasAnalysis>(); -      AU.addRequired<TargetLibraryInfoWrapperPass>(); -      CallGraphSCCPass::getAnalysisUsage(AU); -    } +  bool runOnSCC(CallGraphSCC &SCC) override; +  bool doInitialization(CallGraph &CG) override { +    Revisit.clear(); +    return false; +  } +  bool doFinalization(CallGraph &CG) override; +   +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.setPreservesCFG(); +    AU.addRequired<AssumptionCacheTracker>(); +    AU.addRequired<TargetLibraryInfoWrapperPass>(); +    CallGraphSCCPass::getAnalysisUsage(AU); +  } -  private: -    AliasAnalysis *AA; -    TargetLibraryInfo *TLI; -  }; +private: +  TargetLibraryInfo *TLI; +  SmallVector<WeakVH,16> Revisit; +};  }  char FunctionAttrs::ID = 0;  INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs", -                "Deduce function attributes", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +                      "Deduce function attributes", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)  INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)  INITIALIZE_PASS_END(FunctionAttrs, "functionattrs", -                "Deduce function attributes", false, false) +                    "Deduce function attributes", false, false)  Pass *llvm::createFunctionAttrsPass() { return new FunctionAttrs(); } +namespace { +/// The three kinds of memory access relevant to 'readonly' and +/// 'readnone' attributes. +enum MemoryAccessKind { +  MAK_ReadNone = 0, +  MAK_ReadOnly = 1, +  MAK_MayWrite = 2 +}; +} -/// AddReadAttrs - Deduce readonly/readnone attributes for the SCC. -bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) { -  SmallPtrSet<Function*, 8> SCCNodes; - -  // Fill SCCNodes with the elements of the SCC.  Used for quickly -  // looking up whether a given CallGraphNode is in this SCC. -  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) -    SCCNodes.insert((*I)->getFunction()); +static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR, +                                                  const SCCNodeSet &SCCNodes) { +  FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F); +  if (MRB == FMRB_DoesNotAccessMemory) +    // Already perfect! +    return MAK_ReadNone; + +  // Definitions with weak linkage may be overridden at linktime with +  // something that writes memory, so treat them like declarations. +  if (F.isDeclaration() || F.mayBeOverridden()) { +    if (AliasAnalysis::onlyReadsMemory(MRB)) +      return MAK_ReadOnly; + +    // Conservatively assume it writes to memory. +    return MAK_MayWrite; +  } -  // Check if any of the functions in the SCC read or write memory.  If they -  // write memory then they can't be marked readnone or readonly. +  // Scan the function body for instructions that may read or write memory.    bool ReadsMemory = false; -  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { -    Function *F = (*I)->getFunction(); - -    if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) -      // External node or node we don't want to optimize - assume it may write -      // memory and give up. -      return false; +  for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { +    Instruction *I = &*II; + +    // Some instructions can be ignored even if they read or write memory. +    // Detect these now, skipping to the next instruction if one is found. +    CallSite CS(cast<Value>(I)); +    if (CS) { +      // Ignore calls to functions in the same SCC. +      if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) +        continue; +      FunctionModRefBehavior MRB = AAR.getModRefBehavior(CS); -    AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(F); -    if (MRB == AliasAnalysis::DoesNotAccessMemory) -      // Already perfect! -      continue; +      // If the call doesn't access memory, we're done. +      if (!(MRB & MRI_ModRef)) +        continue; -    // Definitions with weak linkage may be overridden at linktime with -    // something that writes memory, so treat them like declarations. -    if (F->isDeclaration() || F->mayBeOverridden()) { -      if (!AliasAnalysis::onlyReadsMemory(MRB)) -        // May write memory.  Just give up. -        return false; +      if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) { +        // The call could access any memory. If that includes writes, give up. +        if (MRB & MRI_Mod) +          return MAK_MayWrite; +        // If it reads, note it. +        if (MRB & MRI_Ref) +          ReadsMemory = true; +        continue; +      } -      ReadsMemory = true; -      continue; -    } +      // Check whether all pointer arguments point to local memory, and +      // ignore calls that only access local memory. +      for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end(); +           CI != CE; ++CI) { +        Value *Arg = *CI; +        if (!Arg->getType()->isPtrOrPtrVectorTy()) +          continue; -    // Scan the function body for instructions that may read or write memory. -    for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { -      Instruction *I = &*II; +        AAMDNodes AAInfo; +        I->getAAMetadata(AAInfo); +        MemoryLocation Loc(Arg, MemoryLocation::UnknownSize, AAInfo); -      // Some instructions can be ignored even if they read or write memory. -      // Detect these now, skipping to the next instruction if one is found. -      CallSite CS(cast<Value>(I)); -      if (CS) { -        // Ignore calls to functions in the same SCC. -        if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) +        // Skip accesses to local or constant memory as they don't impact the +        // externally visible mod/ref behavior. +        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))            continue; -        AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(CS); -        // If the call doesn't access arbitrary memory, we may be able to -        // figure out something. -        if (AliasAnalysis::onlyAccessesArgPointees(MRB)) { -          // If the call does access argument pointees, check each argument. -          if (AliasAnalysis::doesAccessArgPointees(MRB)) -            // Check whether all pointer arguments point to local memory, and -            // ignore calls that only access local memory. -            for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end(); -                 CI != CE; ++CI) { -              Value *Arg = *CI; -              if (Arg->getType()->isPointerTy()) { -                AAMDNodes AAInfo; -                I->getAAMetadata(AAInfo); - -                MemoryLocation Loc(Arg, MemoryLocation::UnknownSize, AAInfo); -                if (!AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) { -                  if (MRB & AliasAnalysis::Mod) -                    // Writes non-local memory.  Give up. -                    return false; -                  if (MRB & AliasAnalysis::Ref) -                    // Ok, it reads non-local memory. -                    ReadsMemory = true; -                } -              } -            } -          continue; -        } -        // The call could access any memory. If that includes writes, give up. -        if (MRB & AliasAnalysis::Mod) -          return false; -        // If it reads, note it. -        if (MRB & AliasAnalysis::Ref) + +        if (MRB & MRI_Mod) +          // Writes non-local memory.  Give up. +          return MAK_MayWrite; +        if (MRB & MRI_Ref) +          // Ok, it reads non-local memory.            ReadsMemory = true; -        continue; -      } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) { -        // Ignore non-volatile loads from local memory. (Atomic is okay here.) -        if (!LI->isVolatile()) { -          MemoryLocation Loc = MemoryLocation::get(LI); -          if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) -            continue; -        } -      } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { -        // Ignore non-volatile stores to local memory. (Atomic is okay here.) -        if (!SI->isVolatile()) { -          MemoryLocation Loc = MemoryLocation::get(SI); -          if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) -            continue; -        } -      } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) { -        // Ignore vaargs on local memory. -        MemoryLocation Loc = MemoryLocation::get(VI); -        if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) +      } +      continue; +    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) { +      // Ignore non-volatile loads from local memory. (Atomic is okay here.) +      if (!LI->isVolatile()) { +        MemoryLocation Loc = MemoryLocation::get(LI); +        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) +          continue; +      } +    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { +      // Ignore non-volatile stores to local memory. (Atomic is okay here.) +      if (!SI->isVolatile()) { +        MemoryLocation Loc = MemoryLocation::get(SI); +        if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))            continue;        } +    } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) { +      // Ignore vaargs on local memory. +      MemoryLocation Loc = MemoryLocation::get(VI); +      if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) +        continue; +    } -      // Any remaining instructions need to be taken seriously!  Check if they -      // read or write memory. -      if (I->mayWriteToMemory()) -        // Writes memory.  Just give up. -        return false; +    // Any remaining instructions need to be taken seriously!  Check if they +    // read or write memory. +    if (I->mayWriteToMemory()) +      // Writes memory.  Just give up. +      return MAK_MayWrite; + +    // If this instruction may read memory, remember that. +    ReadsMemory |= I->mayReadFromMemory(); +  } + +  return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone; +} -      // If this instruction may read memory, remember that. -      ReadsMemory |= I->mayReadFromMemory(); +/// Deduce readonly/readnone attributes for the SCC. +template <typename AARGetterT> +static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) { +  // Check if any of the functions in the SCC read or write memory.  If they +  // write memory then they can't be marked readnone or readonly. +  bool ReadsMemory = false; +  for (Function *F : SCCNodes) { +    // Call the callable parameter to look up AA results for this function. +    AAResults &AAR = AARGetter(*F); + +    switch (checkFunctionMemoryAccess(*F, AAR, SCCNodes)) { +    case MAK_MayWrite: +      return false; +    case MAK_ReadOnly: +      ReadsMemory = true; +      break; +    case MAK_ReadNone: +      // Nothing to do! +      break;      }    }    // Success!  Functions in this SCC do not access memory, or only read memory.    // Give them the appropriate attribute.    bool MadeChange = false; -  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { -    Function *F = (*I)->getFunction(); - +  for (Function *F : SCCNodes) {      if (F->doesNotAccessMemory())        // Already perfect!        continue; @@ -278,11 +246,10 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {      // Clear out any existing attributes.      AttrBuilder B; -    B.addAttribute(Attribute::ReadOnly) -      .addAttribute(Attribute::ReadNone); -    F->removeAttributes(AttributeSet::FunctionIndex, -                        AttributeSet::get(F->getContext(), -                                          AttributeSet::FunctionIndex, B)); +    B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone); +    F->removeAttributes( +        AttributeSet::FunctionIndex, +        AttributeSet::get(F->getContext(), AttributeSet::FunctionIndex, B));      // Add in the new attribute.      F->addAttribute(AttributeSet::FunctionIndex, @@ -298,124 +265,140 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {  }  namespace { -  // For a given pointer Argument, this retains a list of Arguments of functions -  // in the same SCC that the pointer data flows into. We use this to build an -  // SCC of the arguments. -  struct ArgumentGraphNode { -    Argument *Definition; -    SmallVector<ArgumentGraphNode*, 4> Uses; -  }; - -  class ArgumentGraph { -    // We store pointers to ArgumentGraphNode objects, so it's important that -    // that they not move around upon insert. -    typedef std::map<Argument*, ArgumentGraphNode> ArgumentMapTy; +/// For a given pointer Argument, this retains a list of Arguments of functions +/// in the same SCC that the pointer data flows into. We use this to build an +/// SCC of the arguments. +struct ArgumentGraphNode { +  Argument *Definition; +  SmallVector<ArgumentGraphNode *, 4> Uses; +}; + +class ArgumentGraph { +  // We store pointers to ArgumentGraphNode objects, so it's important that +  // that they not move around upon insert. +  typedef std::map<Argument *, ArgumentGraphNode> ArgumentMapTy; + +  ArgumentMapTy ArgumentMap; + +  // There is no root node for the argument graph, in fact: +  //   void f(int *x, int *y) { if (...) f(x, y); } +  // is an example where the graph is disconnected. The SCCIterator requires a +  // single entry point, so we maintain a fake ("synthetic") root node that +  // uses every node. Because the graph is directed and nothing points into +  // the root, it will not participate in any SCCs (except for its own). +  ArgumentGraphNode SyntheticRoot; + +public: +  ArgumentGraph() { SyntheticRoot.Definition = nullptr; } + +  typedef SmallVectorImpl<ArgumentGraphNode *>::iterator iterator; + +  iterator begin() { return SyntheticRoot.Uses.begin(); } +  iterator end() { return SyntheticRoot.Uses.end(); } +  ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; } + +  ArgumentGraphNode *operator[](Argument *A) { +    ArgumentGraphNode &Node = ArgumentMap[A]; +    Node.Definition = A; +    SyntheticRoot.Uses.push_back(&Node); +    return &Node; +  } +}; -    ArgumentMapTy ArgumentMap; +/// This tracker checks whether callees are in the SCC, and if so it does not +/// consider that a capture, instead adding it to the "Uses" list and +/// continuing with the analysis. +struct ArgumentUsesTracker : public CaptureTracker { +  ArgumentUsesTracker(const SCCNodeSet &SCCNodes) +      : Captured(false), SCCNodes(SCCNodes) {} -    // There is no root node for the argument graph, in fact: -    //   void f(int *x, int *y) { if (...) f(x, y); } -    // is an example where the graph is disconnected. The SCCIterator requires a -    // single entry point, so we maintain a fake ("synthetic") root node that -    // uses every node. Because the graph is directed and nothing points into -    // the root, it will not participate in any SCCs (except for its own). -    ArgumentGraphNode SyntheticRoot; +  void tooManyUses() override { Captured = true; } -  public: -    ArgumentGraph() { SyntheticRoot.Definition = nullptr; } +  bool captured(const Use *U) override { +    CallSite CS(U->getUser()); +    if (!CS.getInstruction()) { +      Captured = true; +      return true; +    } -    typedef SmallVectorImpl<ArgumentGraphNode*>::iterator iterator; +    Function *F = CS.getCalledFunction(); +    if (!F || F->isDeclaration() || F->mayBeOverridden() || +        !SCCNodes.count(F)) { +      Captured = true; +      return true; +    } -    iterator begin() { return SyntheticRoot.Uses.begin(); } -    iterator end() { return SyntheticRoot.Uses.end(); } -    ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; } +    // Note: the callee and the two successor blocks *follow* the argument +    // operands.  This means there is no need to adjust UseIndex to account for +    // these. -    ArgumentGraphNode *operator[](Argument *A) { -      ArgumentGraphNode &Node = ArgumentMap[A]; -      Node.Definition = A; -      SyntheticRoot.Uses.push_back(&Node); -      return &Node; -    } -  }; +    unsigned UseIndex = +        std::distance(const_cast<const Use *>(CS.arg_begin()), U); -  // This tracker checks whether callees are in the SCC, and if so it does not -  // consider that a capture, instead adding it to the "Uses" list and -  // continuing with the analysis. -  struct ArgumentUsesTracker : public CaptureTracker { -    ArgumentUsesTracker(const SmallPtrSet<Function*, 8> &SCCNodes) -      : Captured(false), SCCNodes(SCCNodes) {} +    assert(UseIndex < CS.data_operands_size() && +           "Indirect function calls should have been filtered above!"); -    void tooManyUses() override { Captured = true; } +    if (UseIndex >= CS.getNumArgOperands()) { +      // Data operand, but not a argument operand -- must be a bundle operand +      assert(CS.hasOperandBundles() && "Must be!"); -    bool captured(const Use *U) override { -      CallSite CS(U->getUser()); -      if (!CS.getInstruction()) { Captured = true; return true; } +      // CaptureTracking told us that we're being captured by an operand bundle +      // use.  In this case it does not matter if the callee is within our SCC +      // or not -- we've been captured in some unknown way, and we have to be +      // conservative. +      Captured = true; +      return true; +    } -      Function *F = CS.getCalledFunction(); -      if (!F || !SCCNodes.count(F)) { Captured = true; return true; } - -      bool Found = false; -      Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end(); -      for (CallSite::arg_iterator PI = CS.arg_begin(), PE = CS.arg_end(); -           PI != PE; ++PI, ++AI) { -        if (AI == AE) { -          assert(F->isVarArg() && "More params than args in non-varargs call"); -          Captured = true; -          return true; -        } -        if (PI == U) { -          Uses.push_back(AI); -          Found = true; -          break; -        } -      } -      assert(Found && "Capturing call-site captured nothing?"); -      (void)Found; -      return false; +    if (UseIndex >= F->arg_size()) { +      assert(F->isVarArg() && "More params than args in non-varargs call"); +      Captured = true; +      return true;      } -    bool Captured;  // True only if certainly captured (used outside our SCC). -    SmallVector<Argument*, 4> Uses;  // Uses within our SCC. +    Uses.push_back(&*std::next(F->arg_begin(), UseIndex)); +    return false; +  } -    const SmallPtrSet<Function*, 8> &SCCNodes; -  }; +  bool Captured; // True only if certainly captured (used outside our SCC). +  SmallVector<Argument *, 4> Uses; // Uses within our SCC. + +  const SCCNodeSet &SCCNodes; +};  }  namespace llvm { -  template<> struct GraphTraits<ArgumentGraphNode*> { -    typedef ArgumentGraphNode NodeType; -    typedef SmallVectorImpl<ArgumentGraphNode*>::iterator ChildIteratorType; +template <> struct GraphTraits<ArgumentGraphNode *> { +  typedef ArgumentGraphNode NodeType; +  typedef SmallVectorImpl<ArgumentGraphNode *>::iterator ChildIteratorType; -    static inline NodeType *getEntryNode(NodeType *A) { return A; } -    static inline ChildIteratorType child_begin(NodeType *N) { -      return N->Uses.begin(); -    } -    static inline ChildIteratorType child_end(NodeType *N) { -      return N->Uses.end(); -    } -  }; -  template<> struct GraphTraits<ArgumentGraph*> -    : public GraphTraits<ArgumentGraphNode*> { -    static NodeType *getEntryNode(ArgumentGraph *AG) { -      return AG->getEntryNode(); -    } -    static ChildIteratorType nodes_begin(ArgumentGraph *AG) { -      return AG->begin(); -    } -    static ChildIteratorType nodes_end(ArgumentGraph *AG) { -      return AG->end(); -    } -  }; +  static inline NodeType *getEntryNode(NodeType *A) { return A; } +  static inline ChildIteratorType child_begin(NodeType *N) { +    return N->Uses.begin(); +  } +  static inline ChildIteratorType child_end(NodeType *N) { +    return N->Uses.end(); +  } +}; +template <> +struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> { +  static NodeType *getEntryNode(ArgumentGraph *AG) { +    return AG->getEntryNode(); +  } +  static ChildIteratorType nodes_begin(ArgumentGraph *AG) { +    return AG->begin(); +  } +  static ChildIteratorType nodes_end(ArgumentGraph *AG) { return AG->end(); } +};  } -// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone. +/// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone.  static Attribute::AttrKind  determinePointerReadAttrs(Argument *A, -                          const SmallPtrSet<Argument*, 8> &SCCNodes) { -                                                        -  SmallVector<Use*, 32> Worklist; -  SmallSet<Use*, 32> Visited; -  int Count = 0; +                          const SmallPtrSet<Argument *, 8> &SCCNodes) { + +  SmallVector<Use *, 32> Worklist; +  SmallSet<Use *, 32> Visited;    // inalloca arguments are always clobbered by the call.    if (A->hasInAllocaAttr()) @@ -425,9 +408,6 @@ determinePointerReadAttrs(Argument *A,    // We don't need to track IsWritten. If A is written to, return immediately.    for (Use &U : A->uses()) { -    if (Count++ >= 20) -      return Attribute::None; -      Visited.insert(&U);      Worklist.push_back(&U);    } @@ -435,7 +415,6 @@ determinePointerReadAttrs(Argument *A,    while (!Worklist.empty()) {      Use *U = Worklist.pop_back_val();      Instruction *I = cast<Instruction>(U->getUser()); -    Value *V = U->get();      switch (I->getOpcode()) {      case Instruction::BitCast: @@ -479,24 +458,44 @@ determinePointerReadAttrs(Argument *A,          return Attribute::None;        } -      Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end(); -      CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); -      for (CallSite::arg_iterator A = B; A != E; ++A, ++AI) { -        if (A->get() == V) { -          if (AI == AE) { -            assert(F->isVarArg() && -                   "More params than args in non-varargs call."); -            return Attribute::None; -          } -          Captures &= !CS.doesNotCapture(A - B); -          if (SCCNodes.count(AI)) -            continue; -          if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(A - B)) -            return Attribute::None; -          if (!CS.doesNotAccessMemory(A - B)) -            IsRead = true; -        } +      // Note: the callee and the two successor blocks *follow* the argument +      // operands.  This means there is no need to adjust UseIndex to account +      // for these. + +      unsigned UseIndex = std::distance(CS.arg_begin(), U); + +      // U cannot be the callee operand use: since we're exploring the +      // transitive uses of an Argument, having such a use be a callee would +      // imply the CallSite is an indirect call or invoke; and we'd take the +      // early exit above. +      assert(UseIndex < CS.data_operands_size() && +             "Data operand use expected!"); + +      bool IsOperandBundleUse = UseIndex >= CS.getNumArgOperands(); + +      if (UseIndex >= F->arg_size() && !IsOperandBundleUse) { +        assert(F->isVarArg() && "More params than args in non-varargs call"); +        return Attribute::None;        } + +      Captures &= !CS.doesNotCapture(UseIndex); + +      // Since the optimizer (by design) cannot see the data flow corresponding +      // to a operand bundle use, these cannot participate in the optimistic SCC +      // analysis.  Instead, we model the operand bundle uses as arguments in +      // call to a function external to the SCC. +      if (!SCCNodes.count(&*std::next(F->arg_begin(), UseIndex)) || +          IsOperandBundleUse) { + +        // The accessors used on CallSite here do the right thing for calls and +        // invokes with operand bundles. + +        if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(UseIndex)) +          return Attribute::None; +        if (!CS.doesNotAccessMemory(UseIndex)) +          IsRead = true; +      } +        AddUsersToWorklistIfCapturing();        break;      } @@ -517,21 +516,10 @@ determinePointerReadAttrs(Argument *A,    return IsRead ? Attribute::ReadOnly : Attribute::ReadNone;  } -/// AddArgumentAttrs - Deduce nocapture attributes for the SCC. -bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { +/// Deduce nocapture attributes for the SCC. +static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {    bool Changed = false; -  SmallPtrSet<Function*, 8> SCCNodes; - -  // Fill SCCNodes with the elements of the SCC.  Used for quickly -  // looking up whether a given CallGraphNode is in this SCC. -  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { -    Function *F = (*I)->getFunction(); -    if (F && !F->isDeclaration() && !F->mayBeOverridden() && -        !F->hasFnAttribute(Attribute::OptimizeNone)) -      SCCNodes.insert(F); -  } -    ArgumentGraph AG;    AttrBuilder B; @@ -539,14 +527,7 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {    // Check each function in turn, determining which pointer arguments are not    // captured. -  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { -    Function *F = (*I)->getFunction(); - -    if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) -      // External node or function we're trying not to optimize - only a problem -      // for arguments that we pass to it. -      continue; - +  for (Function *F : SCCNodes) {      // Definitions with weak linkage may be overridden at linktime with      // something that captures pointers, so treat them like declarations.      if (F->isDeclaration() || F->mayBeOverridden()) @@ -556,8 +537,8 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {      // a value can't capture arguments. Don't analyze them.      if (F->onlyReadsMemory() && F->doesNotThrow() &&          F->getReturnType()->isVoidTy()) { -      for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); -           A != E; ++A) { +      for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; +           ++A) {          if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {            A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));            ++NumNoCapture; @@ -567,26 +548,30 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {        continue;      } -    for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); -         A != E; ++A) { -      if (!A->getType()->isPointerTy()) continue; +    for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; +         ++A) { +      if (!A->getType()->isPointerTy()) +        continue;        bool HasNonLocalUses = false;        if (!A->hasNoCaptureAttr()) {          ArgumentUsesTracker Tracker(SCCNodes); -        PointerMayBeCaptured(A, &Tracker); +        PointerMayBeCaptured(&*A, &Tracker);          if (!Tracker.Captured) {            if (Tracker.Uses.empty()) {              // If it's trivially not captured, mark it nocapture now. -            A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo()+1, B)); +            A->addAttr( +                AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));              ++NumNoCapture;              Changed = true;            } else {              // If it's not trivially captured and not trivially not captured,              // then it must be calling into another function in our SCC. Save              // its particulars for Argument-SCC analysis later. -            ArgumentGraphNode *Node = AG[A]; -            for (SmallVectorImpl<Argument*>::iterator UI = Tracker.Uses.begin(), -                     UE = Tracker.Uses.end(); UI != UE; ++UI) { +            ArgumentGraphNode *Node = AG[&*A]; +            for (SmallVectorImpl<Argument *>::iterator +                     UI = Tracker.Uses.begin(), +                     UE = Tracker.Uses.end(); +                 UI != UE; ++UI) {                Node->Uses.push_back(AG[*UI]);                if (*UI != A)                  HasNonLocalUses = true; @@ -600,9 +585,9 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {          // Note that we don't allow any calls at all here, or else our result          // will be dependent on the iteration order through the functions in the          // SCC. -        SmallPtrSet<Argument*, 8> Self; -        Self.insert(A); -        Attribute::AttrKind R = determinePointerReadAttrs(A, Self); +        SmallPtrSet<Argument *, 8> Self; +        Self.insert(&*A); +        Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);          if (R != Attribute::None) {            AttrBuilder B;            B.addAttribute(R); @@ -621,10 +606,11 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {    // made.  If the definition doesn't have a 'nocapture' attribute by now, it    // captures. -  for (scc_iterator<ArgumentGraph*> I = scc_begin(&AG); !I.isAtEnd(); ++I) { +  for (scc_iterator<ArgumentGraph *> I = scc_begin(&AG); !I.isAtEnd(); ++I) {      const std::vector<ArgumentGraphNode *> &ArgumentSCC = *I;      if (ArgumentSCC.size() == 1) { -      if (!ArgumentSCC[0]->Definition) continue;  // synthetic root node +      if (!ArgumentSCC[0]->Definition) +        continue; // synthetic root node        // eg. "void f(int* x) { if (...) f(x); }"        if (ArgumentSCC[0]->Uses.size() == 1 && @@ -646,9 +632,10 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {            SCCCaptured = true;        }      } -    if (SCCCaptured) continue; +    if (SCCCaptured) +      continue; -    SmallPtrSet<Argument*, 8> ArgumentSCCNodes; +    SmallPtrSet<Argument *, 8> ArgumentSCCNodes;      // Fill ArgumentSCCNodes with the elements of the ArgumentSCC.  Used for      // quickly looking up whether a given Argument is in this ArgumentSCC.      for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); I != E; ++I) { @@ -658,8 +645,9 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {      for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();           I != E && !SCCCaptured; ++I) {        ArgumentGraphNode *N = *I; -      for (SmallVectorImpl<ArgumentGraphNode*>::iterator UI = N->Uses.begin(), -             UE = N->Uses.end(); UI != UE; ++UI) { +      for (SmallVectorImpl<ArgumentGraphNode *>::iterator UI = N->Uses.begin(), +                                                          UE = N->Uses.end(); +           UI != UE; ++UI) {          Argument *A = (*UI)->Definition;          if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A))            continue; @@ -667,7 +655,8 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {          break;        }      } -    if (SCCCaptured) continue; +    if (SCCCaptured) +      continue;      for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {        Argument *A = ArgumentSCC[i]->Definition; @@ -704,8 +693,7 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {      if (ReadAttr != Attribute::None) {        AttrBuilder B, R;        B.addAttribute(ReadAttr); -      R.addAttribute(Attribute::ReadOnly) -        .addAttribute(Attribute::ReadNone); +      R.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);        for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {          Argument *A = ArgumentSCC[i]->Definition;          // Clear out existing readonly/readnone attributes @@ -720,10 +708,11 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {    return Changed;  } -/// IsFunctionMallocLike - A function is malloc-like if it returns either null -/// or a pointer that doesn't alias any other pointer visible to the caller. -bool FunctionAttrs::IsFunctionMallocLike(Function *F, -                              SmallPtrSet<Function*, 8> &SCCNodes) const { +/// Tests whether a function is "malloc-like". +/// +/// A function is "malloc-like" if it returns either null or a pointer that +/// doesn't alias any other pointer visible to the caller. +static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {    SmallSetVector<Value *, 8> FlowsToReturn;    for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I)      if (ReturnInst *Ret = dyn_cast<ReturnInst>(I->getTerminator())) @@ -744,39 +733,38 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F,      if (Instruction *RVI = dyn_cast<Instruction>(RetVal))        switch (RVI->getOpcode()) { -        // Extend the analysis by looking upwards. -        case Instruction::BitCast: -        case Instruction::GetElementPtr: -        case Instruction::AddrSpaceCast: -          FlowsToReturn.insert(RVI->getOperand(0)); -          continue; -        case Instruction::Select: { -          SelectInst *SI = cast<SelectInst>(RVI); -          FlowsToReturn.insert(SI->getTrueValue()); -          FlowsToReturn.insert(SI->getFalseValue()); -          continue; -        } -        case Instruction::PHI: { -          PHINode *PN = cast<PHINode>(RVI); -          for (Value *IncValue : PN->incoming_values()) -            FlowsToReturn.insert(IncValue); -          continue; -        } +      // Extend the analysis by looking upwards. +      case Instruction::BitCast: +      case Instruction::GetElementPtr: +      case Instruction::AddrSpaceCast: +        FlowsToReturn.insert(RVI->getOperand(0)); +        continue; +      case Instruction::Select: { +        SelectInst *SI = cast<SelectInst>(RVI); +        FlowsToReturn.insert(SI->getTrueValue()); +        FlowsToReturn.insert(SI->getFalseValue()); +        continue; +      } +      case Instruction::PHI: { +        PHINode *PN = cast<PHINode>(RVI); +        for (Value *IncValue : PN->incoming_values()) +          FlowsToReturn.insert(IncValue); +        continue; +      } -        // Check whether the pointer came from an allocation. -        case Instruction::Alloca: +      // Check whether the pointer came from an allocation. +      case Instruction::Alloca: +        break; +      case Instruction::Call: +      case Instruction::Invoke: { +        CallSite CS(RVI); +        if (CS.paramHasAttr(0, Attribute::NoAlias)) +          break; +        if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction()))            break; -        case Instruction::Call: -        case Instruction::Invoke: { -          CallSite CS(RVI); -          if (CS.paramHasAttr(0, Attribute::NoAlias)) -            break; -          if (CS.getCalledFunction() && -              SCCNodes.count(CS.getCalledFunction())) -            break; -        } // fall-through -        default: -          return false;  // Did not come from an allocation. +      } // fall-through +      default: +        return false; // Did not come from an allocation.        }      if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false)) @@ -786,24 +774,11 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F,    return true;  } -/// AddNoAliasAttrs - Deduce noalias attributes for the SCC. -bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) { -  SmallPtrSet<Function*, 8> SCCNodes; - -  // Fill SCCNodes with the elements of the SCC.  Used for quickly -  // looking up whether a given CallGraphNode is in this SCC. -  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) -    SCCNodes.insert((*I)->getFunction()); - +/// Deduce noalias attributes for the SCC. +static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {    // Check each function in turn, determining which functions return noalias    // pointers. -  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { -    Function *F = (*I)->getFunction(); - -    if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) -      // External node or node we don't want to optimize - skip it; -      return false; - +  for (Function *F : SCCNodes) {      // Already noalias.      if (F->doesNotAlias(0))        continue; @@ -813,18 +788,17 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) {      if (F->isDeclaration() || F->mayBeOverridden())        return false; -    // We annotate noalias return values, which are only applicable to  +    // We annotate noalias return values, which are only applicable to      // pointer types.      if (!F->getReturnType()->isPointerTy())        continue; -    if (!IsFunctionMallocLike(F, SCCNodes)) +    if (!isFunctionMallocLike(F, SCCNodes))        return false;    }    bool MadeChange = false; -  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { -    Function *F = (*I)->getFunction(); +  for (Function *F : SCCNodes) {      if (F->doesNotAlias(0) || !F->getReturnType()->isPointerTy())        continue; @@ -836,880 +810,249 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) {    return MadeChange;  } -/// inferPrototypeAttributes - Analyze the name and prototype of the -/// given function and set any applicable attributes.  Returns true -/// if any attributes were set and false otherwise. -bool FunctionAttrs::inferPrototypeAttributes(Function &F) { -  if (F.hasFnAttribute(Attribute::OptimizeNone)) -    return false; +/// Tests whether this function is known to not return null. +/// +/// Requires that the function returns a pointer. +/// +/// Returns true if it believes the function will not return a null, and sets +/// \p Speculative based on whether the returned conclusion is a speculative +/// conclusion due to SCC calls. +static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes, +                            const TargetLibraryInfo &TLI, bool &Speculative) { +  assert(F->getReturnType()->isPointerTy() && +         "nonnull only meaningful on pointer types"); +  Speculative = false; -  FunctionType *FTy = F.getFunctionType(); -  LibFunc::Func TheLibFunc; -  if (!(TLI->getLibFunc(F.getName(), TheLibFunc) && TLI->has(TheLibFunc))) -    return false; +  SmallSetVector<Value *, 8> FlowsToReturn; +  for (BasicBlock &BB : *F) +    if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) +      FlowsToReturn.insert(Ret->getReturnValue()); -  switch (TheLibFunc) { -  case LibFunc::strlen: -    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setOnlyReadsMemory(F); -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::strchr: -  case LibFunc::strrchr: -    if (FTy->getNumParams() != 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isIntegerTy()) -      return false; -    setOnlyReadsMemory(F); -    setDoesNotThrow(F); -    break; -  case LibFunc::strtol: -  case LibFunc::strtod: -  case LibFunc::strtof: -  case LibFunc::strtoul: -  case LibFunc::strtoll: -  case LibFunc::strtold: -  case LibFunc::strtoull: -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::strcpy: -  case LibFunc::stpcpy: -  case LibFunc::strcat: -  case LibFunc::strncat: -  case LibFunc::strncpy: -  case LibFunc::stpncpy: -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::strxfrm: -    if (FTy->getNumParams() != 3 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::strcmp: //0,1 -    case LibFunc::strspn: // 0,1 -    case LibFunc::strncmp: // 0,1 -    case LibFunc::strcspn: //0,1 -    case LibFunc::strcoll: //0,1 -    case LibFunc::strcasecmp:  // 0,1 -    case LibFunc::strncasecmp: //  -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setOnlyReadsMemory(F); -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::strstr: -  case LibFunc::strpbrk: -    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) -      return false; -    setOnlyReadsMemory(F); -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::strtok: -  case LibFunc::strtok_r: -    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::scanf: -    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::setbuf: -  case LibFunc::setvbuf: -    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::strdup: -  case LibFunc::strndup: -    if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() || -        !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::stat: -  case LibFunc::statvfs: -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::sscanf: -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::sprintf: -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::snprintf: -    if (FTy->getNumParams() != 3 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(2)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 3); -    setOnlyReadsMemory(F, 3); -    break; -  case LibFunc::setitimer: -    if (FTy->getNumParams() != 3 || -        !FTy->getParamType(1)->isPointerTy() || -        !FTy->getParamType(2)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    setDoesNotCapture(F, 3); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::system: -    if (FTy->getNumParams() != 1 || -        !FTy->getParamType(0)->isPointerTy()) -      return false; -    // May throw; "system" is a valid pthread cancellation point. -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::malloc: -    if (FTy->getNumParams() != 1 || -        !FTy->getReturnType()->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    break; -  case LibFunc::memcmp: -    if (FTy->getNumParams() != 3 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setOnlyReadsMemory(F); -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::memchr: -  case LibFunc::memrchr: -    if (FTy->getNumParams() != 3) -      return false; -    setOnlyReadsMemory(F); -    setDoesNotThrow(F); -    break; -  case LibFunc::modf: -  case LibFunc::modff: -  case LibFunc::modfl: -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::memcpy: -  case LibFunc::memccpy: -  case LibFunc::memmove: -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::memalign: -    if (!FTy->getReturnType()->isPointerTy()) -      return false; -    setDoesNotAlias(F, 0); -    break; -  case LibFunc::mkdir: -    if (FTy->getNumParams() == 0 || -        !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::mktime: -    if (FTy->getNumParams() == 0 || -        !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::realloc: -    if (FTy->getNumParams() != 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getReturnType()->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::read: -    if (FTy->getNumParams() != 3 || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    // May throw; "read" is a valid pthread cancellation point. -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::rewind: -    if (FTy->getNumParams() < 1 || -        !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::rmdir: -  case LibFunc::remove: -  case LibFunc::realpath: -    if (FTy->getNumParams() < 1 || -        !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::rename: -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::readlink: -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::write: -    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) -      return false; -    // May throw; "write" is a valid pthread cancellation point. -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::bcopy: -    if (FTy->getNumParams() != 3 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::bcmp: -    if (FTy->getNumParams() != 3 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setOnlyReadsMemory(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::bzero: -    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::calloc: -    if (FTy->getNumParams() != 2 || -        !FTy->getReturnType()->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    break; -  case LibFunc::chmod: -  case LibFunc::chown: -    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::ctermid: -  case LibFunc::clearerr: -  case LibFunc::closedir: -    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::atoi: -  case LibFunc::atol: -  case LibFunc::atof: -  case LibFunc::atoll: -    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setOnlyReadsMemory(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::access: -    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::fopen: -    if (FTy->getNumParams() != 2 || -        !FTy->getReturnType()->isPointerTy() || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::fdopen: -    if (FTy->getNumParams() != 2 || -        !FTy->getReturnType()->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::feof: -  case LibFunc::free: -  case LibFunc::fseek: -  case LibFunc::ftell: -  case LibFunc::fgetc: -  case LibFunc::fseeko: -  case LibFunc::ftello: -  case LibFunc::fileno: -  case LibFunc::fflush: -  case LibFunc::fclose: -  case LibFunc::fsetpos: -  case LibFunc::flockfile: -  case LibFunc::funlockfile: -  case LibFunc::ftrylockfile: -    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::ferror: -    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F); -    break; -  case LibFunc::fputc: -  case LibFunc::fstat: -  case LibFunc::frexp: -  case LibFunc::frexpf: -  case LibFunc::frexpl: -  case LibFunc::fstatvfs: -    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::fgets: -    if (FTy->getNumParams() != 3 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(2)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 3); -    break; -  case LibFunc::fread: -    if (FTy->getNumParams() != 4 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(3)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 4); -    break; -  case LibFunc::fwrite: -    if (FTy->getNumParams() != 4 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(3)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 4); -    break; -  case LibFunc::fputs: -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::fscanf: -  case LibFunc::fprintf: -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::fgetpos: -    if (FTy->getNumParams() < 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::getc: -  case LibFunc::getlogin_r: -  case LibFunc::getc_unlocked: -    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::getenv: -    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setOnlyReadsMemory(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::gets: -  case LibFunc::getchar: -    setDoesNotThrow(F); -    break; -  case LibFunc::getitimer: -    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::getpwnam: -    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::ungetc: -    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::uname: -    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::unlink: -    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::unsetenv: -    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::utime: -  case LibFunc::utimes: -    if (FTy->getNumParams() != 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::putc: -    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::puts: -  case LibFunc::printf: -  case LibFunc::perror: -    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::pread: -    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) -      return false; -    // May throw; "pread" is a valid pthread cancellation point. -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::pwrite: -    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) -      return false; -    // May throw; "pwrite" is a valid pthread cancellation point. -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::putchar: -    setDoesNotThrow(F); -    break; -  case LibFunc::popen: -    if (FTy->getNumParams() != 2 || -        !FTy->getReturnType()->isPointerTy() || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::pclose: -    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::vscanf: -    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::vsscanf: -    if (FTy->getNumParams() != 3 || -        !FTy->getParamType(1)->isPointerTy() || -        !FTy->getParamType(2)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::vfscanf: -    if (FTy->getNumParams() != 3 || -        !FTy->getParamType(1)->isPointerTy() || -        !FTy->getParamType(2)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::valloc: -    if (!FTy->getReturnType()->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    break; -  case LibFunc::vprintf: -    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::vfprintf: -  case LibFunc::vsprintf: -    if (FTy->getNumParams() != 3 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::vsnprintf: -    if (FTy->getNumParams() != 4 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(2)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 3); -    setOnlyReadsMemory(F, 3); -    break; -  case LibFunc::open: -    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    // May throw; "open" is a valid pthread cancellation point. -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::opendir: -    if (FTy->getNumParams() != 1 || -        !FTy->getReturnType()->isPointerTy() || -        !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::tmpfile: -    if (!FTy->getReturnType()->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    break; -  case LibFunc::times: -    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::htonl: -  case LibFunc::htons: -  case LibFunc::ntohl: -  case LibFunc::ntohs: -    setDoesNotThrow(F); -    setDoesNotAccessMemory(F); -    break; -  case LibFunc::lstat: -    if (FTy->getNumParams() != 2 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::lchown: -    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::qsort: -    if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy()) -      return false; -    // May throw; places call through function pointer. -    setDoesNotCapture(F, 4); -    break; -  case LibFunc::dunder_strdup: -  case LibFunc::dunder_strndup: -    if (FTy->getNumParams() < 1 || -        !FTy->getReturnType()->isPointerTy() || -        !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::dunder_strtok_r: -    if (FTy->getNumParams() != 3 || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::under_IO_getc: -    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::under_IO_putc: -    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::dunder_isoc99_scanf: -    if (FTy->getNumParams() < 1 || -        !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::stat64: -  case LibFunc::lstat64: -  case LibFunc::statvfs64: -    if (FTy->getNumParams() < 1 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::dunder_isoc99_sscanf: -    if (FTy->getNumParams() < 1 || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::fopen64: -    if (FTy->getNumParams() != 2 || -        !FTy->getReturnType()->isPointerTy() || -        !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    setOnlyReadsMemory(F, 1); -    setOnlyReadsMemory(F, 2); -    break; -  case LibFunc::fseeko64: -  case LibFunc::ftello64: -    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    break; -  case LibFunc::tmpfile64: -    if (!FTy->getReturnType()->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotAlias(F, 0); -    break; -  case LibFunc::fstat64: -  case LibFunc::fstatvfs64: -    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) -      return false; -    setDoesNotThrow(F); -    setDoesNotCapture(F, 2); -    break; -  case LibFunc::open64: -    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) +  for (unsigned i = 0; i != FlowsToReturn.size(); ++i) { +    Value *RetVal = FlowsToReturn[i]; + +    // If this value is locally known to be non-null, we're good +    if (isKnownNonNull(RetVal, &TLI)) +      continue; + +    // Otherwise, we need to look upwards since we can't make any local +    // conclusions. +    Instruction *RVI = dyn_cast<Instruction>(RetVal); +    if (!RVI)        return false; -    // May throw; "open" is a valid pthread cancellation point. -    setDoesNotCapture(F, 1); -    setOnlyReadsMemory(F, 1); -    break; -  case LibFunc::gettimeofday: -    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || -        !FTy->getParamType(1)->isPointerTy()) +    switch (RVI->getOpcode()) { +    // Extend the analysis by looking upwards. +    case Instruction::BitCast: +    case Instruction::GetElementPtr: +    case Instruction::AddrSpaceCast: +      FlowsToReturn.insert(RVI->getOperand(0)); +      continue; +    case Instruction::Select: { +      SelectInst *SI = cast<SelectInst>(RVI); +      FlowsToReturn.insert(SI->getTrueValue()); +      FlowsToReturn.insert(SI->getFalseValue()); +      continue; +    } +    case Instruction::PHI: { +      PHINode *PN = cast<PHINode>(RVI); +      for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i) +        FlowsToReturn.insert(PN->getIncomingValue(i)); +      continue; +    } +    case Instruction::Call: +    case Instruction::Invoke: { +      CallSite CS(RVI); +      Function *Callee = CS.getCalledFunction(); +      // A call to a node within the SCC is assumed to return null until +      // proven otherwise +      if (Callee && SCCNodes.count(Callee)) { +        Speculative = true; +        continue; +      }        return false; -    // Currently some platforms have the restrict keyword on the arguments to -    // gettimeofday. To be conservative, do not add noalias to gettimeofday's -    // arguments. -    setDoesNotThrow(F); -    setDoesNotCapture(F, 1); -    setDoesNotCapture(F, 2); -    break; -  default: -    // Didn't mark any attributes. -    return false; +    } +    default: +      return false; // Unknown source, may be null +    }; +    llvm_unreachable("should have either continued or returned");    }    return true;  } -/// annotateLibraryCalls - Adds attributes to well-known standard library -/// call declarations. -bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) { +/// Deduce nonnull attributes for the SCC. +static bool addNonNullAttrs(const SCCNodeSet &SCCNodes, +                            const TargetLibraryInfo &TLI) { +  // Speculative that all functions in the SCC return only nonnull +  // pointers.  We may refute this as we analyze functions. +  bool SCCReturnsNonNull = true; +    bool MadeChange = false; -  // Check each function in turn annotating well-known library function -  // declarations with attributes. -  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { -    Function *F = (*I)->getFunction(); +  // Check each function in turn, determining which functions return nonnull +  // pointers. +  for (Function *F : SCCNodes) { +    // Already nonnull. +    if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, +                                        Attribute::NonNull)) +      continue; + +    // Definitions with weak linkage may be overridden at linktime, so +    // treat them like declarations. +    if (F->isDeclaration() || F->mayBeOverridden()) +      return false; + +    // We annotate nonnull return values, which are only applicable to +    // pointer types. +    if (!F->getReturnType()->isPointerTy()) +      continue; -    if (F && F->isDeclaration()) -      MadeChange |= inferPrototypeAttributes(*F); +    bool Speculative = false; +    if (isReturnNonNull(F, SCCNodes, TLI, Speculative)) { +      if (!Speculative) { +        // Mark the function eagerly since we may discover a function +        // which prevents us from speculating about the entire SCC +        DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n"); +        F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); +        ++NumNonNullReturn; +        MadeChange = true; +      } +      continue; +    } +    // At least one function returns something which could be null, can't +    // speculate any more. +    SCCReturnsNonNull = false; +  } + +  if (SCCReturnsNonNull) { +    for (Function *F : SCCNodes) { +      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, +                                          Attribute::NonNull) || +          !F->getReturnType()->isPointerTy()) +        continue; + +      DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n"); +      F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); +      ++NumNonNullReturn; +      MadeChange = true; +    }    }    return MadeChange;  } +static bool setDoesNotRecurse(Function &F) { +  if (F.doesNotRecurse()) +    return false; +  F.setDoesNotRecurse(); +  ++NumNoRecurse; +  return true; +} + +static bool addNoRecurseAttrs(const CallGraphSCC &SCC, +                              SmallVectorImpl<WeakVH> &Revisit) { +  // Try and identify functions that do not recurse. + +  // If the SCC contains multiple nodes we know for sure there is recursion. +  if (!SCC.isSingular()) +    return false; + +  const CallGraphNode *CGN = *SCC.begin(); +  Function *F = CGN->getFunction(); +  if (!F || F->isDeclaration() || F->doesNotRecurse()) +    return false; + +  // If all of the calls in F are identifiable and are to norecurse functions, F +  // is norecurse. This check also detects self-recursion as F is not currently +  // marked norecurse, so any called from F to F will not be marked norecurse. +  if (std::all_of(CGN->begin(), CGN->end(), +                  [](const CallGraphNode::CallRecord &CR) { +                    Function *F = CR.second->getFunction(); +                    return F && F->doesNotRecurse(); +                  })) +    // Function calls a potentially recursive function. +    return setDoesNotRecurse(*F); + +  // We know that F is not obviously recursive, but we haven't been able to +  // prove that it doesn't actually recurse. Add it to the Revisit list to try +  // again top-down later. +  Revisit.push_back(F); +  return false; +} + +static bool addNoRecurseAttrsTopDownOnly(Function *F) { +  // If F is internal and all uses are in norecurse functions, then F is also +  // norecurse. +  if (F->doesNotRecurse()) +    return false; +  if (F->hasInternalLinkage()) { +    for (auto *U : F->users()) +      if (auto *I = dyn_cast<Instruction>(U)) { +        if (!I->getParent()->getParent()->doesNotRecurse()) +          return false; +      } else { +        return false; +      } +    return setDoesNotRecurse(*F); +  } +  return false; +} +  bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) { -  AA = &getAnalysis<AliasAnalysis>();    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); +  bool Changed = false; -  bool Changed = annotateLibraryCalls(SCC); -  Changed |= AddReadAttrs(SCC); -  Changed |= AddArgumentAttrs(SCC); -  Changed |= AddNoAliasAttrs(SCC); +  // We compute dedicated AA results for each function in the SCC as needed. We +  // use a lambda referencing external objects so that they live long enough to +  // be queried, but we re-use them each time. +  Optional<BasicAAResult> BAR; +  Optional<AAResults> AAR; +  auto AARGetter = [&](Function &F) -> AAResults & { +    BAR.emplace(createLegacyPMBasicAAResult(*this, F)); +    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR)); +    return *AAR; +  }; + +  // Fill SCCNodes with the elements of the SCC. Used for quickly looking up +  // whether a given CallGraphNode is in this SCC. Also track whether there are +  // any external or opt-none nodes that will prevent us from optimizing any +  // part of the SCC. +  SCCNodeSet SCCNodes; +  bool ExternalNode = false; +  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { +    Function *F = (*I)->getFunction(); +    if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) { +      // External node or function we're trying not to optimize - we both avoid +      // transform them and avoid leveraging information they provide. +      ExternalNode = true; +      continue; +    } + +    SCCNodes.insert(F); +  } + +  Changed |= addReadAttrs(SCCNodes, AARGetter); +  Changed |= addArgumentAttrs(SCCNodes); + +  // If we have no external nodes participating in the SCC, we can deduce some +  // more precise attributes as well. +  if (!ExternalNode) { +    Changed |= addNoAliasAttrs(SCCNodes); +    Changed |= addNonNullAttrs(SCCNodes, *TLI); +  } +   +  Changed |= addNoRecurseAttrs(SCC, Revisit); +  return Changed; +} + +bool FunctionAttrs::doFinalization(CallGraph &CG) { +  bool Changed = false; +  // When iterating over SCCs we visit functions in a bottom-up fashion. Some of +  // the rules we have for identifying norecurse functions work best with a +  // top-down walk, so look again at all the functions we previously marked as +  // worth revisiting, in top-down order. +  for (auto &F : reverse(Revisit)) +    if (F) +      Changed |= addNoRecurseAttrsTopDownOnly(cast<Function>((Value*)F));    return Changed;  } diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp new file mode 100644 index 000000000000..d8b677b966f2 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -0,0 +1,433 @@ +//===- FunctionImport.cpp - ThinLTO Summary-based Function Import ---------===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements Function import based on summaries. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/FunctionImport.h" + +#include "llvm/ADT/StringSet.h" +#include "llvm/IR/AutoUpgrade.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Object/FunctionIndexObjectFile.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/SourceMgr.h" + +#include <map> + +using namespace llvm; + +#define DEBUG_TYPE "function-import" + +/// Limit on instruction count of imported functions. +static cl::opt<unsigned> ImportInstrLimit( +    "import-instr-limit", cl::init(100), cl::Hidden, cl::value_desc("N"), +    cl::desc("Only import functions with less than N instructions")); + +// Load lazily a module from \p FileName in \p Context. +static std::unique_ptr<Module> loadFile(const std::string &FileName, +                                        LLVMContext &Context) { +  SMDiagnostic Err; +  DEBUG(dbgs() << "Loading '" << FileName << "'\n"); +  std::unique_ptr<Module> Result = getLazyIRFileModule(FileName, Err, Context); +  if (!Result) { +    Err.print("function-import", errs()); +    return nullptr; +  } + +  Result->materializeMetadata(); +  UpgradeDebugInfo(*Result); + +  return Result; +} + +namespace { +/// Helper to load on demand a Module from file and cache it for subsequent +/// queries. It can be used with the FunctionImporter. +class ModuleLazyLoaderCache { +  /// Cache of lazily loaded module for import. +  StringMap<std::unique_ptr<Module>> ModuleMap; + +  /// Retrieve a Module from the cache or lazily load it on demand. +  std::function<std::unique_ptr<Module>(StringRef FileName)> createLazyModule; + +public: +  /// Create the loader, Module will be initialized in \p Context. +  ModuleLazyLoaderCache(std::function< +      std::unique_ptr<Module>(StringRef FileName)> createLazyModule) +      : createLazyModule(createLazyModule) {} + +  /// Retrieve a Module from the cache or lazily load it on demand. +  Module &operator()(StringRef FileName); + +  std::unique_ptr<Module> takeModule(StringRef FileName) { +    auto I = ModuleMap.find(FileName); +    assert(I != ModuleMap.end()); +    std::unique_ptr<Module> Ret = std::move(I->second); +    ModuleMap.erase(I); +    return Ret; +  } +}; + +// Get a Module for \p FileName from the cache, or load it lazily. +Module &ModuleLazyLoaderCache::operator()(StringRef Identifier) { +  auto &Module = ModuleMap[Identifier]; +  if (!Module) +    Module = createLazyModule(Identifier); +  return *Module; +} +} // anonymous namespace + +/// Walk through the instructions in \p F looking for external +/// calls not already in the \p CalledFunctions set. If any are +/// found they are added to the \p Worklist for importing. +static void findExternalCalls(const Module &DestModule, Function &F, +                              const FunctionInfoIndex &Index, +                              StringSet<> &CalledFunctions, +                              SmallVector<StringRef, 64> &Worklist) { +  // We need to suffix internal function calls imported from other modules, +  // prepare the suffix ahead of time. +  std::string Suffix; +  if (F.getParent() != &DestModule) +    Suffix = +        (Twine(".llvm.") + +         Twine(Index.getModuleId(F.getParent()->getModuleIdentifier()))).str(); + +  for (auto &BB : F) { +    for (auto &I : BB) { +      if (isa<CallInst>(I)) { +        auto CalledFunction = cast<CallInst>(I).getCalledFunction(); +        // Insert any new external calls that have not already been +        // added to set/worklist. +        if (!CalledFunction || !CalledFunction->hasName()) +          continue; +        // Ignore intrinsics early +        if (CalledFunction->isIntrinsic()) { +          assert(CalledFunction->getIntrinsicID() != 0); +          continue; +        } +        auto ImportedName = CalledFunction->getName(); +        auto Renamed = (ImportedName + Suffix).str(); +        // Rename internal functions +        if (CalledFunction->hasInternalLinkage()) { +          ImportedName = Renamed; +        } +        auto It = CalledFunctions.insert(ImportedName); +        if (!It.second) { +          // This is a call to a function we already considered, skip. +          continue; +        } +        // Ignore functions already present in the destination module +        auto *SrcGV = DestModule.getNamedValue(ImportedName); +        if (SrcGV) { +          assert(isa<Function>(SrcGV) && "Name collision during import"); +          if (!cast<Function>(SrcGV)->isDeclaration()) { +            DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Ignoring " +                         << ImportedName << " already in DestinationModule\n"); +            continue; +          } +        } + +        Worklist.push_back(It.first->getKey()); +        DEBUG(dbgs() << DestModule.getModuleIdentifier() +                     << ": Adding callee for : " << ImportedName << " : " +                     << F.getName() << "\n"); +      } +    } +  } +} + +// Helper function: given a worklist and an index, will process all the worklist +// and decide what to import based on the summary information. +// +// Nothing is actually imported, functions are materialized in their source +// module and analyzed there. +// +// \p ModuleToFunctionsToImportMap is filled with the set of Function to import +// per Module. +static void GetImportList(Module &DestModule, +                          SmallVector<StringRef, 64> &Worklist, +                          StringSet<> &CalledFunctions, +                          std::map<StringRef, DenseSet<const GlobalValue *>> +                              &ModuleToFunctionsToImportMap, +                          const FunctionInfoIndex &Index, +                          ModuleLazyLoaderCache &ModuleLoaderCache) { +  while (!Worklist.empty()) { +    auto CalledFunctionName = Worklist.pop_back_val(); +    DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Process import for " +                 << CalledFunctionName << "\n"); + +    // Try to get a summary for this function call. +    auto InfoList = Index.findFunctionInfoList(CalledFunctionName); +    if (InfoList == Index.end()) { +      DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": No summary for " +                   << CalledFunctionName << " Ignoring.\n"); +      continue; +    } +    assert(!InfoList->second.empty() && "No summary, error at import?"); + +    // Comdat can have multiple entries, FIXME: what do we do with them? +    auto &Info = InfoList->second[0]; +    assert(Info && "Nullptr in list, error importing summaries?\n"); + +    auto *Summary = Info->functionSummary(); +    if (!Summary) { +      // FIXME: in case we are lazyloading summaries, we can do it now. +      DEBUG(dbgs() << DestModule.getModuleIdentifier() +                   << ": Missing summary for  " << CalledFunctionName +                   << ", error at import?\n"); +      llvm_unreachable("Missing summary"); +    } + +    if (Summary->instCount() > ImportInstrLimit) { +      DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Skip import of " +                   << CalledFunctionName << " with " << Summary->instCount() +                   << " instructions (limit " << ImportInstrLimit << ")\n"); +      continue; +    } + +    // Get the module path from the summary. +    auto ModuleIdentifier = Summary->modulePath(); +    DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Importing " +                 << CalledFunctionName << " from " << ModuleIdentifier << "\n"); + +    auto &SrcModule = ModuleLoaderCache(ModuleIdentifier); + +    // The function that we will import! +    GlobalValue *SGV = SrcModule.getNamedValue(CalledFunctionName); + +    if (!SGV) { +      // The destination module is referencing function using their renamed name +      // when importing a function that was originally local in the source +      // module. The source module we have might not have been renamed so we try +      // to remove the suffix added during the renaming to recover the original +      // name in the source module. +      std::pair<StringRef, StringRef> Split = +          CalledFunctionName.split(".llvm."); +      SGV = SrcModule.getNamedValue(Split.first); +      assert(SGV && "Can't find function to import in source module"); +    } +    if (!SGV) { +      report_fatal_error(Twine("Can't load function '") + CalledFunctionName + +                         "' in Module '" + SrcModule.getModuleIdentifier() + +                         "', error in the summary?\n"); +    } + +    Function *F = dyn_cast<Function>(SGV); +    if (!F && isa<GlobalAlias>(SGV)) { +      auto *SGA = dyn_cast<GlobalAlias>(SGV); +      F = dyn_cast<Function>(SGA->getBaseObject()); +      CalledFunctionName = F->getName(); +    } +    assert(F && "Imported Function is ... not a Function"); + +    // We cannot import weak_any functions/aliases without possibly affecting +    // the order they are seen and selected by the linker, changing program +    // semantics. +    if (SGV->hasWeakAnyLinkage()) { +      DEBUG(dbgs() << DestModule.getModuleIdentifier() +                   << ": Ignoring import request for weak-any " +                   << (isa<Function>(SGV) ? "function " : "alias ") +                   << CalledFunctionName << " from " +                   << SrcModule.getModuleIdentifier() << "\n"); +      continue; +    } + +    // Add the function to the import list +    auto &Entry = ModuleToFunctionsToImportMap[SrcModule.getModuleIdentifier()]; +    Entry.insert(F); + +    // Process the newly imported functions and add callees to the worklist. +    F->materialize(); +    findExternalCalls(DestModule, *F, Index, CalledFunctions, Worklist); +  } +} + +// Automatically import functions in Module \p DestModule based on the summaries +// index. +// +// The current implementation imports every called functions that exists in the +// summaries index. +bool FunctionImporter::importFunctions(Module &DestModule) { +  DEBUG(dbgs() << "Starting import for Module " +               << DestModule.getModuleIdentifier() << "\n"); +  unsigned ImportedCount = 0; + +  /// First step is collecting the called external functions. +  StringSet<> CalledFunctions; +  SmallVector<StringRef, 64> Worklist; +  for (auto &F : DestModule) { +    if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone)) +      continue; +    findExternalCalls(DestModule, F, Index, CalledFunctions, Worklist); +  } +  if (Worklist.empty()) +    return false; + +  /// Second step: for every call to an external function, try to import it. + +  // Linker that will be used for importing function +  Linker TheLinker(DestModule); + +  // Map of Module -> List of Function to import from the Module +  std::map<StringRef, DenseSet<const GlobalValue *>> +      ModuleToFunctionsToImportMap; + +  // Analyze the summaries and get the list of functions to import by +  // populating ModuleToFunctionsToImportMap +  ModuleLazyLoaderCache ModuleLoaderCache(ModuleLoader); +  GetImportList(DestModule, Worklist, CalledFunctions, +                ModuleToFunctionsToImportMap, Index, ModuleLoaderCache); +  assert(Worklist.empty() && "Worklist hasn't been flushed in GetImportList"); + +  StringMap<std::unique_ptr<DenseMap<unsigned, MDNode *>>> +      ModuleToTempMDValsMap; + +  // Do the actual import of functions now, one Module at a time +  for (auto &FunctionsToImportPerModule : ModuleToFunctionsToImportMap) { +    // Get the module for the import +    auto &FunctionsToImport = FunctionsToImportPerModule.second; +    std::unique_ptr<Module> SrcModule = +        ModuleLoaderCache.takeModule(FunctionsToImportPerModule.first); +    assert(&DestModule.getContext() == &SrcModule->getContext() && +           "Context mismatch"); + +    // Save the mapping of value ids to temporary metadata created when +    // importing this function. If we have already imported from this module, +    // add new temporary metadata to the existing mapping. +    auto &TempMDVals = ModuleToTempMDValsMap[SrcModule->getModuleIdentifier()]; +    if (!TempMDVals) +      TempMDVals = llvm::make_unique<DenseMap<unsigned, MDNode *>>(); + +    // Link in the specified functions. +    if (TheLinker.linkInModule(std::move(SrcModule), Linker::Flags::None, +                               &Index, &FunctionsToImport, TempMDVals.get())) +      report_fatal_error("Function Import: link error"); + +    ImportedCount += FunctionsToImport.size(); +  } + +  // Now link in metadata for all modules from which we imported functions. +  for (StringMapEntry<std::unique_ptr<DenseMap<unsigned, MDNode *>>> &SME : +       ModuleToTempMDValsMap) { +    // Load the specified source module. +    auto &SrcModule = ModuleLoaderCache(SME.getKey()); + +    // Link in all necessary metadata from this module. +    if (TheLinker.linkInMetadata(SrcModule, SME.getValue().get())) +      return false; +  } + +  DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module " +               << DestModule.getModuleIdentifier() << "\n"); +  return ImportedCount; +} + +/// Summary file to use for function importing when using -function-import from +/// the command line. +static cl::opt<std::string> +    SummaryFile("summary-file", +                cl::desc("The summary file to use for function importing.")); + +static void diagnosticHandler(const DiagnosticInfo &DI) { +  raw_ostream &OS = errs(); +  DiagnosticPrinterRawOStream DP(OS); +  DI.print(DP); +  OS << '\n'; +} + +/// Parse the function index out of an IR file and return the function +/// index object if found, or nullptr if not. +static std::unique_ptr<FunctionInfoIndex> +getFunctionIndexForFile(StringRef Path, std::string &Error, +                        DiagnosticHandlerFunction DiagnosticHandler) { +  std::unique_ptr<MemoryBuffer> Buffer; +  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr = +      MemoryBuffer::getFile(Path); +  if (std::error_code EC = BufferOrErr.getError()) { +    Error = EC.message(); +    return nullptr; +  } +  Buffer = std::move(BufferOrErr.get()); +  ErrorOr<std::unique_ptr<object::FunctionIndexObjectFile>> ObjOrErr = +      object::FunctionIndexObjectFile::create(Buffer->getMemBufferRef(), +                                              DiagnosticHandler); +  if (std::error_code EC = ObjOrErr.getError()) { +    Error = EC.message(); +    return nullptr; +  } +  return (*ObjOrErr)->takeIndex(); +} + +namespace { +/// Pass that performs cross-module function import provided a summary file. +class FunctionImportPass : public ModulePass { +  /// Optional function summary index to use for importing, otherwise +  /// the summary-file option must be specified. +  const FunctionInfoIndex *Index; + +public: +  /// Pass identification, replacement for typeid +  static char ID; + +  /// Specify pass name for debug output +  const char *getPassName() const override { +    return "Function Importing"; +  } + +  explicit FunctionImportPass(const FunctionInfoIndex *Index = nullptr) +      : ModulePass(ID), Index(Index) {} + +  bool runOnModule(Module &M) override { +    if (SummaryFile.empty() && !Index) +      report_fatal_error("error: -function-import requires -summary-file or " +                         "file from frontend\n"); +    std::unique_ptr<FunctionInfoIndex> IndexPtr; +    if (!SummaryFile.empty()) { +      if (Index) +        report_fatal_error("error: -summary-file and index from frontend\n"); +      std::string Error; +      IndexPtr = getFunctionIndexForFile(SummaryFile, Error, diagnosticHandler); +      if (!IndexPtr) { +        errs() << "Error loading file '" << SummaryFile << "': " << Error +               << "\n"; +        return false; +      } +      Index = IndexPtr.get(); +    } + +    // Perform the import now. +    auto ModuleLoader = [&M](StringRef Identifier) { +      return loadFile(Identifier, M.getContext()); +    }; +    FunctionImporter Importer(*Index, ModuleLoader); +    return Importer.importFunctions(M); + +    return false; +  } +}; +} // anonymous namespace + +char FunctionImportPass::ID = 0; +INITIALIZE_PASS_BEGIN(FunctionImportPass, "function-import", +                      "Summary Based Function Import", false, false) +INITIALIZE_PASS_END(FunctionImportPass, "function-import", +                    "Summary Based Function Import", false, false) + +namespace llvm { +Pass *createFunctionImportPass(const FunctionInfoIndex *Index = nullptr) { +  return new FunctionImportPass(Index); +} +} diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp index 61d0ff94a343..9b276ed28e2e 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -92,33 +92,28 @@ bool GlobalDCE::runOnModule(Module &M) {        ComdatMembers.insert(std::make_pair(C, &GA));    // Loop over the module, adding globals which are obviously necessary. -  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { -    Changed |= RemoveUnusedGlobalValue(*I); +  for (Function &F : M) { +    Changed |= RemoveUnusedGlobalValue(F);      // Functions with external linkage are needed if they have a body -    if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) { -      if (!I->isDiscardableIfUnused()) -        GlobalIsNeeded(I); -    } +    if (!F.isDeclaration() && !F.hasAvailableExternallyLinkage()) +      if (!F.isDiscardableIfUnused()) +        GlobalIsNeeded(&F);    } -  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); -       I != E; ++I) { -    Changed |= RemoveUnusedGlobalValue(*I); +  for (GlobalVariable &GV : M.globals()) { +    Changed |= RemoveUnusedGlobalValue(GV);      // Externally visible & appending globals are needed, if they have an      // initializer. -    if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) { -      if (!I->isDiscardableIfUnused()) -        GlobalIsNeeded(I); -    } +    if (!GV.isDeclaration() && !GV.hasAvailableExternallyLinkage()) +      if (!GV.isDiscardableIfUnused()) +        GlobalIsNeeded(&GV);    } -  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); -       I != E; ++I) { -    Changed |= RemoveUnusedGlobalValue(*I); +  for (GlobalAlias &GA : M.aliases()) { +    Changed |= RemoveUnusedGlobalValue(GA);      // Externally visible aliases are needed. -    if (!I->isDiscardableIfUnused()) { -      GlobalIsNeeded(I); -    } +    if (!GA.isDiscardableIfUnused()) +      GlobalIsNeeded(&GA);    }    // Now that all globals which are needed are in the AliveGlobals set, we loop @@ -126,52 +121,50 @@ bool GlobalDCE::runOnModule(Module &M) {    //    // The first pass is to drop initializers of global variables which are dead. -  std::vector<GlobalVariable*> DeadGlobalVars;   // Keep track of dead globals -  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); -       I != E; ++I) -    if (!AliveGlobals.count(I)) { -      DeadGlobalVars.push_back(I);         // Keep track of dead globals -      if (I->hasInitializer()) { -        Constant *Init = I->getInitializer(); -        I->setInitializer(nullptr); +  std::vector<GlobalVariable *> DeadGlobalVars; // Keep track of dead globals +  for (GlobalVariable &GV : M.globals()) +    if (!AliveGlobals.count(&GV)) { +      DeadGlobalVars.push_back(&GV);         // Keep track of dead globals +      if (GV.hasInitializer()) { +        Constant *Init = GV.getInitializer(); +        GV.setInitializer(nullptr);          if (isSafeToDestroyConstant(Init))            Init->destroyConstant();        }      }    // The second pass drops the bodies of functions which are dead... -  std::vector<Function*> DeadFunctions; -  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) -    if (!AliveGlobals.count(I)) { -      DeadFunctions.push_back(I);         // Keep track of dead globals -      if (!I->isDeclaration()) -        I->deleteBody(); +  std::vector<Function *> DeadFunctions; +  for (Function &F : M) +    if (!AliveGlobals.count(&F)) { +      DeadFunctions.push_back(&F);         // Keep track of dead globals +      if (!F.isDeclaration()) +        F.deleteBody();      }    // The third pass drops targets of aliases which are dead...    std::vector<GlobalAlias*> DeadAliases; -  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E; -       ++I) -    if (!AliveGlobals.count(I)) { -      DeadAliases.push_back(I); -      I->setAliasee(nullptr); +  for (GlobalAlias &GA : M.aliases()) +    if (!AliveGlobals.count(&GA)) { +      DeadAliases.push_back(&GA); +      GA.setAliasee(nullptr);      }    if (!DeadFunctions.empty()) {      // Now that all interferences have been dropped, delete the actual objects      // themselves. -    for (unsigned i = 0, e = DeadFunctions.size(); i != e; ++i) { -      RemoveUnusedGlobalValue(*DeadFunctions[i]); -      M.getFunctionList().erase(DeadFunctions[i]); +    for (Function *F : DeadFunctions) { +      RemoveUnusedGlobalValue(*F); +      M.getFunctionList().erase(F);      }      NumFunctions += DeadFunctions.size();      Changed = true;    }    if (!DeadGlobalVars.empty()) { -    for (unsigned i = 0, e = DeadGlobalVars.size(); i != e; ++i) { -      RemoveUnusedGlobalValue(*DeadGlobalVars[i]); -      M.getGlobalList().erase(DeadGlobalVars[i]); +    for (GlobalVariable *GV : DeadGlobalVars) { +      RemoveUnusedGlobalValue(*GV); +      M.getGlobalList().erase(GV);      }      NumVariables += DeadGlobalVars.size();      Changed = true; @@ -179,9 +172,9 @@ bool GlobalDCE::runOnModule(Module &M) {    // Now delete any dead aliases.    if (!DeadAliases.empty()) { -    for (unsigned i = 0, e = DeadAliases.size(); i != e; ++i) { -      RemoveUnusedGlobalValue(*DeadAliases[i]); -      M.getAliasList().erase(DeadAliases[i]); +    for (GlobalAlias *GA : DeadAliases) { +      RemoveUnusedGlobalValue(*GA); +      M.getAliasList().erase(GA);      }      NumAliases += DeadAliases.size();      Changed = true; @@ -222,21 +215,15 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {      // any globals used will be marked as needed.      Function *F = cast<Function>(G); -    if (F->hasPrefixData()) -      MarkUsedGlobalsAsNeeded(F->getPrefixData()); - -    if (F->hasPrologueData()) -      MarkUsedGlobalsAsNeeded(F->getPrologueData()); +    for (Use &U : F->operands()) +      MarkUsedGlobalsAsNeeded(cast<Constant>(U.get())); -    if (F->hasPersonalityFn()) -      MarkUsedGlobalsAsNeeded(F->getPersonalityFn()); - -    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) -      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) -        for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U) -          if (GlobalValue *GV = dyn_cast<GlobalValue>(*U)) +    for (BasicBlock &BB : *F) +      for (Instruction &I : BB) +        for (Use &U : I.operands()) +          if (GlobalValue *GV = dyn_cast<GlobalValue>(U))              GlobalIsNeeded(GV); -          else if (Constant *C = dyn_cast<Constant>(*U)) +          else if (Constant *C = dyn_cast<Constant>(U))              MarkUsedGlobalsAsNeeded(C);    }  } @@ -247,9 +234,9 @@ void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) {    // Loop over all of the operands of the constant, adding any globals they    // use to the list of needed globals. -  for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I) { +  for (Use &U : C->operands()) {      // If we've already processed this constant there's no need to do it again. -    Constant *Op = dyn_cast<Constant>(*I); +    Constant *Op = dyn_cast<Constant>(U);      if (Op && SeenConstants.insert(Op).second)        MarkUsedGlobalsAsNeeded(Op);    } @@ -262,7 +249,8 @@ void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) {  // might make it deader.  //  bool GlobalDCE::RemoveUnusedGlobalValue(GlobalValue &GV) { -  if (GV.use_empty()) return false; +  if (GV.use_empty()) +    return false;    GV.removeDeadConstantUsers();    return GV.use_empty();  } diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 5ffe15dbd31d..fd7736905fe8 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -28,6 +28,7 @@  #include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h"  #include "llvm/IR/GetElementPtrTypeIterator.h"  #include "llvm/IR/Instructions.h"  #include "llvm/IR/IntrinsicInst.h" @@ -54,7 +55,6 @@ STATISTIC(NumSRA       , "Number of aggregate globals broken into scalars");  STATISTIC(NumHeapSRA   , "Number of heap objects SRA'd");  STATISTIC(NumSubstitute,"Number of globals with initializers stored into them");  STATISTIC(NumDeleted   , "Number of globals deleted"); -STATISTIC(NumFnDeleted , "Number of functions deleted");  STATISTIC(NumGlobUses  , "Number of global uses devirtualized");  STATISTIC(NumLocalized , "Number of globals localized");  STATISTIC(NumShrunkToBool  , "Number of global vars shrunk to booleans"); @@ -69,6 +69,7 @@ namespace {    struct GlobalOpt : public ModulePass {      void getAnalysisUsage(AnalysisUsage &AU) const override {        AU.addRequired<TargetLibraryInfoWrapperPass>(); +      AU.addRequired<DominatorTreeWrapperPass>();      }      static char ID; // Pass identification, replacement for typeid      GlobalOpt() : ModulePass(ID) { @@ -81,11 +82,14 @@ namespace {      bool OptimizeFunctions(Module &M);      bool OptimizeGlobalVars(Module &M);      bool OptimizeGlobalAliases(Module &M); -    bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI); -    bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI, -                               const GlobalStatus &GS); +    bool deleteIfDead(GlobalValue &GV); +    bool processGlobal(GlobalValue &GV); +    bool processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS);      bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn); +    bool isPointerValueDeadOnEntryToFunction(const Function *F, +                                             GlobalValue *GV); +      TargetLibraryInfo *TLI;      SmallSet<const Comdat *, 8> NotDiscardableComdats;    }; @@ -95,13 +99,14 @@ char GlobalOpt::ID = 0;  INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt",                  "Global Variable Optimizer", false, false)  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)  INITIALIZE_PASS_END(GlobalOpt, "globalopt",                  "Global Variable Optimizer", false, false)  ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); } -/// isLeakCheckerRoot - Is this global variable possibly used by a leak checker -/// as a root?  If so, we might not really want to eliminate the stores to it. +/// Is this global variable possibly used by a leak checker as a root?  If so, +/// we might not really want to eliminate the stores to it.  static bool isLeakCheckerRoot(GlobalVariable *GV) {    // A global variable is a root if it is a pointer, or could plausibly contain    // a pointer.  There are two challenges; one is that we could have a struct @@ -176,10 +181,9 @@ static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {    } while (1);  } -/// CleanupPointerRootUsers - This GV is a pointer root.  Loop over all users -/// of the global and clean up any that obviously don't assign the global a -/// value that isn't dynamically allocated. -/// +/// This GV is a pointer root.  Loop over all users of the global and clean up +/// any that obviously don't assign the global a value that isn't dynamically +/// allocated.  static bool CleanupPointerRootUsers(GlobalVariable *GV,                                      const TargetLibraryInfo *TLI) {    // A brief explanation of leak checkers.  The goal is to find bugs where @@ -263,10 +267,9 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,    return Changed;  } -/// CleanupConstantGlobalUsers - We just marked GV constant.  Loop over all -/// users of the global, cleaning up the obvious ones.  This is largely just a -/// quick scan over the use list to clean up the easy and obvious cruft.  This -/// returns true if it made a change. +/// We just marked GV constant.  Loop over all users of the global, cleaning up +/// the obvious ones.  This is largely just a quick scan over the use list to +/// clean up the easy and obvious cruft.  This returns true if it made a change.  static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,                                         const DataLayout &DL,                                         TargetLibraryInfo *TLI) { @@ -353,8 +356,8 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,    return Changed;  } -/// isSafeSROAElementUse - Return true if the specified instruction is a safe -/// user of a derived expression from a global that we want to SROA. +/// Return true if the specified instruction is a safe user of a derived +/// expression from a global that we want to SROA.  static bool isSafeSROAElementUse(Value *V) {    // We might have a dead and dangling constant hanging off of here.    if (Constant *C = dyn_cast<Constant>(V)) @@ -385,9 +388,8 @@ static bool isSafeSROAElementUse(Value *V) {  } -/// IsUserOfGlobalSafeForSRA - U is a direct user of the specified global value. -/// Look at it and its uses and decide whether it is safe to SROA this global. -/// +/// U is a direct user of the specified global value.  Look at it and its uses +/// and decide whether it is safe to SROA this global.  static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {    // The user of the global must be a GEP Inst or a ConstantExpr GEP.    if (!isa<GetElementPtrInst>(U) && @@ -452,9 +454,8 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {    return true;  } -/// GlobalUsersSafeToSRA - Look at all uses of the global and decide whether it -/// is safe for us to perform this transformation. -/// +/// Look at all uses of the global and decide whether it is safe for us to +/// perform this transformation.  static bool GlobalUsersSafeToSRA(GlobalValue *GV) {    for (User *U : GV->users())      if (!IsUserOfGlobalSafeForSRA(U, GV)) @@ -464,10 +465,10 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) {  } -/// SRAGlobal - Perform scalar replacement of aggregates on the specified global -/// variable.  This opens the door for other optimizations by exposing the -/// behavior of the program in a more fine-grained way.  We have determined that -/// this transformation is safe already.  We return the first global variable we +/// Perform scalar replacement of aggregates on the specified global variable. +/// This opens the door for other optimizations by exposing the behavior of the +/// program in a more fine-grained way.  We have determined that this +/// transformation is safe already.  We return the first global variable we  /// insert so that the caller can reprocess it.  static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {    // Make sure this global only has simple uses that we can SRA. @@ -497,7 +498,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {                                                 In, GV->getName()+"."+Twine(i),                                                 GV->getThreadLocalMode(),                                                GV->getType()->getAddressSpace()); -      Globals.insert(GV, NGV); +      NGV->setExternallyInitialized(GV->isExternallyInitialized()); +      Globals.push_back(NGV);        NewGlobals.push_back(NGV);        // Calculate the known alignment of the field.  If the original aggregate @@ -530,7 +532,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {                                                 In, GV->getName()+"."+Twine(i),                                                 GV->getThreadLocalMode(),                                                GV->getType()->getAddressSpace()); -      Globals.insert(GV, NGV); +      NGV->setExternallyInitialized(GV->isExternallyInitialized()); +      Globals.push_back(NGV);        NewGlobals.push_back(NGV);        // Calculate the known alignment of the field.  If the original aggregate @@ -545,7 +548,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {    if (NewGlobals.empty())      return nullptr; -  DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV); +  DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");    Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext())); @@ -610,9 +613,9 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {    return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : nullptr;  } -/// AllUsesOfValueWillTrapIfNull - Return true if all users of the specified -/// value will trap if the value is dynamically null.  PHIs keeps track of any -/// phi nodes we've seen to avoid reprocessing them. +/// Return true if all users of the specified value will trap if the value is +/// dynamically null.  PHIs keeps track of any phi nodes we've seen to avoid +/// reprocessing them.  static bool AllUsesOfValueWillTrapIfNull(const Value *V,                                          SmallPtrSetImpl<const PHINode*> &PHIs) {    for (const User *U : V->users()) @@ -653,9 +656,9 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,    return true;  } -/// AllUsesOfLoadedValueWillTrapIfNull - Return true if all uses of any loads -/// from GV will trap if the loaded value is null.  Note that this also permits -/// comparisons of the loaded value against null, as a special case. +/// Return true if all uses of any loads from GV will trap if the loaded value +/// is null.  Note that this also permits comparisons of the loaded value +/// against null, as a special case.  static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) {    for (const User *U : GV->users())      if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { @@ -735,10 +738,10 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {  } -/// OptimizeAwayTrappingUsesOfLoads - The specified global has only one non-null -/// value stored into it.  If there are uses of the loaded value that would trap -/// if the loaded value is dynamically null, then we know that they cannot be -/// reachable with a null optimize away the load. +/// The specified global has only one non-null value stored into it.  If there +/// are uses of the loaded value that would trap if the loaded value is +/// dynamically null, then we know that they cannot be reachable with a null +/// optimize away the load.  static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,                                              const DataLayout &DL,                                              TargetLibraryInfo *TLI) { @@ -778,7 +781,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,    }    if (Changed) { -    DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV); +    DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV << "\n");      ++NumGlobUses;    } @@ -801,8 +804,8 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,    return Changed;  } -/// ConstantPropUsersOf - Walk the use list of V, constant folding all of the -/// instructions that are foldable. +/// Walk the use list of V, constant folding all of the instructions that are +/// foldable.  static void ConstantPropUsersOf(Value *V, const DataLayout &DL,                                  TargetLibraryInfo *TLI) {    for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; ) @@ -818,11 +821,11 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL,        }  } -/// OptimizeGlobalAddressOfMalloc - This function takes the specified global -/// variable, and transforms the program as if it always contained the result of -/// the specified malloc.  Because it is always the result of the specified -/// malloc, there is no reason to actually DO the malloc.  Instead, turn the -/// malloc into a global, and any loads of GV as uses of the new global. +/// This function takes the specified global variable, and transforms the +/// program as if it always contained the result of the specified malloc. +/// Because it is always the result of the specified malloc, there is no reason +/// to actually DO the malloc.  Instead, turn the malloc into a global, and any +/// loads of GV as uses of the new global.  static GlobalVariable *  OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,                                ConstantInt *NElements, const DataLayout &DL, @@ -838,13 +841,10 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,    // Create the new global variable.  The contents of the malloc'd memory is    // undefined, so initialize with an undef value. -  GlobalVariable *NewGV = new GlobalVariable(*GV->getParent(), -                                             GlobalType, false, -                                             GlobalValue::InternalLinkage, -                                             UndefValue::get(GlobalType), -                                             GV->getName()+".body", -                                             GV, -                                             GV->getThreadLocalMode()); +  GlobalVariable *NewGV = new GlobalVariable( +      *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage, +      UndefValue::get(GlobalType), GV->getName() + ".body", nullptr, +      GV->getThreadLocalMode());    // If there are bitcast users of the malloc (which is typical, usually we have    // a malloc + bitcast) then replace them with uses of the new global.  Update @@ -935,7 +935,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,        cast<StoreInst>(InitBool->user_back())->eraseFromParent();      delete InitBool;    } else -    GV->getParent()->getGlobalList().insert(GV, InitBool); +    GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool);    // Now the GV is dead, nuke it and the malloc..    GV->eraseFromParent(); @@ -951,10 +951,9 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,    return NewGV;  } -/// ValueIsOnlyUsedLocallyOrStoredToOneGlobal - Scan the use-list of V checking -/// to make sure that there are no complex uses of V.  We permit simple things -/// like dereferencing the pointer, but not storing through the address, unless -/// it is to the specified global. +/// Scan the use-list of V checking to make sure that there are no complex uses +/// of V.  We permit simple things like dereferencing the pointer, but not +/// storing through the address, unless it is to the specified global.  static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,                                                        const GlobalVariable *GV,                                          SmallPtrSetImpl<const PHINode*> &PHIs) { @@ -998,10 +997,9 @@ static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,    return true;  } -/// ReplaceUsesOfMallocWithGlobal - The Alloc pointer is stored into GV -/// somewhere.  Transform all uses of the allocation into loads from the -/// global and uses of the resultant pointer.  Further, delete the store into -/// GV.  This assumes that these value pass the +/// The Alloc pointer is stored into GV somewhere.  Transform all uses of the +/// allocation into loads from the global and uses of the resultant pointer. +/// Further, delete the store into GV.  This assumes that these value pass the  /// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate.  static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc,                                            GlobalVariable *GV) { @@ -1043,9 +1041,9 @@ static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc,    }  } -/// LoadUsesSimpleEnoughForHeapSRA - Verify that all uses of V (a load, or a phi -/// of a load) are simple enough to perform heap SRA on.  This permits GEP's -/// that index through the array and struct field, icmps of null, and PHIs. +/// Verify that all uses of V (a load, or a phi of a load) are simple enough to +/// perform heap SRA on.  This permits GEP's that index through the array and +/// struct field, icmps of null, and PHIs.  static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,                          SmallPtrSetImpl<const PHINode*> &LoadUsingPHIs,                          SmallPtrSetImpl<const PHINode*> &LoadUsingPHIsPerLoad) { @@ -1096,8 +1094,8 @@ static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,  } -/// AllGlobalLoadUsesSimpleEnoughForHeapSRA - If all users of values loaded from -/// GV are simple enough to perform HeapSRA, return true. +/// If all users of values loaded from GV are simple enough to perform HeapSRA, +/// return true.  static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV,                                                      Instruction *StoredVal) {    SmallPtrSet<const PHINode*, 32> LoadUsingPHIs; @@ -1186,8 +1184,8 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,    return FieldVals[FieldNo] = Result;  } -/// RewriteHeapSROALoadUser - Given a load instruction and a value derived from -/// the load, rewrite the derived value to use the HeapSRoA'd load. +/// Given a load instruction and a value derived from the load, rewrite the +/// derived value to use the HeapSRoA'd load.  static void RewriteHeapSROALoadUser(Instruction *LoadUser,               DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,                     std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { @@ -1248,10 +1246,9 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser,    }  } -/// RewriteUsesOfLoadForHeapSRoA - We are performing Heap SRoA on a global.  Ptr -/// is a value loaded from the global.  Eliminate all uses of Ptr, making them -/// use FieldGlobals instead.  All uses of loaded values satisfy -/// AllGlobalLoadUsesSimpleEnoughForHeapSRA. +/// We are performing Heap SRoA on a global.  Ptr is a value loaded from the +/// global.  Eliminate all uses of Ptr, making them use FieldGlobals instead. +/// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA.  static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,                 DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,                     std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { @@ -1266,8 +1263,8 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,    }  } -/// PerformHeapAllocSRoA - CI is an allocation of an array of structures.  Break -/// it up into multiple allocations of arrays of the fields. +/// CI is an allocation of an array of structures.  Break it up into multiple +/// allocations of arrays of the fields.  static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,                                              Value *NElems, const DataLayout &DL,                                              const TargetLibraryInfo *TLI) { @@ -1291,12 +1288,10 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,      Type *FieldTy = STy->getElementType(FieldNo);      PointerType *PFieldTy = PointerType::get(FieldTy, AS); -    GlobalVariable *NGV = -      new GlobalVariable(*GV->getParent(), -                         PFieldTy, false, GlobalValue::InternalLinkage, -                         Constant::getNullValue(PFieldTy), -                         GV->getName() + ".f" + Twine(FieldNo), GV, -                         GV->getThreadLocalMode()); +    GlobalVariable *NGV = new GlobalVariable( +        *GV->getParent(), PFieldTy, false, GlobalValue::InternalLinkage, +        Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo), +        nullptr, GV->getThreadLocalMode());      FieldGlobals.push_back(NGV);      unsigned TypeSize = DL.getTypeAllocSize(FieldTy); @@ -1336,7 +1331,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,    // Split the basic block at the old malloc.    BasicBlock *OrigBB = CI->getParent(); -  BasicBlock *ContBB = OrigBB->splitBasicBlock(CI, "malloc_cont"); +  BasicBlock *ContBB = +      OrigBB->splitBasicBlock(CI->getIterator(), "malloc_cont");    // Create the block to check the first condition.  Put all these blocks at the    // end of the function as they are unlikely to be executed. @@ -1376,9 +1372,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,    // CI is no longer needed, remove it.    CI->eraseFromParent(); -  /// InsertedScalarizedLoads - As we process loads, if we can't immediately -  /// update all uses of the load, keep track of what scalarized loads are -  /// inserted for a given load. +  /// As we process loads, if we can't immediately update all uses of the load, +  /// keep track of what scalarized loads are inserted for a given load.    DenseMap<Value*, std::vector<Value*> > InsertedScalarizedValues;    InsertedScalarizedValues[GV] = FieldGlobals; @@ -1454,13 +1449,11 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,    return cast<GlobalVariable>(FieldGlobals[0]);  } -/// TryToOptimizeStoreOfMallocToGlobal - This function is called when we see a -/// pointer global variable with a single value stored it that is a malloc or -/// cast of malloc. -static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, +/// This function is called when we see a pointer global variable with a single +/// value stored it that is a malloc or cast of malloc. +static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,                                                 Type *AllocTy,                                                 AtomicOrdering Ordering, -                                               Module::global_iterator &GVI,                                                 const DataLayout &DL,                                                 TargetLibraryInfo *TLI) {    // If this is a malloc of an abstract type, don't touch it. @@ -1499,7 +1492,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,      // (2048 bytes currently), as we don't want to introduce a 16M global or      // something.      if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) { -      GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI); +      OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI);        return true;      } @@ -1544,19 +1537,18 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,          CI = cast<CallInst>(Malloc);      } -    GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), -                               DL, TLI); +    PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), DL, +                         TLI);      return true;    }    return false;  } -// OptimizeOnceStoredGlobal - Try to optimize globals based on the knowledge -// that only one value (besides its initializer) is ever stored to the global. -static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, +// Try to optimize globals based on the knowledge that only one value (besides +// its initializer) is ever stored to the global. +static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,                                       AtomicOrdering Ordering, -                                     Module::global_iterator &GVI,                                       const DataLayout &DL,                                       TargetLibraryInfo *TLI) {    // Ignore no-op GEPs and bitcasts. @@ -1577,9 +1569,8 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,          return true;      } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) {        Type *MallocType = getMallocAllocatedType(CI, TLI); -      if (MallocType && -          TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, Ordering, GVI, -                                             DL, TLI)) +      if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, +                                                           Ordering, DL, TLI))          return true;      }    } @@ -1587,10 +1578,10 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,    return false;  } -/// TryToShrinkGlobalToBoolean - At this point, we have learned that the only -/// two values ever stored into GV are its initializer and OtherVal.  See if we -/// can shrink the global into a boolean and select between the two values -/// whenever it is used.  This exposes the values to other scalar optimizations. +/// At this point, we have learned that the only two values ever stored into GV +/// are its initializer and OtherVal.  See if we can shrink the global into a +/// boolean and select between the two values whenever it is used.  This exposes +/// the values to other scalar optimizations.  static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {    Type *GVElType = GV->getType()->getElementType(); @@ -1610,7 +1601,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {      if (!isa<LoadInst>(U) && !isa<StoreInst>(U))        return false; -  DEBUG(dbgs() << "   *** SHRINKING TO BOOL: " << *GV); +  DEBUG(dbgs() << "   *** SHRINKING TO BOOL: " << *GV << "\n");    // Create the new global, initializing it to false.    GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()), @@ -1620,7 +1611,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {                                               GV->getName()+".b",                                               GV->getThreadLocalMode(),                                               GV->getType()->getAddressSpace()); -  GV->getParent()->getGlobalList().insert(GV, NewGV); +  GV->getParent()->getGlobalList().insert(GV->getIterator(), NewGV);    Constant *InitVal = GV->getInitializer();    assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) && @@ -1688,61 +1679,213 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {    return true;  } +bool GlobalOpt::deleteIfDead(GlobalValue &GV) { +  GV.removeDeadConstantUsers(); -/// ProcessGlobal - Analyze the specified global variable and optimize it if -/// possible.  If we make a change, return true. -bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, -                              Module::global_iterator &GVI) { -  // Do more involved optimizations if the global is internal. -  GV->removeDeadConstantUsers(); +  if (!GV.isDiscardableIfUnused()) +    return false; -  if (GV->use_empty()) { -    DEBUG(dbgs() << "GLOBAL DEAD: " << *GV); -    GV->eraseFromParent(); -    ++NumDeleted; -    return true; -  } +  if (const Comdat *C = GV.getComdat()) +    if (!GV.hasLocalLinkage() && NotDiscardableComdats.count(C)) +      return false; -  if (!GV->hasLocalLinkage()) +  bool Dead; +  if (auto *F = dyn_cast<Function>(&GV)) +    Dead = F->isDefTriviallyDead(); +  else +    Dead = GV.use_empty(); +  if (!Dead) +    return false; + +  DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n"); +  GV.eraseFromParent(); +  ++NumDeleted; +  return true; +} + +/// Analyze the specified global variable and optimize it if possible.  If we +/// make a change, return true. +bool GlobalOpt::processGlobal(GlobalValue &GV) { +  // Do more involved optimizations if the global is internal. +  if (!GV.hasLocalLinkage())      return false;    GlobalStatus GS; -  if (GlobalStatus::analyzeGlobal(GV, GS)) +  if (GlobalStatus::analyzeGlobal(&GV, GS))      return false; -  if (!GS.IsCompared && !GV->hasUnnamedAddr()) { -    GV->setUnnamedAddr(true); +  bool Changed = false; +  if (!GS.IsCompared && !GV.hasUnnamedAddr()) { +    GV.setUnnamedAddr(true);      NumUnnamed++; +    Changed = true;    } -  if (GV->isConstant() || !GV->hasInitializer()) +  auto *GVar = dyn_cast<GlobalVariable>(&GV); +  if (!GVar) +    return Changed; + +  if (GVar->isConstant() || !GVar->hasInitializer()) +    return Changed; + +  return processInternalGlobal(GVar, GS) || Changed; +} + +bool GlobalOpt::isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV) { +  // Find all uses of GV. We expect them all to be in F, and if we can't +  // identify any of the uses we bail out. +  // +  // On each of these uses, identify if the memory that GV points to is +  // used/required/live at the start of the function. If it is not, for example +  // if the first thing the function does is store to the GV, the GV can +  // possibly be demoted. +  // +  // We don't do an exhaustive search for memory operations - simply look +  // through bitcasts as they're quite common and benign. +  const DataLayout &DL = GV->getParent()->getDataLayout(); +  SmallVector<LoadInst *, 4> Loads; +  SmallVector<StoreInst *, 4> Stores; +  for (auto *U : GV->users()) { +    if (Operator::getOpcode(U) == Instruction::BitCast) { +      for (auto *UU : U->users()) { +        if (auto *LI = dyn_cast<LoadInst>(UU)) +          Loads.push_back(LI); +        else if (auto *SI = dyn_cast<StoreInst>(UU)) +          Stores.push_back(SI); +        else +          return false; +      } +      continue; +    } + +    Instruction *I = dyn_cast<Instruction>(U); +    if (!I) +      return false; +    assert(I->getParent()->getParent() == F); + +    if (auto *LI = dyn_cast<LoadInst>(I)) +      Loads.push_back(LI); +    else if (auto *SI = dyn_cast<StoreInst>(I)) +      Stores.push_back(SI); +    else +      return false; +  } + +  // We have identified all uses of GV into loads and stores. Now check if all +  // of them are known not to depend on the value of the global at the function +  // entry point. We do this by ensuring that every load is dominated by at +  // least one store. +  auto &DT = getAnalysis<DominatorTreeWrapperPass>(*const_cast<Function *>(F)) +                 .getDomTree(); + +  // The below check is quadratic. Check we're not going to do too many tests. +  // FIXME: Even though this will always have worst-case quadratic time, we +  // could put effort into minimizing the average time by putting stores that +  // have been shown to dominate at least one load at the beginning of the +  // Stores array, making subsequent dominance checks more likely to succeed +  // early. +  // +  // The threshold here is fairly large because global->local demotion is a +  // very powerful optimization should it fire. +  const unsigned Threshold = 100; +  if (Loads.size() * Stores.size() > Threshold)      return false; -  return ProcessInternalGlobal(GV, GVI, GS); +  for (auto *L : Loads) { +    auto *LTy = L->getType(); +    if (!std::any_of(Stores.begin(), Stores.end(), [&](StoreInst *S) { +          auto *STy = S->getValueOperand()->getType(); +          // The load is only dominated by the store if DomTree says so +          // and the number of bits loaded in L is less than or equal to +          // the number of bits stored in S. +          return DT.dominates(S, L) && +                 DL.getTypeStoreSize(LTy) <= DL.getTypeStoreSize(STy); +        })) +      return false; +  } +  // All loads have known dependences inside F, so the global can be localized. +  return true; +} + +/// C may have non-instruction users. Can all of those users be turned into +/// instructions? +static bool allNonInstructionUsersCanBeMadeInstructions(Constant *C) { +  // We don't do this exhaustively. The most common pattern that we really need +  // to care about is a constant GEP or constant bitcast - so just looking +  // through one single ConstantExpr. +  // +  // The set of constants that this function returns true for must be able to be +  // handled by makeAllConstantUsesInstructions. +  for (auto *U : C->users()) { +    if (isa<Instruction>(U)) +      continue; +    if (!isa<ConstantExpr>(U)) +      // Non instruction, non-constantexpr user; cannot convert this. +      return false; +    for (auto *UU : U->users()) +      if (!isa<Instruction>(UU)) +        // A constantexpr used by another constant. We don't try and recurse any +        // further but just bail out at this point. +        return false; +  } + +  return true; +} + +/// C may have non-instruction users, and +/// allNonInstructionUsersCanBeMadeInstructions has returned true. Convert the +/// non-instruction users to instructions. +static void makeAllConstantUsesInstructions(Constant *C) { +  SmallVector<ConstantExpr*,4> Users; +  for (auto *U : C->users()) { +    if (isa<ConstantExpr>(U)) +      Users.push_back(cast<ConstantExpr>(U)); +    else +      // We should never get here; allNonInstructionUsersCanBeMadeInstructions +      // should not have returned true for C. +      assert( +          isa<Instruction>(U) && +          "Can't transform non-constantexpr non-instruction to instruction!"); +  } + +  SmallVector<Value*,4> UUsers; +  for (auto *U : Users) { +    UUsers.clear(); +    for (auto *UU : U->users()) +      UUsers.push_back(UU); +    for (auto *UU : UUsers) { +      Instruction *UI = cast<Instruction>(UU); +      Instruction *NewU = U->getAsInstruction(); +      NewU->insertBefore(UI); +      UI->replaceUsesOfWith(U, NewU); +    } +    U->dropAllReferences(); +  }  } -/// ProcessInternalGlobal - Analyze the specified global variable and optimize +/// Analyze the specified global variable and optimize  /// it if possible.  If we make a change, return true. -bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, -                                      Module::global_iterator &GVI, +bool GlobalOpt::processInternalGlobal(GlobalVariable *GV,                                        const GlobalStatus &GS) {    auto &DL = GV->getParent()->getDataLayout(); -  // If this is a first class global and has only one accessing function -  // and this function is main (which we know is not recursive), we replace -  // the global with a local alloca in this function. +  // If this is a first class global and has only one accessing function and +  // this function is non-recursive, we replace the global with a local alloca +  // in this function.    //    // NOTE: It doesn't make sense to promote non-single-value types since we    // are just replacing static memory to stack memory.    //    // If the global is in different address space, don't bring it to stack.    if (!GS.HasMultipleAccessingFunctions && -      GS.AccessingFunction && !GS.HasNonInstructionUser && +      GS.AccessingFunction &&        GV->getType()->getElementType()->isSingleValueType() && -      GS.AccessingFunction->getName() == "main" && -      GS.AccessingFunction->hasExternalLinkage() && -      GV->getType()->getAddressSpace() == 0) { -    DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV); +      GV->getType()->getAddressSpace() == 0 && +      !GV->isExternallyInitialized() && +      allNonInstructionUsersCanBeMadeInstructions(GV) && +      GS.AccessingFunction->doesNotRecurse() && +      isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV) ) { +    DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");      Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction                                                     ->getEntryBlock().begin());      Type *ElemTy = GV->getType()->getElementType(); @@ -1752,6 +1895,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,      if (!isa<UndefValue>(GV->getInitializer()))        new StoreInst(GV->getInitializer(), Alloca, &FirstI); +    makeAllConstantUsesInstructions(GV); +          GV->replaceAllUsesWith(Alloca);      GV->eraseFromParent();      ++NumLocalized; @@ -1761,7 +1906,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,    // If the global is never loaded (but may be stored to), it is dead.    // Delete it now.    if (!GS.IsLoaded) { -    DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV); +    DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n");      bool Changed;      if (isLeakCheckerRoot(GV)) { @@ -1800,11 +1945,9 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,      return true;    } else if (!GV->getInitializer()->getType()->isSingleValueType()) {      const DataLayout &DL = GV->getParent()->getDataLayout(); -    if (GlobalVariable *FirstNewGV = SRAGlobal(GV, DL)) { -      GVI = FirstNewGV; // Don't skip the newly produced globals! +    if (SRAGlobal(GV, DL))        return true; -    } -  } else if (GS.StoredType == GlobalStatus::StoredOnce) { +  } else if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) {      // If the initial value for the global was an undef value, and if only      // one other value was stored into it, we can just change the      // initializer to be the stored value, then delete all stores to the @@ -1822,8 +1965,6 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,                         << "simplify all users and delete global!\n");            GV->eraseFromParent();            ++NumDeleted; -        } else { -          GVI = GV;          }          ++NumSubstitute;          return true; @@ -1831,8 +1972,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,      // Try to optimize globals based on the knowledge that only one value      // (besides its initializer) is ever stored to the global. -    if (OptimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, GVI, -                                 DL, TLI)) +    if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL, TLI))        return true;      // Otherwise, if the global was not a boolean, we can shrink it to be a @@ -1850,8 +1990,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,    return false;  } -/// ChangeCalleesToFastCall - Walk all of the direct calls of the specified -/// function, changing them to FastCC. +/// Walk all of the direct calls of the specified function, changing them to +/// FastCC.  static void ChangeCalleesToFastCall(Function *F) {    for (User *U : F->users()) {      if (isa<BlockAddress>(U)) @@ -1898,38 +2038,38 @@ bool GlobalOpt::OptimizeFunctions(Module &M) {    bool Changed = false;    // Optimize functions.    for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) { -    Function *F = FI++; +    Function *F = &*FI++;      // Functions without names cannot be referenced outside this module.      if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())        F->setLinkage(GlobalValue::InternalLinkage); -    const Comdat *C = F->getComdat(); -    bool inComdat = C && NotDiscardableComdats.count(C); -    F->removeDeadConstantUsers(); -    if ((!inComdat || F->hasLocalLinkage()) && F->isDefTriviallyDead()) { -      F->eraseFromParent(); +    if (deleteIfDead(*F)) {        Changed = true; -      ++NumFnDeleted; -    } else if (F->hasLocalLinkage()) { -      if (isProfitableToMakeFastCC(F) && !F->isVarArg() && -          !F->hasAddressTaken()) { -        // If this function has a calling convention worth changing, is not a -        // varargs function, and is only called directly, promote it to use the -        // Fast calling convention. -        F->setCallingConv(CallingConv::Fast); -        ChangeCalleesToFastCall(F); -        ++NumFastCallFns; -        Changed = true; -      } +      continue; +    } -      if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) && -          !F->hasAddressTaken()) { -        // The function is not used by a trampoline intrinsic, so it is safe -        // to remove the 'nest' attribute. -        RemoveNestAttribute(F); -        ++NumNestRemoved; -        Changed = true; -      } +    Changed |= processGlobal(*F); + +    if (!F->hasLocalLinkage()) +      continue; +    if (isProfitableToMakeFastCC(F) && !F->isVarArg() && +        !F->hasAddressTaken()) { +      // If this function has a calling convention worth changing, is not a +      // varargs function, and is only called directly, promote it to use the +      // Fast calling convention. +      F->setCallingConv(CallingConv::Fast); +      ChangeCalleesToFastCall(F); +      ++NumFastCallFns; +      Changed = true; +    } + +    if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) && +        !F->hasAddressTaken()) { +      // The function is not used by a trampoline intrinsic, so it is safe +      // to remove the 'nest' attribute. +      RemoveNestAttribute(F); +      ++NumNestRemoved; +      Changed = true;      }    }    return Changed; @@ -1940,7 +2080,7 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) {    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();         GVI != E; ) { -    GlobalVariable *GV = GVI++; +    GlobalVariable *GV = &*GVI++;      // Global variables without names cannot be referenced outside this module.      if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage())        GV->setLinkage(GlobalValue::InternalLinkage); @@ -1953,12 +2093,12 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) {            GV->setInitializer(New);        } -    if (GV->isDiscardableIfUnused()) { -      if (const Comdat *C = GV->getComdat()) -        if (NotDiscardableComdats.count(C) && !GV->hasLocalLinkage()) -          continue; -      Changed |= ProcessGlobal(GV, GVI); +    if (deleteIfDead(*GV)) { +      Changed = true; +      continue;      } + +    Changed |= processGlobal(*GV);    }    return Changed;  } @@ -1968,8 +2108,8 @@ isSimpleEnoughValueToCommit(Constant *C,                              SmallPtrSetImpl<Constant *> &SimpleConstants,                              const DataLayout &DL); -/// isSimpleEnoughValueToCommit - Return true if the specified constant can be -/// handled by the code generator.  We don't want to generate something like: +/// Return true if the specified constant can be handled by the code generator. +/// We don't want to generate something like:  ///   void *X = &X/42;  /// because the code generator doesn't have a relocation that can handle that.  /// @@ -2044,11 +2184,11 @@ isSimpleEnoughValueToCommit(Constant *C,  } -/// isSimpleEnoughPointerToCommit - Return true if this constant is simple -/// enough for us to understand.  In particular, if it is a cast to anything -/// other than from one pointer type to another pointer type, we punt. -/// We basically just support direct accesses to globals and GEP's of -/// globals.  This should be kept up to date with CommitValueTo. +/// Return true if this constant is simple enough for us to understand.  In +/// particular, if it is a cast to anything other than from one pointer type to +/// another pointer type, we punt.  We basically just support direct accesses to +/// globals and GEP's of globals.  This should be kept up to date with +/// CommitValueTo.  static bool isSimpleEnoughPointerToCommit(Constant *C) {    // Conservatively, avoid aggregate types. This is because we don't    // want to worry about them partially overlapping other stores. @@ -2095,9 +2235,9 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) {    return false;  } -/// EvaluateStoreInto - Evaluate a piece of a constantexpr store into a global -/// initializer.  This returns 'Init' modified to reflect 'Val' stored into it. -/// At this point, the GEP operands of Addr [0, OpNo) have been stepped into. +/// Evaluate a piece of a constantexpr store into a global initializer.  This +/// returns 'Init' modified to reflect 'Val' stored into it.  At this point, the +/// GEP operands of Addr [0, OpNo) have been stepped into.  static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,                                     ConstantExpr *Addr, unsigned OpNo) {    // Base case of the recursion. @@ -2144,7 +2284,7 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,    return ConstantVector::get(Elts);  } -/// CommitValueTo - We have decided that Addr (which satisfies the predicate +/// We have decided that Addr (which satisfies the predicate  /// isSimpleEnoughPointerToCommit) should get Val as its value.  Make it happen.  static void CommitValueTo(Constant *Val, Constant *Addr) {    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { @@ -2160,10 +2300,10 @@ static void CommitValueTo(Constant *Val, Constant *Addr) {  namespace { -/// Evaluator - This class evaluates LLVM IR, producing the Constant -/// representing each SSA instruction.  Changes to global variables are stored -/// in a mapping that can be iterated over after the evaluation is complete. -/// Once an evaluation call fails, the evaluation object should not be reused. +/// This class evaluates LLVM IR, producing the Constant representing each SSA +/// instruction.  Changes to global variables are stored in a mapping that can +/// be iterated over after the evaluation is complete.  Once an evaluation call +/// fails, the evaluation object should not be reused.  class Evaluator {  public:    Evaluator(const DataLayout &DL, const TargetLibraryInfo *TLI) @@ -2180,15 +2320,15 @@ public:          Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType()));    } -  /// EvaluateFunction - Evaluate a call to function F, returning true if -  /// successful, false if we can't evaluate it.  ActualArgs contains the formal -  /// arguments for the function. +  /// Evaluate a call to function F, returning true if successful, false if we +  /// can't evaluate it.  ActualArgs contains the formal arguments for the +  /// function.    bool EvaluateFunction(Function *F, Constant *&RetVal,                          const SmallVectorImpl<Constant*> &ActualArgs); -  /// EvaluateBlock - Evaluate all instructions in block BB, returning true if -  /// successful, false if we can't evaluate it.  NewBB returns the next BB that -  /// control flows into, or null upon return. +  /// Evaluate all instructions in block BB, returning true if successful, false +  /// if we can't evaluate it.  NewBB returns the next BB that control flows +  /// into, or null upon return.    bool EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB);    Constant *getVal(Value *V) { @@ -2213,32 +2353,31 @@ public:  private:    Constant *ComputeLoadResult(Constant *P); -  /// ValueStack - As we compute SSA register values, we store their contents -  /// here. The back of the deque contains the current function and the stack -  /// contains the values in the calling frames. +  /// As we compute SSA register values, we store their contents here. The back +  /// of the deque contains the current function and the stack contains the +  /// values in the calling frames.    std::deque<DenseMap<Value*, Constant*>> ValueStack; -  /// CallStack - This is used to detect recursion.  In pathological situations -  /// we could hit exponential behavior, but at least there is nothing -  /// unbounded. +  /// This is used to detect recursion.  In pathological situations we could hit +  /// exponential behavior, but at least there is nothing unbounded.    SmallVector<Function*, 4> CallStack; -  /// MutatedMemory - For each store we execute, we update this map.  Loads -  /// check this to get the most up-to-date value.  If evaluation is successful, -  /// this state is committed to the process. +  /// For each store we execute, we update this map.  Loads check this to get +  /// the most up-to-date value.  If evaluation is successful, this state is +  /// committed to the process.    DenseMap<Constant*, Constant*> MutatedMemory; -  /// AllocaTmps - To 'execute' an alloca, we create a temporary global variable -  /// to represent its body.  This vector is needed so we can delete the -  /// temporary globals when we are done. +  /// To 'execute' an alloca, we create a temporary global variable to represent +  /// its body.  This vector is needed so we can delete the temporary globals +  /// when we are done.    SmallVector<std::unique_ptr<GlobalVariable>, 32> AllocaTmps; -  /// Invariants - These global variables have been marked invariant by the -  /// static constructor. +  /// These global variables have been marked invariant by the static +  /// constructor.    SmallPtrSet<GlobalVariable*, 8> Invariants; -  /// SimpleConstants - These are constants we have checked and know to be -  /// simple enough to live in a static initializer of a global. +  /// These are constants we have checked and know to be simple enough to live +  /// in a static initializer of a global.    SmallPtrSet<Constant*, 8> SimpleConstants;    const DataLayout &DL; @@ -2247,9 +2386,8 @@ private:  }  // anonymous namespace -/// ComputeLoadResult - Return the value that would be computed by a load from -/// P after the stores reflected by 'memory' have been performed.  If we can't -/// decide, return null. +/// Return the value that would be computed by a load from P after the stores +/// reflected by 'memory' have been performed.  If we can't decide, return null.  Constant *Evaluator::ComputeLoadResult(Constant *P) {    // If this memory location has been recently stored, use the stored value: it    // is the most up-to-date. @@ -2275,9 +2413,9 @@ Constant *Evaluator::ComputeLoadResult(Constant *P) {    return nullptr;  // don't know how to evaluate.  } -/// EvaluateBlock - Evaluate all instructions in block BB, returning true if -/// successful, false if we can't evaluate it.  NewBB returns the next BB that -/// control flows into, or null upon return. +/// Evaluate all instructions in block BB, returning true if successful, false +/// if we can't evaluate it.  NewBB returns the next BB that control flows into, +/// or null upon return.  bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,                                BasicBlock *&NextBB) {    // This is the main evaluation loop. @@ -2438,7 +2576,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,        InstResult = AllocaTmps.back().get();        DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");      } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) { -      CallSite CS(CurInst); +      CallSite CS(&*CurInst);        // Debug info can safely be ignored here.        if (isa<DbgInfoIntrinsic>(CS.getInstruction())) { @@ -2504,6 +2642,10 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,            // Continue even if we do nothing.            ++CurInst;            continue; +        } else if (II->getIntrinsicID() == Intrinsic::assume) { +          DEBUG(dbgs() << "Skipping assume intrinsic.\n"); +          ++CurInst; +          continue;          }          DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n"); @@ -2600,7 +2742,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult))          InstResult = ConstantFoldConstantExpression(CE, DL, TLI); -      setVal(CurInst, InstResult); +      setVal(&*CurInst, InstResult);      }      // If we just processed an invoke, we finished evaluating the block. @@ -2615,9 +2757,9 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,    }  } -/// EvaluateFunction - Evaluate a call to function F, returning true if -/// successful, false if we can't evaluate it.  ActualArgs contains the formal -/// arguments for the function. +/// Evaluate a call to function F, returning true if successful, false if we +/// can't evaluate it.  ActualArgs contains the formal arguments for the +/// function.  bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,                                   const SmallVectorImpl<Constant*> &ActualArgs) {    // Check to see if this function is already executing (recursion).  If so, @@ -2631,7 +2773,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,    unsigned ArgNo = 0;    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;         ++AI, ++ArgNo) -    setVal(AI, ActualArgs[ArgNo]); +    setVal(&*AI, ActualArgs[ArgNo]);    // ExecutedBlocks - We only handle non-looping, non-recursive code.  As such,    // we can only evaluate any one basic block at most once.  This set keeps @@ -2639,7 +2781,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,    SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;    // CurBB - The current basic block we're evaluating. -  BasicBlock *CurBB = F->begin(); +  BasicBlock *CurBB = &F->front();    BasicBlock::iterator CurInst = CurBB->begin(); @@ -2679,8 +2821,8 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,    }  } -/// EvaluateStaticConstructor - Evaluate static constructors in the function, if -/// we can.  Return true if we can, false otherwise. +/// Evaluate static constructors in the function, if we can.  Return true if we +/// can, false otherwise.  static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,                                        const TargetLibraryInfo *TLI) {    // Call the function. @@ -2708,7 +2850,8 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,  }  static int compareNames(Constant *const *A, Constant *const *B) { -  return (*A)->getName().compare((*B)->getName()); +  return (*A)->stripPointerCasts()->getName().compare( +      (*B)->stripPointerCasts()->getName());  }  static void setUsedInitializer(GlobalVariable &V, @@ -2742,7 +2885,7 @@ static void setUsedInitializer(GlobalVariable &V,  }  namespace { -/// \brief An easy to access representation of llvm.used and llvm.compiler.used. +/// An easy to access representation of llvm.used and llvm.compiler.used.  class LLVMUsed {    SmallPtrSet<GlobalValue *, 8> Used;    SmallPtrSet<GlobalValue *, 8> CompilerUsed; @@ -2861,10 +3004,17 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {    for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();         I != E;) { -    Module::alias_iterator J = I++; +    GlobalAlias *J = &*I++; +      // Aliases without names cannot be referenced outside this module.      if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage())        J->setLinkage(GlobalValue::InternalLinkage); + +    if (deleteIfDead(*J)) { +      Changed = true; +      continue; +    } +      // If the aliasee may change at link time, nothing can be done - bail out.      if (J->mayBeOverridden())        continue; @@ -2889,15 +3039,15 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {      if (RenameTarget) {        // Give the aliasee the name, linkage and other attributes of the alias. -      Target->takeName(J); +      Target->takeName(&*J);        Target->setLinkage(J->getLinkage());        Target->setVisibility(J->getVisibility());        Target->setDLLStorageClass(J->getDLLStorageClass()); -      if (Used.usedErase(J)) +      if (Used.usedErase(&*J))          Used.usedInsert(Target); -      if (Used.compilerUsedErase(J)) +      if (Used.compilerUsedErase(&*J))          Used.compilerUsedInsert(Target);      } else if (mayHaveOtherReferences(*J, Used))        continue; @@ -2936,8 +3086,8 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {    return Fn;  } -/// cxxDtorIsEmpty - Returns whether the given function is an empty C++ -/// destructor and can therefore be eliminated. +/// Returns whether the given function is an empty C++ destructor and can +/// therefore be eliminated.  /// Note that we assume that other optimization passes have already simplified  /// the code so we only look for a function with a single basic block, where  /// the only allowed instructions are 'ret', 'call' to an empty C++ dtor and @@ -3081,3 +3231,4 @@ bool GlobalOpt::runOnModule(Module &M) {    return Changed;  } + diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp index 50f56b0f2afe..7ea6c08b2e66 100644 --- a/contrib/llvm/lib/Transforms/IPO/IPO.cpp +++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp @@ -7,8 +7,8 @@  //  //===----------------------------------------------------------------------===//  // -// This file implements the common infrastructure (including C bindings) for  -// libLLVMIPO.a, which implements several transformations over the LLVM  +// This file implements the common infrastructure (including C bindings) for +// libLLVMIPO.a, which implements several transformations over the LLVM  // intermediate representation.  //  //===----------------------------------------------------------------------===// @@ -24,14 +24,17 @@ using namespace llvm;  void llvm::initializeIPO(PassRegistry &Registry) {    initializeArgPromotionPass(Registry);    initializeConstantMergePass(Registry); +  initializeCrossDSOCFIPass(Registry);    initializeDAEPass(Registry);    initializeDAHPass(Registry); +  initializeForceFunctionAttrsLegacyPassPass(Registry);    initializeFunctionAttrsPass(Registry);    initializeGlobalDCEPass(Registry);    initializeGlobalOptPass(Registry);    initializeIPCPPass(Registry);    initializeAlwaysInlinerPass(Registry);    initializeSimpleInlinerPass(Registry); +  initializeInferFunctionAttrsLegacyPassPass(Registry);    initializeInternalizePassPass(Registry);    initializeLoopExtractorPass(Registry);    initializeBlockExtractorPassPass(Registry); @@ -40,13 +43,15 @@ void llvm::initializeIPO(PassRegistry &Registry) {    initializeMergeFunctionsPass(Registry);    initializePartialInlinerPass(Registry);    initializePruneEHPass(Registry); -  initializeStripDeadPrototypesPassPass(Registry); +  initializeStripDeadPrototypesLegacyPassPass(Registry);    initializeStripSymbolsPass(Registry);    initializeStripDebugDeclarePass(Registry);    initializeStripDeadDebugInfoPass(Registry);    initializeStripNonDebugSymbolsPass(Registry);    initializeBarrierNoopPass(Registry);    initializeEliminateAvailableExternallyPass(Registry); +  initializeSampleProfileLoaderPass(Registry); +  initializeFunctionImportPassPass(Registry);  }  void LLVMInitializeIPO(LLVMPassRegistryRef R) { diff --git a/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp new file mode 100644 index 000000000000..d02c861a2948 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp @@ -0,0 +1,937 @@ +//===- InferFunctionAttrs.cpp - Infer implicit function attributes --------===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/InferFunctionAttrs.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "inferattrs" + +STATISTIC(NumReadNone, "Number of functions inferred as readnone"); +STATISTIC(NumReadOnly, "Number of functions inferred as readonly"); +STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind"); +STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture"); +STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly"); +STATISTIC(NumNoAlias, "Number of function returns inferred as noalias"); + +static bool setDoesNotAccessMemory(Function &F) { +  if (F.doesNotAccessMemory()) +    return false; +  F.setDoesNotAccessMemory(); +  ++NumReadNone; +  return true; +} + +static bool setOnlyReadsMemory(Function &F) { +  if (F.onlyReadsMemory()) +    return false; +  F.setOnlyReadsMemory(); +  ++NumReadOnly; +  return true; +} + +static bool setDoesNotThrow(Function &F) { +  if (F.doesNotThrow()) +    return false; +  F.setDoesNotThrow(); +  ++NumNoUnwind; +  return true; +} + +static bool setDoesNotCapture(Function &F, unsigned n) { +  if (F.doesNotCapture(n)) +    return false; +  F.setDoesNotCapture(n); +  ++NumNoCapture; +  return true; +} + +static bool setOnlyReadsMemory(Function &F, unsigned n) { +  if (F.onlyReadsMemory(n)) +    return false; +  F.setOnlyReadsMemory(n); +  ++NumReadOnlyArg; +  return true; +} + +static bool setDoesNotAlias(Function &F, unsigned n) { +  if (F.doesNotAlias(n)) +    return false; +  F.setDoesNotAlias(n); +  ++NumNoAlias; +  return true; +} + +/// Analyze the name and prototype of the given function and set any applicable +/// attributes. +/// +/// Returns true if any attributes were set and false otherwise. +static bool inferPrototypeAttributes(Function &F, +                                     const TargetLibraryInfo &TLI) { +  if (F.hasFnAttribute(Attribute::OptimizeNone)) +    return false; + +  FunctionType *FTy = F.getFunctionType(); +  LibFunc::Func TheLibFunc; +  if (!(TLI.getLibFunc(F.getName(), TheLibFunc) && TLI.has(TheLibFunc))) +    return false; + +  bool Changed = false; + +  switch (TheLibFunc) { +  case LibFunc::strlen: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setOnlyReadsMemory(F); +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::strchr: +  case LibFunc::strrchr: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isIntegerTy()) +      return false; +    Changed |= setOnlyReadsMemory(F); +    Changed |= setDoesNotThrow(F); +    return Changed; +  case LibFunc::strtol: +  case LibFunc::strtod: +  case LibFunc::strtof: +  case LibFunc::strtoul: +  case LibFunc::strtoll: +  case LibFunc::strtold: +  case LibFunc::strtoull: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::strcpy: +  case LibFunc::stpcpy: +  case LibFunc::strcat: +  case LibFunc::strncat: +  case LibFunc::strncpy: +  case LibFunc::stpncpy: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::strxfrm: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::strcmp:      // 0,1 +  case LibFunc::strspn:      // 0,1 +  case LibFunc::strncmp:     // 0,1 +  case LibFunc::strcspn:     // 0,1 +  case LibFunc::strcoll:     // 0,1 +  case LibFunc::strcasecmp:  // 0,1 +  case LibFunc::strncasecmp: // +    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setOnlyReadsMemory(F); +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::strstr: +  case LibFunc::strpbrk: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setOnlyReadsMemory(F); +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::strtok: +  case LibFunc::strtok_r: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::scanf: +    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::setbuf: +  case LibFunc::setvbuf: +    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::strdup: +  case LibFunc::strndup: +    if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() || +        !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::stat: +  case LibFunc::statvfs: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::sscanf: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::sprintf: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::snprintf: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(2)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 3); +    Changed |= setOnlyReadsMemory(F, 3); +    return Changed; +  case LibFunc::setitimer: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() || +        !FTy->getParamType(2)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setDoesNotCapture(F, 3); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::system: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    // May throw; "system" is a valid pthread cancellation point. +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::malloc: +    if (FTy->getNumParams() != 1 || !FTy->getReturnType()->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    return Changed; +  case LibFunc::memcmp: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setOnlyReadsMemory(F); +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::memchr: +  case LibFunc::memrchr: +    if (FTy->getNumParams() != 3) +      return false; +    Changed |= setOnlyReadsMemory(F); +    Changed |= setDoesNotThrow(F); +    return Changed; +  case LibFunc::modf: +  case LibFunc::modff: +  case LibFunc::modfl: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::memcpy: +  case LibFunc::memccpy: +  case LibFunc::memmove: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::memalign: +    if (!FTy->getReturnType()->isPointerTy()) +      return false; +    Changed |= setDoesNotAlias(F, 0); +    return Changed; +  case LibFunc::mkdir: +    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::mktime: +    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::realloc: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getReturnType()->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::read: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    // May throw; "read" is a valid pthread cancellation point. +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::rewind: +    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::rmdir: +  case LibFunc::remove: +  case LibFunc::realpath: +    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::rename: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::readlink: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::write: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    // May throw; "write" is a valid pthread cancellation point. +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::bcopy: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::bcmp: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setOnlyReadsMemory(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::bzero: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::calloc: +    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    return Changed; +  case LibFunc::chmod: +  case LibFunc::chown: +    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::ctermid: +  case LibFunc::clearerr: +  case LibFunc::closedir: +    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::atoi: +  case LibFunc::atol: +  case LibFunc::atof: +  case LibFunc::atoll: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setOnlyReadsMemory(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::access: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::fopen: +    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() || +        !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::fdopen: +    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::feof: +  case LibFunc::free: +  case LibFunc::fseek: +  case LibFunc::ftell: +  case LibFunc::fgetc: +  case LibFunc::fseeko: +  case LibFunc::ftello: +  case LibFunc::fileno: +  case LibFunc::fflush: +  case LibFunc::fclose: +  case LibFunc::fsetpos: +  case LibFunc::flockfile: +  case LibFunc::funlockfile: +  case LibFunc::ftrylockfile: +    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::ferror: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F); +    return Changed; +  case LibFunc::fputc: +  case LibFunc::fstat: +  case LibFunc::frexp: +  case LibFunc::frexpf: +  case LibFunc::frexpl: +  case LibFunc::fstatvfs: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::fgets: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(2)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 3); +    return Changed; +  case LibFunc::fread: +    if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(3)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 4); +    return Changed; +  case LibFunc::fwrite: +    if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(3)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 4); +    return Changed; +  case LibFunc::fputs: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::fscanf: +  case LibFunc::fprintf: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::fgetpos: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::getc: +  case LibFunc::getlogin_r: +  case LibFunc::getc_unlocked: +    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::getenv: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setOnlyReadsMemory(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::gets: +  case LibFunc::getchar: +    Changed |= setDoesNotThrow(F); +    return Changed; +  case LibFunc::getitimer: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::getpwnam: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::ungetc: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::uname: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::unlink: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::unsetenv: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::utime: +  case LibFunc::utimes: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::putc: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::puts: +  case LibFunc::printf: +  case LibFunc::perror: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::pread: +    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    // May throw; "pread" is a valid pthread cancellation point. +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::pwrite: +    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    // May throw; "pwrite" is a valid pthread cancellation point. +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::putchar: +    Changed |= setDoesNotThrow(F); +    return Changed; +  case LibFunc::popen: +    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() || +        !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::pclose: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::vscanf: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::vsscanf: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() || +        !FTy->getParamType(2)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::vfscanf: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() || +        !FTy->getParamType(2)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::valloc: +    if (!FTy->getReturnType()->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    return Changed; +  case LibFunc::vprintf: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::vfprintf: +  case LibFunc::vsprintf: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::vsnprintf: +    if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(2)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 3); +    Changed |= setOnlyReadsMemory(F, 3); +    return Changed; +  case LibFunc::open: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    // May throw; "open" is a valid pthread cancellation point. +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::opendir: +    if (FTy->getNumParams() != 1 || !FTy->getReturnType()->isPointerTy() || +        !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::tmpfile: +    if (!FTy->getReturnType()->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    return Changed; +  case LibFunc::times: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::htonl: +  case LibFunc::htons: +  case LibFunc::ntohl: +  case LibFunc::ntohs: +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAccessMemory(F); +    return Changed; +  case LibFunc::lstat: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::lchown: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::qsort: +    if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy()) +      return false; +    // May throw; places call through function pointer. +    Changed |= setDoesNotCapture(F, 4); +    return Changed; +  case LibFunc::dunder_strdup: +  case LibFunc::dunder_strndup: +    if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() || +        !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::dunder_strtok_r: +    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::under_IO_getc: +    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::under_IO_putc: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::dunder_isoc99_scanf: +    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::stat64: +  case LibFunc::lstat64: +  case LibFunc::statvfs64: +    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::dunder_isoc99_sscanf: +    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::fopen64: +    if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() || +        !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    Changed |= setOnlyReadsMemory(F, 1); +    Changed |= setOnlyReadsMemory(F, 2); +    return Changed; +  case LibFunc::fseeko64: +  case LibFunc::ftello64: +    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    return Changed; +  case LibFunc::tmpfile64: +    if (!FTy->getReturnType()->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotAlias(F, 0); +    return Changed; +  case LibFunc::fstat64: +  case LibFunc::fstatvfs64: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) +      return false; +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; +  case LibFunc::open64: +    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) +      return false; +    // May throw; "open" is a valid pthread cancellation point. +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setOnlyReadsMemory(F, 1); +    return Changed; +  case LibFunc::gettimeofday: +    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || +        !FTy->getParamType(1)->isPointerTy()) +      return false; +    // Currently some platforms have the restrict keyword on the arguments to +    // gettimeofday. To be conservative, do not add noalias to gettimeofday's +    // arguments. +    Changed |= setDoesNotThrow(F); +    Changed |= setDoesNotCapture(F, 1); +    Changed |= setDoesNotCapture(F, 2); +    return Changed; + +  default: +    // FIXME: It'd be really nice to cover all the library functions we're +    // aware of here. +    return false; +  } +} + +static bool inferAllPrototypeAttributes(Module &M, +                                        const TargetLibraryInfo &TLI) { +  bool Changed = false; + +  for (Function &F : M.functions()) +    // We only infer things using the prototype if the definition isn't around +    // to analyze directly. +    if (F.isDeclaration()) +      Changed |= inferPrototypeAttributes(F, TLI); + +  return Changed; +} + +PreservedAnalyses InferFunctionAttrsPass::run(Module &M, +                                              AnalysisManager<Module> *AM) { +  auto &TLI = AM->getResult<TargetLibraryAnalysis>(M); + +  if (!inferAllPrototypeAttributes(M, TLI)) +    // If we didn't infer anything, preserve all analyses. +    return PreservedAnalyses::all(); + +  // Otherwise, we may have changed fundamental function attributes, so clear +  // out all the passes. +  return PreservedAnalyses::none(); +} + +namespace { +struct InferFunctionAttrsLegacyPass : public ModulePass { +  static char ID; // Pass identification, replacement for typeid +  InferFunctionAttrsLegacyPass() : ModulePass(ID) { +    initializeInferFunctionAttrsLegacyPassPass( +        *PassRegistry::getPassRegistry()); +  } + +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<TargetLibraryInfoWrapperPass>(); +  } + +  bool runOnModule(Module &M) override { +    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); +    return inferAllPrototypeAttributes(M, TLI); +  } +}; +} + +char InferFunctionAttrsLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(InferFunctionAttrsLegacyPass, "inferattrs", +                      "Infer set function attributes", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(InferFunctionAttrsLegacyPass, "inferattrs", +                    "Infer set function attributes", false, false) + +Pass *llvm::createInferFunctionAttrsLegacyPass() { +  return new InferFunctionAttrsLegacyPass(); +} diff --git a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp index dc56a02e7b7d..1704bfea0b86 100644 --- a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp +++ b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp @@ -14,10 +14,10 @@  #include "llvm/Transforms/IPO.h"  #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/CallGraph.h"  #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/IR/CallSite.h"  #include "llvm/IR/CallingConv.h"  #include "llvm/IR/DataLayout.h" @@ -35,17 +35,15 @@ namespace {  /// \brief Inliner pass which only handles "always inline" functions.  class AlwaysInliner : public Inliner { -  InlineCostAnalysis *ICA;  public:    // Use extremely low threshold. -  AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true), -                    ICA(nullptr) { +  AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true) {      initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());    }    AlwaysInliner(bool InsertLifetime) -      : Inliner(ID, -2000000000, InsertLifetime), ICA(nullptr) { +      : Inliner(ID, -2000000000, InsertLifetime) {      initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());    } @@ -53,9 +51,6 @@ public:    InlineCost getInlineCost(CallSite CS) override; -  void getAnalysisUsage(AnalysisUsage &AU) const override; -  bool runOnSCC(CallGraphSCC &SCC) override; -    using llvm::Pass::doFinalization;    bool doFinalization(CallGraph &CG) override {      return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/ true); @@ -67,10 +62,9 @@ public:  char AlwaysInliner::ID = 0;  INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",                  "Inliner for always_inline functions", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis)  INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)  INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)  INITIALIZE_PASS_END(AlwaysInliner, "always-inline",                  "Inliner for always_inline functions", false, false) @@ -99,19 +93,8 @@ InlineCost AlwaysInliner::getInlineCost(CallSite CS) {    // that are viable for inlining. FIXME: We shouldn't even get here for    // declarations.    if (Callee && !Callee->isDeclaration() && -      CS.hasFnAttr(Attribute::AlwaysInline) && -      ICA->isInlineViable(*Callee)) +      CS.hasFnAttr(Attribute::AlwaysInline) && isInlineViable(*Callee))      return InlineCost::getAlways();    return InlineCost::getNever();  } - -bool AlwaysInliner::runOnSCC(CallGraphSCC &SCC) { -  ICA = &getAnalysis<InlineCostAnalysis>(); -  return Inliner::runOnSCC(SCC); -} - -void AlwaysInliner::getAnalysisUsage(AnalysisUsage &AU) const { -  AU.addRequired<InlineCostAnalysis>(); -  Inliner::getAnalysisUsage(AU); -} diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp index 9b01d81b3c7c..45609f891ed8 100644 --- a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp +++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp @@ -11,11 +11,11 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" -#include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/CallGraph.h"  #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h"  #include "llvm/IR/CallSite.h"  #include "llvm/IR/CallingConv.h"  #include "llvm/IR/DataLayout.h" @@ -23,6 +23,7 @@  #include "llvm/IR/IntrinsicInst.h"  #include "llvm/IR/Module.h"  #include "llvm/IR/Type.h" +#include "llvm/Transforms/IPO.h"  #include "llvm/Transforms/IPO/InlinerPass.h"  using namespace llvm; @@ -37,26 +38,30 @@ namespace {  /// inliner pass and the always inliner pass. The two passes use different cost  /// analyses to determine when to inline.  class SimpleInliner : public Inliner { -  InlineCostAnalysis *ICA;  public: -  SimpleInliner() : Inliner(ID), ICA(nullptr) { +  SimpleInliner() : Inliner(ID) {      initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());    }    SimpleInliner(int Threshold) -      : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(nullptr) { +      : Inliner(ID, Threshold, /*InsertLifetime*/ true) {      initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());    }    static char ID; // Pass identification, replacement for typeid    InlineCost getInlineCost(CallSite CS) override { -    return ICA->getInlineCost(CS, getInlineThreshold(CS)); +    Function *Callee = CS.getCalledFunction(); +    TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); +    return llvm::getInlineCost(CS, getInlineThreshold(CS), TTI, ACT);    }    bool runOnSCC(CallGraphSCC &SCC) override;    void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: +  TargetTransformInfoWrapperPass *TTIWP;  };  static int computeThresholdFromOptLevels(unsigned OptLevel, @@ -75,10 +80,10 @@ static int computeThresholdFromOptLevels(unsigned OptLevel,  char SimpleInliner::ID = 0;  INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",                  "Function Integration/Inlining", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis)  INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)  INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)  INITIALIZE_PASS_END(SimpleInliner, "inline",                  "Function Integration/Inlining", false, false) @@ -95,11 +100,11 @@ Pass *llvm::createFunctionInliningPass(unsigned OptLevel,  }  bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) { -  ICA = &getAnalysis<InlineCostAnalysis>(); +  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();    return Inliner::runOnSCC(SCC);  }  void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const { -  AU.addRequired<InlineCostAnalysis>(); +  AU.addRequired<TargetTransformInfoWrapperPass>();    Inliner::getAnalysisUsage(AU);  } diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp index 5273c3dc3ca2..bbe5f8761d5f 100644 --- a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp @@ -18,6 +18,7 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h"  #include "llvm/Analysis/CallGraph.h"  #include "llvm/Analysis/InlineCost.h"  #include "llvm/Analysis/TargetLibraryInfo.h" @@ -64,20 +65,22 @@ ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(225),  // Threshold to use when optsize is specified (and there is no -inline-limit).  const int OptSizeThreshold = 75; -Inliner::Inliner(char &ID)  -  : CallGraphSCCPass(ID), InlineThreshold(InlineLimit), InsertLifetime(true) {} +Inliner::Inliner(char &ID) +    : CallGraphSCCPass(ID), InlineThreshold(InlineLimit), InsertLifetime(true) { +}  Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime) -  : CallGraphSCCPass(ID), InlineThreshold(InlineLimit.getNumOccurrences() > 0 ? -                                          InlineLimit : Threshold), -    InsertLifetime(InsertLifetime) {} +    : CallGraphSCCPass(ID), +      InlineThreshold(InlineLimit.getNumOccurrences() > 0 ? InlineLimit +                                                          : Threshold), +      InsertLifetime(InsertLifetime) {}  /// For this class, we declare that we require and preserve the call graph.  /// If the derived class implements this method, it should  /// always explicitly call the implementation here.  void Inliner::getAnalysisUsage(AnalysisUsage &AU) const { -  AU.addRequired<AliasAnalysis>();    AU.addRequired<AssumptionCacheTracker>(); +  AU.addRequired<TargetLibraryInfoWrapperPass>();    CallGraphSCCPass::getAnalysisUsage(AU);  } @@ -85,39 +88,6 @@ void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {  typedef DenseMap<ArrayType*, std::vector<AllocaInst*> >  InlinedArrayAllocasTy; -/// \brief If the inlined function had a higher stack protection level than the -/// calling function, then bump up the caller's stack protection level. -static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) { -  // If upgrading the SSP attribute, clear out the old SSP Attributes first. -  // Having multiple SSP attributes doesn't actually hurt, but it adds useless -  // clutter to the IR. -  AttrBuilder B; -  B.addAttribute(Attribute::StackProtect) -    .addAttribute(Attribute::StackProtectStrong) -    .addAttribute(Attribute::StackProtectReq); -  AttributeSet OldSSPAttr = AttributeSet::get(Caller->getContext(), -                                              AttributeSet::FunctionIndex, -                                              B); - -  if (Callee->hasFnAttribute(Attribute::SafeStack)) { -    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); -    Caller->addFnAttr(Attribute::SafeStack); -  } else if (Callee->hasFnAttribute(Attribute::StackProtectReq) && -             !Caller->hasFnAttribute(Attribute::SafeStack)) { -    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); -    Caller->addFnAttr(Attribute::StackProtectReq); -  } else if (Callee->hasFnAttribute(Attribute::StackProtectStrong) && -             !Caller->hasFnAttribute(Attribute::SafeStack) && -             !Caller->hasFnAttribute(Attribute::StackProtectReq)) { -    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); -    Caller->addFnAttr(Attribute::StackProtectStrong); -  } else if (Callee->hasFnAttribute(Attribute::StackProtect) && -             !Caller->hasFnAttribute(Attribute::SafeStack) && -             !Caller->hasFnAttribute(Attribute::StackProtectReq) && -             !Caller->hasFnAttribute(Attribute::StackProtectStrong)) -    Caller->addFnAttr(Attribute::StackProtect); -} -  /// If it is possible to inline the specified call site,  /// do so and update the CallGraph for this operation.  /// @@ -126,18 +96,26 @@ static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) {  /// available from other functions inlined into the caller.  If we are able to  /// inline this call site we attempt to reuse already available allocas or add  /// any new allocas to the set if not possible. -static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, +static bool InlineCallIfPossible(Pass &P, CallSite CS, InlineFunctionInfo &IFI,                                   InlinedArrayAllocasTy &InlinedArrayAllocas,                                   int InlineHistory, bool InsertLifetime) {    Function *Callee = CS.getCalledFunction();    Function *Caller = CS.getCaller(); +  // We need to manually construct BasicAA directly in order to disable +  // its use of other function analyses. +  BasicAAResult BAR(createLegacyPMBasicAAResult(P, *Callee)); + +  // Construct our own AA results for this function. We do this manually to +  // work around the limitations of the legacy pass manager. +  AAResults AAR(createLegacyPMAAResults(P, *Callee, BAR)); +    // Try to inline the function.  Get the list of static allocas that were    // inlined. -  if (!InlineFunction(CS, IFI, InsertLifetime)) +  if (!InlineFunction(CS, IFI, &AAR, InsertLifetime))      return false; -  AdjustCallerSSPLevel(Caller, Callee); +  AttributeFuncs::mergeAttributesForInlining(*Caller, *Callee);    // Look at all of the allocas that we inlined through this call site.  If we    // have already inlined other allocas through other calls into this function, @@ -219,6 +197,14 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,        DEBUG(dbgs() << "    ***MERGED ALLOCA: " << *AI << "\n\t\tINTO: "                     << *AvailableAlloca << '\n'); +      // Move affected dbg.declare calls immediately after the new alloca to +      // avoid the situation when a dbg.declare preceeds its alloca. +      if (auto *L = LocalAsMetadata::getIfExists(AI)) +        if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L)) +          for (User *U : MDV->users()) +            if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) +              DDI->moveBefore(AvailableAlloca->getNextNode()); +        AI->replaceAllUsesWith(AvailableAlloca);        if (Align1 != Align2) { @@ -258,39 +244,64 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,  }  unsigned Inliner::getInlineThreshold(CallSite CS) const { -  int thres = InlineThreshold; // -inline-threshold or else selected by -                               // overall opt level +  int Threshold = InlineThreshold; // -inline-threshold or else selected by +                                   // overall opt level    // If -inline-threshold is not given, listen to the optsize attribute when it    // would decrease the threshold.    Function *Caller = CS.getCaller();    bool OptSize = Caller && !Caller->isDeclaration() && +                 // FIXME: Use Function::optForSize().                   Caller->hasFnAttribute(Attribute::OptimizeForSize);    if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && -      OptSizeThreshold < thres) -    thres = OptSizeThreshold; +      OptSizeThreshold < Threshold) +    Threshold = OptSizeThreshold; -  // Listen to the inlinehint attribute when it would increase the threshold -  // and the caller does not need to minimize its size.    Function *Callee = CS.getCalledFunction(); -  bool InlineHint = Callee && !Callee->isDeclaration() && -                    Callee->hasFnAttribute(Attribute::InlineHint); -  if (InlineHint && HintThreshold > thres && -      !Caller->hasFnAttribute(Attribute::MinSize)) -    thres = HintThreshold; +  if (!Callee || Callee->isDeclaration()) +    return Threshold; + +  // If profile information is available, use that to adjust threshold of hot +  // and cold functions. +  // FIXME: The heuristic used below for determining hotness and coldness are +  // based on preliminary SPEC tuning and may not be optimal. Replace this with +  // a well-tuned heuristic based on *callsite* hotness and not callee hotness. +  uint64_t FunctionCount = 0, MaxFunctionCount = 0; +  bool HasPGOCounts = false; +  if (Callee->getEntryCount() && +      Callee->getParent()->getMaximumFunctionCount()) { +    HasPGOCounts = true; +    FunctionCount = Callee->getEntryCount().getValue(); +    MaxFunctionCount = +        Callee->getParent()->getMaximumFunctionCount().getValue(); +  } -  // Listen to the cold attribute when it would decrease the threshold. -  bool ColdCallee = Callee && !Callee->isDeclaration() && -                    Callee->hasFnAttribute(Attribute::Cold); +  // Listen to the inlinehint attribute or profile based hotness information +  // when it would increase the threshold and the caller does not need to +  // minimize its size. +  bool InlineHint = +      Callee->hasFnAttribute(Attribute::InlineHint) || +      (HasPGOCounts && +       FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount)); +  if (InlineHint && HintThreshold > Threshold && +      !Caller->hasFnAttribute(Attribute::MinSize)) +    Threshold = HintThreshold; + +  // Listen to the cold attribute or profile based coldness information +  // when it would decrease the threshold. +  bool ColdCallee = +      Callee->hasFnAttribute(Attribute::Cold) || +      (HasPGOCounts && +       FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount));    // Command line argument for InlineLimit will override the default    // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold,    // do not use the default cold threshold even if it is smaller.    if ((InlineLimit.getNumOccurrences() == 0 ||         ColdThreshold.getNumOccurrences() > 0) && ColdCallee && -      ColdThreshold < thres) -    thres = ColdThreshold; +      ColdThreshold < Threshold) +    Threshold = ColdThreshold; -  return thres; +  return Threshold;  }  static void emitAnalysis(CallSite CS, const Twine &Msg) { @@ -430,10 +441,8 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,  bool Inliner::runOnSCC(CallGraphSCC &SCC) {    CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); -  AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); -  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); -  const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; -  AliasAnalysis *AA = &getAnalysis<AliasAnalysis>(); +  ACT = &getAnalysis<AssumptionCacheTracker>(); +  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();    SmallPtrSet<Function*, 8> SCCFunctions;    DEBUG(dbgs() << "Inliner visiting SCC:"); @@ -469,8 +478,9 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {          // If this is a direct call to an external function, we can never inline          // it.  If it is an indirect call, inlining may resolve it to be a          // direct call, so we keep it. -        if (CS.getCalledFunction() && CS.getCalledFunction()->isDeclaration()) -          continue; +        if (Function *Callee = CS.getCalledFunction()) +          if (Callee->isDeclaration()) +            continue;          CallSites.push_back(std::make_pair(CS, -1));        } @@ -492,7 +502,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {    InlinedArrayAllocasTy InlinedArrayAllocas; -  InlineFunctionInfo InlineInfo(&CG, AA, ACT); +  InlineFunctionInfo InlineInfo(&CG, ACT);    // Now that we have all of the call sites, loop over them and inline them if    // it looks profitable to do so. @@ -513,7 +523,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {        // just delete the call instead of trying to inline it, regardless of        // size.  This happens because IPSCCP propagates the result out of the        // call and then we're left with the dead call. -      if (isInstructionTriviallyDead(CS.getInstruction(), TLI)) { +      if (isInstructionTriviallyDead(CS.getInstruction(), &TLI)) {          DEBUG(dbgs() << "    -> Deleting dead call: "                       << *CS.getInstruction() << "\n");          // Update the call graph by deleting the edge from Callee to Caller. @@ -550,7 +560,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {          }          // Attempt to inline the function. -        if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas, +        if (!InlineCallIfPossible(*this, CS, InlineInfo, InlinedArrayAllocas,                                    InlineHistoryID, InsertLifetime)) {            emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc,                                         Twine(Callee->getName() + @@ -647,8 +657,8 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {    // Scan for all of the functions, looking for ones that should now be removed    // from the program.  Insert the dead ones in the FunctionsToRemove set. -  for (auto I : CG) { -    CallGraphNode *CGN = I.second; +  for (const auto &I : CG) { +    CallGraphNode *CGN = I.second.get();      Function *F = CGN->getFunction();      if (!F || F->isDeclaration())        continue; diff --git a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp index 7950163f757d..21bb5d000bc7 100644 --- a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp +++ b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp @@ -60,6 +60,10 @@ namespace {      explicit InternalizePass();      explicit InternalizePass(ArrayRef<const char *> ExportList);      void LoadFile(const char *Filename); +    bool maybeInternalize(GlobalValue &GV, +                          const std::set<const Comdat *> &ExternalComdats); +    void checkComdatVisibility(GlobalValue &GV, +                               std::set<const Comdat *> &ExternalComdats);      bool runOnModule(Module &M) override;      void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -105,40 +109,85 @@ void InternalizePass::LoadFile(const char *Filename) {    }  } -static bool shouldInternalize(const GlobalValue &GV, -                              const std::set<std::string> &ExternalNames) { +static bool isExternallyVisible(const GlobalValue &GV, +                                const std::set<std::string> &ExternalNames) {    // Function must be defined here    if (GV.isDeclaration()) -    return false; +    return true;    // Available externally is really just a "declaration with a body".    if (GV.hasAvailableExternallyLinkage()) -    return false; +    return true;    // Assume that dllexported symbols are referenced elsewhere    if (GV.hasDLLExportStorageClass()) -    return false; - -  // Already has internal linkage -  if (GV.hasLocalLinkage()) -    return false; +    return true;    // Marked to keep external? -  if (ExternalNames.count(GV.getName())) -    return false; +  if (!GV.hasLocalLinkage() && ExternalNames.count(GV.getName())) +    return true; + +  return false; +} +// Internalize GV if it is possible to do so, i.e. it is not externally visible +// and is not a member of an externally visible comdat. +bool InternalizePass::maybeInternalize( +    GlobalValue &GV, const std::set<const Comdat *> &ExternalComdats) { +  if (Comdat *C = GV.getComdat()) { +    if (ExternalComdats.count(C)) +      return false; + +    // If a comdat is not externally visible we can drop it. +    if (auto GO = dyn_cast<GlobalObject>(&GV)) +      GO->setComdat(nullptr); + +    if (GV.hasLocalLinkage()) +      return false; +  } else { +    if (GV.hasLocalLinkage()) +      return false; + +    if (isExternallyVisible(GV, ExternalNames)) +      return false; +  } + +  GV.setVisibility(GlobalValue::DefaultVisibility); +  GV.setLinkage(GlobalValue::InternalLinkage);    return true;  } +// If GV is part of a comdat and is externally visible, keep track of its +// comdat so that we don't internalize any of its members. +void InternalizePass::checkComdatVisibility( +    GlobalValue &GV, std::set<const Comdat *> &ExternalComdats) { +  Comdat *C = GV.getComdat(); +  if (!C) +    return; + +  if (isExternallyVisible(GV, ExternalNames)) +    ExternalComdats.insert(C); +} +  bool InternalizePass::runOnModule(Module &M) {    CallGraphWrapperPass *CGPass = getAnalysisIfAvailable<CallGraphWrapperPass>();    CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr;    CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : nullptr; -  bool Changed = false;    SmallPtrSet<GlobalValue *, 8> Used;    collectUsedGlobalVariables(M, Used, false); +  // Collect comdat visiblity information for the module. +  std::set<const Comdat *> ExternalComdats; +  if (!M.getComdatSymbolTable().empty()) { +    for (Function &F : M) +      checkComdatVisibility(F, ExternalComdats); +    for (GlobalVariable &GV : M.globals()) +      checkComdatVisibility(GV, ExternalComdats); +    for (GlobalAlias &GA : M.aliases()) +      checkComdatVisibility(GA, ExternalComdats); +  } +    // We must assume that globals in llvm.used have a reference that not even    // the linker can see, so we don't internalize them.    // For llvm.compiler.used the situation is a bit fuzzy. The assembler and @@ -153,20 +202,16 @@ bool InternalizePass::runOnModule(Module &M) {    }    // Mark all functions not in the api as internal. -  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { -    if (!shouldInternalize(*I, ExternalNames)) +  for (Function &I : M) { +    if (!maybeInternalize(I, ExternalComdats))        continue; -    I->setVisibility(GlobalValue::DefaultVisibility); -    I->setLinkage(GlobalValue::InternalLinkage); -      if (ExternalNode)        // Remove a callgraph edge from the external node to this function. -      ExternalNode->removeOneAbstractEdgeTo((*CG)[I]); +      ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]); -    Changed = true;      ++NumFunctions; -    DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n"); +    DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");    }    // Never internalize the llvm.used symbol.  It is used to implement @@ -191,12 +236,9 @@ bool InternalizePass::runOnModule(Module &M) {    // internal as well.    for (Module::global_iterator I = M.global_begin(), E = M.global_end();         I != E; ++I) { -    if (!shouldInternalize(*I, ExternalNames)) +    if (!maybeInternalize(*I, ExternalComdats))        continue; -    I->setVisibility(GlobalValue::DefaultVisibility); -    I->setLinkage(GlobalValue::InternalLinkage); -    Changed = true;      ++NumGlobals;      DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n");    } @@ -204,17 +246,20 @@ bool InternalizePass::runOnModule(Module &M) {    // Mark all aliases that are not in the api as internal as well.    for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();         I != E; ++I) { -    if (!shouldInternalize(*I, ExternalNames)) +    if (!maybeInternalize(*I, ExternalComdats))        continue; -    I->setVisibility(GlobalValue::DefaultVisibility); -    I->setLinkage(GlobalValue::InternalLinkage); -    Changed = true;      ++NumAliases;      DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n");    } -  return Changed; +  // We do not keep track of whether this pass changed the module because +  // it adds unnecessary complexity: +  // 1) This pass will generally be near the start of the pass pipeline, so +  //    there will be no analyses to invalidate. +  // 2) This pass will most likely end up changing the module and it isn't worth +  //    worrying about optimizing the case where the module is unchanged. +  return true;  }  ModulePass *llvm::createInternalizePass() { return new InternalizePass(); } diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp index 41334ca5b429..8e4ad642ddd5 100644 --- a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp +++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp @@ -43,12 +43,13 @@ namespace {          initializeLoopExtractorPass(*PassRegistry::getPassRegistry());        } -    bool runOnLoop(Loop *L, LPPassManager &LPM) override; +    bool runOnLoop(Loop *L, LPPassManager &) override;      void getAnalysisUsage(AnalysisUsage &AU) const override {        AU.addRequiredID(BreakCriticalEdgesID);        AU.addRequiredID(LoopSimplifyID);        AU.addRequired<DominatorTreeWrapperPass>(); +      AU.addRequired<LoopInfoWrapperPass>();      }    };  } @@ -79,7 +80,7 @@ INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single",  //  Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); } -bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { +bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &) {    if (skipOptnoneFunction(L))      return false; @@ -92,6 +93,7 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {      return false;    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); +  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();    bool Changed = false;    // If there is more than one top-level loop in this function, extract all of @@ -120,14 +122,14 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {    }    if (ShouldExtractLoop) { -    // We must omit landing pads. Landing pads must accompany the invoke +    // We must omit EH pads. EH pads must accompany the invoke      // instruction. But this would result in a loop in the extracted      // function. An infinite cycle occurs when it tries to extract that loop as      // well.      SmallVector<BasicBlock*, 8> ExitBlocks;      L->getExitBlocks(ExitBlocks);      for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) -      if (ExitBlocks[i]->isLandingPad()) { +      if (ExitBlocks[i]->isEHPad()) {          ShouldExtractLoop = false;          break;        } @@ -141,7 +143,7 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {        Changed = true;        // After extraction, the loop is replaced by a function call, so        // we shouldn't try to run any more loop passes on it. -      LPM.deleteLoopFromQueue(L); +      LI.updateUnloop(L);      }      ++NumExtracted;    } @@ -259,7 +261,7 @@ bool BlockExtractorPass::runOnModule(Module &M) {      // Figure out which index the basic block is in its function.      Function::iterator BBI = MF->begin();      std::advance(BBI, std::distance(F->begin(), Function::iterator(BB))); -    TranslatedBlocksToNotExtract.insert(BBI); +    TranslatedBlocksToNotExtract.insert(&*BBI);    }    while (!BlocksToNotExtractByName.empty()) { @@ -278,7 +280,7 @@ bool BlockExtractorPass::runOnModule(Module &M) {          BasicBlock &BB = *BI;          if (BB.getName() != BlockName) continue; -        TranslatedBlocksToNotExtract.insert(BI); +        TranslatedBlocksToNotExtract.insert(&*BI);        }      } @@ -291,8 +293,8 @@ bool BlockExtractorPass::runOnModule(Module &M) {    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {      SplitLandingPadPreds(&*F);      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) -      if (!TranslatedBlocksToNotExtract.count(BB)) -        BlocksToExtract.push_back(BB); +      if (!TranslatedBlocksToNotExtract.count(&*BB)) +        BlocksToExtract.push_back(&*BB);    }    for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i) { diff --git a/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp b/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp index c6795c623eff..7b515745c312 100644 --- a/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp +++ b/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp @@ -19,6 +19,8 @@  #include "llvm/ADT/Triple.h"  #include "llvm/IR/Constant.h"  #include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalObject.h"  #include "llvm/IR/GlobalVariable.h"  #include "llvm/IR/IRBuilder.h"  #include "llvm/IR/Instructions.h" @@ -26,6 +28,8 @@  #include "llvm/IR/Module.h"  #include "llvm/IR/Operator.h"  #include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h"  #include "llvm/Transforms/Utils/BasicBlockUtils.h"  using namespace llvm; @@ -59,9 +63,9 @@ bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {  bool BitSetInfo::containsValue(      const DataLayout &DL, -    const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout, Value *V, +    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout, Value *V,      uint64_t COffset) const { -  if (auto GV = dyn_cast<GlobalVariable>(V)) { +  if (auto GV = dyn_cast<GlobalObject>(V)) {      auto I = GlobalLayout.find(GV);      if (I == GlobalLayout.end())        return false; @@ -90,6 +94,21 @@ bool BitSetInfo::containsValue(    return false;  } +void BitSetInfo::print(raw_ostream &OS) const { +  OS << "offset " << ByteOffset << " size " << BitSize << " align " +     << (1 << AlignLog2); + +  if (isAllOnes()) { +    OS << " all-ones\n"; +    return; +  } + +  OS << " { "; +  for (uint64_t B : Bits) +    OS << B << ' '; +  OS << "}\n"; +} +  BitSetInfo BitSetBuilder::build() {    if (Min > Max)      Min = 0; @@ -193,34 +212,48 @@ struct LowerBitSets : public ModulePass {    Module *M;    bool LinkerSubsectionsViaSymbols; +  Triple::ArchType Arch; +  Triple::ObjectFormatType ObjectFormat;    IntegerType *Int1Ty;    IntegerType *Int8Ty;    IntegerType *Int32Ty;    Type *Int32PtrTy;    IntegerType *Int64Ty; -  Type *IntPtrTy; +  IntegerType *IntPtrTy;    // The llvm.bitsets named metadata.    NamedMDNode *BitSetNM; -  // Mapping from bitset mdstrings to the call sites that test them. -  DenseMap<MDString *, std::vector<CallInst *>> BitSetTestCallSites; +  // Mapping from bitset identifiers to the call sites that test them. +  DenseMap<Metadata *, std::vector<CallInst *>> BitSetTestCallSites;    std::vector<ByteArrayInfo> ByteArrayInfos;    BitSetInfo -  buildBitSet(MDString *BitSet, -              const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout); +  buildBitSet(Metadata *BitSet, +              const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);    ByteArrayInfo *createByteArray(BitSetInfo &BSI);    void allocateByteArrays();    Value *createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, ByteArrayInfo *&BAI,                            Value *BitOffset); +  void lowerBitSetCalls(ArrayRef<Metadata *> BitSets, +                        Constant *CombinedGlobalAddr, +                        const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);    Value *    lowerBitSetCall(CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI, -                  GlobalVariable *CombinedGlobal, -                  const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout); -  void buildBitSetsFromGlobals(const std::vector<MDString *> &BitSets, -                               const std::vector<GlobalVariable *> &Globals); +                  Constant *CombinedGlobal, +                  const DenseMap<GlobalObject *, uint64_t> &GlobalLayout); +  void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> BitSets, +                                       ArrayRef<GlobalVariable *> Globals); +  unsigned getJumpTableEntrySize(); +  Type *getJumpTableEntryType(); +  Constant *createJumpTableEntry(GlobalObject *Src, Function *Dest, +                                 unsigned Distance); +  void verifyBitSetMDNode(MDNode *Op); +  void buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets, +                                 ArrayRef<Function *> Functions); +  void buildBitSetsFromDisjointSet(ArrayRef<Metadata *> BitSets, +                                   ArrayRef<GlobalObject *> Globals);    bool buildBitSets();    bool eraseBitSetMetadata(); @@ -228,7 +261,7 @@ struct LowerBitSets : public ModulePass {    bool runOnModule(Module &M) override;  }; -} // namespace +} // anonymous namespace  INITIALIZE_PASS_BEGIN(LowerBitSets, "lowerbitsets",                  "Lower bitset metadata", false, false) @@ -244,6 +277,8 @@ bool LowerBitSets::doInitialization(Module &Mod) {    Triple TargetTriple(M->getTargetTriple());    LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX(); +  Arch = TargetTriple.getArch(); +  ObjectFormat = TargetTriple.getObjectFormat();    Int1Ty = Type::getInt1Ty(M->getContext());    Int8Ty = Type::getInt8Ty(M->getContext()); @@ -262,8 +297,8 @@ bool LowerBitSets::doInitialization(Module &Mod) {  /// Build a bit set for BitSet using the object layouts in  /// GlobalLayout.  BitSetInfo LowerBitSets::buildBitSet( -    MDString *BitSet, -    const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) { +    Metadata *BitSet, +    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {    BitSetBuilder BSB;    // Compute the byte offset of each element of this bitset. @@ -271,8 +306,11 @@ BitSetInfo LowerBitSets::buildBitSet(      for (MDNode *Op : BitSetNM->operands()) {        if (Op->getOperand(0) != BitSet || !Op->getOperand(1))          continue; -      auto OpGlobal = dyn_cast<GlobalVariable>( -          cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); +      Constant *OpConst = +          cast<ConstantAsMetadata>(Op->getOperand(1))->getValue(); +      if (auto GA = dyn_cast<GlobalAlias>(OpConst)) +        OpConst = GA->getAliasee(); +      auto OpGlobal = dyn_cast<GlobalObject>(OpConst);        if (!OpGlobal)          continue;        uint64_t Offset = @@ -360,9 +398,8 @@ void LowerBitSets::allocateByteArrays() {      if (LinkerSubsectionsViaSymbols) {        BAI->ByteArray->replaceAllUsesWith(GEP);      } else { -      GlobalAlias *Alias = -          GlobalAlias::create(PointerType::getUnqual(Int8Ty), -                              GlobalValue::PrivateLinkage, "bits", GEP, M); +      GlobalAlias *Alias = GlobalAlias::create( +          Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, M);        BAI->ByteArray->replaceAllUsesWith(Alias);      }      BAI->ByteArray->eraseFromParent(); @@ -404,7 +441,7 @@ Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI,        // Each use of the byte array uses a different alias. This makes the        // backend less likely to reuse previously computed byte array addresses,        // improving the security of the CFI mechanism based on this pass. -      ByteArray = GlobalAlias::create(BAI->ByteArray->getType(), +      ByteArray = GlobalAlias::create(BAI->ByteArray->getValueType(), 0,                                        GlobalValue::PrivateLinkage, "bits_use",                                        ByteArray, M);      } @@ -421,17 +458,16 @@ Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI,  /// replace the call with.  Value *LowerBitSets::lowerBitSetCall(      CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI, -    GlobalVariable *CombinedGlobal, -    const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) { +    Constant *CombinedGlobalIntAddr, +    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {    Value *Ptr = CI->getArgOperand(0);    const DataLayout &DL = M->getDataLayout();    if (BSI.containsValue(DL, GlobalLayout, Ptr)) -    return ConstantInt::getTrue(CombinedGlobal->getParent()->getContext()); +    return ConstantInt::getTrue(M->getContext()); -  Constant *GlobalAsInt = ConstantExpr::getPtrToInt(CombinedGlobal, IntPtrTy);    Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd( -      GlobalAsInt, ConstantInt::get(IntPtrTy, BSI.ByteOffset)); +      CombinedGlobalIntAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset));    BasicBlock *InitialBB = CI->getParent(); @@ -490,18 +526,19 @@ Value *LowerBitSets::lowerBitSetCall(  /// Given a disjoint set of bitsets and globals, layout the globals, build the  /// bit sets and lower the llvm.bitset.test calls. -void LowerBitSets::buildBitSetsFromGlobals( -    const std::vector<MDString *> &BitSets, -    const std::vector<GlobalVariable *> &Globals) { +void LowerBitSets::buildBitSetsFromGlobalVariables( +    ArrayRef<Metadata *> BitSets, ArrayRef<GlobalVariable *> Globals) {    // Build a new global with the combined contents of the referenced globals. +  // This global is a struct whose even-indexed elements contain the original +  // contents of the referenced globals and whose odd-indexed elements contain +  // any padding required to align the next element to the next power of 2.    std::vector<Constant *> GlobalInits;    const DataLayout &DL = M->getDataLayout();    for (GlobalVariable *G : Globals) {      GlobalInits.push_back(G->getInitializer()); -    uint64_t InitSize = DL.getTypeAllocSize(G->getInitializer()->getType()); +    uint64_t InitSize = DL.getTypeAllocSize(G->getValueType()); -    // Compute the amount of padding required to align the next element to the -    // next power of 2. +    // Compute the amount of padding required.      uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize;      // Cap at 128 was found experimentally to have a good data/instruction @@ -515,34 +552,20 @@ void LowerBitSets::buildBitSetsFromGlobals(    if (!GlobalInits.empty())      GlobalInits.pop_back();    Constant *NewInit = ConstantStruct::getAnon(M->getContext(), GlobalInits); -  auto CombinedGlobal = +  auto *CombinedGlobal =        new GlobalVariable(*M, NewInit->getType(), /*isConstant=*/true,                           GlobalValue::PrivateLinkage, NewInit); -  const StructLayout *CombinedGlobalLayout = -      DL.getStructLayout(cast<StructType>(NewInit->getType())); +  StructType *NewTy = cast<StructType>(NewInit->getType()); +  const StructLayout *CombinedGlobalLayout = DL.getStructLayout(NewTy);    // Compute the offsets of the original globals within the new global. -  DenseMap<GlobalVariable *, uint64_t> GlobalLayout; +  DenseMap<GlobalObject *, uint64_t> GlobalLayout;    for (unsigned I = 0; I != Globals.size(); ++I)      // Multiply by 2 to account for padding elements.      GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2); -  // For each bitset in this disjoint set... -  for (MDString *BS : BitSets) { -    // Build the bitset. -    BitSetInfo BSI = buildBitSet(BS, GlobalLayout); - -    ByteArrayInfo *BAI = 0; - -    // Lower each call to llvm.bitset.test for this bitset. -    for (CallInst *CI : BitSetTestCallSites[BS]) { -      ++NumBitSetCallsLowered; -      Value *Lowered = lowerBitSetCall(CI, BSI, BAI, CombinedGlobal, GlobalLayout); -      CI->replaceAllUsesWith(Lowered); -      CI->eraseFromParent(); -    } -  } +  lowerBitSetCalls(BitSets, CombinedGlobal, GlobalLayout);    // Build aliases pointing to offsets into the combined global for each    // global from which we built the combined global, and replace references @@ -556,9 +579,11 @@ void LowerBitSets::buildBitSetsFromGlobals(      if (LinkerSubsectionsViaSymbols) {        Globals[I]->replaceAllUsesWith(CombinedGlobalElemPtr);      } else { -      GlobalAlias *GAlias = -          GlobalAlias::create(Globals[I]->getType(), Globals[I]->getLinkage(), -                              "", CombinedGlobalElemPtr, M); +      assert(Globals[I]->getType()->getAddressSpace() == 0); +      GlobalAlias *GAlias = GlobalAlias::create(NewTy->getElementType(I * 2), 0, +                                                Globals[I]->getLinkage(), "", +                                                CombinedGlobalElemPtr, M); +      GAlias->setVisibility(Globals[I]->getVisibility());        GAlias->takeName(Globals[I]);        Globals[I]->replaceAllUsesWith(GAlias);      } @@ -566,6 +591,331 @@ void LowerBitSets::buildBitSetsFromGlobals(    }  } +void LowerBitSets::lowerBitSetCalls( +    ArrayRef<Metadata *> BitSets, Constant *CombinedGlobalAddr, +    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) { +  Constant *CombinedGlobalIntAddr = +      ConstantExpr::getPtrToInt(CombinedGlobalAddr, IntPtrTy); + +  // For each bitset in this disjoint set... +  for (Metadata *BS : BitSets) { +    // Build the bitset. +    BitSetInfo BSI = buildBitSet(BS, GlobalLayout); +    DEBUG({ +      if (auto BSS = dyn_cast<MDString>(BS)) +        dbgs() << BSS->getString() << ": "; +      else +        dbgs() << "<unnamed>: "; +      BSI.print(dbgs()); +    }); + +    ByteArrayInfo *BAI = nullptr; + +    // Lower each call to llvm.bitset.test for this bitset. +    for (CallInst *CI : BitSetTestCallSites[BS]) { +      ++NumBitSetCallsLowered; +      Value *Lowered = +          lowerBitSetCall(CI, BSI, BAI, CombinedGlobalIntAddr, GlobalLayout); +      CI->replaceAllUsesWith(Lowered); +      CI->eraseFromParent(); +    } +  } +} + +void LowerBitSets::verifyBitSetMDNode(MDNode *Op) { +  if (Op->getNumOperands() != 3) +    report_fatal_error( +        "All operands of llvm.bitsets metadata must have 3 elements"); +  if (!Op->getOperand(1)) +    return; + +  auto OpConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(1)); +  if (!OpConstMD) +    report_fatal_error("Bit set element must be a constant"); +  auto OpGlobal = dyn_cast<GlobalObject>(OpConstMD->getValue()); +  if (!OpGlobal) +    return; + +  if (OpGlobal->isThreadLocal()) +    report_fatal_error("Bit set element may not be thread-local"); +  if (OpGlobal->hasSection()) +    report_fatal_error("Bit set element may not have an explicit section"); + +  if (isa<GlobalVariable>(OpGlobal) && OpGlobal->isDeclarationForLinker()) +    report_fatal_error("Bit set global var element must be a definition"); + +  auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(2)); +  if (!OffsetConstMD) +    report_fatal_error("Bit set element offset must be a constant"); +  auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue()); +  if (!OffsetInt) +    report_fatal_error("Bit set element offset must be an integer constant"); +} + +static const unsigned kX86JumpTableEntrySize = 8; + +unsigned LowerBitSets::getJumpTableEntrySize() { +  if (Arch != Triple::x86 && Arch != Triple::x86_64) +    report_fatal_error("Unsupported architecture for jump tables"); + +  return kX86JumpTableEntrySize; +} + +// Create a constant representing a jump table entry for the target. This +// consists of an instruction sequence containing a relative branch to Dest. The +// constant will be laid out at address Src+(Len*Distance) where Len is the +// target-specific jump table entry size. +Constant *LowerBitSets::createJumpTableEntry(GlobalObject *Src, Function *Dest, +                                             unsigned Distance) { +  if (Arch != Triple::x86 && Arch != Triple::x86_64) +    report_fatal_error("Unsupported architecture for jump tables"); + +  const unsigned kJmpPCRel32Code = 0xe9; +  const unsigned kInt3Code = 0xcc; + +  ConstantInt *Jmp = ConstantInt::get(Int8Ty, kJmpPCRel32Code); + +  // Build a constant representing the displacement between the constant's +  // address and Dest. This will resolve to a PC32 relocation referring to Dest. +  Constant *DestInt = ConstantExpr::getPtrToInt(Dest, IntPtrTy); +  Constant *SrcInt = ConstantExpr::getPtrToInt(Src, IntPtrTy); +  Constant *Disp = ConstantExpr::getSub(DestInt, SrcInt); +  ConstantInt *DispOffset = +      ConstantInt::get(IntPtrTy, Distance * kX86JumpTableEntrySize + 5); +  Constant *OffsetedDisp = ConstantExpr::getSub(Disp, DispOffset); +  OffsetedDisp = ConstantExpr::getTruncOrBitCast(OffsetedDisp, Int32Ty); + +  ConstantInt *Int3 = ConstantInt::get(Int8Ty, kInt3Code); + +  Constant *Fields[] = { +      Jmp, OffsetedDisp, Int3, Int3, Int3, +  }; +  return ConstantStruct::getAnon(Fields, /*Packed=*/true); +} + +Type *LowerBitSets::getJumpTableEntryType() { +  if (Arch != Triple::x86 && Arch != Triple::x86_64) +    report_fatal_error("Unsupported architecture for jump tables"); + +  return StructType::get(M->getContext(), +                         {Int8Ty, Int32Ty, Int8Ty, Int8Ty, Int8Ty}, +                         /*Packed=*/true); +} + +/// Given a disjoint set of bitsets and functions, build a jump table for the +/// functions, build the bit sets and lower the llvm.bitset.test calls. +void LowerBitSets::buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets, +                                             ArrayRef<Function *> Functions) { +  // Unlike the global bitset builder, the function bitset builder cannot +  // re-arrange functions in a particular order and base its calculations on the +  // layout of the functions' entry points, as we have no idea how large a +  // particular function will end up being (the size could even depend on what +  // this pass does!) Instead, we build a jump table, which is a block of code +  // consisting of one branch instruction for each of the functions in the bit +  // set that branches to the target function, and redirect any taken function +  // addresses to the corresponding jump table entry. In the object file's +  // symbol table, the symbols for the target functions also refer to the jump +  // table entries, so that addresses taken outside the module will pass any +  // verification done inside the module. +  // +  // In more concrete terms, suppose we have three functions f, g, h which are +  // members of a single bitset, and a function foo that returns their +  // addresses: +  // +  // f: +  // mov 0, %eax +  // ret +  // +  // g: +  // mov 1, %eax +  // ret +  // +  // h: +  // mov 2, %eax +  // ret +  // +  // foo: +  // mov f, %eax +  // mov g, %edx +  // mov h, %ecx +  // ret +  // +  // To create a jump table for these functions, we instruct the LLVM code +  // generator to output a jump table in the .text section. This is done by +  // representing the instructions in the jump table as an LLVM constant and +  // placing them in a global variable in the .text section. The end result will +  // (conceptually) look like this: +  // +  // f: +  // jmp .Ltmp0 ; 5 bytes +  // int3       ; 1 byte +  // int3       ; 1 byte +  // int3       ; 1 byte +  // +  // g: +  // jmp .Ltmp1 ; 5 bytes +  // int3       ; 1 byte +  // int3       ; 1 byte +  // int3       ; 1 byte +  // +  // h: +  // jmp .Ltmp2 ; 5 bytes +  // int3       ; 1 byte +  // int3       ; 1 byte +  // int3       ; 1 byte +  // +  // .Ltmp0: +  // mov 0, %eax +  // ret +  // +  // .Ltmp1: +  // mov 1, %eax +  // ret +  // +  // .Ltmp2: +  // mov 2, %eax +  // ret +  // +  // foo: +  // mov f, %eax +  // mov g, %edx +  // mov h, %ecx +  // ret +  // +  // Because the addresses of f, g, h are evenly spaced at a power of 2, in the +  // normal case the check can be carried out using the same kind of simple +  // arithmetic that we normally use for globals. + +  assert(!Functions.empty()); + +  // Build a simple layout based on the regular layout of jump tables. +  DenseMap<GlobalObject *, uint64_t> GlobalLayout; +  unsigned EntrySize = getJumpTableEntrySize(); +  for (unsigned I = 0; I != Functions.size(); ++I) +    GlobalLayout[Functions[I]] = I * EntrySize; + +  // Create a constant to hold the jump table. +  ArrayType *JumpTableType = +      ArrayType::get(getJumpTableEntryType(), Functions.size()); +  auto JumpTable = new GlobalVariable(*M, JumpTableType, +                                      /*isConstant=*/true, +                                      GlobalValue::PrivateLinkage, nullptr); +  JumpTable->setSection(ObjectFormat == Triple::MachO +                            ? "__TEXT,__text,regular,pure_instructions" +                            : ".text"); +  lowerBitSetCalls(BitSets, JumpTable, GlobalLayout); + +  // Build aliases pointing to offsets into the jump table, and replace +  // references to the original functions with references to the aliases. +  for (unsigned I = 0; I != Functions.size(); ++I) { +    Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast( +        ConstantExpr::getGetElementPtr( +            JumpTableType, JumpTable, +            ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0), +                                 ConstantInt::get(IntPtrTy, I)}), +        Functions[I]->getType()); +    if (LinkerSubsectionsViaSymbols || Functions[I]->isDeclarationForLinker()) { +      Functions[I]->replaceAllUsesWith(CombinedGlobalElemPtr); +    } else { +      assert(Functions[I]->getType()->getAddressSpace() == 0); +      GlobalAlias *GAlias = GlobalAlias::create(Functions[I]->getValueType(), 0, +                                                Functions[I]->getLinkage(), "", +                                                CombinedGlobalElemPtr, M); +      GAlias->setVisibility(Functions[I]->getVisibility()); +      GAlias->takeName(Functions[I]); +      Functions[I]->replaceAllUsesWith(GAlias); +    } +    if (!Functions[I]->isDeclarationForLinker()) +      Functions[I]->setLinkage(GlobalValue::PrivateLinkage); +  } + +  // Build and set the jump table's initializer. +  std::vector<Constant *> JumpTableEntries; +  for (unsigned I = 0; I != Functions.size(); ++I) +    JumpTableEntries.push_back( +        createJumpTableEntry(JumpTable, Functions[I], I)); +  JumpTable->setInitializer( +      ConstantArray::get(JumpTableType, JumpTableEntries)); +} + +void LowerBitSets::buildBitSetsFromDisjointSet( +    ArrayRef<Metadata *> BitSets, ArrayRef<GlobalObject *> Globals) { +  llvm::DenseMap<Metadata *, uint64_t> BitSetIndices; +  llvm::DenseMap<GlobalObject *, uint64_t> GlobalIndices; +  for (unsigned I = 0; I != BitSets.size(); ++I) +    BitSetIndices[BitSets[I]] = I; +  for (unsigned I = 0; I != Globals.size(); ++I) +    GlobalIndices[Globals[I]] = I; + +  // For each bitset, build a set of indices that refer to globals referenced by +  // the bitset. +  std::vector<std::set<uint64_t>> BitSetMembers(BitSets.size()); +  if (BitSetNM) { +    for (MDNode *Op : BitSetNM->operands()) { +      // Op = { bitset name, global, offset } +      if (!Op->getOperand(1)) +        continue; +      auto I = BitSetIndices.find(Op->getOperand(0)); +      if (I == BitSetIndices.end()) +        continue; + +      auto OpGlobal = dyn_cast<GlobalObject>( +          cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); +      if (!OpGlobal) +        continue; +      BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]); +    } +  } + +  // Order the sets of indices by size. The GlobalLayoutBuilder works best +  // when given small index sets first. +  std::stable_sort( +      BitSetMembers.begin(), BitSetMembers.end(), +      [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) { +        return O1.size() < O2.size(); +      }); + +  // Create a GlobalLayoutBuilder and provide it with index sets as layout +  // fragments. The GlobalLayoutBuilder tries to lay out members of fragments as +  // close together as possible. +  GlobalLayoutBuilder GLB(Globals.size()); +  for (auto &&MemSet : BitSetMembers) +    GLB.addFragment(MemSet); + +  // Build the bitsets from this disjoint set. +  if (Globals.empty() || isa<GlobalVariable>(Globals[0])) { +    // Build a vector of global variables with the computed layout. +    std::vector<GlobalVariable *> OrderedGVs(Globals.size()); +    auto OGI = OrderedGVs.begin(); +    for (auto &&F : GLB.Fragments) { +      for (auto &&Offset : F) { +        auto GV = dyn_cast<GlobalVariable>(Globals[Offset]); +        if (!GV) +          report_fatal_error( +              "Bit set may not contain both global variables and functions"); +        *OGI++ = GV; +      } +    } + +    buildBitSetsFromGlobalVariables(BitSets, OrderedGVs); +  } else { +    // Build a vector of functions with the computed layout. +    std::vector<Function *> OrderedFns(Globals.size()); +    auto OFI = OrderedFns.begin(); +    for (auto &&F : GLB.Fragments) { +      for (auto &&Offset : F) { +        auto Fn = dyn_cast<Function>(Globals[Offset]); +        if (!Fn) +          report_fatal_error( +              "Bit set may not contain both global variables and functions"); +        *OFI++ = Fn; +      } +    } + +    buildBitSetsFromFunctions(BitSets, OrderedFns); +  } +} +  /// Lower all bit sets in this module.  bool LowerBitSets::buildBitSets() {    Function *BitSetTestFunc = @@ -576,24 +926,36 @@ bool LowerBitSets::buildBitSets() {    // Equivalence class set containing bitsets and the globals they reference.    // This is used to partition the set of bitsets in the module into disjoint    // sets. -  typedef EquivalenceClasses<PointerUnion<GlobalVariable *, MDString *>> +  typedef EquivalenceClasses<PointerUnion<GlobalObject *, Metadata *>>        GlobalClassesTy;    GlobalClassesTy GlobalClasses; +  // Verify the bitset metadata and build a mapping from bitset identifiers to +  // their last observed index in BitSetNM. This will used later to +  // deterministically order the list of bitset identifiers. +  llvm::DenseMap<Metadata *, unsigned> BitSetIdIndices; +  if (BitSetNM) { +    for (unsigned I = 0, E = BitSetNM->getNumOperands(); I != E; ++I) { +      MDNode *Op = BitSetNM->getOperand(I); +      verifyBitSetMDNode(Op); +      BitSetIdIndices[Op->getOperand(0)] = I; +    } +  } +    for (const Use &U : BitSetTestFunc->uses()) {      auto CI = cast<CallInst>(U.getUser());      auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); -    if (!BitSetMDVal || !isa<MDString>(BitSetMDVal->getMetadata())) +    if (!BitSetMDVal)        report_fatal_error( -          "Second argument of llvm.bitset.test must be metadata string"); -    auto BitSet = cast<MDString>(BitSetMDVal->getMetadata()); +          "Second argument of llvm.bitset.test must be metadata"); +    auto BitSet = BitSetMDVal->getMetadata();      // Add the call site to the list of call sites for this bit set. We also use      // BitSetTestCallSites to keep track of whether we have seen this bit set      // before. If we have, we don't need to re-add the referenced globals to the      // equivalence class. -    std::pair<DenseMap<MDString *, std::vector<CallInst *>>::iterator, +    std::pair<DenseMap<Metadata *, std::vector<CallInst *>>::iterator,                bool> Ins =          BitSetTestCallSites.insert(              std::make_pair(BitSet, std::vector<CallInst *>())); @@ -608,31 +970,16 @@ bool LowerBitSets::buildBitSets() {      if (!BitSetNM)        continue; -    // Verify the bitset metadata and add the referenced globals to the bitset's -    // equivalence class. +    // Add the referenced globals to the bitset's equivalence class.      for (MDNode *Op : BitSetNM->operands()) { -      if (Op->getNumOperands() != 3) -        report_fatal_error( -            "All operands of llvm.bitsets metadata must have 3 elements"); -        if (Op->getOperand(0) != BitSet || !Op->getOperand(1))          continue; -      auto OpConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(1)); -      if (!OpConstMD) -        report_fatal_error("Bit set element must be a constant"); -      auto OpGlobal = dyn_cast<GlobalVariable>(OpConstMD->getValue()); +      auto OpGlobal = dyn_cast<GlobalObject>( +          cast<ConstantAsMetadata>(Op->getOperand(1))->getValue());        if (!OpGlobal)          continue; -      auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(2)); -      if (!OffsetConstMD) -        report_fatal_error("Bit set element offset must be a constant"); -      auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue()); -      if (!OffsetInt) -        report_fatal_error( -            "Bit set element offset must be an integer constant"); -        CurSet = GlobalClasses.unionSets(            CurSet, GlobalClasses.findLeader(GlobalClasses.insert(OpGlobal)));      } @@ -641,79 +988,51 @@ bool LowerBitSets::buildBitSets() {    if (GlobalClasses.empty())      return false; -  // For each disjoint set we found... +  // Build a list of disjoint sets ordered by their maximum BitSetNM index +  // for determinism. +  std::vector<std::pair<GlobalClassesTy::iterator, unsigned>> Sets;    for (GlobalClassesTy::iterator I = GlobalClasses.begin(),                                   E = GlobalClasses.end();         I != E; ++I) {      if (!I->isLeader()) continue; -      ++NumBitSetDisjointSets; -    // Build the list of bitsets and referenced globals in this disjoint set. -    std::vector<MDString *> BitSets; -    std::vector<GlobalVariable *> Globals; -    llvm::DenseMap<MDString *, uint64_t> BitSetIndices; -    llvm::DenseMap<GlobalVariable *, uint64_t> GlobalIndices; +    unsigned MaxIndex = 0;      for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I);           MI != GlobalClasses.member_end(); ++MI) { -      if ((*MI).is<MDString *>()) { -        BitSetIndices[MI->get<MDString *>()] = BitSets.size(); -        BitSets.push_back(MI->get<MDString *>()); -      } else { -        GlobalIndices[MI->get<GlobalVariable *>()] = Globals.size(); -        Globals.push_back(MI->get<GlobalVariable *>()); -      } +      if ((*MI).is<Metadata *>()) +        MaxIndex = std::max(MaxIndex, BitSetIdIndices[MI->get<Metadata *>()]);      } +    Sets.emplace_back(I, MaxIndex); +  } +  std::sort(Sets.begin(), Sets.end(), +            [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1, +               const std::pair<GlobalClassesTy::iterator, unsigned> &S2) { +              return S1.second < S2.second; +            }); -    // For each bitset, build a set of indices that refer to globals referenced -    // by the bitset. -    std::vector<std::set<uint64_t>> BitSetMembers(BitSets.size()); -    if (BitSetNM) { -      for (MDNode *Op : BitSetNM->operands()) { -        // Op = { bitset name, global, offset } -        if (!Op->getOperand(1)) -          continue; -        auto I = BitSetIndices.find(cast<MDString>(Op->getOperand(0))); -        if (I == BitSetIndices.end()) -          continue; - -        auto OpGlobal = dyn_cast<GlobalVariable>( -            cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); -        if (!OpGlobal) -          continue; -        BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]); -      } +  // For each disjoint set we found... +  for (const auto &S : Sets) { +    // Build the list of bitsets in this disjoint set. +    std::vector<Metadata *> BitSets; +    std::vector<GlobalObject *> Globals; +    for (GlobalClassesTy::member_iterator MI = +             GlobalClasses.member_begin(S.first); +         MI != GlobalClasses.member_end(); ++MI) { +      if ((*MI).is<Metadata *>()) +        BitSets.push_back(MI->get<Metadata *>()); +      else +        Globals.push_back(MI->get<GlobalObject *>());      } -    // Order the sets of indices by size. The GlobalLayoutBuilder works best -    // when given small index sets first. -    std::stable_sort( -        BitSetMembers.begin(), BitSetMembers.end(), -        [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) { -          return O1.size() < O2.size(); -        }); - -    // Create a GlobalLayoutBuilder and provide it with index sets as layout -    // fragments. The GlobalLayoutBuilder tries to lay out members of fragments -    // as close together as possible. -    GlobalLayoutBuilder GLB(Globals.size()); -    for (auto &&MemSet : BitSetMembers) -      GLB.addFragment(MemSet); - -    // Build a vector of globals with the computed layout. -    std::vector<GlobalVariable *> OrderedGlobals(Globals.size()); -    auto OGI = OrderedGlobals.begin(); -    for (auto &&F : GLB.Fragments) -      for (auto &&Offset : F) -        *OGI++ = Globals[Offset]; - -    // Order bitsets by name for determinism. -    std::sort(BitSets.begin(), BitSets.end(), [](MDString *S1, MDString *S2) { -      return S1->getString() < S2->getString(); +    // Order bitsets by BitSetNM index for determinism. This ordering is stable +    // as there is a one-to-one mapping between metadata and indices. +    std::sort(BitSets.begin(), BitSets.end(), [&](Metadata *M1, Metadata *M2) { +      return BitSetIdIndices[M1] < BitSetIdIndices[M2];      }); -    // Build the bitsets from this disjoint set. -    buildBitSetsFromGlobals(BitSets, OrderedGlobals); +    // Lower the bitsets in this disjoint set. +    buildBitSetsFromDisjointSet(BitSets, Globals);    }    allocateByteArrays(); diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp index 2e3519eac6a5..8a209a18c540 100644 --- a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -27,6 +27,14 @@  // -- We define Function* container class with custom "operator<" (FunctionPtr).  // -- "FunctionPtr" instances are stored in std::set collection, so every  //    std::set::insert operation will give you result in log(N) time. +//  +// As an optimization, a hash of the function structure is calculated first, and +// two functions are only compared if they have the same hash. This hash is +// cheap to compute, and has the property that if function F == G according to +// the comparison function, then hash(F) == hash(G). This consistency property +// is critical to ensuring all possible merging opportunities are exploited. +// Collisions in the hash affect the speed of the pass but not the correctness +// or determinism of the resulting transformation.  //  // When a match is found the functions are folded. If both functions are  // overridable, we move the functionality into a new internal function and @@ -87,6 +95,7 @@  #include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SmallSet.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Hashing.h"  #include "llvm/IR/CallSite.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h" @@ -97,12 +106,14 @@  #include "llvm/IR/Module.h"  #include "llvm/IR/Operator.h"  #include "llvm/IR/ValueHandle.h" +#include "llvm/IR/ValueMap.h"  #include "llvm/Pass.h"  #include "llvm/Support/CommandLine.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/raw_ostream.h"  #include <vector> +  using namespace llvm;  #define DEBUG_TYPE "mergefunc" @@ -121,21 +132,64 @@ static cl::opt<unsigned> NumFunctionsForSanityCheck(  namespace { +/// GlobalNumberState assigns an integer to each global value in the program, +/// which is used by the comparison routine to order references to globals. This +/// state must be preserved throughout the pass, because Functions and other +/// globals need to maintain their relative order. Globals are assigned a number +/// when they are first visited. This order is deterministic, and so the +/// assigned numbers are as well. When two functions are merged, neither number +/// is updated. If the symbols are weak, this would be incorrect. If they are +/// strong, then one will be replaced at all references to the other, and so +/// direct callsites will now see one or the other symbol, and no update is +/// necessary. Note that if we were guaranteed unique names, we could just +/// compare those, but this would not work for stripped bitcodes or for those +/// few symbols without a name. +class GlobalNumberState { +  struct Config : ValueMapConfig<GlobalValue*> { +    enum { FollowRAUW = false }; +  }; +  // Each GlobalValue is mapped to an identifier. The Config ensures when RAUW +  // occurs, the mapping does not change. Tracking changes is unnecessary, and +  // also problematic for weak symbols (which may be overwritten). +  typedef ValueMap<GlobalValue *, uint64_t, Config> ValueNumberMap; +  ValueNumberMap GlobalNumbers; +  // The next unused serial number to assign to a global. +  uint64_t NextNumber; +  public: +    GlobalNumberState() : GlobalNumbers(), NextNumber(0) {} +    uint64_t getNumber(GlobalValue* Global) { +      ValueNumberMap::iterator MapIter; +      bool Inserted; +      std::tie(MapIter, Inserted) = GlobalNumbers.insert({Global, NextNumber}); +      if (Inserted) +        NextNumber++; +      return MapIter->second; +    } +    void clear() { +      GlobalNumbers.clear(); +    } +}; +  /// FunctionComparator - Compares two functions to determine whether or not  /// they will generate machine code with the same behaviour. DataLayout is  /// used if available. The comparator always fails conservatively (erring on the  /// side of claiming that two functions are different).  class FunctionComparator {  public: -  FunctionComparator(const Function *F1, const Function *F2) -      : FnL(F1), FnR(F2) {} +  FunctionComparator(const Function *F1, const Function *F2, +                     GlobalNumberState* GN) +      : FnL(F1), FnR(F2), GlobalNumbers(GN) {}    /// Test whether the two functions have equivalent behaviour.    int compare(); +  /// Hash a function. Equivalent functions will have the same hash, and unequal +  /// functions will have different hashes with high probability. +  typedef uint64_t FunctionHash; +  static FunctionHash functionHash(Function &);  private:    /// Test whether two basic blocks have equivalent behaviour. -  int compare(const BasicBlock *BBL, const BasicBlock *BBR); +  int cmpBasicBlocks(const BasicBlock *BBL, const BasicBlock *BBR);    /// Constants comparison.    /// Its analog to lexicographical comparison between hypothetical numbers @@ -241,6 +295,10 @@ private:    /// If these properties are equal - compare their contents.    int cmpConstants(const Constant *L, const Constant *R); +  /// Compares two global values by number. Uses the GlobalNumbersState to +  /// identify the same gobals across function calls. +  int cmpGlobalValues(GlobalValue *L, GlobalValue *R); +    /// Assign or look up previously assigned numbers for the two values, and    /// return whether the numbers are equal. Numbers are assigned in the order    /// visited. @@ -320,8 +378,9 @@ private:    ///    /// 1. If types are of different kind (different type IDs).    ///    Return result of type IDs comparison, treating them as numbers. -  /// 2. If types are vectors or integers, compare Type* values as numbers. -  /// 3. Types has same ID, so check whether they belongs to the next group: +  /// 2. If types are integers, check that they have the same width. If they +  /// are vectors, check that they have the same count and subtype. +  /// 3. Types have the same ID, so check whether they are one of:    /// * Void    /// * Float    /// * Double @@ -330,8 +389,7 @@ private:    /// * PPC_FP128    /// * Label    /// * Metadata -  /// If so - return 0, yes - we can treat these types as equal only because -  /// their IDs are same. +  /// We can treat these types as equal whenever their IDs are same.    /// 4. If Left and Right are pointers, return result of address space    /// comparison (numbers comparison). We can treat pointer types of same    /// address space as equal. @@ -343,11 +401,13 @@ private:    int cmpTypes(Type *TyL, Type *TyR) const;    int cmpNumbers(uint64_t L, uint64_t R) const; -    int cmpAPInts(const APInt &L, const APInt &R) const;    int cmpAPFloats(const APFloat &L, const APFloat &R) const; -  int cmpStrings(StringRef L, StringRef R) const; +  int cmpInlineAsm(const InlineAsm *L, const InlineAsm *R) const; +  int cmpMem(StringRef L, StringRef R) const;    int cmpAttrs(const AttributeSet L, const AttributeSet R) const; +  int cmpRangeMetadata(const MDNode* L, const MDNode* R) const; +  int cmpOperandBundlesSchema(const Instruction *L, const Instruction *R) const;    // The two functions undergoing comparison.    const Function *FnL, *FnR; @@ -386,30 +446,30 @@ private:    /// could be operands from further BBs we didn't scan yet.    /// So it's impossible to use dominance properties in general.    DenseMap<const Value*, int> sn_mapL, sn_mapR; + +  // The global state we will use +  GlobalNumberState* GlobalNumbers;  };  class FunctionNode {    mutable AssertingVH<Function> F; - +  FunctionComparator::FunctionHash Hash;  public: -  FunctionNode(Function *F) : F(F) {} +  // Note the hash is recalculated potentially multiple times, but it is cheap. +  FunctionNode(Function *F) +    : F(F), Hash(FunctionComparator::functionHash(*F))  {}    Function *getFunc() const { return F; } +  FunctionComparator::FunctionHash getHash() const { return Hash; }    /// Replace the reference to the function F by the function G, assuming their    /// implementations are equal.    void replaceBy(Function *G) const { -    assert(!(*this < FunctionNode(G)) && !(FunctionNode(G) < *this) && -           "The two functions must be equal"); -      F = G;    } -  void release() { F = 0; } -  bool operator<(const FunctionNode &RHS) const { -    return (FunctionComparator(F, RHS.getFunc()).compare()) == -1; -  } +  void release() { F = nullptr; }  }; -} +} // end anonymous namespace  int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {    if (L < R) return -1; @@ -426,13 +486,25 @@ int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {  }  int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const { -  if (int Res = cmpNumbers((uint64_t)&L.getSemantics(), -                           (uint64_t)&R.getSemantics())) +  // Floats are ordered first by semantics (i.e. float, double, half, etc.), +  // then by value interpreted as a bitstring (aka APInt). +  const fltSemantics &SL = L.getSemantics(), &SR = R.getSemantics(); +  if (int Res = cmpNumbers(APFloat::semanticsPrecision(SL), +                           APFloat::semanticsPrecision(SR))) +    return Res; +  if (int Res = cmpNumbers(APFloat::semanticsMaxExponent(SL), +                           APFloat::semanticsMaxExponent(SR))) +    return Res; +  if (int Res = cmpNumbers(APFloat::semanticsMinExponent(SL), +                           APFloat::semanticsMinExponent(SR))) +    return Res; +  if (int Res = cmpNumbers(APFloat::semanticsSizeInBits(SL), +                           APFloat::semanticsSizeInBits(SR)))      return Res;    return cmpAPInts(L.bitcastToAPInt(), R.bitcastToAPInt());  } -int FunctionComparator::cmpStrings(StringRef L, StringRef R) const { +int FunctionComparator::cmpMem(StringRef L, StringRef R) const {    // Prevent heavy comparison, compare sizes first.    if (int Res = cmpNumbers(L.size(), R.size()))      return Res; @@ -466,6 +538,59 @@ int FunctionComparator::cmpAttrs(const AttributeSet L,    return 0;  } +int FunctionComparator::cmpRangeMetadata(const MDNode* L, +                                         const MDNode* R) const { +  if (L == R) +    return 0; +  if (!L) +    return -1; +  if (!R) +    return 1; +  // Range metadata is a sequence of numbers. Make sure they are the same +  // sequence.  +  // TODO: Note that as this is metadata, it is possible to drop and/or merge +  // this data when considering functions to merge. Thus this comparison would +  // return 0 (i.e. equivalent), but merging would become more complicated +  // because the ranges would need to be unioned. It is not likely that +  // functions differ ONLY in this metadata if they are actually the same +  // function semantically. +  if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands())) +    return Res; +  for (size_t I = 0; I < L->getNumOperands(); ++I) { +    ConstantInt* LLow = mdconst::extract<ConstantInt>(L->getOperand(I)); +    ConstantInt* RLow = mdconst::extract<ConstantInt>(R->getOperand(I)); +    if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue())) +      return Res; +  } +  return 0; +} + +int FunctionComparator::cmpOperandBundlesSchema(const Instruction *L, +                                                const Instruction *R) const { +  ImmutableCallSite LCS(L); +  ImmutableCallSite RCS(R); + +  assert(LCS && RCS && "Must be calls or invokes!"); +  assert(LCS.isCall() == RCS.isCall() && "Can't compare otherwise!"); + +  if (int Res = +          cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles())) +    return Res; + +  for (unsigned i = 0, e = LCS.getNumOperandBundles(); i != e; ++i) { +    auto OBL = LCS.getOperandBundleAt(i); +    auto OBR = RCS.getOperandBundleAt(i); + +    if (int Res = OBL.getTagName().compare(OBR.getTagName())) +      return Res; + +    if (int Res = cmpNumbers(OBL.Inputs.size(), OBR.Inputs.size())) +      return Res; +  } + +  return 0; +} +  /// Constants comparison:  /// 1. Check whether type of L constant could be losslessly bitcasted to R  /// type. @@ -500,9 +625,9 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) {      unsigned TyLWidth = 0;      unsigned TyRWidth = 0; -    if (const VectorType *VecTyL = dyn_cast<VectorType>(TyL)) +    if (auto *VecTyL = dyn_cast<VectorType>(TyL))        TyLWidth = VecTyL->getBitWidth(); -    if (const VectorType *VecTyR = dyn_cast<VectorType>(TyR)) +    if (auto *VecTyR = dyn_cast<VectorType>(TyR))        TyRWidth = VecTyR->getBitWidth();      if (TyLWidth != TyRWidth) @@ -538,11 +663,29 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) {    if (!L->isNullValue() && R->isNullValue())      return -1; +  auto GlobalValueL = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(L)); +  auto GlobalValueR = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(R)); +  if (GlobalValueL && GlobalValueR) { +    return cmpGlobalValues(GlobalValueL, GlobalValueR); +  } +    if (int Res = cmpNumbers(L->getValueID(), R->getValueID()))      return Res; +  if (const auto *SeqL = dyn_cast<ConstantDataSequential>(L)) { +    const auto *SeqR = cast<ConstantDataSequential>(R); +    // This handles ConstantDataArray and ConstantDataVector. Note that we +    // compare the two raw data arrays, which might differ depending on the host +    // endianness. This isn't a problem though, because the endiness of a module +    // will affect the order of the constants, but this order is the same +    // for a given input module and host platform. +    return cmpMem(SeqL->getRawDataValues(), SeqR->getRawDataValues()); +  } +    switch (L->getValueID()) { -  case Value::UndefValueVal: return TypesRes; +  case Value::UndefValueVal: +  case Value::ConstantTokenNoneVal: +    return TypesRes;    case Value::ConstantIntVal: {      const APInt &LInt = cast<ConstantInt>(L)->getValue();      const APInt &RInt = cast<ConstantInt>(R)->getValue(); @@ -609,19 +752,55 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) {      }      return 0;    } -  case Value::FunctionVal: -  case Value::GlobalVariableVal: -  case Value::GlobalAliasVal: -  default: // Unknown constant, cast L and R pointers to numbers and compare. -    return cmpNumbers((uint64_t)L, (uint64_t)R); +  case Value::BlockAddressVal: { +    const BlockAddress *LBA = cast<BlockAddress>(L); +    const BlockAddress *RBA = cast<BlockAddress>(R); +    if (int Res = cmpValues(LBA->getFunction(), RBA->getFunction())) +      return Res; +    if (LBA->getFunction() == RBA->getFunction()) { +      // They are BBs in the same function. Order by which comes first in the +      // BB order of the function. This order is deterministic. +      Function* F = LBA->getFunction(); +      BasicBlock *LBB = LBA->getBasicBlock(); +      BasicBlock *RBB = RBA->getBasicBlock(); +      if (LBB == RBB) +        return 0; +      for(BasicBlock &BB : F->getBasicBlockList()) { +        if (&BB == LBB) { +          assert(&BB != RBB); +          return -1; +        } +        if (&BB == RBB) +          return 1; +      } +      llvm_unreachable("Basic Block Address does not point to a basic block in " +                       "its function."); +      return -1; +    } else { +      // cmpValues said the functions are the same. So because they aren't +      // literally the same pointer, they must respectively be the left and +      // right functions. +      assert(LBA->getFunction() == FnL && RBA->getFunction() == FnR); +      // cmpValues will tell us if these are equivalent BasicBlocks, in the +      // context of their respective functions. +      return cmpValues(LBA->getBasicBlock(), RBA->getBasicBlock()); +    }    } +  default: // Unknown constant, abort. +    DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n"); +    llvm_unreachable("Constant ValueID not recognized."); +    return -1; +  } +} + +int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue* R) { +  return cmpNumbers(GlobalNumbers->getNumber(L), GlobalNumbers->getNumber(R));  }  /// cmpType - compares two types,  /// defines total ordering among the types set.  /// See method declaration comments for more details.  int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const { -    PointerType *PTyL = dyn_cast<PointerType>(TyL);    PointerType *PTyR = dyn_cast<PointerType>(TyR); @@ -642,10 +821,15 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {      llvm_unreachable("Unknown type!");      // Fall through in Release mode.    case Type::IntegerTyID: -  case Type::VectorTyID: -    // TyL == TyR would have returned true earlier. -    return cmpNumbers((uint64_t)TyL, (uint64_t)TyR); - +    return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(), +                      cast<IntegerType>(TyR)->getBitWidth()); +  case Type::VectorTyID: { +    VectorType *VTyL = cast<VectorType>(TyL), *VTyR = cast<VectorType>(TyR); +    if (int Res = cmpNumbers(VTyL->getNumElements(), VTyR->getNumElements())) +      return Res; +    return cmpTypes(VTyL->getElementType(), VTyR->getElementType()); +  } +  // TyL == TyR would have returned true earlier, because types are uniqued.    case Type::VoidTyID:    case Type::FloatTyID:    case Type::DoubleTyID: @@ -654,6 +838,7 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {    case Type::PPC_FP128TyID:    case Type::LabelTyID:    case Type::MetadataTyID: +  case Type::TokenTyID:      return 0;    case Type::PointerTyID: { @@ -759,8 +944,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,      if (int Res =              cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope()))        return Res; -    return cmpNumbers((uint64_t)LI->getMetadata(LLVMContext::MD_range), -                      (uint64_t)cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range)); +    return cmpRangeMetadata(LI->getMetadata(LLVMContext::MD_range), +        cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));    }    if (const StoreInst *SI = dyn_cast<StoreInst>(L)) {      if (int Res = @@ -783,20 +968,24 @@ int FunctionComparator::cmpOperations(const Instruction *L,      if (int Res =              cmpAttrs(CI->getAttributes(), cast<CallInst>(R)->getAttributes()))        return Res; -    return cmpNumbers( -        (uint64_t)CI->getMetadata(LLVMContext::MD_range), -        (uint64_t)cast<CallInst>(R)->getMetadata(LLVMContext::MD_range)); +    if (int Res = cmpOperandBundlesSchema(CI, R)) +      return Res; +    return cmpRangeMetadata( +        CI->getMetadata(LLVMContext::MD_range), +        cast<CallInst>(R)->getMetadata(LLVMContext::MD_range));    } -  if (const InvokeInst *CI = dyn_cast<InvokeInst>(L)) { -    if (int Res = cmpNumbers(CI->getCallingConv(), +  if (const InvokeInst *II = dyn_cast<InvokeInst>(L)) { +    if (int Res = cmpNumbers(II->getCallingConv(),                               cast<InvokeInst>(R)->getCallingConv()))        return Res;      if (int Res = -            cmpAttrs(CI->getAttributes(), cast<InvokeInst>(R)->getAttributes())) +            cmpAttrs(II->getAttributes(), cast<InvokeInst>(R)->getAttributes())) +      return Res; +    if (int Res = cmpOperandBundlesSchema(II, R))        return Res; -    return cmpNumbers( -        (uint64_t)CI->getMetadata(LLVMContext::MD_range), -        (uint64_t)cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range)); +    return cmpRangeMetadata( +        II->getMetadata(LLVMContext::MD_range), +        cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range));    }    if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) {      ArrayRef<unsigned> LIndices = IVI->getIndices(); @@ -876,9 +1065,8 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,    if (GEPL->accumulateConstantOffset(DL, OffsetL) &&        GEPR->accumulateConstantOffset(DL, OffsetR))      return cmpAPInts(OffsetL, OffsetR); - -  if (int Res = cmpNumbers((uint64_t)GEPL->getPointerOperand()->getType(), -                           (uint64_t)GEPR->getPointerOperand()->getType())) +  if (int Res = cmpTypes(GEPL->getSourceElementType(), +                         GEPR->getSourceElementType()))      return Res;    if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands())) @@ -892,6 +1080,28 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,    return 0;  } +int FunctionComparator::cmpInlineAsm(const InlineAsm *L, +                                     const InlineAsm *R) const { +  // InlineAsm's are uniqued. If they are the same pointer, obviously they are +  // the same, otherwise compare the fields. +  if (L == R) +    return 0; +  if (int Res = cmpTypes(L->getFunctionType(), R->getFunctionType())) +    return Res; +  if (int Res = cmpMem(L->getAsmString(), R->getAsmString())) +    return Res; +  if (int Res = cmpMem(L->getConstraintString(), R->getConstraintString())) +    return Res; +  if (int Res = cmpNumbers(L->hasSideEffects(), R->hasSideEffects())) +    return Res; +  if (int Res = cmpNumbers(L->isAlignStack(), R->isAlignStack())) +    return Res; +  if (int Res = cmpNumbers(L->getDialect(), R->getDialect())) +    return Res; +  llvm_unreachable("InlineAsm blocks were not uniqued."); +  return 0; +} +  /// Compare two values used by the two functions under pair-wise comparison. If  /// this is the first time the values are seen, they're added to the mapping so  /// that we will detect mismatches on next use. @@ -926,7 +1136,7 @@ int FunctionComparator::cmpValues(const Value *L, const Value *R) {    const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R);    if (InlineAsmL && InlineAsmR) -    return cmpNumbers((uint64_t)L, (uint64_t)R); +    return cmpInlineAsm(InlineAsmL, InlineAsmR);    if (InlineAsmL)      return 1;    if (InlineAsmR) @@ -938,12 +1148,13 @@ int FunctionComparator::cmpValues(const Value *L, const Value *R) {    return cmpNumbers(LeftSN.first->second, RightSN.first->second);  }  // Test whether two basic blocks have equivalent behaviour. -int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) { +int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL, +                                       const BasicBlock *BBR) {    BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();    BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();    do { -    if (int Res = cmpValues(InstL, InstR)) +    if (int Res = cmpValues(&*InstL, &*InstR))        return Res;      const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(InstL); @@ -961,7 +1172,7 @@ int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) {        if (int Res = cmpGEPs(GEPL, GEPR))          return Res;      } else { -      if (int Res = cmpOperations(InstL, InstR)) +      if (int Res = cmpOperations(&*InstL, &*InstR))          return Res;        assert(InstL->getNumOperands() == InstR->getNumOperands()); @@ -970,11 +1181,8 @@ int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) {          Value *OpR = InstR->getOperand(i);          if (int Res = cmpValues(OpL, OpR))            return Res; -        if (int Res = cmpNumbers(OpL->getValueID(), OpR->getValueID())) -          return Res; -        // TODO: Already checked in cmpOperation -        if (int Res = cmpTypes(OpL->getType(), OpR->getType())) -          return Res; +        // cmpValues should ensure this is true. +        assert(cmpTypes(OpL->getType(), OpR->getType()) == 0);        }      } @@ -990,7 +1198,6 @@ int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) {  // Test whether the two functions have equivalent behaviour.  int FunctionComparator::compare() { -    sn_mapL.clear();    sn_mapR.clear(); @@ -1001,7 +1208,7 @@ int FunctionComparator::compare() {      return Res;    if (FnL->hasGC()) { -    if (int Res = cmpNumbers((uint64_t)FnL->getGC(), (uint64_t)FnR->getGC())) +    if (int Res = cmpMem(FnL->getGC(), FnR->getGC()))        return Res;    } @@ -1009,7 +1216,7 @@ int FunctionComparator::compare() {      return Res;    if (FnL->hasSection()) { -    if (int Res = cmpStrings(FnL->getSection(), FnR->getSection())) +    if (int Res = cmpMem(FnL->getSection(), FnR->getSection()))        return Res;    } @@ -1033,7 +1240,7 @@ int FunctionComparator::compare() {                                      ArgRI = FnR->arg_begin(),                                      ArgLE = FnL->arg_end();         ArgLI != ArgLE; ++ArgLI, ++ArgRI) { -    if (cmpValues(ArgLI, ArgRI) != 0) +    if (cmpValues(&*ArgLI, &*ArgRI) != 0)        llvm_unreachable("Arguments repeat!");    } @@ -1055,7 +1262,7 @@ int FunctionComparator::compare() {      if (int Res = cmpValues(BBL, BBR))        return Res; -    if (int Res = compare(BBL, BBR)) +    if (int Res = cmpBasicBlocks(BBL, BBR))        return Res;      const TerminatorInst *TermL = BBL->getTerminator(); @@ -1074,6 +1281,68 @@ int FunctionComparator::compare() {  }  namespace { +// Accumulate the hash of a sequence of 64-bit integers. This is similar to a +// hash of a sequence of 64bit ints, but the entire input does not need to be +// available at once. This interface is necessary for functionHash because it +// needs to accumulate the hash as the structure of the function is traversed +// without saving these values to an intermediate buffer. This form of hashing +// is not often needed, as usually the object to hash is just read from a +// buffer. +class HashAccumulator64 { +  uint64_t Hash; +public: +  // Initialize to random constant, so the state isn't zero. +  HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; } +  void add(uint64_t V) { +     Hash = llvm::hashing::detail::hash_16_bytes(Hash, V); +  } +  // No finishing is required, because the entire hash value is used. +  uint64_t getHash() { return Hash; } +}; +} // end anonymous namespace + +// A function hash is calculated by considering only the number of arguments and +// whether a function is varargs, the order of basic blocks (given by the +// successors of each basic block in depth first order), and the order of +// opcodes of each instruction within each of these basic blocks. This mirrors +// the strategy compare() uses to compare functions by walking the BBs in depth +// first order and comparing each instruction in sequence. Because this hash +// does not look at the operands, it is insensitive to things such as the +// target of calls and the constants used in the function, which makes it useful +// when possibly merging functions which are the same modulo constants and call +// targets. +FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) { +  HashAccumulator64 H; +  H.add(F.isVarArg()); +  H.add(F.arg_size()); +   +  SmallVector<const BasicBlock *, 8> BBs; +  SmallSet<const BasicBlock *, 16> VisitedBBs; + +  // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(), +  // accumulating the hash of the function "structure." (BB and opcode sequence) +  BBs.push_back(&F.getEntryBlock()); +  VisitedBBs.insert(BBs[0]); +  while (!BBs.empty()) { +    const BasicBlock *BB = BBs.pop_back_val(); +    // This random value acts as a block header, as otherwise the partition of +    // opcodes into BBs wouldn't affect the hash, only the order of the opcodes +    H.add(45798);  +    for (auto &Inst : *BB) { +      H.add(Inst.getOpcode()); +    } +    const TerminatorInst *Term = BB->getTerminator(); +    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { +      if (!VisitedBBs.insert(Term->getSuccessor(i)).second) +        continue; +      BBs.push_back(Term->getSuccessor(i)); +    } +  } +  return H.getHash(); +} + + +namespace {  /// MergeFunctions finds functions which will generate identical machine code,  /// by considering all pointer types to be equivalent. Once identified, @@ -1084,14 +1353,31 @@ class MergeFunctions : public ModulePass {  public:    static char ID;    MergeFunctions() -    : ModulePass(ID), HasGlobalAliases(false) { +    : ModulePass(ID), FnTree(FunctionNodeCmp(&GlobalNumbers)), FNodesInTree(), +      HasGlobalAliases(false) {      initializeMergeFunctionsPass(*PassRegistry::getPassRegistry());    }    bool runOnModule(Module &M) override;  private: -  typedef std::set<FunctionNode> FnTreeType; +  // The function comparison operator is provided here so that FunctionNodes do +  // not need to become larger with another pointer. +  class FunctionNodeCmp { +    GlobalNumberState* GlobalNumbers; +  public: +    FunctionNodeCmp(GlobalNumberState* GN) : GlobalNumbers(GN) {} +    bool operator()(const FunctionNode &LHS, const FunctionNode &RHS) const { +      // Order first by hashes, then full function comparison. +      if (LHS.getHash() != RHS.getHash()) +        return LHS.getHash() < RHS.getHash(); +      FunctionComparator FCmp(LHS.getFunc(), RHS.getFunc(), GlobalNumbers); +      return FCmp.compare() == -1; +    } +  }; +  typedef std::set<FunctionNode, FunctionNodeCmp> FnTreeType; + +  GlobalNumberState GlobalNumbers;    /// A work queue of functions that may have been modified and should be    /// analyzed again. @@ -1133,17 +1419,23 @@ private:    void writeAlias(Function *F, Function *G);    /// Replace function F with function G in the function tree. -  void replaceFunctionInTree(FnTreeType::iterator &IterToF, Function *G); +  void replaceFunctionInTree(const FunctionNode &FN, Function *G);    /// The set of all distinct functions. Use the insert() and remove() methods -  /// to modify it. +  /// to modify it. The map allows efficient lookup and deferring of Functions.    FnTreeType FnTree; +  // Map functions to the iterators of the FunctionNode which contains them +  // in the FnTree. This must be updated carefully whenever the FnTree is +  // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid +  // dangling iterators into FnTree. The invariant that preserves this is that +  // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree. +  ValueMap<Function*, FnTreeType::iterator> FNodesInTree;    /// Whether or not the target supports global aliases.    bool HasGlobalAliases;  }; -}  // end anonymous namespace +} // end anonymous namespace  char MergeFunctions::ID = 0;  INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false) @@ -1166,8 +1458,8 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {        for (std::vector<WeakVH>::iterator J = I; J != E && j < Max; ++J, ++j) {          Function *F1 = cast<Function>(*I);          Function *F2 = cast<Function>(*J); -        int Res1 = FunctionComparator(F1, F2).compare(); -        int Res2 = FunctionComparator(F2, F1).compare(); +        int Res1 = FunctionComparator(F1, F2, &GlobalNumbers).compare(); +        int Res2 = FunctionComparator(F2, F1, &GlobalNumbers).compare();          // If F1 <= F2, then F2 >= F1, otherwise report failure.          if (Res1 != -Res2) { @@ -1188,8 +1480,8 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {              continue;            Function *F3 = cast<Function>(*K); -          int Res3 = FunctionComparator(F1, F3).compare(); -          int Res4 = FunctionComparator(F2, F3).compare(); +          int Res3 = FunctionComparator(F1, F3, &GlobalNumbers).compare(); +          int Res4 = FunctionComparator(F2, F3, &GlobalNumbers).compare();            bool Transitive = true; @@ -1227,11 +1519,33 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {  bool MergeFunctions::runOnModule(Module &M) {    bool Changed = false; -  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { -    if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) -      Deferred.push_back(WeakVH(I)); +  // All functions in the module, ordered by hash. Functions with a unique +  // hash value are easily eliminated. +  std::vector<std::pair<FunctionComparator::FunctionHash, Function *>> +    HashedFuncs; +  for (Function &Func : M) { +    if (!Func.isDeclaration() && !Func.hasAvailableExternallyLinkage()) { +      HashedFuncs.push_back({FunctionComparator::functionHash(Func), &Func}); +    }     } +  std::stable_sort( +      HashedFuncs.begin(), HashedFuncs.end(), +      [](const std::pair<FunctionComparator::FunctionHash, Function *> &a, +         const std::pair<FunctionComparator::FunctionHash, Function *> &b) { +        return a.first < b.first; +      }); + +  auto S = HashedFuncs.begin(); +  for (auto I = HashedFuncs.begin(), IE = HashedFuncs.end(); I != IE; ++I) { +    // If the hash value matches the previous value or the next one, we must +    // consider merging it. Otherwise it is dropped and never considered again. +    if ((I != S && std::prev(I)->first == I->first) || +        (std::next(I) != IE && std::next(I)->first == I->first) ) { +      Deferred.push_back(WeakVH(I->second)); +    } +  } +      do {      std::vector<WeakVH> Worklist;      Deferred.swap(Worklist); @@ -1270,6 +1584,7 @@ bool MergeFunctions::runOnModule(Module &M) {    } while (!Deferred.empty());    FnTree.clear(); +  GlobalNumbers.clear();    return Changed;  } @@ -1282,6 +1597,32 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {      ++UI;      CallSite CS(U->getUser());      if (CS && CS.isCallee(U)) { +      // Transfer the called function's attributes to the call site. Due to the +      // bitcast we will 'lose' ABI changing attributes because the 'called +      // function' is no longer a Function* but the bitcast. Code that looks up +      // the attributes from the called function will fail. + +      // FIXME: This is not actually true, at least not anymore. The callsite +      // will always have the same ABI affecting attributes as the callee, +      // because otherwise the original input has UB. Note that Old and New +      // always have matching ABI, so no attributes need to be changed. +      // Transferring other attributes may help other optimizations, but that +      // should be done uniformly and not in this ad-hoc way. +      auto &Context = New->getContext(); +      auto NewFuncAttrs = New->getAttributes(); +      auto CallSiteAttrs = CS.getAttributes(); + +      CallSiteAttrs = CallSiteAttrs.addAttributes( +          Context, AttributeSet::ReturnIndex, NewFuncAttrs.getRetAttributes()); + +      for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) { +        AttributeSet Attrs = NewFuncAttrs.getParamAttributes(argIdx); +        if (Attrs.getNumSlots()) +          CallSiteAttrs = CallSiteAttrs.addAttributes(Context, argIdx, Attrs); +      } + +      CS.setAttributes(CallSiteAttrs); +        remove(CS.getInstruction()->getParent()->getParent());        U->set(BitcastNew);      } @@ -1352,15 +1693,15 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {    SmallVector<Value *, 16> Args;    unsigned i = 0;    FunctionType *FFTy = F->getFunctionType(); -  for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end(); -       AI != AE; ++AI) { -    Args.push_back(createCast(Builder, (Value*)AI, FFTy->getParamType(i))); +  for (Argument & AI : NewG->args()) { +    Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i)));      ++i;    }    CallInst *CI = Builder.CreateCall(F, Args);    CI->setTailCall();    CI->setCallingConv(F->getCallingConv()); +  CI->setAttributes(F->getAttributes());    if (NewG->getReturnType()->isVoidTy()) {      Builder.CreateRetVoid();    } else { @@ -1379,8 +1720,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {  // Replace G with an alias to F and delete G.  void MergeFunctions::writeAlias(Function *F, Function *G) { -  PointerType *PTy = G->getType(); -  auto *GA = GlobalAlias::create(PTy, G->getLinkage(), "", F); +  auto *GA = GlobalAlias::create(G->getLinkage(), "", F);    F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));    GA->takeName(G);    GA->setVisibility(G->getVisibility()); @@ -1425,19 +1765,24 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {    ++NumFunctionsMerged;  } -/// Replace function F for function G in the map. -void MergeFunctions::replaceFunctionInTree(FnTreeType::iterator &IterToF, +/// Replace function F by function G. +void MergeFunctions::replaceFunctionInTree(const FunctionNode &FN,                                             Function *G) { -  Function *F = IterToF->getFunc(); - -  // A total order is already guaranteed otherwise because we process strong -  // functions before weak functions. -  assert(((F->mayBeOverridden() && G->mayBeOverridden()) || -          (!F->mayBeOverridden() && !G->mayBeOverridden())) && -         "Only change functions if both are strong or both are weak"); -  (void)F; - -  IterToF->replaceBy(G); +  Function *F = FN.getFunc(); +  assert(FunctionComparator(F, G, &GlobalNumbers).compare() == 0 && +         "The two functions must be equal"); +   +  auto I = FNodesInTree.find(F); +  assert(I != FNodesInTree.end() && "F should be in FNodesInTree"); +  assert(FNodesInTree.count(G) == 0 && "FNodesInTree should not contain G"); +   +  FnTreeType::iterator IterToFNInFnTree = I->second; +  assert(&(*IterToFNInFnTree) == &FN && "F should map to FN in FNodesInTree."); +  // Remove F -> FN and insert G -> FN +  FNodesInTree.erase(I); +  FNodesInTree.insert({G, IterToFNInFnTree}); +  // Replace F with G in FN, which is stored inside the FnTree. +  FN.replaceBy(G);  }  // Insert a ComparableFunction into the FnTree, or merge it away if equal to one @@ -1447,6 +1792,8 @@ bool MergeFunctions::insert(Function *NewFunction) {        FnTree.insert(FunctionNode(NewFunction));    if (Result.second) { +    assert(FNodesInTree.count(NewFunction) == 0); +    FNodesInTree.insert({NewFunction, Result.first});      DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n');      return false;    } @@ -1476,7 +1823,7 @@ bool MergeFunctions::insert(Function *NewFunction) {      if (OldF.getFunc()->getName() > NewFunction->getName()) {        // Swap the two functions.        Function *F = OldF.getFunc(); -      replaceFunctionInTree(Result.first, NewFunction); +      replaceFunctionInTree(*Result.first, NewFunction);        NewFunction = F;        assert(OldF.getFunc() != F && "Must have swapped the functions.");      } @@ -1495,18 +1842,13 @@ bool MergeFunctions::insert(Function *NewFunction) {  // Remove a function from FnTree. If it was already in FnTree, add  // it to Deferred so that we'll look at it in the next round.  void MergeFunctions::remove(Function *F) { -  // We need to make sure we remove F, not a function "equal" to F per the -  // function equality comparator. -  FnTreeType::iterator found = FnTree.find(FunctionNode(F)); -  size_t Erased = 0; -  if (found != FnTree.end() && found->getFunc() == F) { -    Erased = 1; -    FnTree.erase(found); -  } - -  if (Erased) { -    DEBUG(dbgs() << "Removed " << F->getName() -                 << " from set and deferred it.\n"); +  auto I = FNodesInTree.find(F); +  if (I != FNodesInTree.end()) { +    DEBUG(dbgs() << "Deferred " << F->getName()<< ".\n"); +    FnTree.erase(I->second); +    // I->second has been invalidated, remove it from the FNodesInTree map to +    // preserve the invariant. +    FNodesInTree.erase(I);      Deferred.emplace_back(F);    }  } @@ -1516,6 +1858,8 @@ void MergeFunctions::remove(Function *F) {  void MergeFunctions::removeUsers(Value *V) {    std::vector<Value *> Worklist;    Worklist.push_back(V); +  SmallSet<Value*, 8> Visited; +  Visited.insert(V);    while (!Worklist.empty()) {      Value *V = Worklist.back();      Worklist.pop_back(); @@ -1526,8 +1870,10 @@ void MergeFunctions::removeUsers(Value *V) {        } else if (isa<GlobalValue>(U)) {          // do nothing        } else if (Constant *C = dyn_cast<Constant>(U)) { -        for (User *UU : C->users()) -          Worklist.push_back(UU); +        for (User *UU : C->users()) { +          if (!Visited.insert(UU).second) +            Worklist.push_back(UU); +        }        }      }    } diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp index 4a7cb7ba7d12..0c5c84bbccab 100644 --- a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -50,7 +50,7 @@ ModulePass* llvm::createPartialInliningPass() { return new PartialInliner(); }  Function* PartialInliner::unswitchFunction(Function* F) {    // First, verify that this function is an unswitching candidate... -  BasicBlock* entryBlock = F->begin(); +  BasicBlock *entryBlock = &F->front();    BranchInst *BR = dyn_cast<BranchInst>(entryBlock->getTerminator());    if (!BR || BR->isUnconditional())      return nullptr; @@ -89,18 +89,18 @@ Function* PartialInliner::unswitchFunction(Function* F) {    // of which will go outside.    BasicBlock* preReturn = newReturnBlock;    newReturnBlock = newReturnBlock->splitBasicBlock( -                                              newReturnBlock->getFirstNonPHI()); +      newReturnBlock->getFirstNonPHI()->getIterator());    BasicBlock::iterator I = preReturn->begin(); -  BasicBlock::iterator Ins = newReturnBlock->begin(); +  Instruction *Ins = &newReturnBlock->front();    while (I != preReturn->end()) {      PHINode* OldPhi = dyn_cast<PHINode>(I);      if (!OldPhi) break; -     -    PHINode* retPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins); + +    PHINode *retPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins);      OldPhi->replaceAllUsesWith(retPhi);      Ins = newReturnBlock->getFirstNonPHI(); -     -    retPhi->addIncoming(I, preReturn); + +    retPhi->addIncoming(&*I, preReturn);      retPhi->addIncoming(OldPhi->getIncomingValueForBlock(newEntryBlock),                          newEntryBlock);      OldPhi->removeIncomingValue(newEntryBlock); @@ -116,8 +116,8 @@ Function* PartialInliner::unswitchFunction(Function* F) {         FE = duplicateFunction->end(); FI != FE; ++FI)      if (&*FI != newEntryBlock && &*FI != newReturnBlock &&          &*FI != newNonReturnBlock) -      toExtract.push_back(FI); -       +      toExtract.push_back(&*FI); +    // The CodeExtractor needs a dominator tree.    DominatorTree DT;    DT.recalculate(*duplicateFunction); diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 909baae92548..9876efa7b235 100644 --- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -12,19 +12,26 @@  //  //===----------------------------------------------------------------------===// -  #include "llvm/Transforms/IPO/PassManagerBuilder.h"  #include "llvm-c/Transforms/PassManagerBuilder.h"  #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/CFLAliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TypeBasedAliasAnalysis.h"  #include "llvm/IR/DataLayout.h" -#include "llvm/IR/Verifier.h" +#include "llvm/IR/FunctionInfo.h"  #include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Verifier.h"  #include "llvm/Support/CommandLine.h"  #include "llvm/Support/ManagedStatic.h" -#include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/Target/TargetMachine.h"  #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/ForceFunctionAttrs.h" +#include "llvm/Transforms/IPO/InferFunctionAttrs.h"  #include "llvm/Transforms/Scalar.h"  #include "llvm/Transforms/Vectorize.h" @@ -89,11 +96,21 @@ static cl::opt<bool> EnableLoopDistribute(      "enable-loop-distribute", cl::init(false), cl::Hidden,      cl::desc("Enable the new, experimental LoopDistribution Pass")); +static cl::opt<bool> EnableNonLTOGlobalsModRef( +    "enable-non-lto-gmr", cl::init(true), cl::Hidden, +    cl::desc( +        "Enable the GlobalsModRef AliasAnalysis outside of the LTO pipeline.")); + +static cl::opt<bool> EnableLoopLoadElim( +    "enable-loop-load-elim", cl::init(false), cl::Hidden, +    cl::desc("Enable the new, experimental LoopLoadElimination Pass")); +  PassManagerBuilder::PassManagerBuilder() {      OptLevel = 2;      SizeLevel = 0;      LibraryInfo = nullptr;      Inliner = nullptr; +    FunctionIndex = nullptr;      DisableUnitAtATime = false;      DisableUnrollLoops = false;      BBVectorize = RunBBVectorization; @@ -143,10 +160,9 @@ void PassManagerBuilder::addInitialAliasAnalysisPasses(    // BasicAliasAnalysis wins if they disagree. This is intended to help    // support "obvious" type-punning idioms.    if (UseCFLAA) -    PM.add(createCFLAliasAnalysisPass()); -  PM.add(createTypeBasedAliasAnalysisPass()); -  PM.add(createScopedNoAliasAAPass()); -  PM.add(createBasicAliasAnalysisPass()); +    PM.add(createCFLAAWrapperPass()); +  PM.add(createTypeBasedAAWrapperPass()); +  PM.add(createScopedNoAliasAAWrapperPass());  }  void PassManagerBuilder::populateFunctionPassManager( @@ -172,6 +188,9 @@ void PassManagerBuilder::populateFunctionPassManager(  void PassManagerBuilder::populateModulePassManager(      legacy::PassManagerBase &MPM) { +  // Allow forcing function attributes as a debugging and tuning aid. +  MPM.add(createForceFunctionAttrsLegacyPass()); +    // If all optimizations are disabled, just run the always-inline pass and,    // if enabled, the function merging pass.    if (OptLevel == 0) { @@ -201,10 +220,15 @@ void PassManagerBuilder::populateModulePassManager(    addInitialAliasAnalysisPasses(MPM);    if (!DisableUnitAtATime) { +    // Infer attributes about declarations if possible. +    MPM.add(createInferFunctionAttrsLegacyPass()); +      addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);      MPM.add(createIPSCCPPass());              // IP SCCP      MPM.add(createGlobalOptimizerPass());     // Optimize out global vars +    // Promote any localized global vars +    MPM.add(createPromoteMemoryToRegisterPass());      MPM.add(createDeadArgEliminationPass());  // Dead argument elimination @@ -213,6 +237,12 @@ void PassManagerBuilder::populateModulePassManager(      MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE    } +  if (EnableNonLTOGlobalsModRef) +    // We add a module alias analysis pass here. In part due to bugs in the +    // analysis infrastructure this "works" in that the analysis stays alive +    // for the entire SCC pass run below. +    MPM.add(createGlobalsAAWrapperPass()); +    // Start of CallGraph SCC passes.    if (!DisableUnitAtATime)      MPM.add(createPruneEHPass());             // Remove dead EH info @@ -245,6 +275,7 @@ void PassManagerBuilder::populateModulePassManager(    MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));    MPM.add(createLICMPass());                  // Hoist loop invariants    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); +  MPM.add(createCFGSimplificationPass());    MPM.add(createInstructionCombiningPass());    MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars    MPM.add(createLoopIdiomPass());             // Recognize idioms like memset. @@ -315,9 +346,42 @@ void PassManagerBuilder::populateModulePassManager(    // we must insert a no-op module pass to reset the pass manager.    MPM.add(createBarrierNoopPass()); +  if (!DisableUnitAtATime && OptLevel > 1 && !PrepareForLTO) { +    // Remove avail extern fns and globals definitions if we aren't +    // compiling an object file for later LTO. For LTO we want to preserve +    // these so they are eligible for inlining at link-time. Note if they +    // are unreferenced they will be removed by GlobalDCE later, so +    // this only impacts referenced available externally globals. +    // Eventually they will be suppressed during codegen, but eliminating +    // here enables more opportunity for GlobalDCE as it may make +    // globals referenced by available external functions dead +    // and saves running remaining passes on the eliminated functions. +    MPM.add(createEliminateAvailableExternallyPass()); +  } + +  if (EnableNonLTOGlobalsModRef) +    // We add a fresh GlobalsModRef run at this point. This is particularly +    // useful as the above will have inlined, DCE'ed, and function-attr +    // propagated everything. We should at this point have a reasonably minimal +    // and richly annotated call graph. By computing aliasing and mod/ref +    // information for all local globals here, the late loop passes and notably +    // the vectorizer will be able to use them to help recognize vectorizable +    // memory operations. +    // +    // Note that this relies on a bug in the pass manager which preserves +    // a module analysis into a function pass pipeline (and throughout it) so +    // long as the first function pass doesn't invalidate the module analysis. +    // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for +    // this to work. Fortunately, it is trivial to preserve AliasAnalysis +    // (doing nothing preserves it as it is required to be conservatively +    // correct in the face of IR changes). +    MPM.add(createGlobalsAAWrapperPass()); +    if (RunFloat2Int)      MPM.add(createFloat2IntPass()); +  addExtensionsToPM(EP_VectorizerStart, MPM); +    // Re-rotate loops in all our loop nests. These may have fallout out of    // rotated form due to GVN or other transformations, and the vectorizer relies    // on the rotated form. Disable header duplication at -Oz. @@ -329,6 +393,12 @@ void PassManagerBuilder::populateModulePassManager(      MPM.add(createLoopDistributePass());    MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); + +  // Eliminate loads by forwarding stores from the previous iteration to loads +  // of the current iteration. +  if (EnableLoopLoadElim) +    MPM.add(createLoopLoadEliminationPass()); +    // FIXME: Because of #pragma vectorize enable, the passes below are always    // inserted in the pipeline, even when the vectorizer doesn't run (ex. when    // on -O1 and no #pragma is found). Would be good to have these two passes @@ -402,17 +472,6 @@ void PassManagerBuilder::populateModulePassManager(      // GlobalOpt already deletes dead functions and globals, at -O2 try a      // late pass of GlobalDCE.  It is capable of deleting dead cycles.      if (OptLevel > 1) { -      if (!PrepareForLTO) { -        // Remove avail extern fns and globals definitions if we aren't -        // compiling an object file for later LTO. For LTO we want to preserve -        // these so they are eligible for inlining at link-time. Note if they -        // are unreferenced they will be removed by GlobalDCE below, so -        // this only impacts referenced available externally globals. -        // Eventually they will be suppressed during codegen, but eliminating -        // here enables more opportunity for GlobalDCE as it may make -        // globals referenced by available external functions dead. -        MPM.add(createEliminateAvailableExternallyPass()); -      }        MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.        MPM.add(createConstantMergePass());     // Merge dup global constants      } @@ -428,13 +487,25 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {    // Provide AliasAnalysis services for optimizations.    addInitialAliasAnalysisPasses(PM); +  if (FunctionIndex) +    PM.add(createFunctionImportPass(FunctionIndex)); + +  // Allow forcing function attributes as a debugging and tuning aid. +  PM.add(createForceFunctionAttrsLegacyPass()); + +  // Infer attributes about declarations if possible. +  PM.add(createInferFunctionAttrsLegacyPass()); +    // Propagate constants at call sites into the functions they call.  This    // opens opportunities for globalopt (and inlining) by substituting function    // pointers passed as arguments to direct uses of functions.    PM.add(createIPSCCPPass());    // Now that we internalized some globals, see if we can hack on them! +  PM.add(createFunctionAttrsPass()); // Add norecurse if possible.    PM.add(createGlobalOptimizerPass()); +  // Promote any localized global vars. +  PM.add(createPromoteMemoryToRegisterPass());    // Linking modules together can lead to duplicated global constants, only    // keep one copy of each constant. @@ -481,7 +552,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {    // Run a few AA driven optimizations here and now, to cleanup the code.    PM.add(createFunctionAttrsPass()); // Add nocapture. -  PM.add(createGlobalsModRefPass()); // IP alias analysis. +  PM.add(createGlobalsAAWrapperPass()); // IP alias analysis.    PM.add(createLICMPass());                 // Hoist loop invariants.    if (EnableMLSM) @@ -500,6 +571,15 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {    PM.add(createLoopVectorizePass(true, LoopVectorize)); +  // Now that we've optimized loops (in particular loop induction variables), +  // we may have exposed more scalar opportunities. Run parts of the scalar +  // optimizer again at this point. +  PM.add(createInstructionCombiningPass()); // Initial cleanup +  PM.add(createCFGSimplificationPass()); // if-convert +  PM.add(createSCCPPass()); // Propagate exposed constants +  PM.add(createInstructionCombiningPass()); // Clean up again +  PM.add(createBitTrackingDCEPass()); +    // More scalar chains could be vectorized due to more alias information    if (RunSLPAfterLoopVectorization)      if (SLPVectorize) @@ -524,6 +604,9 @@ void PassManagerBuilder::addLateLTOOptimizationPasses(    // Delete basic blocks, which optimization passes may have killed.    PM.add(createCFGSimplificationPass()); +  // Drop bodies of available externally objects to improve GlobalDCE. +  PM.add(createEliminateAvailableExternallyPass()); +    // Now that we have optimized the program, discard unreachable functions.    PM.add(createGlobalDCEPass()); @@ -543,6 +626,10 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {    if (OptLevel > 1)      addLTOOptimizationPasses(PM); +  // Create a function that performs CFI checks for cross-DSO calls with targets +  // in the current module. +  PM.add(createCrossDSOCFIPass()); +    // Lower bit sets to globals. This pass supports Clang's control flow    // integrity mechanisms (-fsanitize=cfi*) and needs to run at link time if CFI    // is enabled. The pass does nothing if CFI is disabled. diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp index b2f1010c9a07..3af4afb903fe 100644 --- a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp @@ -21,7 +21,7 @@  #include "llvm/Support/raw_ostream.h"  #include "llvm/Analysis/CallGraph.h"  #include "llvm/Analysis/CallGraphSCCPass.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h"  #include "llvm/IR/CFG.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/Function.h" @@ -153,21 +153,16 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {    // If the SCC doesn't unwind or doesn't throw, note this fact.    if (!SCCMightUnwind || !SCCMightReturn)      for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { -      AttrBuilder NewAttributes; - -      if (!SCCMightUnwind) -        NewAttributes.addAttribute(Attribute::NoUnwind); -      if (!SCCMightReturn) -        NewAttributes.addAttribute(Attribute::NoReturn); -        Function *F = (*I)->getFunction(); -      const AttributeSet &PAL = F->getAttributes().getFnAttributes(); -      const AttributeSet &NPAL = AttributeSet::get( -          F->getContext(), AttributeSet::FunctionIndex, NewAttributes); -      if (PAL != NPAL) { +      if (!SCCMightUnwind && !F->hasFnAttribute(Attribute::NoUnwind)) { +        F->addFnAttr(Attribute::NoUnwind); +        MadeChange = true; +      } + +      if (!SCCMightReturn && !F->hasFnAttribute(Attribute::NoReturn)) { +        F->addFnAttr(Attribute::NoReturn);          MadeChange = true; -        F->addAttributes(AttributeSet::FunctionIndex, NPAL);        }      } @@ -191,9 +186,13 @@ bool PruneEH::SimplifyFunction(Function *F) {    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {      if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))        if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(F)) { -        SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); +        SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end()); +        SmallVector<OperandBundleDef, 1> OpBundles; +        II->getOperandBundlesAsDefs(OpBundles); +          // Insert a call instruction before the invoke. -        CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II); +        CallInst *Call = CallInst::Create(II->getCalledValue(), Args, OpBundles, +                                          "", II);          Call->takeName(II);          Call->setCallingConv(II->getCallingConv());          Call->setAttributes(II->getAttributes()); @@ -233,7 +232,7 @@ bool PruneEH::SimplifyFunction(Function *F) {            // Remove the uncond branch and add an unreachable.            BB->getInstList().pop_back(); -          new UnreachableInst(BB->getContext(), BB); +          new UnreachableInst(BB->getContext(), &*BB);            DeleteBasicBlock(New);  // Delete the new BB.            MadeChange = true; diff --git a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp index c8dfa54a4aa0..928d92ef9d12 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp +++ b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -22,7 +22,6 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h"  #include "llvm/ADT/DenseMap.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/SmallSet.h" @@ -44,7 +43,11 @@  #include "llvm/ProfileData/SampleProfReader.h"  #include "llvm/Support/CommandLine.h"  #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/Format.h"  #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Cloning.h"  #include <cctype>  using namespace llvm; @@ -61,27 +64,51 @@ static cl::opt<unsigned> SampleProfileMaxPropagateIterations(      "sample-profile-max-propagate-iterations", cl::init(100),      cl::desc("Maximum number of iterations to go through when propagating "               "sample block/edge weights through the CFG.")); +static cl::opt<unsigned> SampleProfileRecordCoverage( +    "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"), +    cl::desc("Emit a warning if less than N% of records in the input profile " +             "are matched to the IR.")); +static cl::opt<unsigned> SampleProfileSampleCoverage( +    "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"), +    cl::desc("Emit a warning if less than N% of samples in the input profile " +             "are matched to the IR.")); +static cl::opt<double> SampleProfileHotThreshold( +    "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"), +    cl::desc("Inlined functions that account for more than N% of all samples " +             "collected in the parent function, will be inlined again.")); +static cl::opt<double> SampleProfileGlobalHotThreshold( +    "sample-profile-global-hot-threshold", cl::init(30), cl::value_desc("N"), +    cl::desc("Top-level functions that account for more than N% of all samples " +             "collected in the profile, will be marked as hot for the inliner " +             "to consider.")); +static cl::opt<double> SampleProfileGlobalColdThreshold( +    "sample-profile-global-cold-threshold", cl::init(0.5), cl::value_desc("N"), +    cl::desc("Top-level functions that account for less than N% of all samples " +             "collected in the profile, will be marked as cold for the inliner " +             "to consider."));  namespace { -typedef DenseMap<BasicBlock *, unsigned> BlockWeightMap; -typedef DenseMap<BasicBlock *, BasicBlock *> EquivalenceClassMap; -typedef std::pair<BasicBlock *, BasicBlock *> Edge; -typedef DenseMap<Edge, unsigned> EdgeWeightMap; -typedef DenseMap<BasicBlock *, SmallVector<BasicBlock *, 8>> BlockEdgeMap; +typedef DenseMap<const BasicBlock *, uint64_t> BlockWeightMap; +typedef DenseMap<const BasicBlock *, const BasicBlock *> EquivalenceClassMap; +typedef std::pair<const BasicBlock *, const BasicBlock *> Edge; +typedef DenseMap<Edge, uint64_t> EdgeWeightMap; +typedef DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>> +    BlockEdgeMap;  /// \brief Sample profile pass.  ///  /// This pass reads profile data from the file specified by  /// -sample-profile-file and annotates every affected function with the  /// profile information found in that file. -class SampleProfileLoader : public FunctionPass { +class SampleProfileLoader : public ModulePass {  public:    // Class identification, replacement for typeinfo    static char ID;    SampleProfileLoader(StringRef Name = SampleProfileFile) -      : FunctionPass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Ctx(nullptr), -        Reader(), Samples(nullptr), Filename(Name), ProfileIsValid(false) { +      : ModulePass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Reader(), +        Samples(nullptr), Filename(Name), ProfileIsValid(false), +        TotalCollectedSamples(0) {      initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry());    } @@ -91,36 +118,37 @@ public:    const char *getPassName() const override { return "Sample profile pass"; } -  bool runOnFunction(Function &F) override; +  bool runOnModule(Module &M) override;    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.setPreservesCFG(); -    AU.addRequired<LoopInfoWrapperPass>(); -    AU.addRequired<DominatorTreeWrapperPass>(); -    AU.addRequired<PostDominatorTree>();    }  protected: +  bool runOnFunction(Function &F);    unsigned getFunctionLoc(Function &F);    bool emitAnnotations(Function &F); -  unsigned getInstWeight(Instruction &I); -  unsigned getBlockWeight(BasicBlock *BB); +  ErrorOr<uint64_t> getInstWeight(const Instruction &I) const; +  ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB) const; +  const FunctionSamples *findCalleeFunctionSamples(const CallInst &I) const; +  const FunctionSamples *findFunctionSamples(const Instruction &I) const; +  bool inlineHotFunctions(Function &F); +  bool emitInlineHints(Function &F);    void printEdgeWeight(raw_ostream &OS, Edge E); -  void printBlockWeight(raw_ostream &OS, BasicBlock *BB); -  void printBlockEquivalence(raw_ostream &OS, BasicBlock *BB); +  void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; +  void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);    bool computeBlockWeights(Function &F);    void findEquivalenceClasses(Function &F);    void findEquivalencesFor(BasicBlock *BB1,                             SmallVector<BasicBlock *, 8> Descendants,                             DominatorTreeBase<BasicBlock> *DomTree);    void propagateWeights(Function &F); -  unsigned visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); +  uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);    void buildEdges(Function &F);    bool propagateThroughEdges(Function &F); - -  /// \brief Line number for the function header. Used to compute absolute -  /// line numbers from the relative line numbers found in the profile. -  unsigned HeaderLineno; +  void computeDominanceAndLoopInfo(Function &F); +  unsigned getOffset(unsigned L, unsigned H) const; +  void clearFunctionData();    /// \brief Map basic blocks to their computed weights.    /// @@ -135,7 +163,7 @@ protected:    EdgeWeightMap EdgeWeights;    /// \brief Set of visited blocks during propagation. -  SmallPtrSet<BasicBlock *, 128> VisitedBlocks; +  SmallPtrSet<const BasicBlock *, 128> VisitedBlocks;    /// \brief Set of visited edges during propagation.    SmallSet<Edge, 128> VisitedEdges; @@ -149,9 +177,9 @@ protected:    EquivalenceClassMap EquivalenceClass;    /// \brief Dominance, post-dominance and loop information. -  DominatorTree *DT; -  PostDominatorTree *PDT; -  LoopInfo *LI; +  std::unique_ptr<DominatorTree> DT; +  std::unique_ptr<DominatorTreeBase<BasicBlock>> PDT; +  std::unique_ptr<LoopInfo> LI;    /// \brief Predecessors for each basic block in the CFG.    BlockEdgeMap Predecessors; @@ -159,9 +187,6 @@ protected:    /// \brief Successors for each basic block in the CFG.    BlockEdgeMap Successors; -  /// \brief LLVM context holding the debug data we need. -  LLVMContext *Ctx; -    /// \brief Profile reader object.    std::unique_ptr<SampleProfileReader> Reader; @@ -173,7 +198,207 @@ protected:    /// \brief Flag indicating whether the profile input loaded successfully.    bool ProfileIsValid; + +  /// \brief Total number of samples collected in this profile. +  /// +  /// This is the sum of all the samples collected in all the functions executed +  /// at runtime. +  uint64_t TotalCollectedSamples;  }; + +class SampleCoverageTracker { +public: +  SampleCoverageTracker() : SampleCoverage(), TotalUsedSamples(0) {} + +  bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset, +                       uint32_t Discriminator, uint64_t Samples); +  unsigned computeCoverage(unsigned Used, unsigned Total) const; +  unsigned countUsedRecords(const FunctionSamples *FS) const; +  unsigned countBodyRecords(const FunctionSamples *FS) const; +  uint64_t getTotalUsedSamples() const { return TotalUsedSamples; } +  uint64_t countBodySamples(const FunctionSamples *FS) const; +  void clear() { +    SampleCoverage.clear(); +    TotalUsedSamples = 0; +  } + +private: +  typedef std::map<LineLocation, unsigned> BodySampleCoverageMap; +  typedef DenseMap<const FunctionSamples *, BodySampleCoverageMap> +      FunctionSamplesCoverageMap; + +  /// Coverage map for sampling records. +  /// +  /// This map keeps a record of sampling records that have been matched to +  /// an IR instruction. This is used to detect some form of staleness in +  /// profiles (see flag -sample-profile-check-coverage). +  /// +  /// Each entry in the map corresponds to a FunctionSamples instance.  This is +  /// another map that counts how many times the sample record at the +  /// given location has been used. +  FunctionSamplesCoverageMap SampleCoverage; + +  /// Number of samples used from the profile. +  /// +  /// When a sampling record is used for the first time, the samples from +  /// that record are added to this accumulator.  Coverage is later computed +  /// based on the total number of samples available in this function and +  /// its callsites. +  /// +  /// Note that this accumulator tracks samples used from a single function +  /// and all the inlined callsites. Strictly, we should have a map of counters +  /// keyed by FunctionSamples pointers, but these stats are cleared after +  /// every function, so we just need to keep a single counter. +  uint64_t TotalUsedSamples; +}; + +SampleCoverageTracker CoverageTracker; + +/// Return true if the given callsite is hot wrt to its caller. +/// +/// Functions that were inlined in the original binary will be represented +/// in the inline stack in the sample profile. If the profile shows that +/// the original inline decision was "good" (i.e., the callsite is executed +/// frequently), then we will recreate the inline decision and apply the +/// profile from the inlined callsite. +/// +/// To decide whether an inlined callsite is hot, we compute the fraction +/// of samples used by the callsite with respect to the total number of samples +/// collected in the caller. +/// +/// If that fraction is larger than the default given by +/// SampleProfileHotThreshold, the callsite will be inlined again. +bool callsiteIsHot(const FunctionSamples *CallerFS, +                   const FunctionSamples *CallsiteFS) { +  if (!CallsiteFS) +    return false; // The callsite was not inlined in the original binary. + +  uint64_t ParentTotalSamples = CallerFS->getTotalSamples(); +  if (ParentTotalSamples == 0) +    return false; // Avoid division by zero. + +  uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples(); +  if (CallsiteTotalSamples == 0) +    return false; // Callsite is trivially cold. + +  double PercentSamples = +      (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0; +  return PercentSamples >= SampleProfileHotThreshold; +} + +} + +/// Mark as used the sample record for the given function samples at +/// (LineOffset, Discriminator). +/// +/// \returns true if this is the first time we mark the given record. +bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS, +                                            uint32_t LineOffset, +                                            uint32_t Discriminator, +                                            uint64_t Samples) { +  LineLocation Loc(LineOffset, Discriminator); +  unsigned &Count = SampleCoverage[FS][Loc]; +  bool FirstTime = (++Count == 1); +  if (FirstTime) +    TotalUsedSamples += Samples; +  return FirstTime; +} + +/// Return the number of sample records that were applied from this profile. +/// +/// This count does not include records from cold inlined callsites. +unsigned +SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const { +  auto I = SampleCoverage.find(FS); + +  // The size of the coverage map for FS represents the number of records +  // that were marked used at least once. +  unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0; + +  // If there are inlined callsites in this function, count the samples found +  // in the respective bodies. However, do not bother counting callees with 0 +  // total samples, these are callees that were never invoked at runtime. +  for (const auto &I : FS->getCallsiteSamples()) { +    const FunctionSamples *CalleeSamples = &I.second; +    if (callsiteIsHot(FS, CalleeSamples)) +      Count += countUsedRecords(CalleeSamples); +  } + +  return Count; +} + +/// Return the number of sample records in the body of this profile. +/// +/// This count does not include records from cold inlined callsites. +unsigned +SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const { +  unsigned Count = FS->getBodySamples().size(); + +  // Only count records in hot callsites. +  for (const auto &I : FS->getCallsiteSamples()) { +    const FunctionSamples *CalleeSamples = &I.second; +    if (callsiteIsHot(FS, CalleeSamples)) +      Count += countBodyRecords(CalleeSamples); +  } + +  return Count; +} + +/// Return the number of samples collected in the body of this profile. +/// +/// This count does not include samples from cold inlined callsites. +uint64_t +SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const { +  uint64_t Total = 0; +  for (const auto &I : FS->getBodySamples()) +    Total += I.second.getSamples(); + +  // Only count samples in hot callsites. +  for (const auto &I : FS->getCallsiteSamples()) { +    const FunctionSamples *CalleeSamples = &I.second; +    if (callsiteIsHot(FS, CalleeSamples)) +      Total += countBodySamples(CalleeSamples); +  } + +  return Total; +} + +/// Return the fraction of sample records used in this profile. +/// +/// The returned value is an unsigned integer in the range 0-100 indicating +/// the percentage of sample records that were used while applying this +/// profile to the associated function. +unsigned SampleCoverageTracker::computeCoverage(unsigned Used, +                                                unsigned Total) const { +  assert(Used <= Total && +         "number of used records cannot exceed the total number of records"); +  return Total > 0 ? Used * 100 / Total : 100; +} + +/// Clear all the per-function data used to load samples and propagate weights. +void SampleProfileLoader::clearFunctionData() { +  BlockWeights.clear(); +  EdgeWeights.clear(); +  VisitedBlocks.clear(); +  VisitedEdges.clear(); +  EquivalenceClass.clear(); +  DT = nullptr; +  PDT = nullptr; +  LI = nullptr; +  Predecessors.clear(); +  Successors.clear(); +  CoverageTracker.clear(); +} + +/// \brief Returns the offset of lineno \p L to head_lineno \p H +/// +/// \param L  Lineno +/// \param H  Header lineno of the function +/// +/// \returns offset to the header lineno. 16 bits are used to represent offset. +/// We assume that a single function will not exceed 65535 LOC. +unsigned SampleProfileLoader::getOffset(unsigned L, unsigned H) const { +  return (L - H) & 0xffff;  }  /// \brief Print the weight of edge \p E on stream \p OS. @@ -190,8 +415,8 @@ void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {  /// \param OS  Stream to emit the output to.  /// \param BB  Block to print.  void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS, -                                                BasicBlock *BB) { -  BasicBlock *Equiv = EquivalenceClass[BB]; +                                                const BasicBlock *BB) { +  const BasicBlock *Equiv = EquivalenceClass[BB];    OS << "equivalence[" << BB->getName()       << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";  } @@ -200,8 +425,11 @@ void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,  ///  /// \param OS  Stream to emit the output to.  /// \param BB  Block to print. -void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) { -  OS << "weight[" << BB->getName() << "]: " << BlockWeights[BB] << "\n"; +void SampleProfileLoader::printBlockWeight(raw_ostream &OS, +                                           const BasicBlock *BB) const { +  const auto &I = BlockWeights.find(BB); +  uint64_t W = (I == BlockWeights.end() ? 0 : I->second); +  OS << "weight[" << BB->getName() << "]: " << W << "\n";  }  /// \brief Get the weight for an instruction. @@ -214,51 +442,67 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) {  ///  /// \param Inst Instruction to query.  /// -/// \returns The profiled weight of I. -unsigned SampleProfileLoader::getInstWeight(Instruction &Inst) { +/// \returns the weight of \p Inst. +ErrorOr<uint64_t> +SampleProfileLoader::getInstWeight(const Instruction &Inst) const {    DebugLoc DLoc = Inst.getDebugLoc();    if (!DLoc) -    return 0; +    return std::error_code(); -  unsigned Lineno = DLoc.getLine(); -  if (Lineno < HeaderLineno) -    return 0; +  const FunctionSamples *FS = findFunctionSamples(Inst); +  if (!FS) +    return std::error_code();    const DILocation *DIL = DLoc; -  int LOffset = Lineno - HeaderLineno; -  unsigned Discriminator = DIL->getDiscriminator(); -  unsigned Weight = Samples->samplesAt(LOffset, Discriminator); -  DEBUG(dbgs() << "    " << Lineno << "." << Discriminator << ":" << Inst -               << " (line offset: " << LOffset << "." << Discriminator -               << " - weight: " << Weight << ")\n"); -  return Weight; +  unsigned Lineno = DLoc.getLine(); +  unsigned HeaderLineno = DIL->getScope()->getSubprogram()->getLine(); + +  uint32_t LineOffset = getOffset(Lineno, HeaderLineno); +  uint32_t Discriminator = DIL->getDiscriminator(); +  ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator); +  if (R) { +    bool FirstMark = +        CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get()); +    if (FirstMark) { +      const Function *F = Inst.getParent()->getParent(); +      LLVMContext &Ctx = F->getContext(); +      emitOptimizationRemark( +          Ctx, DEBUG_TYPE, *F, DLoc, +          Twine("Applied ") + Twine(*R) + " samples from profile (offset: " + +              Twine(LineOffset) + +              ((Discriminator) ? Twine(".") + Twine(Discriminator) : "") + ")"); +    } +    DEBUG(dbgs() << "    " << Lineno << "." << DIL->getDiscriminator() << ":" +                 << Inst << " (line offset: " << Lineno - HeaderLineno << "." +                 << DIL->getDiscriminator() << " - weight: " << R.get() +                 << ")\n"); +  } +  return R;  }  /// \brief Compute the weight of a basic block.  ///  /// The weight of basic block \p BB is the maximum weight of all the -/// instructions in BB. The weight of \p BB is computed and cached in -/// the BlockWeights map. +/// instructions in BB.  ///  /// \param BB The basic block to query.  /// -/// \returns The computed weight of BB. -unsigned SampleProfileLoader::getBlockWeight(BasicBlock *BB) { -  // If we've computed BB's weight before, return it. -  std::pair<BlockWeightMap::iterator, bool> Entry = -      BlockWeights.insert(std::make_pair(BB, 0)); -  if (!Entry.second) -    return Entry.first->second; - -  // Otherwise, compute and cache BB's weight. -  unsigned Weight = 0; +/// \returns the weight for \p BB. +ErrorOr<uint64_t> +SampleProfileLoader::getBlockWeight(const BasicBlock *BB) const { +  bool Found = false; +  uint64_t Weight = 0;    for (auto &I : BB->getInstList()) { -    unsigned InstWeight = getInstWeight(I); -    if (InstWeight > Weight) -      Weight = InstWeight; +    const ErrorOr<uint64_t> &R = getInstWeight(I); +    if (R && R.get() >= Weight) { +      Weight = R.get(); +      Found = true; +    }    } -  Entry.first->second = Weight; -  return Weight; +  if (Found) +    return Weight; +  else +    return std::error_code();  }  /// \brief Compute and store the weights of every basic block. @@ -270,15 +514,199 @@ unsigned SampleProfileLoader::getBlockWeight(BasicBlock *BB) {  bool SampleProfileLoader::computeBlockWeights(Function &F) {    bool Changed = false;    DEBUG(dbgs() << "Block weights\n"); -  for (auto &BB : F) { -    unsigned Weight = getBlockWeight(&BB); -    Changed |= (Weight > 0); +  for (const auto &BB : F) { +    ErrorOr<uint64_t> Weight = getBlockWeight(&BB); +    if (Weight) { +      BlockWeights[&BB] = Weight.get(); +      VisitedBlocks.insert(&BB); +      Changed = true; +    }      DEBUG(printBlockWeight(dbgs(), &BB));    }    return Changed;  } +/// \brief Get the FunctionSamples for a call instruction. +/// +/// The FunctionSamples of a call instruction \p Inst is the inlined +/// instance in which that call instruction is calling to. It contains +/// all samples that resides in the inlined instance. We first find the +/// inlined instance in which the call instruction is from, then we +/// traverse its children to find the callsite with the matching +/// location and callee function name. +/// +/// \param Inst Call instruction to query. +/// +/// \returns The FunctionSamples pointer to the inlined instance. +const FunctionSamples * +SampleProfileLoader::findCalleeFunctionSamples(const CallInst &Inst) const { +  const DILocation *DIL = Inst.getDebugLoc(); +  if (!DIL) { +    return nullptr; +  } +  DISubprogram *SP = DIL->getScope()->getSubprogram(); +  if (!SP) +    return nullptr; + +  Function *CalleeFunc = Inst.getCalledFunction(); +  if (!CalleeFunc) { +    return nullptr; +  } + +  StringRef CalleeName = CalleeFunc->getName(); +  const FunctionSamples *FS = findFunctionSamples(Inst); +  if (FS == nullptr) +    return nullptr; + +  return FS->findFunctionSamplesAt( +      CallsiteLocation(getOffset(DIL->getLine(), SP->getLine()), +                       DIL->getDiscriminator(), CalleeName)); +} + +/// \brief Get the FunctionSamples for an instruction. +/// +/// The FunctionSamples of an instruction \p Inst is the inlined instance +/// in which that instruction is coming from. We traverse the inline stack +/// of that instruction, and match it with the tree nodes in the profile. +/// +/// \param Inst Instruction to query. +/// +/// \returns the FunctionSamples pointer to the inlined instance. +const FunctionSamples * +SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { +  SmallVector<CallsiteLocation, 10> S; +  const DILocation *DIL = Inst.getDebugLoc(); +  if (!DIL) { +    return Samples; +  } +  StringRef CalleeName; +  for (const DILocation *DIL = Inst.getDebugLoc(); DIL; +       DIL = DIL->getInlinedAt()) { +    DISubprogram *SP = DIL->getScope()->getSubprogram(); +    if (!SP) +      return nullptr; +    if (!CalleeName.empty()) { +      S.push_back(CallsiteLocation(getOffset(DIL->getLine(), SP->getLine()), +                                   DIL->getDiscriminator(), CalleeName)); +    } +    CalleeName = SP->getLinkageName(); +  } +  if (S.size() == 0) +    return Samples; +  const FunctionSamples *FS = Samples; +  for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) { +    FS = FS->findFunctionSamplesAt(S[i]); +  } +  return FS; +} + +/// \brief Emit an inline hint if \p F is globally hot or cold. +/// +/// If \p F consumes a significant fraction of samples (indicated by +/// SampleProfileGlobalHotThreshold), apply the InlineHint attribute for the +/// inliner to consider the function hot. +/// +/// If \p F consumes a small fraction of samples (indicated by +/// SampleProfileGlobalColdThreshold), apply the Cold attribute for the inliner +/// to consider the function cold. +/// +/// FIXME - This setting of inline hints is sub-optimal. Instead of marking a +/// function globally hot or cold, we should be annotating individual callsites. +/// This is not currently possible, but work on the inliner will eventually +/// provide this ability. See http://reviews.llvm.org/D15003 for details and +/// discussion. +/// +/// \returns True if either attribute was applied to \p F. +bool SampleProfileLoader::emitInlineHints(Function &F) { +  if (TotalCollectedSamples == 0) +    return false; + +  uint64_t FunctionSamples = Samples->getTotalSamples(); +  double SamplesPercent = +      (double)FunctionSamples / (double)TotalCollectedSamples * 100.0; + +  // If the function collected more samples than the hot threshold, mark +  // it globally hot. +  if (SamplesPercent >= SampleProfileGlobalHotThreshold) { +    F.addFnAttr(llvm::Attribute::InlineHint); +    std::string Msg; +    raw_string_ostream S(Msg); +    S << "Applied inline hint to globally hot function '" << F.getName() +      << "' with " << format("%.2f", SamplesPercent) +      << "% of samples (threshold: " +      << format("%.2f", SampleProfileGlobalHotThreshold.getValue()) << "%)"; +    S.flush(); +    emitOptimizationRemark(F.getContext(), DEBUG_TYPE, F, DebugLoc(), Msg); +    return true; +  } + +  // If the function collected fewer samples than the cold threshold, mark +  // it globally cold. +  if (SamplesPercent <= SampleProfileGlobalColdThreshold) { +    F.addFnAttr(llvm::Attribute::Cold); +    std::string Msg; +    raw_string_ostream S(Msg); +    S << "Applied cold hint to globally cold function '" << F.getName() +      << "' with " << format("%.2f", SamplesPercent) +      << "% of samples (threshold: " +      << format("%.2f", SampleProfileGlobalColdThreshold.getValue()) << "%)"; +    S.flush(); +    emitOptimizationRemark(F.getContext(), DEBUG_TYPE, F, DebugLoc(), Msg); +    return true; +  } + +  return false; +} + +/// \brief Iteratively inline hot callsites of a function. +/// +/// Iteratively traverse all callsites of the function \p F, and find if +/// the corresponding inlined instance exists and is hot in profile. If +/// it is hot enough, inline the callsites and adds new callsites of the +/// callee into the caller. +/// +/// TODO: investigate the possibility of not invoking InlineFunction directly. +/// +/// \param F function to perform iterative inlining. +/// +/// \returns True if there is any inline happened. +bool SampleProfileLoader::inlineHotFunctions(Function &F) { +  bool Changed = false; +  LLVMContext &Ctx = F.getContext(); +  while (true) { +    bool LocalChanged = false; +    SmallVector<CallInst *, 10> CIS; +    for (auto &BB : F) { +      for (auto &I : BB.getInstList()) { +        CallInst *CI = dyn_cast<CallInst>(&I); +        if (CI && callsiteIsHot(Samples, findCalleeFunctionSamples(*CI))) +          CIS.push_back(CI); +      } +    } +    for (auto CI : CIS) { +      InlineFunctionInfo IFI; +      Function *CalledFunction = CI->getCalledFunction(); +      DebugLoc DLoc = CI->getDebugLoc(); +      uint64_t NumSamples = findCalleeFunctionSamples(*CI)->getTotalSamples(); +      if (InlineFunction(CI, IFI)) { +        LocalChanged = true; +        emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc, +                               Twine("inlined hot callee '") + +                                   CalledFunction->getName() + "' with " + +                                   Twine(NumSamples) + " samples into '" + +                                   F.getName() + "'"); +      } +    } +    if (LocalChanged) { +      Changed = true; +    } else { +      break; +    } +  } +  return Changed; +} +  /// \brief Find equivalence classes for the given block.  ///  /// This finds all the blocks that are guaranteed to execute the same @@ -305,12 +733,13 @@ bool SampleProfileLoader::computeBlockWeights(Function &F) {  void SampleProfileLoader::findEquivalencesFor(      BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants,      DominatorTreeBase<BasicBlock> *DomTree) { -  for (auto *BB2 : Descendants) { +  const BasicBlock *EC = EquivalenceClass[BB1]; +  uint64_t Weight = BlockWeights[EC]; +  for (const auto *BB2 : Descendants) {      bool IsDomParent = DomTree->dominates(BB2, BB1);      bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2); -    if (BB1 != BB2 && VisitedBlocks.insert(BB2).second && IsDomParent && -        IsInSameLoop) { -      EquivalenceClass[BB2] = BB1; +    if (BB1 != BB2 && IsDomParent && IsInSameLoop) { +      EquivalenceClass[BB2] = EC;        // If BB2 is heavier than BB1, make BB2 have the same weight        // as BB1. @@ -320,11 +749,10 @@ void SampleProfileLoader::findEquivalencesFor(        // during the propagation phase. Right now, we just want to        // make sure that BB1 has the largest weight of all the        // members of its equivalence set. -      unsigned &BB1Weight = BlockWeights[BB1]; -      unsigned &BB2Weight = BlockWeights[BB2]; -      BB1Weight = std::max(BB1Weight, BB2Weight); +      Weight = std::max(Weight, BlockWeights[BB2]);      }    } +  BlockWeights[EC] = Weight;  }  /// \brief Find equivalence classes. @@ -364,19 +792,7 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) {      // class by making BB2's equivalence class be BB1.      DominatedBBs.clear();      DT->getDescendants(BB1, DominatedBBs); -    findEquivalencesFor(BB1, DominatedBBs, PDT->DT); - -    // Repeat the same logic for all the blocks post-dominated by BB1. -    // We are looking for every basic block BB2 such that: -    // -    // 1- BB1 post-dominates BB2. -    // 2- BB2 dominates BB1. -    // 3- BB1 and BB2 are in the same loop nest. -    // -    // If all those conditions hold, BB2's equivalence class is BB1. -    DominatedBBs.clear(); -    PDT->getDescendants(BB1, DominatedBBs); -    findEquivalencesFor(BB1, DominatedBBs, DT); +    findEquivalencesFor(BB1, DominatedBBs, PDT.get());      DEBUG(printBlockEquivalence(dbgs(), BB1));    } @@ -389,8 +805,8 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) {    // to all the blocks in that equivalence class.    DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n");    for (auto &BI : F) { -    BasicBlock *BB = &BI; -    BasicBlock *EquivBB = EquivalenceClass[BB]; +    const BasicBlock *BB = &BI; +    const BasicBlock *EquivBB = EquivalenceClass[BB];      if (BB != EquivBB)        BlockWeights[BB] = BlockWeights[EquivBB];      DEBUG(printBlockWeight(dbgs(), BB)); @@ -407,7 +823,7 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) {  /// \param UnknownEdge  Set if E has not been visited before.  ///  /// \returns E's weight, if known. Otherwise, return 0. -unsigned SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, +uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,                                          Edge *UnknownEdge) {    if (!VisitedEdges.count(E)) {      (*NumUnknownEdges)++; @@ -432,8 +848,9 @@ unsigned SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,  bool SampleProfileLoader::propagateThroughEdges(Function &F) {    bool Changed = false;    DEBUG(dbgs() << "\nPropagation through edges\n"); -  for (auto &BI : F) { -    BasicBlock *BB = &BI; +  for (const auto &BI : F) { +    const BasicBlock *BB = &BI; +    const BasicBlock *EC = EquivalenceClass[BB];      // Visit all the predecessor and successor edges to determine      // which ones have a weight assigned already. Note that it doesn't @@ -441,7 +858,7 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) {      // only case we are interested in handling is when only a single      // edge is unknown (see setEdgeOrBlockWeight).      for (unsigned i = 0; i < 2; i++) { -      unsigned TotalWeight = 0; +      uint64_t TotalWeight = 0;        unsigned NumUnknownEdges = 0;        Edge UnknownEdge, SelfReferentialEdge; @@ -485,7 +902,7 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) {        // all edges will get a weight, or iteration will stop when        // it reaches SampleProfileMaxPropagateIterations.        if (NumUnknownEdges <= 1) { -        unsigned &BBWeight = BlockWeights[BB]; +        uint64_t &BBWeight = BlockWeights[EC];          if (NumUnknownEdges == 0) {            // If we already know the weight of all edges, the weight of the            // basic block can be computed. It should be no larger than the sum @@ -497,9 +914,9 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) {                           << " known. Set weight for block: ";                    printBlockWeight(dbgs(), BB););            } -          if (VisitedBlocks.insert(BB).second) +          if (VisitedBlocks.insert(EC).second)              Changed = true; -        } else if (NumUnknownEdges == 1 && VisitedBlocks.count(BB)) { +        } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) {            // If there is a single unknown edge and the block has been            // visited, then we can compute E's weight.            if (BBWeight >= TotalWeight) @@ -511,8 +928,8 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) {            DEBUG(dbgs() << "Set weight for edge: ";                  printEdgeWeight(dbgs(), UnknownEdge));          } -      } else if (SelfReferentialEdge.first && VisitedBlocks.count(BB)) { -        unsigned &BBWeight = BlockWeights[BB]; +      } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) { +        uint64_t &BBWeight = BlockWeights[BB];          // We have a self-referential edge and the weight of BB is known.          if (BBWeight >= TotalWeight)            EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight; @@ -578,7 +995,7 @@ void SampleProfileLoader::buildEdges(Function &F) {  ///   known).  void SampleProfileLoader::propagateWeights(Function &F) {    bool Changed = true; -  unsigned i = 0; +  unsigned I = 0;    // Add an entry count to the function using the samples gathered    // at the function entry. @@ -592,14 +1009,15 @@ void SampleProfileLoader::propagateWeights(Function &F) {    buildEdges(F);    // Propagate until we converge or we go past the iteration limit. -  while (Changed && i++ < SampleProfileMaxPropagateIterations) { +  while (Changed && I++ < SampleProfileMaxPropagateIterations) {      Changed = propagateThroughEdges(F);    }    // Generate MD_prof metadata for every branch instruction using the    // edge weights computed during propagation.    DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n"); -  MDBuilder MDB(F.getContext()); +  LLVMContext &Ctx = F.getContext(); +  MDBuilder MDB(Ctx);    for (auto &BI : F) {      BasicBlock *BB = &BI;      TerminatorInst *TI = BB->getTerminator(); @@ -610,24 +1028,44 @@ void SampleProfileLoader::propagateWeights(Function &F) {      DEBUG(dbgs() << "\nGetting weights for branch at line "                   << TI->getDebugLoc().getLine() << ".\n"); -    SmallVector<unsigned, 4> Weights; -    bool AllWeightsZero = true; +    SmallVector<uint32_t, 4> Weights; +    uint32_t MaxWeight = 0; +    DebugLoc MaxDestLoc;      for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {        BasicBlock *Succ = TI->getSuccessor(I);        Edge E = std::make_pair(BB, Succ); -      unsigned Weight = EdgeWeights[E]; +      uint64_t Weight = EdgeWeights[E];        DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E)); -      Weights.push_back(Weight); -      if (Weight != 0) -        AllWeightsZero = false; +      // Use uint32_t saturated arithmetic to adjust the incoming weights, +      // if needed. Sample counts in profiles are 64-bit unsigned values, +      // but internally branch weights are expressed as 32-bit values. +      if (Weight > std::numeric_limits<uint32_t>::max()) { +        DEBUG(dbgs() << " (saturated due to uint32_t overflow)"); +        Weight = std::numeric_limits<uint32_t>::max(); +      } +      Weights.push_back(static_cast<uint32_t>(Weight)); +      if (Weight != 0) { +        if (Weight > MaxWeight) { +          MaxWeight = Weight; +          MaxDestLoc = Succ->getFirstNonPHIOrDbgOrLifetime()->getDebugLoc(); +        } +      }      }      // Only set weights if there is at least one non-zero weight.      // In any other case, let the analyzer set weights. -    if (!AllWeightsZero) { +    if (MaxWeight > 0) {        DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");        TI->setMetadata(llvm::LLVMContext::MD_prof,                        MDB.createBranchWeights(Weights)); +      DebugLoc BranchLoc = TI->getDebugLoc(); +      emitOptimizationRemark( +          Ctx, DEBUG_TYPE, F, MaxDestLoc, +          Twine("most popular destination for conditional branches at ") + +              ((BranchLoc) ? Twine(BranchLoc->getFilename() + ":" + +                                   Twine(BranchLoc.getLine()) + ":" + +                                   Twine(BranchLoc.getCol())) +                           : Twine("<UNKNOWN LOCATION>")));      } else {        DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");      } @@ -649,7 +1087,7 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) {    if (DISubprogram *S = getDISubprogram(&F))      return S->getLine(); -  // If could not find the start of \p F, emit a diagnostic to inform the user +  // If the start of \p F is missing, emit a diagnostic to inform the user    // about the missed opportunity.    F.getContext().diagnose(DiagnosticInfoSampleProfile(        "No debug information found in function " + F.getName() + @@ -658,6 +1096,17 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) {    return 0;  } +void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) { +  DT.reset(new DominatorTree); +  DT->recalculate(F); + +  PDT.reset(new DominatorTreeBase<BasicBlock>(true)); +  PDT->recalculate(F); + +  LI.reset(new LoopInfo); +  LI->analyze(*DT); +} +  /// \brief Generate branch weight metadata for all branches in \p F.  ///  /// Branch weights are computed out of instruction samples using a @@ -710,18 +1159,23 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) {  bool SampleProfileLoader::emitAnnotations(Function &F) {    bool Changed = false; -  // Initialize invariants used during computation and propagation. -  HeaderLineno = getFunctionLoc(F); -  if (HeaderLineno == 0) +  if (getFunctionLoc(F) == 0)      return false;    DEBUG(dbgs() << "Line number for the first instruction in " << F.getName() -               << ": " << HeaderLineno << "\n"); +               << ": " << getFunctionLoc(F) << "\n"); + +  Changed |= emitInlineHints(F); + +  Changed |= inlineHotFunctions(F);    // Compute basic block weights.    Changed |= computeBlockWeights(F);    if (Changed) { +    // Compute dominance and loop info needed for propagation. +    computeDominanceAndLoopInfo(F); +      // Find equivalence classes.      findEquivalenceClasses(F); @@ -729,24 +1183,48 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {      propagateWeights(F);    } +  // If coverage checking was requested, compute it now. +  if (SampleProfileRecordCoverage) { +    unsigned Used = CoverageTracker.countUsedRecords(Samples); +    unsigned Total = CoverageTracker.countBodyRecords(Samples); +    unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); +    if (Coverage < SampleProfileRecordCoverage) { +      F.getContext().diagnose(DiagnosticInfoSampleProfile( +          getDISubprogram(&F)->getFilename(), getFunctionLoc(F), +          Twine(Used) + " of " + Twine(Total) + " available profile records (" + +              Twine(Coverage) + "%) were applied", +          DS_Warning)); +    } +  } + +  if (SampleProfileSampleCoverage) { +    uint64_t Used = CoverageTracker.getTotalUsedSamples(); +    uint64_t Total = CoverageTracker.countBodySamples(Samples); +    unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); +    if (Coverage < SampleProfileSampleCoverage) { +      F.getContext().diagnose(DiagnosticInfoSampleProfile( +          getDISubprogram(&F)->getFilename(), getFunctionLoc(F), +          Twine(Used) + " of " + Twine(Total) + " available profile samples (" + +              Twine(Coverage) + "%) were applied", +          DS_Warning)); +    } +  }    return Changed;  }  char SampleProfileLoader::ID = 0;  INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile",                        "Sample Profile loader", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(AddDiscriminators)  INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile",                      "Sample Profile loader", false, false)  bool SampleProfileLoader::doInitialization(Module &M) { -  auto ReaderOrErr = SampleProfileReader::create(Filename, M.getContext()); +  auto &Ctx = M.getContext(); +  auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx);    if (std::error_code EC = ReaderOrErr.getError()) {      std::string Msg = "Could not open profile: " + EC.message(); -    M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg)); +    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));      return false;    }    Reader = std::move(ReaderOrErr.get()); @@ -754,22 +1232,32 @@ bool SampleProfileLoader::doInitialization(Module &M) {    return true;  } -FunctionPass *llvm::createSampleProfileLoaderPass() { +ModulePass *llvm::createSampleProfileLoaderPass() {    return new SampleProfileLoader(SampleProfileFile);  } -FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) { +ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {    return new SampleProfileLoader(Name);  } -bool SampleProfileLoader::runOnFunction(Function &F) { +bool SampleProfileLoader::runOnModule(Module &M) {    if (!ProfileIsValid)      return false; -  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); -  PDT = &getAnalysis<PostDominatorTree>(); -  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); -  Ctx = &F.getParent()->getContext(); +  // Compute the total number of samples collected in this profile. +  for (const auto &I : Reader->getProfiles()) +    TotalCollectedSamples += I.second.getTotalSamples(); + +  bool retval = false; +  for (auto &F : M) +    if (!F.isDeclaration()) { +      clearFunctionData(); +      retval |= runOnFunction(F); +    } +  return retval; +} + +bool SampleProfileLoader::runOnFunction(Function &F) {    Samples = Reader->getSamplesFor(F);    if (!Samples->empty())      return emitAnnotations(F); diff --git a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp index 956991ad1f95..c94cc7c74a89 100644 --- a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp +++ b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp @@ -7,47 +7,31 @@  //  //===----------------------------------------------------------------------===//  // -// This pass loops over all of the functions in the input module, looking for  +// This pass loops over all of the functions in the input module, looking for  // dead declarations and removes them. Dead declarations are declarations of  // functions for which no implementation is available (i.e., declarations for  // unused library functions).  //  //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/StripDeadPrototypes.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/IR/Module.h"  #include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" +  using namespace llvm;  #define DEBUG_TYPE "strip-dead-prototypes"  STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed"); -namespace { - -/// @brief Pass to remove unused function declarations. -class StripDeadPrototypesPass : public ModulePass { -public: -  static char ID; // Pass identification, replacement for typeid -  StripDeadPrototypesPass() : ModulePass(ID) { -    initializeStripDeadPrototypesPassPass(*PassRegistry::getPassRegistry()); -  } -  bool runOnModule(Module &M) override; -}; - -} // end anonymous namespace - -char StripDeadPrototypesPass::ID = 0; -INITIALIZE_PASS(StripDeadPrototypesPass, "strip-dead-prototypes", -                "Strip Unused Function Prototypes", false, false) - -bool StripDeadPrototypesPass::runOnModule(Module &M) { +static bool stripDeadPrototypes(Module &M) {    bool MadeChange = false; -   +    // Erase dead function prototypes.    for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { -    Function *F = I++; +    Function *F = &*I++;      // Function must be a prototype and unused.      if (F->isDeclaration() && F->use_empty()) {        F->eraseFromParent(); @@ -59,16 +43,42 @@ bool StripDeadPrototypesPass::runOnModule(Module &M) {    // Erase dead global var prototypes.    for (Module::global_iterator I = M.global_begin(), E = M.global_end();         I != E; ) { -    GlobalVariable *GV = I++; +    GlobalVariable *GV = &*I++;      // Global must be a prototype and unused.      if (GV->isDeclaration() && GV->use_empty())        GV->eraseFromParent();    } -   +    // Return an indication of whether we changed anything or not.    return MadeChange;  } +PreservedAnalyses StripDeadPrototypesPass::run(Module &M) { +  if (stripDeadPrototypes(M)) +    return PreservedAnalyses::none(); +  return PreservedAnalyses::all(); +} + +namespace { + +class StripDeadPrototypesLegacyPass : public ModulePass { +public: +  static char ID; // Pass identification, replacement for typeid +  StripDeadPrototypesLegacyPass() : ModulePass(ID) { +    initializeStripDeadPrototypesLegacyPassPass( +        *PassRegistry::getPassRegistry()); +  } +  bool runOnModule(Module &M) override { +    return stripDeadPrototypes(M); +  } +}; + +} // end anonymous namespace + +char StripDeadPrototypesLegacyPass::ID = 0; +INITIALIZE_PASS(StripDeadPrototypesLegacyPass, "strip-dead-prototypes", +                "Strip Unused Function Prototypes", false, false) +  ModulePass *llvm::createStripDeadPrototypesPass() { -  return new StripDeadPrototypesPass(); +  return new StripDeadPrototypesLegacyPass();  } diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp index a4f30c58f936..46f352f7f9f1 100644 --- a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp +++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -211,13 +211,13 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {    for (Module::global_iterator I = M.global_begin(), E = M.global_end();         I != E; ++I) { -    if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0) +    if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0)        if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))          I->setName("");     // Internal symbols can't participate in linkage    }    for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { -    if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0) +    if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0)        if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))          I->setName("");     // Internal symbols can't participate in linkage      StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo); @@ -305,6 +305,12 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {    SmallVector<Metadata *, 64> LiveSubprograms;    DenseSet<const MDNode *> VisitedSet; +  std::set<DISubprogram *> LiveSPs; +  for (Function &F : M) { +    if (DISubprogram *SP = F.getSubprogram()) +      LiveSPs.insert(SP); +  } +    for (DICompileUnit *DIC : F.compile_units()) {      // Create our live subprogram list.      bool SubprogramChange = false; @@ -314,7 +320,7 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {          continue;        // If the function referenced by DISP is not null, the function is live. -      if (DISP->getFunction()) +      if (LiveSPs.count(DISP))          LiveSubprograms.push_back(DISP);        else          SubprogramChange = true; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 2d2c109f3243..6f49399f57bf 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1,4 +1,4 @@ -//===- InstCombineAddSub.cpp ----------------------------------------------===// +//===- InstCombineAddSub.cpp ------------------------------------*- C++ -*-===//  //  //                     The LLVM Compiler Infrastructure  // @@ -17,6 +17,7 @@  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/GetElementPtrTypeIterator.h"  #include "llvm/IR/PatternMatch.h" +  using namespace llvm;  using namespace PatternMatch; @@ -67,17 +68,17 @@ namespace {    private:      bool insaneIntVal(int V) { return V > 4 || V < -4; } -    APFloat *getFpValPtr(void) +    APFloat *getFpValPtr()        { return reinterpret_cast<APFloat*>(&FpValBuf.buffer[0]); } -    const APFloat *getFpValPtr(void) const +    const APFloat *getFpValPtr() const        { return reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]); } -    const APFloat &getFpVal(void) const { +    const APFloat &getFpVal() const {        assert(IsFp && BufHasFpVal && "Incorret state");        return *getFpValPtr();      } -    APFloat &getFpVal(void) { +    APFloat &getFpVal() {        assert(IsFp && BufHasFpVal && "Incorret state");        return *getFpValPtr();      } @@ -92,8 +93,8 @@ namespace {      // TODO: We should get rid of this function when APFloat can be constructed      //       from an *SIGNED* integer.      APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val); -  private: +  private:      bool IsFp;      // True iff FpValBuf contains an instance of APFloat. @@ -114,10 +115,10 @@ namespace {    ///    class FAddend {    public: -    FAddend() { Val = nullptr; } +    FAddend() : Val(nullptr) {} -    Value *getSymVal (void) const { return Val; } -    const FAddendCoef &getCoef(void) const { return Coeff; } +    Value *getSymVal() const { return Val; } +    const FAddendCoef &getCoef() const { return Coeff; }      bool isConstant() const { return Val == nullptr; }      bool isZero() const { return Coeff.isZero(); } @@ -182,7 +183,6 @@ namespace {      InstCombiner::BuilderTy *Builder;      Instruction *Instr; -  private:       // Debugging stuff are clustered here.      #ifndef NDEBUG        unsigned CreateInstrNum; @@ -193,7 +193,8 @@ namespace {        void incCreateInstNum() {}      #endif    }; -} + +} // anonymous namespace  //===----------------------------------------------------------------------===//  // @@ -602,7 +603,6 @@ Value *FAddCombine::simplify(Instruction *I) {  }  Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { -    unsigned AddendNum = Addends.size();    assert(AddendNum <= 4 && "Too many addends"); @@ -886,7 +886,7 @@ static bool checkRippleForAdd(const APInt &Op0KnownZero,    return Op0ZeroPosition >= Op1OnePosition;  } -/// WillNotOverflowSignedAdd - Return true if we can prove that: +/// Return true if we can prove that:  ///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS))  /// This basically requires proving that the add in the original type would not  /// overflow to change the sign bit or have a carry out. @@ -1118,8 +1118,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {        // (X + signbit) + C could have gotten canonicalized to (X ^ signbit) + C,        // transform them into (X + (signbit ^ C))        if (XorRHS->getValue().isSignBit()) -          return BinaryOperator::CreateAdd(XorLHS, -                                           ConstantExpr::getXor(XorRHS, CI)); +        return BinaryOperator::CreateAdd(XorLHS, +                                         ConstantExpr::getXor(XorRHS, CI));      }    } @@ -1421,7 +1421,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {    return Changed ? &I : nullptr;  } -  /// Optimize pointer differences into the same array into a size.  Consider:  ///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer  /// operands to the ptrtoint instructions for the LHS/RHS of the subtract. @@ -1589,7 +1588,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {      }    } -    {      Value *Y;      // X-(X+Y) == -Y    X-(Y+X) == -Y @@ -1611,32 +1609,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {        return BinaryOperator::CreateAnd(A, B);    } -  // (sub (select (a, c, b)), (select (a, d, b))) -> (select (a, (sub c, d), 0)) -  // (sub (select (a, b, c)), (select (a, b, d))) -> (select (a, 0, (sub c, d))) -  if (auto *SI0 = dyn_cast<SelectInst>(Op0)) { -    if (auto *SI1 = dyn_cast<SelectInst>(Op1)) { -      if (SI0->getCondition() == SI1->getCondition()) { -        if (Value *V = SimplifySubInst( -                SI0->getFalseValue(), SI1->getFalseValue(), I.hasNoSignedWrap(), -                I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) -          return SelectInst::Create( -              SI0->getCondition(), -              Builder->CreateSub(SI0->getTrueValue(), SI1->getTrueValue(), "", -                                 /*HasNUW=*/I.hasNoUnsignedWrap(), -                                 /*HasNSW=*/I.hasNoSignedWrap()), -              V); -        if (Value *V = SimplifySubInst(SI0->getTrueValue(), SI1->getTrueValue(), -                                       I.hasNoSignedWrap(), -                                       I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) -          return SelectInst::Create( -              SI0->getCondition(), V, -              Builder->CreateSub(SI0->getFalseValue(), SI1->getFalseValue(), "", -                                 /*HasNUW=*/I.hasNoUnsignedWrap(), -                                 /*HasNSW=*/I.hasNoSignedWrap())); -      } -    } -  } -    if (Op0->hasOneUse()) {      Value *Y = nullptr;      // ((X | Y) - X) --> (~X & Y) diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 15e0889b51b7..95c50d32c820 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -37,9 +37,9 @@ static inline Value *dyn_castNotVal(Value *V) {    return nullptr;  } -/// getFCmpCode - Similar to getICmpCode but for FCmpInst. This encodes a fcmp -/// predicate into a three bit mask. It also returns whether it is an ordered -/// predicate by reference. +/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into +/// a three bit mask. It also returns whether it is an ordered predicate by +/// reference.  static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) {    isOrdered = false;    switch (CC) { @@ -64,10 +64,10 @@ static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) {    }  } -/// getNewICmpValue - This is the complement of getICmpCode, which turns an -/// opcode and two operands into either a constant true or false, or a brand -/// new ICmp instruction. The sign is passed in to determine which kind -/// of predicate to use in the new icmp instruction. +/// This is the complement of getICmpCode, which turns an opcode and two +/// operands into either a constant true or false, or a brand new ICmp +/// instruction. The sign is passed in to determine which kind of predicate to +/// use in the new icmp instruction.  static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,                                InstCombiner::BuilderTy *Builder) {    ICmpInst::Predicate NewPred; @@ -76,9 +76,9 @@ static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,    return Builder->CreateICmp(NewPred, LHS, RHS);  } -/// getFCmpValue - This is the complement of getFCmpCode, which turns an -/// opcode and two operands into either a FCmp instruction. isordered is passed -/// in to determine which kind of predicate to use in the new fcmp instruction. +/// This is the complement of getFCmpCode, which turns an opcode and two +/// operands into either a FCmp instruction. isordered is passed in to determine +/// which kind of predicate to use in the new fcmp instruction.  static Value *getFCmpValue(bool isordered, unsigned code,                             Value *LHS, Value *RHS,                             InstCombiner::BuilderTy *Builder) { @@ -150,14 +150,13 @@ Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) {    else //if (Op == Instruction::Xor)      BinOp = Builder->CreateXor(NewLHS, NewRHS); -  Module *M = I.getParent()->getParent()->getParent(); -  Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy); +  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap, ITy);    return Builder->CreateCall(F, BinOp);  } -// OptAndOp - This handles expressions of the form ((val OP C1) & C2).  Where -// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is -// guaranteed to be a binary operator. +/// This handles expressions of the form ((val OP C1) & C2).  Where +/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is +/// guaranteed to be a binary operator.  Instruction *InstCombiner::OptAndOp(Instruction *Op,                                      ConstantInt *OpRHS,                                      ConstantInt *AndRHS, @@ -341,10 +340,10 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,    return Builder->CreateICmpUGT(Add, LowerBound);  } -// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s with -// any number of 0s on either side.  The 1s are allowed to wrap from LSB to -// MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is -// not, since all 1s are not contiguous. +/// Returns true iff Val consists of one contiguous run of 1s with any number +/// of 0s on either side.  The 1s are allowed to wrap from LSB to MSB, +/// so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is +/// not, since all 1s are not contiguous.  static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) {    const APInt& V = Val->getValue();    uint32_t BitWidth = Val->getType()->getBitWidth(); @@ -357,9 +356,8 @@ static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) {    return true;  } -/// FoldLogicalPlusAnd - This is part of an expression (LHS +/- RHS) & Mask, -/// where isSub determines whether the operator is a sub.  If we can fold one of -/// the following xforms: +/// This is part of an expression (LHS +/- RHS) & Mask, where isSub determines +/// whether the operator is a sub. If we can fold one of the following xforms:  ///  /// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask  /// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0 @@ -449,8 +447,8 @@ enum MaskedICmpType {    FoldMskICmp_BMask_NotMixed          =   512  }; -/// return the set of pattern classes (from MaskedICmpType) -/// that (icmp SCC (A & B), C) satisfies +/// Return the set of pattern classes (from MaskedICmpType) +/// that (icmp SCC (A & B), C) satisfies.  static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,                                      ICmpInst::Predicate SCC)  { @@ -538,8 +536,8 @@ static unsigned conjugateICmpMask(unsigned Mask) {    return NewMask;  } -/// decomposeBitTestICmp - Decompose an icmp into the form ((X & Y) pred Z) -/// if possible. The returned predicate is either == or !=. Returns false if +/// Decompose an icmp into the form ((X & Y) pred Z) if possible. +/// The returned predicate is either == or !=. Returns false if  /// decomposition fails.  static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred,                                   Value *&X, Value *&Y, Value *&Z) { @@ -585,10 +583,9 @@ static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred,    return true;  } -/// foldLogOpOfMaskedICmpsHelper: -/// handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) -/// return the set of pattern classes (from MaskedICmpType) -/// that both LHS and RHS satisfy +/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) +/// Return the set of pattern classes (from MaskedICmpType) +/// that both LHS and RHS satisfy.  static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,                                               Value*& B, Value*& C,                                               Value*& D, Value*& E, @@ -700,9 +697,9 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,    unsigned right_type = getTypeOfMaskedICmp(A, D, E, RHSCC);    return left_type & right_type;  } -/// foldLogOpOfMaskedICmps: -/// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) -/// into a single (icmp(A & X) ==/!= Y) + +/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) +/// into a single (icmp(A & X) ==/!= Y).  static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,                                       llvm::InstCombiner::BuilderTy *Builder) {    Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr; @@ -879,7 +876,7 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,    return Builder->CreateICmp(NewPred, Input, RangeEnd);  } -/// FoldAndOfICmps - Fold (icmp)&(icmp) if possible. +/// Fold (icmp)&(icmp) if possible.  Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {    ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); @@ -1123,9 +1120,8 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {    return nullptr;  } -/// FoldAndOfFCmps - Optimize (fcmp)&(fcmp).  NOTE: Unlike the rest of -/// instcombine, this returns a Value which should already be inserted into the -/// function. +/// Optimize (fcmp)&(fcmp).  NOTE: Unlike the rest of instcombine, this returns +/// a Value which should already be inserted into the function.  Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {    if (LHS->getPredicate() == FCmpInst::FCMP_ORD &&        RHS->getPredicate() == FCmpInst::FCMP_ORD) { @@ -1203,6 +1199,54 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {    return nullptr;  } +/// Match De Morgan's Laws: +/// (~A & ~B) == (~(A | B)) +/// (~A | ~B) == (~(A & B)) +static Instruction *matchDeMorgansLaws(BinaryOperator &I, +                                       InstCombiner::BuilderTy *Builder) { +  auto Opcode = I.getOpcode(); +  assert((Opcode == Instruction::And || Opcode == Instruction::Or) && +         "Trying to match De Morgan's Laws with something other than and/or"); +  // Flip the logic operation. +  if (Opcode == Instruction::And) +    Opcode = Instruction::Or; +  else +    Opcode = Instruction::And; + +  Value *Op0 = I.getOperand(0); +  Value *Op1 = I.getOperand(1); +  // TODO: Use pattern matchers instead of dyn_cast. +  if (Value *Op0NotVal = dyn_castNotVal(Op0)) +    if (Value *Op1NotVal = dyn_castNotVal(Op1)) +      if (Op0->hasOneUse() && Op1->hasOneUse()) { +        Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal, +                                              I.getName() + ".demorgan"); +        return BinaryOperator::CreateNot(LogicOp); +      } + +  // De Morgan's Law in disguise: +  // (zext(bool A) ^ 1) & (zext(bool B) ^ 1) -> zext(~(A | B)) +  // (zext(bool A) ^ 1) | (zext(bool B) ^ 1) -> zext(~(A & B)) +  Value *A = nullptr; +  Value *B = nullptr; +  ConstantInt *C1 = nullptr; +  if (match(Op0, m_OneUse(m_Xor(m_ZExt(m_Value(A)), m_ConstantInt(C1)))) && +      match(Op1, m_OneUse(m_Xor(m_ZExt(m_Value(B)), m_Specific(C1))))) { +    // TODO: This check could be loosened to handle different type sizes. +    // Alternatively, we could fix the definition of m_Not to recognize a not +    // operation hidden by a zext? +    if (A->getType()->isIntegerTy(1) && B->getType()->isIntegerTy(1) && +        C1->isOne()) { +      Value *LogicOp = Builder->CreateBinOp(Opcode, A, B, +                                            I.getName() + ".demorgan"); +      Value *Not = Builder->CreateNot(LogicOp); +      return CastInst::CreateZExtOrBitCast(Not, I.getType()); +    } +  } + +  return nullptr; +} +  Instruction *InstCombiner::visitAnd(BinaryOperator &I) {    bool Changed = SimplifyAssociativeOrCommutative(I);    Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -1273,6 +1317,10 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {          if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I))            return BinaryOperator::CreateAnd(V, AndRHS); +        // -x & 1 -> x & 1 +        if (AndRHSMask == 1 && match(Op0LHS, m_Zero())) +          return BinaryOperator::CreateAnd(Op0RHS, AndRHS); +          // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS          // has 1's for all bits that the subtraction with A might affect.          if (Op0I->hasOneUse() && !match(Op0LHS, m_Zero())) { @@ -1329,15 +1377,8 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {          return NV;    } - -  // (~A & ~B) == (~(A | B)) - De Morgan's Law -  if (Value *Op0NotVal = dyn_castNotVal(Op0)) -    if (Value *Op1NotVal = dyn_castNotVal(Op1)) -      if (Op0->hasOneUse() && Op1->hasOneUse()) { -        Value *Or = Builder->CreateOr(Op0NotVal, Op1NotVal, -                                      I.getName()+".demorgan"); -        return BinaryOperator::CreateNot(Or); -      } +  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) +    return DeMorgan;    {      Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; @@ -1446,14 +1487,15 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {          return ReplaceInstUsesWith(I, Res); -  // fold (and (cast A), (cast B)) -> (cast (and A, B)) -  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) +  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) { +    Value *Op0COp = Op0C->getOperand(0); +    Type *SrcTy = Op0COp->getType(); +    // fold (and (cast A), (cast B)) -> (cast (and A, B))      if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) { -      Type *SrcTy = Op0C->getOperand(0)->getType();        if (Op0C->getOpcode() == Op1C->getOpcode() && // same cast kind ?            SrcTy == Op1C->getOperand(0)->getType() &&            SrcTy->isIntOrIntVectorTy()) { -        Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0); +        Value *Op1COp = Op1C->getOperand(0);          // Only do this if the casts both really cause code to be generated.          if (ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) && @@ -1478,6 +1520,20 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {        }      } +    // If we are masking off the sign bit of a floating-point value, convert +    // this to the canonical fabs intrinsic call and cast back to integer. +    // The backend should know how to optimize fabs(). +    // TODO: This transform should also apply to vectors. +    ConstantInt *CI; +    if (isa<BitCastInst>(Op0C) && SrcTy->isFloatingPointTy() && +        match(Op1, m_ConstantInt(CI)) && CI->isMaxValue(true)) { +      Module *M = I.getModule(); +      Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, SrcTy); +      Value *Call = Builder->CreateCall(Fabs, Op0COp, "fabs"); +      return CastInst::CreateBitOrPointerCast(Call, I.getType()); +    } +  } +    {      Value *X = nullptr;      bool OpsSwapped = false; @@ -1509,163 +1565,195 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {    return Changed ? &I : nullptr;  } -/// CollectBSwapParts - Analyze the specified subexpression and see if it is -/// capable of providing pieces of a bswap.  The subexpression provides pieces -/// of a bswap if it is proven that each of the non-zero bytes in the output of -/// the expression came from the corresponding "byte swapped" byte in some other -/// value.  For example, if the current subexpression is "(shl i32 %X, 24)" then -/// we know that the expression deposits the low byte of %X into the high byte -/// of the bswap result and that all other bytes are zero.  This expression is -/// accepted, the high byte of ByteValues is set to X to indicate a correct -/// match. + +/// Analyze the specified subexpression and see if it is capable of providing +/// pieces of a bswap or bitreverse. The subexpression provides a potential +/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in +/// the output of the expression came from a corresponding bit in some other +/// value. This function is recursive, and the end result is a mapping of +/// (value, bitnumber) to bitnumber. It is the caller's responsibility to +/// validate that all `value`s are identical and that the bitnumber to bitnumber +/// mapping is correct for a bswap or bitreverse. +/// +/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know +/// that the expression deposits the low byte of %X into the high byte of the +/// result and that all other bits are zero. This expression is accepted, +/// BitValues[24-31] are set to %X and BitProvenance[24-31] are set to [0-7].  ///  /// This function returns true if the match was unsuccessful and false if so.  /// On entry to the function the "OverallLeftShift" is a signed integer value -/// indicating the number of bytes that the subexpression is later shifted.  For +/// indicating the number of bits that the subexpression is later shifted.  For  /// example, if the expression is later right shifted by 16 bits, the -/// OverallLeftShift value would be -2 on entry.  This is used to specify which -/// byte of ByteValues is actually being set. +/// OverallLeftShift value would be -16 on entry.  This is used to specify which +/// bits of BitValues are actually being set.  /// -/// Similarly, ByteMask is a bitmask where a bit is clear if its corresponding -/// byte is masked to zero by a user.  For example, in (X & 255), X will be -/// processed with a bytemask of 1.  Because bytemask is 32-bits, this limits -/// this function to working on up to 32-byte (256 bit) values.  ByteMask is -/// always in the local (OverallLeftShift) coordinate space. +/// Similarly, BitMask is a bitmask where a bit is clear if its corresponding +/// bit is masked to zero by a user.  For example, in (X & 255), X will be +/// processed with a bytemask of 255. BitMask is always in the local +/// (OverallLeftShift) coordinate space.  /// -static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, -                              SmallVectorImpl<Value *> &ByteValues) { +static bool CollectBitParts(Value *V, int OverallLeftShift, APInt BitMask, +                            SmallVectorImpl<Value *> &BitValues, +                            SmallVectorImpl<int> &BitProvenance) {    if (Instruction *I = dyn_cast<Instruction>(V)) {      // If this is an or instruction, it may be an inner node of the bswap. -    if (I->getOpcode() == Instruction::Or) { -      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, -                               ByteValues) || -             CollectBSwapParts(I->getOperand(1), OverallLeftShift, ByteMask, -                               ByteValues); -    } - -    // If this is a logical shift by a constant multiple of 8, recurse with -    // OverallLeftShift and ByteMask adjusted. +    if (I->getOpcode() == Instruction::Or) +      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, +                             BitValues, BitProvenance) || +             CollectBitParts(I->getOperand(1), OverallLeftShift, BitMask, +                             BitValues, BitProvenance); + +    // If this is a logical shift by a constant, recurse with OverallLeftShift +    // and BitMask adjusted.      if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {        unsigned ShAmt = -        cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U); -      // Ensure the shift amount is defined and of a byte value. -      if ((ShAmt & 7) || (ShAmt > 8*ByteValues.size())) +          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U); +      // Ensure the shift amount is defined. +      if (ShAmt > BitValues.size())          return true; -      unsigned ByteShift = ShAmt >> 3; +      unsigned BitShift = ShAmt;        if (I->getOpcode() == Instruction::Shl) { -        // X << 2 -> collect(X, +2) -        OverallLeftShift += ByteShift; -        ByteMask >>= ByteShift; +        // X << C -> collect(X, +C) +        OverallLeftShift += BitShift; +        BitMask = BitMask.lshr(BitShift);        } else { -        // X >>u 2 -> collect(X, -2) -        OverallLeftShift -= ByteShift; -        ByteMask <<= ByteShift; -        ByteMask &= (~0U >> (32-ByteValues.size())); +        // X >>u C -> collect(X, -C) +        OverallLeftShift -= BitShift; +        BitMask = BitMask.shl(BitShift);        } -      if (OverallLeftShift >= (int)ByteValues.size()) return true; -      if (OverallLeftShift <= -(int)ByteValues.size()) return true; +      if (OverallLeftShift >= (int)BitValues.size()) +        return true; +      if (OverallLeftShift <= -(int)BitValues.size()) +        return true; -      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, -                               ByteValues); +      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, +                             BitValues, BitProvenance);      } -    // If this is a logical 'and' with a mask that clears bytes, clear the -    // corresponding bytes in ByteMask. +    // If this is a logical 'and' with a mask that clears bits, clear the +    // corresponding bits in BitMask.      if (I->getOpcode() == Instruction::And &&          isa<ConstantInt>(I->getOperand(1))) { -      // Scan every byte of the and mask, seeing if the byte is either 0 or 255. -      unsigned NumBytes = ByteValues.size(); -      APInt Byte(I->getType()->getPrimitiveSizeInBits(), 255); +      unsigned NumBits = BitValues.size(); +      APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1);        const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue(); -      for (unsigned i = 0; i != NumBytes; ++i, Byte <<= 8) { -        // If this byte is masked out by a later operation, we don't care what +      for (unsigned i = 0; i != NumBits; ++i, Bit <<= 1) { +        // If this bit is masked out by a later operation, we don't care what          // the and mask is. -        if ((ByteMask & (1 << i)) == 0) +        if (BitMask[i] == 0)            continue; -        // If the AndMask is all zeros for this byte, clear the bit. -        APInt MaskB = AndMask & Byte; +        // If the AndMask is zero for this bit, clear the bit. +        APInt MaskB = AndMask & Bit;          if (MaskB == 0) { -          ByteMask &= ~(1U << i); +          BitMask.clearBit(i);            continue;          } -        // If the AndMask is not all ones for this byte, it's not a bytezap. -        if (MaskB != Byte) -          return true; - -        // Otherwise, this byte is kept. +        // Otherwise, this bit is kept.        } -      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, -                               ByteValues); +      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, +                             BitValues, BitProvenance);      }    }    // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be -  // the input value to the bswap.  Some observations: 1) if more than one byte -  // is demanded from this input, then it could not be successfully assembled -  // into a byteswap.  At least one of the two bytes would not be aligned with -  // their ultimate destination. -  if (!isPowerOf2_32(ByteMask)) return true; -  unsigned InputByteNo = countTrailingZeros(ByteMask); - -  // 2) The input and ultimate destinations must line up: if byte 3 of an i32 -  // is demanded, it needs to go into byte 0 of the result.  This means that the -  // byte needs to be shifted until it lands in the right byte bucket.  The -  // shift amount depends on the position: if the byte is coming from the high -  // part of the value (e.g. byte 3) then it must be shifted right.  If from the -  // low part, it must be shifted left. -  unsigned DestByteNo = InputByteNo + OverallLeftShift; -  if (ByteValues.size()-1-DestByteNo != InputByteNo) +  // the input value to the bswap/bitreverse. To be part of a bswap or +  // bitreverse we must be demanding a contiguous range of bits from it. +  unsigned InputBitLen = BitMask.countPopulation(); +  unsigned InputBitNo = BitMask.countTrailingZeros(); +  if (BitMask.getBitWidth() - BitMask.countLeadingZeros() - InputBitNo != +      InputBitLen) +    // Not a contiguous set range of bits!      return true; -  // If the destination byte value is already defined, the values are or'd -  // together, which isn't a bswap (unless it's an or of the same bits). -  if (ByteValues[DestByteNo] && ByteValues[DestByteNo] != V) +  // We know we're moving a contiguous range of bits from the input to the +  // output. Record which bits in the output came from which bits in the input. +  unsigned DestBitNo = InputBitNo + OverallLeftShift; +  for (unsigned I = 0; I < InputBitLen; ++I) +    BitProvenance[DestBitNo + I] = InputBitNo + I; + +  // If the destination bit value is already defined, the values are or'd +  // together, which isn't a bswap/bitreverse (unless it's an or of the same +  // bits). +  if (BitValues[DestBitNo] && BitValues[DestBitNo] != V)      return true; -  ByteValues[DestByteNo] = V; +  for (unsigned I = 0; I < InputBitLen; ++I) +    BitValues[DestBitNo + I] = V; +    return false;  } -/// MatchBSwap - Given an OR instruction, check to see if this is a bswap idiom. -/// If so, insert the new bswap intrinsic and return it. -Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) { -  IntegerType *ITy = dyn_cast<IntegerType>(I.getType()); -  if (!ITy || ITy->getBitWidth() % 16 || -      // ByteMask only allows up to 32-byte values. -      ITy->getBitWidth() > 32*8) -    return nullptr;   // Can only bswap pairs of bytes.  Can't do vectors. +static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To, +                                          unsigned BitWidth) { +  if (From % 8 != To % 8) +    return false; +  // Convert from bit indices to byte indices and check for a byte reversal. +  From >>= 3; +  To >>= 3; +  BitWidth >>= 3; +  return From == BitWidth - To - 1; +} -  /// ByteValues - For each byte of the result, we keep track of which value -  /// defines each byte. -  SmallVector<Value*, 8> ByteValues; -  ByteValues.resize(ITy->getBitWidth()/8); +static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To, +                                               unsigned BitWidth) { +  return From == BitWidth - To - 1; +} +/// Given an OR instruction, check to see if this is a bswap or bitreverse +/// idiom. If so, insert the new intrinsic and return it. +Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) { +  IntegerType *ITy = dyn_cast<IntegerType>(I.getType()); +  if (!ITy) +    return nullptr;   // Can't do vectors. +  unsigned BW = ITy->getBitWidth(); +   +  /// We keep track of which bit (BitProvenance) inside which value (BitValues) +  /// defines each bit in the result. +  SmallVector<Value *, 8> BitValues(BW, nullptr); +  SmallVector<int, 8> BitProvenance(BW, -1); +      // Try to find all the pieces corresponding to the bswap. -  uint32_t ByteMask = ~0U >> (32-ByteValues.size()); -  if (CollectBSwapParts(&I, 0, ByteMask, ByteValues)) +  APInt BitMask = APInt::getAllOnesValue(BitValues.size()); +  if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance))      return nullptr; -  // Check to see if all of the bytes come from the same value. -  Value *V = ByteValues[0]; -  if (!V) return nullptr;  // Didn't find a byte?  Must be zero. +  // Check to see if all of the bits come from the same value. +  Value *V = BitValues[0]; +  if (!V) return nullptr;  // Didn't find a bit?  Must be zero. -  // Check to make sure that all of the bytes come from the same value. -  for (unsigned i = 1, e = ByteValues.size(); i != e; ++i) -    if (ByteValues[i] != V) -      return nullptr; -  Module *M = I.getParent()->getParent()->getParent(); -  Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy); +  if (!std::all_of(BitValues.begin(), BitValues.end(), +                   [&](const Value *X) { return X == V; })) +    return nullptr; + +  // Now, is the bit permutation correct for a bswap or a bitreverse? We can +  // only byteswap values with an even number of bytes. +  bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;; +  for (unsigned i = 0, e = BitValues.size(); i != e; ++i) { +    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW); +    OKForBitReverse &= +        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW); +  } + +  Intrinsic::ID Intrin; +  if (OKForBSwap) +    Intrin = Intrinsic::bswap; +  else if (OKForBitReverse) +    Intrin = Intrinsic::bitreverse; +  else +    return nullptr; + +  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrin, ITy);    return CallInst::Create(F, V);  } -/// MatchSelectFromAndOr - We have an expression of the form (A&C)|(B&D).  Check -/// If A is (cond?-1:0) and either B or D is ~(cond?-1,0) or (cond?0,-1), then -/// we can simplify this expression to "cond ? C : D or B". +/// We have an expression of the form (A&C)|(B&D).  Check if A is (cond?-1:0) +/// and either B or D is ~(cond?-1,0) or (cond?0,-1), then we can simplify this +/// expression to "cond ? C : D or B".  static Instruction *MatchSelectFromAndOr(Value *A, Value *B,                                           Value *C, Value *D) {    // If A is not a select of -1/0, this cannot match. @@ -1688,7 +1776,7 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B,    return nullptr;  } -/// FoldOrOfICmps - Fold (icmp)|(icmp) if possible. +/// Fold (icmp)|(icmp) if possible.  Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,                                     Instruction *CxtI) {    ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); @@ -1905,14 +1993,14 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,      case ICmpInst::ICMP_EQ:        if (LHS->getOperand(0) == RHS->getOperand(0)) {          // if LHSCst and RHSCst differ only by one bit: -        // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1 +        // (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2          assert(LHSCst->getValue().ule(LHSCst->getValue()));          APInt Xor = LHSCst->getValue() ^ RHSCst->getValue();          if (Xor.isPowerOf2()) { -          Value *NegCst = Builder->getInt(~Xor); -          Value *And = Builder->CreateAnd(LHS->getOperand(0), NegCst); -          return Builder->CreateICmp(ICmpInst::ICMP_EQ, And, LHSCst); +          Value *Cst = Builder->getInt(Xor); +          Value *Or = Builder->CreateOr(LHS->getOperand(0), Cst); +          return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSCst);          }        } @@ -2020,9 +2108,8 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,    return nullptr;  } -/// FoldOrOfFCmps - Optimize (fcmp)|(fcmp).  NOTE: Unlike the rest of -/// instcombine, this returns a Value which should already be inserted into the -/// function. +/// Optimize (fcmp)|(fcmp).  NOTE: Unlike the rest of instcombine, this returns +/// a Value which should already be inserted into the function.  Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {    if (LHS->getPredicate() == FCmpInst::FCMP_UNO &&        RHS->getPredicate() == FCmpInst::FCMP_UNO && @@ -2080,7 +2167,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {    return nullptr;  } -/// FoldOrWithConstants - This helper function folds: +/// This helper function folds:  ///  ///     ((A | B) & C1) | (B & C2)  /// @@ -2199,14 +2286,18 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {    ConstantInt *C1 = nullptr, *C2 = nullptr;    // (A | B) | C  and  A | (B | C)                  -> bswap if possible. +  bool OrOfOrs = match(Op0, m_Or(m_Value(), m_Value())) || +                 match(Op1, m_Or(m_Value(), m_Value()));    // (A >> B) | (C << D)  and  (A << B) | (B >> C)  -> bswap if possible. -  if (match(Op0, m_Or(m_Value(), m_Value())) || -      match(Op1, m_Or(m_Value(), m_Value())) || -      (match(Op0, m_LogicalShift(m_Value(), m_Value())) && -       match(Op1, m_LogicalShift(m_Value(), m_Value())))) { -    if (Instruction *BSwap = MatchBSwap(I)) +  bool OrOfShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) && +                    match(Op1, m_LogicalShift(m_Value(), m_Value())); +  // (A & B) | (C & D)                              -> bswap if possible. +  bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) && +                  match(Op1, m_And(m_Value(), m_Value())); + +  if (OrOfOrs || OrOfShifts || OrOfAnds) +    if (Instruction *BSwap = MatchBSwapOrBitReverse(I))        return BSwap; -  }    // (X^C)|Y -> (X|Y)^C iff Y&C == 0    if (Op0->hasOneUse() && @@ -2360,14 +2451,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {    if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))      return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C)); -  // (~A | ~B) == (~(A & B)) - De Morgan's Law -  if (Value *Op0NotVal = dyn_castNotVal(Op0)) -    if (Value *Op1NotVal = dyn_castNotVal(Op1)) -      if (Op0->hasOneUse() && Op1->hasOneUse()) { -        Value *And = Builder->CreateAnd(Op0NotVal, Op1NotVal, -                                        I.getName()+".demorgan"); -        return BinaryOperator::CreateNot(And); -      } +  if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) +    return DeMorgan;    // Canonicalize xor to the RHS.    bool SwappedForXor = false; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 6de380bcad67..e3634f269cf5 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -67,8 +67,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {    unsigned CopyAlign = MI->getAlignment();    if (CopyAlign < MinAlign) { -    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), -                                             MinAlign, false)); +    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false));      return MI;    } @@ -198,12 +197,140 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {    return nullptr;  } +static Value *SimplifyX86immshift(const IntrinsicInst &II, +                                  InstCombiner::BuilderTy &Builder) { +  bool LogicalShift = false; +  bool ShiftLeft = false; + +  switch (II.getIntrinsicID()) { +  default: +    return nullptr; +  case Intrinsic::x86_sse2_psra_d: +  case Intrinsic::x86_sse2_psra_w: +  case Intrinsic::x86_sse2_psrai_d: +  case Intrinsic::x86_sse2_psrai_w: +  case Intrinsic::x86_avx2_psra_d: +  case Intrinsic::x86_avx2_psra_w: +  case Intrinsic::x86_avx2_psrai_d: +  case Intrinsic::x86_avx2_psrai_w: +    LogicalShift = false; ShiftLeft = false; +    break; +  case Intrinsic::x86_sse2_psrl_d: +  case Intrinsic::x86_sse2_psrl_q: +  case Intrinsic::x86_sse2_psrl_w: +  case Intrinsic::x86_sse2_psrli_d: +  case Intrinsic::x86_sse2_psrli_q: +  case Intrinsic::x86_sse2_psrli_w: +  case Intrinsic::x86_avx2_psrl_d: +  case Intrinsic::x86_avx2_psrl_q: +  case Intrinsic::x86_avx2_psrl_w: +  case Intrinsic::x86_avx2_psrli_d: +  case Intrinsic::x86_avx2_psrli_q: +  case Intrinsic::x86_avx2_psrli_w: +    LogicalShift = true; ShiftLeft = false; +    break; +  case Intrinsic::x86_sse2_psll_d: +  case Intrinsic::x86_sse2_psll_q: +  case Intrinsic::x86_sse2_psll_w: +  case Intrinsic::x86_sse2_pslli_d: +  case Intrinsic::x86_sse2_pslli_q: +  case Intrinsic::x86_sse2_pslli_w: +  case Intrinsic::x86_avx2_psll_d: +  case Intrinsic::x86_avx2_psll_q: +  case Intrinsic::x86_avx2_psll_w: +  case Intrinsic::x86_avx2_pslli_d: +  case Intrinsic::x86_avx2_pslli_q: +  case Intrinsic::x86_avx2_pslli_w: +    LogicalShift = true; ShiftLeft = true; +    break; +  } +  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + +  // Simplify if count is constant. +  auto Arg1 = II.getArgOperand(1); +  auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1); +  auto CDV = dyn_cast<ConstantDataVector>(Arg1); +  auto CInt = dyn_cast<ConstantInt>(Arg1); +  if (!CAZ && !CDV && !CInt) +    return nullptr; + +  APInt Count(64, 0); +  if (CDV) { +    // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector +    // operand to compute the shift amount. +    auto VT = cast<VectorType>(CDV->getType()); +    unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); +    assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); +    unsigned NumSubElts = 64 / BitWidth; + +    // Concatenate the sub-elements to create the 64-bit value. +    for (unsigned i = 0; i != NumSubElts; ++i) { +      unsigned SubEltIdx = (NumSubElts - 1) - i; +      auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); +      Count = Count.shl(BitWidth); +      Count |= SubElt->getValue().zextOrTrunc(64); +    } +  } +  else if (CInt) +    Count = CInt->getValue(); + +  auto Vec = II.getArgOperand(0); +  auto VT = cast<VectorType>(Vec->getType()); +  auto SVT = VT->getElementType(); +  unsigned VWidth = VT->getNumElements(); +  unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + +  // If shift-by-zero then just return the original value. +  if (Count == 0) +    return Vec; + +  // Handle cases when Shift >= BitWidth. +  if (Count.uge(BitWidth)) { +    // If LogicalShift - just return zero. +    if (LogicalShift) +      return ConstantAggregateZero::get(VT); + +    // If ArithmeticShift - clamp Shift to (BitWidth - 1). +    Count = APInt(64, BitWidth - 1); +  } + +  // Get a constant vector of the same type as the first operand. +  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); +  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); + +  if (ShiftLeft) +    return Builder.CreateShl(Vec, ShiftVec); + +  if (LogicalShift) +    return Builder.CreateLShr(Vec, ShiftVec); + +  return Builder.CreateAShr(Vec, ShiftVec); +} + +static Value *SimplifyX86extend(const IntrinsicInst &II, +                                InstCombiner::BuilderTy &Builder, +                                bool SignExtend) { +  VectorType *SrcTy = cast<VectorType>(II.getArgOperand(0)->getType()); +  VectorType *DstTy = cast<VectorType>(II.getType()); +  unsigned NumDstElts = DstTy->getNumElements(); + +  // Extract a subvector of the first NumDstElts lanes and sign/zero extend. +  SmallVector<int, 8> ShuffleMask; +  for (int i = 0; i != (int)NumDstElts; ++i) +    ShuffleMask.push_back(i); + +  Value *SV = Builder.CreateShuffleVector(II.getArgOperand(0), +                                          UndefValue::get(SrcTy), ShuffleMask); +  return SignExtend ? Builder.CreateSExt(SV, DstTy) +                    : Builder.CreateZExt(SV, DstTy); +} +  static Value *SimplifyX86insertps(const IntrinsicInst &II,                                    InstCombiner::BuilderTy &Builder) {    if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {      VectorType *VecTy = cast<VectorType>(II.getType());      assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); -     +      // The immediate permute control byte looks like this:      //    [3:0] - zero mask for each 32-bit lane      //    [5:4] - select one 32-bit destination lane @@ -248,12 +375,202 @@ static Value *SimplifyX86insertps(const IntrinsicInst &II,        // Replace the selected destination lane with the selected source lane.        ShuffleMask[DestLane] = SourceLane + 4;      } -   +      return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);    }    return nullptr;  } +/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding +/// or conversion to a shuffle vector. +static Value *SimplifyX86extrq(IntrinsicInst &II, Value *Op0, +                               ConstantInt *CILength, ConstantInt *CIIndex, +                               InstCombiner::BuilderTy &Builder) { +  auto LowConstantHighUndef = [&](uint64_t Val) { +    Type *IntTy64 = Type::getInt64Ty(II.getContext()); +    Constant *Args[] = {ConstantInt::get(IntTy64, Val), +                        UndefValue::get(IntTy64)}; +    return ConstantVector::get(Args); +  }; + +  // See if we're dealing with constant values. +  Constant *C0 = dyn_cast<Constant>(Op0); +  ConstantInt *CI0 = +      C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0)) +         : nullptr; + +  // Attempt to constant fold. +  if (CILength && CIIndex) { +    // From AMD documentation: "The bit index and field length are each six +    // bits in length other bits of the field are ignored." +    APInt APIndex = CIIndex->getValue().zextOrTrunc(6); +    APInt APLength = CILength->getValue().zextOrTrunc(6); + +    unsigned Index = APIndex.getZExtValue(); + +    // From AMD documentation: "a value of zero in the field length is +    // defined as length of 64". +    unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + +    // From AMD documentation: "If the sum of the bit index + length field +    // is greater than 64, the results are undefined". +    unsigned End = Index + Length; + +    // Note that both field index and field length are 8-bit quantities. +    // Since variables 'Index' and 'Length' are unsigned values +    // obtained from zero-extending field index and field length +    // respectively, their sum should never wrap around. +    if (End > 64) +      return UndefValue::get(II.getType()); + +    // If we are inserting whole bytes, we can convert this to a shuffle. +    // Lowering can recognize EXTRQI shuffle masks. +    if ((Length % 8) == 0 && (Index % 8) == 0) { +      // Convert bit indices to byte indices. +      Length /= 8; +      Index /= 8; + +      Type *IntTy8 = Type::getInt8Ty(II.getContext()); +      Type *IntTy32 = Type::getInt32Ty(II.getContext()); +      VectorType *ShufTy = VectorType::get(IntTy8, 16); + +      SmallVector<Constant *, 16> ShuffleMask; +      for (int i = 0; i != (int)Length; ++i) +        ShuffleMask.push_back( +            Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); +      for (int i = Length; i != 8; ++i) +        ShuffleMask.push_back( +            Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); +      for (int i = 8; i != 16; ++i) +        ShuffleMask.push_back(UndefValue::get(IntTy32)); + +      Value *SV = Builder.CreateShuffleVector( +          Builder.CreateBitCast(Op0, ShufTy), +          ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask)); +      return Builder.CreateBitCast(SV, II.getType()); +    } + +    // Constant Fold - shift Index'th bit to lowest position and mask off +    // Length bits. +    if (CI0) { +      APInt Elt = CI0->getValue(); +      Elt = Elt.lshr(Index).zextOrTrunc(Length); +      return LowConstantHighUndef(Elt.getZExtValue()); +    } + +    // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. +    if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { +      Value *Args[] = {Op0, CILength, CIIndex}; +      Module *M = II.getModule(); +      Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); +      return Builder.CreateCall(F, Args); +    } +  } + +  // Constant Fold - extraction from zero is always {zero, undef}. +  if (CI0 && CI0->equalsInt(0)) +    return LowConstantHighUndef(0); + +  return nullptr; +} + +/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant +/// folding or conversion to a shuffle vector. +static Value *SimplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, +                                 APInt APLength, APInt APIndex, +                                 InstCombiner::BuilderTy &Builder) { + +  // From AMD documentation: "The bit index and field length are each six bits +  // in length other bits of the field are ignored." +  APIndex = APIndex.zextOrTrunc(6); +  APLength = APLength.zextOrTrunc(6); + +  // Attempt to constant fold. +  unsigned Index = APIndex.getZExtValue(); + +  // From AMD documentation: "a value of zero in the field length is +  // defined as length of 64". +  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + +  // From AMD documentation: "If the sum of the bit index + length field +  // is greater than 64, the results are undefined". +  unsigned End = Index + Length; + +  // Note that both field index and field length are 8-bit quantities. +  // Since variables 'Index' and 'Length' are unsigned values +  // obtained from zero-extending field index and field length +  // respectively, their sum should never wrap around. +  if (End > 64) +    return UndefValue::get(II.getType()); + +  // If we are inserting whole bytes, we can convert this to a shuffle. +  // Lowering can recognize INSERTQI shuffle masks. +  if ((Length % 8) == 0 && (Index % 8) == 0) { +    // Convert bit indices to byte indices. +    Length /= 8; +    Index /= 8; + +    Type *IntTy8 = Type::getInt8Ty(II.getContext()); +    Type *IntTy32 = Type::getInt32Ty(II.getContext()); +    VectorType *ShufTy = VectorType::get(IntTy8, 16); + +    SmallVector<Constant *, 16> ShuffleMask; +    for (int i = 0; i != (int)Index; ++i) +      ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); +    for (int i = 0; i != (int)Length; ++i) +      ShuffleMask.push_back( +          Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); +    for (int i = Index + Length; i != 8; ++i) +      ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); +    for (int i = 8; i != 16; ++i) +      ShuffleMask.push_back(UndefValue::get(IntTy32)); + +    Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), +                                            Builder.CreateBitCast(Op1, ShufTy), +                                            ConstantVector::get(ShuffleMask)); +    return Builder.CreateBitCast(SV, II.getType()); +  } + +  // See if we're dealing with constant values. +  Constant *C0 = dyn_cast<Constant>(Op0); +  Constant *C1 = dyn_cast<Constant>(Op1); +  ConstantInt *CI00 = +      C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0)) +         : nullptr; +  ConstantInt *CI10 = +      C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0)) +         : nullptr; + +  // Constant Fold - insert bottom Length bits starting at the Index'th bit. +  if (CI00 && CI10) { +    APInt V00 = CI00->getValue(); +    APInt V10 = CI10->getValue(); +    APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); +    V00 = V00 & ~Mask; +    V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); +    APInt Val = V00 | V10; +    Type *IntTy64 = Type::getInt64Ty(II.getContext()); +    Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), +                        UndefValue::get(IntTy64)}; +    return ConstantVector::get(Args); +  } + +  // If we were an INSERTQ call, we'll save demanded elements if we convert to +  // INSERTQI. +  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { +    Type *IntTy8 = Type::getInt8Ty(II.getContext()); +    Constant *CILength = ConstantInt::get(IntTy8, Length, false); +    Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); + +    Value *Args[] = {Op0, Op1, CILength, CIIndex}; +    Module *M = II.getModule(); +    Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); +    return Builder.CreateCall(F, Args); +  } + +  return nullptr; +} +  /// The shuffle mask for a perm2*128 selects any two halves of two 256-bit  /// source vectors, unless a zero bit is set. If a zero bit is set,  /// then ignore that half of the mask and clear that half of the vector. @@ -289,7 +606,7 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II,      // The high bit of the selection field chooses the 1st or 2nd operand.      bool LowInputSelect = Imm & 0x02;      bool HighInputSelect = Imm & 0x20; -     +      // The low bit of the selection field chooses the low or high half      // of the selected operand.      bool LowHalfSelect = Imm & 0x01; @@ -298,11 +615,11 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II,      // Determine which operand(s) are actually in use for this instruction.      Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);      Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); -     +      // If needed, replace operands based on zero mask.      V0 = LowHalfZero ? ZeroVector : V0;      V1 = HighHalfZero ? ZeroVector : V1; -     +      // Permute low half of result.      unsigned StartIndex = LowHalfSelect ? HalfSize : 0;      for (unsigned i = 0; i < HalfSize; ++i) @@ -319,6 +636,43 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II,    return nullptr;  } +/// Decode XOP integer vector comparison intrinsics. +static Value *SimplifyX86vpcom(const IntrinsicInst &II, +                               InstCombiner::BuilderTy &Builder, bool IsSigned) { +  if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) { +    uint64_t Imm = CInt->getZExtValue() & 0x7; +    VectorType *VecTy = cast<VectorType>(II.getType()); +    CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + +    switch (Imm) { +    case 0x0: +      Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; +      break; +    case 0x1: +      Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; +      break; +    case 0x2: +      Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; +      break; +    case 0x3: +      Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; +      break; +    case 0x4: +      Pred = ICmpInst::ICMP_EQ; break; +    case 0x5: +      Pred = ICmpInst::ICMP_NE; break; +    case 0x6: +      return ConstantInt::getSigned(VecTy, 0); // FALSE +    case 0x7: +      return ConstantInt::getSigned(VecTy, -1); // TRUE +    } + +    if (Value *Cmp = Builder.CreateICmp(Pred, II.getArgOperand(0), II.getArgOperand(1))) +      return Builder.CreateSExtOrTrunc(Cmp, VecTy); +  } +  return nullptr; +} +  /// visitCallInst - CallInst simplification.  This mostly only handles folding  /// of intrinsic instructions.  For normal calls, it allows visitCallSite to do  /// the heavy lifting. @@ -371,7 +725,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {      if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {        if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))          if (GVSrc->isConstant()) { -          Module *M = CI.getParent()->getParent()->getParent(); +          Module *M = CI.getModule();            Intrinsic::ID MemCpyID = Intrinsic::memcpy;            Type *Tys[3] = { CI.getArgOperand(0)->getType(),                             CI.getArgOperand(1)->getType(), @@ -400,6 +754,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {      if (Changed) return II;    } +  auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, unsigned DemandedWidth) +  { +    APInt UndefElts(Width, 0); +    APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); +    return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); +  }; +    switch (II->getIntrinsicID()) {    default: break;    case Intrinsic::objectsize: { @@ -427,6 +788,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {      break;    } +  case Intrinsic::bitreverse: { +    Value *IIOperand = II->getArgOperand(0); +    Value *X = nullptr; + +    // bitreverse(bitreverse(x)) -> x +    if (match(IIOperand, m_Intrinsic<Intrinsic::bitreverse>(m_Value(X)))) +      return ReplaceInstUsesWith(CI, X); +    break; +  } +    case Intrinsic::powi:      if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {        // powi(x, 0) -> 1.0 @@ -669,6 +1040,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {        return new StoreInst(II->getArgOperand(0), Ptr);      }      break; +    case Intrinsic::x86_sse_storeu_ps:    case Intrinsic::x86_sse2_storeu_pd:    case Intrinsic::x86_sse2_storeu_dq: @@ -682,6 +1054,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {      }      break; +  case Intrinsic::x86_vcvtph2ps_128: +  case Intrinsic::x86_vcvtph2ps_256: { +    auto Arg = II->getArgOperand(0); +    auto ArgType = cast<VectorType>(Arg->getType()); +    auto RetType = cast<VectorType>(II->getType()); +    unsigned ArgWidth = ArgType->getNumElements(); +    unsigned RetWidth = RetType->getNumElements(); +    assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths"); +    assert(ArgType->isIntOrIntVectorTy() && +           ArgType->getScalarSizeInBits() == 16 && +           "CVTPH2PS input type should be 16-bit integer vector"); +    assert(RetType->getScalarType()->isFloatTy() && +           "CVTPH2PS output type should be 32-bit float vector"); + +    // Constant folding: Convert to generic half to single conversion. +    if (isa<ConstantAggregateZero>(Arg)) +      return ReplaceInstUsesWith(*II, ConstantAggregateZero::get(RetType)); + +    if (isa<ConstantDataVector>(Arg)) { +      auto VectorHalfAsShorts = Arg; +      if (RetWidth < ArgWidth) { +        SmallVector<int, 8> SubVecMask; +        for (unsigned i = 0; i != RetWidth; ++i) +          SubVecMask.push_back((int)i); +        VectorHalfAsShorts = Builder->CreateShuffleVector( +            Arg, UndefValue::get(ArgType), SubVecMask); +      } + +      auto VectorHalfType = +          VectorType::get(Type::getHalfTy(II->getContext()), RetWidth); +      auto VectorHalfs = +          Builder->CreateBitCast(VectorHalfAsShorts, VectorHalfType); +      auto VectorFloats = Builder->CreateFPExt(VectorHalfs, RetType); +      return ReplaceInstUsesWith(*II, VectorFloats); +    } + +    // We only use the lowest lanes of the argument. +    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) { +      II->setArgOperand(0, V); +      return II; +    } +    break; +  } +    case Intrinsic::x86_sse_cvtss2si:    case Intrinsic::x86_sse_cvtss2si64:    case Intrinsic::x86_sse_cvttss2si: @@ -692,194 +1108,229 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {    case Intrinsic::x86_sse2_cvttsd2si64: {      // These intrinsics only demand the 0th element of their input vectors. If      // we can simplify the input based on that, do so now. -    unsigned VWidth = -      cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements(); -    APInt DemandedElts(VWidth, 1); -    APInt UndefElts(VWidth, 0); -    if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0), -                                              DemandedElts, UndefElts)) { +    Value *Arg = II->getArgOperand(0); +    unsigned VWidth = Arg->getType()->getVectorNumElements(); +    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {        II->setArgOperand(0, V);        return II;      }      break;    } -  // Constant fold <A x Bi> << Ci. -  // FIXME: We don't handle _dq because it's a shift of an i128, but is -  // represented in the IR as <2 x i64>. A per element shift is wrong. -  case Intrinsic::x86_sse2_psll_d: -  case Intrinsic::x86_sse2_psll_q: -  case Intrinsic::x86_sse2_psll_w: +  // Constant fold ashr( <A x Bi>, Ci ). +  // Constant fold lshr( <A x Bi>, Ci ). +  // Constant fold shl( <A x Bi>, Ci ). +  case Intrinsic::x86_sse2_psrai_d: +  case Intrinsic::x86_sse2_psrai_w: +  case Intrinsic::x86_avx2_psrai_d: +  case Intrinsic::x86_avx2_psrai_w: +  case Intrinsic::x86_sse2_psrli_d: +  case Intrinsic::x86_sse2_psrli_q: +  case Intrinsic::x86_sse2_psrli_w: +  case Intrinsic::x86_avx2_psrli_d: +  case Intrinsic::x86_avx2_psrli_q: +  case Intrinsic::x86_avx2_psrli_w:    case Intrinsic::x86_sse2_pslli_d:    case Intrinsic::x86_sse2_pslli_q:    case Intrinsic::x86_sse2_pslli_w: -  case Intrinsic::x86_avx2_psll_d: -  case Intrinsic::x86_avx2_psll_q: -  case Intrinsic::x86_avx2_psll_w:    case Intrinsic::x86_avx2_pslli_d:    case Intrinsic::x86_avx2_pslli_q:    case Intrinsic::x86_avx2_pslli_w: +    if (Value *V = SimplifyX86immshift(*II, *Builder)) +      return ReplaceInstUsesWith(*II, V); +    break; + +  case Intrinsic::x86_sse2_psra_d: +  case Intrinsic::x86_sse2_psra_w: +  case Intrinsic::x86_avx2_psra_d: +  case Intrinsic::x86_avx2_psra_w:    case Intrinsic::x86_sse2_psrl_d:    case Intrinsic::x86_sse2_psrl_q:    case Intrinsic::x86_sse2_psrl_w: -  case Intrinsic::x86_sse2_psrli_d: -  case Intrinsic::x86_sse2_psrli_q: -  case Intrinsic::x86_sse2_psrli_w:    case Intrinsic::x86_avx2_psrl_d:    case Intrinsic::x86_avx2_psrl_q:    case Intrinsic::x86_avx2_psrl_w: -  case Intrinsic::x86_avx2_psrli_d: -  case Intrinsic::x86_avx2_psrli_q: -  case Intrinsic::x86_avx2_psrli_w: { -    // Simplify if count is constant. To 0 if >= BitWidth, -    // otherwise to shl/lshr. -    auto CDV = dyn_cast<ConstantDataVector>(II->getArgOperand(1)); -    auto CInt = dyn_cast<ConstantInt>(II->getArgOperand(1)); -    if (!CDV && !CInt) -      break; -    ConstantInt *Count; -    if (CDV) -      Count = cast<ConstantInt>(CDV->getElementAsConstant(0)); -    else -      Count = CInt; - -    auto Vec = II->getArgOperand(0); -    auto VT = cast<VectorType>(Vec->getType()); -    if (Count->getZExtValue() > -        VT->getElementType()->getPrimitiveSizeInBits() - 1) -      return ReplaceInstUsesWith( -          CI, ConstantAggregateZero::get(Vec->getType())); - -    bool isPackedShiftLeft = true; -    switch (II->getIntrinsicID()) { -    default : break; -    case Intrinsic::x86_sse2_psrl_d: -    case Intrinsic::x86_sse2_psrl_q: -    case Intrinsic::x86_sse2_psrl_w: -    case Intrinsic::x86_sse2_psrli_d: -    case Intrinsic::x86_sse2_psrli_q: -    case Intrinsic::x86_sse2_psrli_w: -    case Intrinsic::x86_avx2_psrl_d: -    case Intrinsic::x86_avx2_psrl_q: -    case Intrinsic::x86_avx2_psrl_w: -    case Intrinsic::x86_avx2_psrli_d: -    case Intrinsic::x86_avx2_psrli_q: -    case Intrinsic::x86_avx2_psrli_w: isPackedShiftLeft = false; break; -    } - -    unsigned VWidth = VT->getNumElements(); -    // Get a constant vector of the same type as the first operand. -    auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue()); -    if (isPackedShiftLeft) -      return BinaryOperator::CreateShl(Vec, -          Builder->CreateVectorSplat(VWidth, VTCI)); - -    return BinaryOperator::CreateLShr(Vec, -        Builder->CreateVectorSplat(VWidth, VTCI)); +  case Intrinsic::x86_sse2_psll_d: +  case Intrinsic::x86_sse2_psll_q: +  case Intrinsic::x86_sse2_psll_w: +  case Intrinsic::x86_avx2_psll_d: +  case Intrinsic::x86_avx2_psll_q: +  case Intrinsic::x86_avx2_psll_w: { +    if (Value *V = SimplifyX86immshift(*II, *Builder)) +      return ReplaceInstUsesWith(*II, V); + +    // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector +    // operand to compute the shift amount. +    Value *Arg1 = II->getArgOperand(1); +    assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && +           "Unexpected packed shift size"); +    unsigned VWidth = Arg1->getType()->getVectorNumElements(); + +    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { +      II->setArgOperand(1, V); +      return II; +    } +    break;    } -  case Intrinsic::x86_sse41_pmovsxbw: -  case Intrinsic::x86_sse41_pmovsxwd: -  case Intrinsic::x86_sse41_pmovsxdq: +  case Intrinsic::x86_avx2_pmovsxbd: +  case Intrinsic::x86_avx2_pmovsxbq: +  case Intrinsic::x86_avx2_pmovsxbw: +  case Intrinsic::x86_avx2_pmovsxdq: +  case Intrinsic::x86_avx2_pmovsxwd: +  case Intrinsic::x86_avx2_pmovsxwq: +    if (Value *V = SimplifyX86extend(*II, *Builder, true)) +      return ReplaceInstUsesWith(*II, V); +    break; + +  case Intrinsic::x86_sse41_pmovzxbd: +  case Intrinsic::x86_sse41_pmovzxbq:    case Intrinsic::x86_sse41_pmovzxbw: +  case Intrinsic::x86_sse41_pmovzxdq:    case Intrinsic::x86_sse41_pmovzxwd: -  case Intrinsic::x86_sse41_pmovzxdq: { -    // pmov{s|z}x ignores the upper half of their input vectors. -    unsigned VWidth = -      cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements(); -    unsigned LowHalfElts = VWidth / 2; -    APInt InputDemandedElts(APInt::getBitsSet(VWidth, 0, LowHalfElts)); -    APInt UndefElts(VWidth, 0); -    if (Value *TmpV = SimplifyDemandedVectorElts( -            II->getArgOperand(0), InputDemandedElts, UndefElts)) { -      II->setArgOperand(0, TmpV); +  case Intrinsic::x86_sse41_pmovzxwq: +  case Intrinsic::x86_avx2_pmovzxbd: +  case Intrinsic::x86_avx2_pmovzxbq: +  case Intrinsic::x86_avx2_pmovzxbw: +  case Intrinsic::x86_avx2_pmovzxdq: +  case Intrinsic::x86_avx2_pmovzxwd: +  case Intrinsic::x86_avx2_pmovzxwq: +    if (Value *V = SimplifyX86extend(*II, *Builder, false)) +      return ReplaceInstUsesWith(*II, V); +    break; + +  case Intrinsic::x86_sse41_insertps: +    if (Value *V = SimplifyX86insertps(*II, *Builder)) +      return ReplaceInstUsesWith(*II, V); +    break; + +  case Intrinsic::x86_sse4a_extrq: { +    Value *Op0 = II->getArgOperand(0); +    Value *Op1 = II->getArgOperand(1); +    unsigned VWidth0 = Op0->getType()->getVectorNumElements(); +    unsigned VWidth1 = Op1->getType()->getVectorNumElements(); +    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && +           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && +           VWidth1 == 16 && "Unexpected operand sizes"); + +    // See if we're dealing with constant values. +    Constant *C1 = dyn_cast<Constant>(Op1); +    ConstantInt *CILength = +        C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0)) +           : nullptr; +    ConstantInt *CIIndex = +        C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1)) +           : nullptr; + +    // Attempt to simplify to a constant, shuffle vector or EXTRQI call. +    if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder)) +      return ReplaceInstUsesWith(*II, V); + +    // EXTRQ only uses the lowest 64-bits of the first 128-bit vector +    // operands and the lowest 16-bits of the second. +    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { +      II->setArgOperand(0, V); +      return II; +    } +    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { +      II->setArgOperand(1, V);        return II;      }      break;    } -  case Intrinsic::x86_sse41_insertps: -    if (Value *V = SimplifyX86insertps(*II, *Builder)) + +  case Intrinsic::x86_sse4a_extrqi: { +    // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining +    // bits of the lower 64-bits. The upper 64-bits are undefined. +    Value *Op0 = II->getArgOperand(0); +    unsigned VWidth = Op0->getType()->getVectorNumElements(); +    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && +           "Unexpected operand size"); + +    // See if we're dealing with constant values. +    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1)); +    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2)); + +    // Attempt to simplify to a constant or shuffle vector. +    if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder))        return ReplaceInstUsesWith(*II, V); + +    // EXTRQI only uses the lowest 64-bits of the first 128-bit vector +    // operand. +    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { +      II->setArgOperand(0, V); +      return II; +    } +    break; +  } + +  case Intrinsic::x86_sse4a_insertq: { +    Value *Op0 = II->getArgOperand(0); +    Value *Op1 = II->getArgOperand(1); +    unsigned VWidth = Op0->getType()->getVectorNumElements(); +    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && +           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && +           Op1->getType()->getVectorNumElements() == 2 && +           "Unexpected operand size"); + +    // See if we're dealing with constant values. +    Constant *C1 = dyn_cast<Constant>(Op1); +    ConstantInt *CI11 = +        C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1)) +           : nullptr; + +    // Attempt to simplify to a constant, shuffle vector or INSERTQI call. +    if (CI11) { +      APInt V11 = CI11->getValue(); +      APInt Len = V11.zextOrTrunc(6); +      APInt Idx = V11.lshr(8).zextOrTrunc(6); +      if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder)) +        return ReplaceInstUsesWith(*II, V); +    } + +    // INSERTQ only uses the lowest 64-bits of the first 128-bit vector +    // operand. +    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { +      II->setArgOperand(0, V); +      return II; +    }      break; -     +  } +    case Intrinsic::x86_sse4a_insertqi: { -    // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top -    // ones undef -    // TODO: eventually we should lower this intrinsic to IR -    if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) { -      if (auto CIStart = dyn_cast<ConstantInt>(II->getArgOperand(3))) { -        unsigned Index = CIStart->getZExtValue(); -        // From AMD documentation: "a value of zero in the field length is -        // defined as length of 64". -        unsigned Length = CIWidth->equalsInt(0) ? 64 : CIWidth->getZExtValue(); - -        // From AMD documentation: "If the sum of the bit index + length field -        // is greater than 64, the results are undefined". - -        // Note that both field index and field length are 8-bit quantities. -        // Since variables 'Index' and 'Length' are unsigned values -        // obtained from zero-extending field index and field length -        // respectively, their sum should never wrap around. -        if ((Index + Length) > 64) -          return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); - -        if (Length == 64 && Index == 0) { -          Value *Vec = II->getArgOperand(1); -          Value *Undef = UndefValue::get(Vec->getType()); -          const uint32_t Mask[] = { 0, 2 }; -          return ReplaceInstUsesWith( -              CI, -              Builder->CreateShuffleVector( -                  Vec, Undef, ConstantDataVector::get( -                                  II->getContext(), makeArrayRef(Mask)))); - -        } else if (auto Source = -                       dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { -          if (Source->hasOneUse() && -              Source->getArgOperand(1) == II->getArgOperand(1)) { -            // If the source of the insert has only one use and it's another -            // insert (and they're both inserting from the same vector), try to -            // bundle both together. -            auto CISourceWidth = -                dyn_cast<ConstantInt>(Source->getArgOperand(2)); -            auto CISourceStart = -                dyn_cast<ConstantInt>(Source->getArgOperand(3)); -            if (CISourceStart && CISourceWidth) { -              unsigned Start = CIStart->getZExtValue(); -              unsigned Width = CIWidth->getZExtValue(); -              unsigned End = Start + Width; -              unsigned SourceStart = CISourceStart->getZExtValue(); -              unsigned SourceWidth = CISourceWidth->getZExtValue(); -              unsigned SourceEnd = SourceStart + SourceWidth; -              unsigned NewStart, NewWidth; -              bool ShouldReplace = false; -              if (Start <= SourceStart && SourceStart <= End) { -                NewStart = Start; -                NewWidth = std::max(End, SourceEnd) - NewStart; -                ShouldReplace = true; -              } else if (SourceStart <= Start && Start <= SourceEnd) { -                NewStart = SourceStart; -                NewWidth = std::max(SourceEnd, End) - NewStart; -                ShouldReplace = true; -              } - -              if (ShouldReplace) { -                Constant *ConstantWidth = ConstantInt::get( -                    II->getArgOperand(2)->getType(), NewWidth, false); -                Constant *ConstantStart = ConstantInt::get( -                    II->getArgOperand(3)->getType(), NewStart, false); -                Value *Args[4] = { Source->getArgOperand(0), -                                   II->getArgOperand(1), ConstantWidth, -                                   ConstantStart }; -                Module *M = CI.getParent()->getParent()->getParent(); -                Value *F = -                    Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); -                return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args)); -              } -            } -          } -        } -      } +    // INSERTQI: Extract lowest Length bits from lower half of second source and +    // insert over first source starting at Index bit. The upper 64-bits are +    // undefined. +    Value *Op0 = II->getArgOperand(0); +    Value *Op1 = II->getArgOperand(1); +    unsigned VWidth0 = Op0->getType()->getVectorNumElements(); +    unsigned VWidth1 = Op1->getType()->getVectorNumElements(); +    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && +           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && +           VWidth1 == 2 && "Unexpected operand sizes"); + +    // See if we're dealing with constant values. +    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2)); +    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3)); + +    // Attempt to simplify to a constant or shuffle vector. +    if (CILength && CIIndex) { +      APInt Len = CILength->getValue().zextOrTrunc(6); +      APInt Idx = CIIndex->getValue().zextOrTrunc(6); +      if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder)) +        return ReplaceInstUsesWith(*II, V); +    } + +    // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector +    // operands. +    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { +      II->setArgOperand(0, V); +      return II; +    } + +    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { +      II->setArgOperand(1, V); +      return II;      }      break;    } @@ -894,7 +1345,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {      // This optimization is convoluted because the intrinsic is defined as      // getting a vector of floats or doubles for the ps and pd versions.      // FIXME: That should be changed. + +    Value *Op0 = II->getArgOperand(0); +    Value *Op1 = II->getArgOperand(1);      Value *Mask = II->getArgOperand(2); + +    // fold (blend A, A, Mask) -> A +    if (Op0 == Op1) +      return ReplaceInstUsesWith(CI, Op0); + +    // Zero Mask - select 1st argument. +    if (isa<ConstantAggregateZero>(Mask)) +      return ReplaceInstUsesWith(CI, Op0); + +    // Constant Mask - select 1st/2nd argument lane based on top bit of mask.      if (auto C = dyn_cast<ConstantDataVector>(Mask)) {        auto Tyi1 = Builder->getInt1Ty();        auto SelectorType = cast<VectorType>(Mask->getType()); @@ -917,11 +1381,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {          Selectors.push_back(ConstantInt::get(Tyi1, Selector >> (BitWidth - 1)));        }        auto NewSelector = ConstantVector::get(Selectors); -      return SelectInst::Create(NewSelector, II->getArgOperand(1), -                                II->getArgOperand(0), "blendv"); -    } else { -      break; +      return SelectInst::Create(NewSelector, Op1, Op0, "blendv");      } +    break; +  } + +  case Intrinsic::x86_ssse3_pshuf_b_128: +  case Intrinsic::x86_avx2_pshuf_b: { +    // Turn pshufb(V1,mask) -> shuffle(V1,Zero,mask) if mask is a constant. +    auto *V = II->getArgOperand(1); +    auto *VTy = cast<VectorType>(V->getType()); +    unsigned NumElts = VTy->getNumElements(); +    assert((NumElts == 16 || NumElts == 32) && +           "Unexpected number of elements in shuffle mask!"); +    // Initialize the resulting shuffle mask to all zeroes. +    uint32_t Indexes[32] = {0}; + +    if (auto *Mask = dyn_cast<ConstantDataVector>(V)) { +      // Each byte in the shuffle control mask forms an index to permute the +      // corresponding byte in the destination operand. +      for (unsigned I = 0; I < NumElts; ++I) { +        int8_t Index = Mask->getElementAsInteger(I); +        // If the most significant bit (bit[7]) of each byte of the shuffle +        // control mask is set, then zero is written in the result byte. +        // The zero vector is in the right-hand side of the resulting +        // shufflevector. + +        // The value of each index is the least significant 4 bits of the +        // shuffle control byte. +        Indexes[I] = (Index < 0) ? NumElts : Index & 0xF; +      } +    } else if (!isa<ConstantAggregateZero>(V)) +      break; + +    // The value of each index for the high 128-bit lane is the least +    // significant 4 bits of the respective shuffle control byte. +    for (unsigned I = 16; I < NumElts; ++I) +      Indexes[I] += I & 0xF0; + +    auto NewC = ConstantDataVector::get(V->getContext(), +                                        makeArrayRef(Indexes, NumElts)); +    auto V1 = II->getArgOperand(0); +    auto V2 = Constant::getNullValue(II->getType()); +    auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC); +    return ReplaceInstUsesWith(CI, Shuffle);    }    case Intrinsic::x86_avx_vpermilvar_ps: @@ -972,6 +1475,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {        return ReplaceInstUsesWith(*II, V);      break; +  case Intrinsic::x86_xop_vpcomb: +  case Intrinsic::x86_xop_vpcomd: +  case Intrinsic::x86_xop_vpcomq: +  case Intrinsic::x86_xop_vpcomw: +    if (Value *V = SimplifyX86vpcom(*II, *Builder, true)) +      return ReplaceInstUsesWith(*II, V); +    break; + +  case Intrinsic::x86_xop_vpcomub: +  case Intrinsic::x86_xop_vpcomud: +  case Intrinsic::x86_xop_vpcomuq: +  case Intrinsic::x86_xop_vpcomuw: +    if (Value *V = SimplifyX86vpcom(*II, *Builder, false)) +      return ReplaceInstUsesWith(*II, V); +    break; +    case Intrinsic::ppc_altivec_vperm:      // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.      // Note that ppc_altivec_vperm has a big-endian bias, so when creating @@ -1115,15 +1634,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {      // happen when variable allocas are DCE'd.      if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {        if (SS->getIntrinsicID() == Intrinsic::stacksave) { -        BasicBlock::iterator BI = SS; -        if (&*++BI == II) +        if (&*++SS->getIterator() == II)            return EraseInstFromFunction(CI);        }      }      // Scan down this block to see if there is another stack restore in the      // same block without an intervening call/alloca. -    BasicBlock::iterator BI = II; +    BasicBlock::iterator BI(II);      TerminatorInst *TI = II->getParent()->getTerminator();      bool CannotRemove = false;      for (++BI; &*BI != TI; ++BI) { @@ -1153,6 +1671,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {        return EraseInstFromFunction(CI);      break;    } +  case Intrinsic::lifetime_start: { +    // Remove trivially empty lifetime_start/end ranges, i.e. a start +    // immediately followed by an end (ignoring debuginfo or other +    // lifetime markers in between). +    BasicBlock::iterator BI = II->getIterator(), BE = II->getParent()->end(); +    for (++BI; BI != BE; ++BI) { +      if (IntrinsicInst *LTE = dyn_cast<IntrinsicInst>(BI)) { +        if (isa<DbgInfoIntrinsic>(LTE) || +            LTE->getIntrinsicID() == Intrinsic::lifetime_start) +          continue; +        if (LTE->getIntrinsicID() == Intrinsic::lifetime_end) { +          if (II->getOperand(0) == LTE->getOperand(0) && +              II->getOperand(1) == LTE->getOperand(1)) { +            EraseInstFromFunction(*LTE); +            return EraseInstFromFunction(*II); +          } +          continue; +        } +      } +      break; +    } +    break; +  }    case Intrinsic::assume: {      // Canonicalize assume(a && b) -> assume(a); assume(b);      // Note: New assumption intrinsics created here are registered by @@ -1233,7 +1774,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {      }      // isKnownNonNull -> nonnull attribute -    if (isKnownNonNull(DerivedPtr)) +    if (isKnownNonNullAt(DerivedPtr, II, DT, TLI))        II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);      // isDereferenceablePointer -> deref attribute @@ -1355,9 +1896,10 @@ static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp,                                                 Value *TrampMem) {    // Visit all the previous instructions in the basic block, and try to find a    // init.trampoline which has a direct path to the adjust.trampoline. -  for (BasicBlock::iterator I = AdjustTramp, -       E = AdjustTramp->getParent()->begin(); I != E; ) { -    Instruction *Inst = --I; +  for (BasicBlock::iterator I = AdjustTramp->getIterator(), +                            E = AdjustTramp->getParent()->begin(); +       I != E;) { +    Instruction *Inst = &*--I;      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))        if (II->getIntrinsicID() == Intrinsic::init_trampoline &&            II->getOperand(0) == TrampMem) @@ -1400,20 +1942,27 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {    // Mark any parameters that are known to be non-null with the nonnull    // attribute.  This is helpful for inlining calls to functions with null    // checks on their arguments. +  SmallVector<unsigned, 4> Indices;    unsigned ArgNo = 0; +    for (Value *V : CS.args()) { -    if (!CS.paramHasAttr(ArgNo+1, Attribute::NonNull) && -        isKnownNonNull(V)) { -      AttributeSet AS = CS.getAttributes(); -      AS = AS.addAttribute(CS.getInstruction()->getContext(), ArgNo+1, -                           Attribute::NonNull); -      CS.setAttributes(AS); -      Changed = true; -    } +    if (V->getType()->isPointerTy() && !CS.paramHasAttr(ArgNo+1, Attribute::NonNull) && +        isKnownNonNullAt(V, CS.getInstruction(), DT, TLI)) +      Indices.push_back(ArgNo + 1);      ArgNo++;    } +    assert(ArgNo == CS.arg_size() && "sanity check"); +  if (!Indices.empty()) { +    AttributeSet AS = CS.getAttributes(); +    LLVMContext &Ctx = CS.getInstruction()->getContext(); +    AS = AS.addAttribute(Ctx, Indices, +                         Attribute::get(Ctx, Attribute::NonNull)); +    CS.setAttributes(AS); +    Changed = true; +  } +    // If the callee is a pointer to a function, attempt to move any casts to the    // arguments of the call/invoke.    Value *Callee = CS.getCalledValue(); @@ -1725,16 +2274,19 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {    const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(),                                                         attrVec); +  SmallVector<OperandBundleDef, 1> OpBundles; +  CS.getOperandBundlesAsDefs(OpBundles); +    Instruction *NC;    if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { -    NC = Builder->CreateInvoke(Callee, II->getNormalDest(), -                               II->getUnwindDest(), Args); +    NC = Builder->CreateInvoke(Callee, II->getNormalDest(), II->getUnwindDest(), +                               Args, OpBundles);      NC->takeName(II);      cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv());      cast<InvokeInst>(NC)->setAttributes(NewCallerPAL);    } else {      CallInst *CI = cast<CallInst>(Caller); -    NC = Builder->CreateCall(Callee, Args); +    NC = Builder->CreateCall(Callee, Args, OpBundles);      NC->takeName(CI);      if (CI->isTailCall())        cast<CallInst>(NC)->setTailCall(); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 48ab0eb2c1b9..da835a192322 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -21,11 +21,11 @@ using namespace PatternMatch;  #define DEBUG_TYPE "instcombine" -/// DecomposeSimpleLinearExpr - Analyze 'Val', seeing if it is a simple linear -/// expression.  If so, decompose it, returning some value X, such that Val is +/// Analyze 'Val', seeing if it is a simple linear expression. +/// If so, decompose it, returning some value X, such that Val is  /// X*Scale+Offset.  /// -static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, +static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,                                          uint64_t &Offset) {    if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {      Offset = CI->getZExtValue(); @@ -62,7 +62,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,          // where C1 is divisible by C2.          unsigned SubScale;          Value *SubVal = -          DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset); +          decomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);          Offset += RHS->getZExtValue();          Scale = SubScale;          return SubVal; @@ -76,14 +76,14 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,    return Val;  } -/// PromoteCastOfAllocation - If we find a cast of an allocation instruction, -/// try to eliminate the cast by moving the type information into the alloc. +/// If we find a cast of an allocation instruction, try to eliminate the cast by +/// moving the type information into the alloc.  Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,                                                     AllocaInst &AI) {    PointerType *PTy = cast<PointerType>(CI.getType());    BuilderTy AllocaBuilder(*Builder); -  AllocaBuilder.SetInsertPoint(AI.getParent(), &AI); +  AllocaBuilder.SetInsertPoint(&AI);    // Get the type really allocated and the type casted to.    Type *AllocElTy = AI.getAllocatedType(); @@ -114,7 +114,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,    unsigned ArraySizeScale;    uint64_t ArrayOffset;    Value *NumElements = // See if the array size is a decomposable linear expr. -    DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset); +    decomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);    // If we can now satisfy the modulus, by using a non-1 scale, we really can    // do the xform. @@ -154,9 +154,8 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,    return ReplaceInstUsesWith(CI, New);  } -/// EvaluateInDifferentType - Given an expression that -/// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually -/// insert the code to evaluate the expression. +/// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns +/// true for, actually insert the code to evaluate the expression.  Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,                                               bool isSigned) {    if (Constant *C = dyn_cast<Constant>(V)) { @@ -261,9 +260,9 @@ isEliminableCastPair(const CastInst *CI, ///< First cast instruction    return Instruction::CastOps(Res);  } -/// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually -/// results in any code being generated and is interesting to optimize out. If -/// the cast can be eliminated by some other simple transformation, we prefer +/// Return true if the cast from "V to Ty" actually results in any code being +/// generated and is interesting to optimize out. +/// If the cast can be eliminated by some other simple transformation, we prefer  /// to do the simplification first.  bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V,                                        Type *Ty) { @@ -318,9 +317,9 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {    return nullptr;  } -/// CanEvaluateTruncated - Return true if we can evaluate the specified -/// expression tree as type Ty instead of its larger type, and arrive with the -/// same value.  This is used by code that tries to eliminate truncates. +/// Return true if we can evaluate the specified expression tree as type Ty +/// instead of its larger type, and arrive with the same value. +/// This is used by code that tries to eliminate truncates.  ///  /// Ty will always be a type smaller than V.  We should return true if trunc(V)  /// can be computed by computing V in the smaller type.  If V is an instruction, @@ -329,7 +328,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {  ///  /// This function works on both vectors and scalars.  /// -static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, +static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,                                   Instruction *CxtI) {    // We can always evaluate constants in another type.    if (isa<Constant>(V)) @@ -359,8 +358,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,    case Instruction::Or:    case Instruction::Xor:      // These operators can all arbitrarily be extended or truncated. -    return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && -           CanEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); +    return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && +           canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);    case Instruction::UDiv:    case Instruction::URem: { @@ -371,8 +370,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,        APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth);        if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) &&            IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) { -        return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && -               CanEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); +        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && +               canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);        }      }      break; @@ -383,7 +382,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,      if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {        uint32_t BitWidth = Ty->getScalarSizeInBits();        if (CI->getLimitedValue(BitWidth) < BitWidth) -        return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); +        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);      }      break;    case Instruction::LShr: @@ -396,7 +395,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,        if (IC.MaskedValueIsZero(I->getOperand(0),              APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth), 0, CxtI) &&            CI->getLimitedValue(BitWidth) < BitWidth) { -        return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); +        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);        }      }      break; @@ -410,8 +409,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,      return true;    case Instruction::Select: {      SelectInst *SI = cast<SelectInst>(I); -    return CanEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) && -           CanEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI); +    return canEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) && +           canEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI);    }    case Instruction::PHI: {      // We can change a phi if we can change all operands.  Note that we never @@ -419,7 +418,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,      // instructions with a single use.      PHINode *PN = cast<PHINode>(I);      for (Value *IncValue : PN->incoming_values()) -      if (!CanEvaluateTruncated(IncValue, Ty, IC, CxtI)) +      if (!canEvaluateTruncated(IncValue, Ty, IC, CxtI))          return false;      return true;    } @@ -431,6 +430,50 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,    return false;  } +/// Given a vector that is bitcast to an integer, optionally logically +/// right-shifted, and truncated, convert it to an extractelement. +/// Example (big endian): +///   trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32 +///   ---> +///   extractelement <4 x i32> %X, 1 +static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC, +                                         const DataLayout &DL) { +  Value *TruncOp = Trunc.getOperand(0); +  Type *DestType = Trunc.getType(); +  if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType)) +    return nullptr; + +  Value *VecInput = nullptr; +  ConstantInt *ShiftVal = nullptr; +  if (!match(TruncOp, m_CombineOr(m_BitCast(m_Value(VecInput)), +                                  m_LShr(m_BitCast(m_Value(VecInput)), +                                         m_ConstantInt(ShiftVal)))) || +      !isa<VectorType>(VecInput->getType())) +    return nullptr; + +  VectorType *VecType = cast<VectorType>(VecInput->getType()); +  unsigned VecWidth = VecType->getPrimitiveSizeInBits(); +  unsigned DestWidth = DestType->getPrimitiveSizeInBits(); +  unsigned ShiftAmount = ShiftVal ? ShiftVal->getZExtValue() : 0; + +  if ((VecWidth % DestWidth != 0) || (ShiftAmount % DestWidth != 0)) +    return nullptr; + +  // If the element type of the vector doesn't match the result type, +  // bitcast it to a vector type that we can extract from. +  unsigned NumVecElts = VecWidth / DestWidth; +  if (VecType->getElementType() != DestType) { +    VecType = VectorType::get(DestType, NumVecElts); +    VecInput = IC.Builder->CreateBitCast(VecInput, VecType, "bc"); +  } + +  unsigned Elt = ShiftAmount / DestWidth; +  if (DL.isBigEndian()) +    Elt = NumVecElts - 1 - Elt; + +  return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); +} +  Instruction *InstCombiner::visitTrunc(TruncInst &CI) {    if (Instruction *Result = commonCastTransforms(CI))      return Result; @@ -441,7 +484,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {    // min/max.    Value *LHS, *RHS;    if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0))) -    if (matchSelectPattern(SI, LHS, RHS) != SPF_UNKNOWN) +    if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN)        return nullptr;    // See if we can simplify any instructions used by the input whose sole @@ -457,7 +500,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {    // expression tree to something weird like i93 unless the source is also    // strange.    if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && -      CanEvaluateTruncated(Src, DestTy, *this, &CI)) { +      canEvaluateTruncated(Src, DestTy, *this, &CI)) {      // If this cast is a truncate, evaluting in a different type always      // eliminates the cast, so it is always a win. @@ -470,7 +513,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {    // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.    if (DestTy->getScalarSizeInBits() == 1) { -    Constant *One = ConstantInt::get(Src->getType(), 1); +    Constant *One = ConstantInt::get(SrcTy, 1);      Src = Builder->CreateAnd(Src, One);      Value *Zero = Constant::getNullValue(Src->getType());      return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero); @@ -489,31 +532,54 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {      // If the shift amount is larger than the size of A, then the result is      // known to be zero because all the input bits got shifted out.      if (Cst->getZExtValue() >= ASize) -      return ReplaceInstUsesWith(CI, Constant::getNullValue(CI.getType())); +      return ReplaceInstUsesWith(CI, Constant::getNullValue(DestTy));      // Since we're doing an lshr and a zero extend, and know that the shift      // amount is smaller than ASize, it is always safe to do the shift in A's      // type, then zero extend or truncate to the result.      Value *Shift = Builder->CreateLShr(A, Cst->getZExtValue());      Shift->takeName(Src); -    return CastInst::CreateIntegerCast(Shift, CI.getType(), false); +    return CastInst::CreateIntegerCast(Shift, DestTy, false); +  } + +  // Transform trunc(lshr (sext A), Cst) to ashr A, Cst to eliminate type +  // conversion. +  // It works because bits coming from sign extension have the same value as +  // the sign bit of the original value; performing ashr instead of lshr +  // generates bits of the same value as the sign bit. +  if (Src->hasOneUse() && +      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) && +      cast<Instruction>(Src)->getOperand(0)->hasOneUse()) { +    const unsigned ASize = A->getType()->getPrimitiveSizeInBits(); +    // This optimization can be only performed when zero bits generated by +    // the original lshr aren't pulled into the value after truncation, so we +    // can only shift by values smaller than the size of destination type (in +    // bits). +    if (Cst->getValue().ult(ASize)) { +      Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue()); +      Shift->takeName(Src); +      return CastInst::CreateIntegerCast(Shift, CI.getType(), true); +    }    }    // Transform "trunc (and X, cst)" -> "and (trunc X), cst" so long as the dest    // type isn't non-native. -  if (Src->hasOneUse() && isa<IntegerType>(Src->getType()) && -      ShouldChangeType(Src->getType(), CI.getType()) && +  if (Src->hasOneUse() && isa<IntegerType>(SrcTy) && +      ShouldChangeType(SrcTy, DestTy) &&        match(Src, m_And(m_Value(A), m_ConstantInt(Cst)))) { -    Value *NewTrunc = Builder->CreateTrunc(A, CI.getType(), A->getName()+".tr"); +    Value *NewTrunc = Builder->CreateTrunc(A, DestTy, A->getName() + ".tr");      return BinaryOperator::CreateAnd(NewTrunc, -                                     ConstantExpr::getTrunc(Cst, CI.getType())); +                                     ConstantExpr::getTrunc(Cst, DestTy));    } +  if (Instruction *I = foldVecTruncToExtElt(CI, *this, DL)) +    return I; +    return nullptr;  } -/// transformZExtICmp - Transform (zext icmp) to bitwise / integer operations -/// in order to eliminate the icmp. +/// Transform (zext icmp) to bitwise / integer operations in order to eliminate +/// the icmp.  Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,                                               bool DoXform) {    // If we are just checking for a icmp eq of a single bit and zext'ing it @@ -637,8 +703,8 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,    return nullptr;  } -/// CanEvaluateZExtd - Determine if the specified value can be computed in the -/// specified wider type and produce the same low bits.  If not, return false. +/// Determine if the specified value can be computed in the specified wider type +/// and produce the same low bits. If not, return false.  ///  /// If this function returns true, it can also return a non-zero number of bits  /// (in BitsToClear) which indicates that the value it computes is correct for @@ -655,7 +721,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,  /// clear the top bits anyway, doing this has no extra cost.  ///  /// This function works on both vectors and scalars. -static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, +static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,                               InstCombiner &IC, Instruction *CxtI) {    BitsToClear = 0;    if (isa<Constant>(V)) @@ -685,8 +751,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,    case Instruction::Add:    case Instruction::Sub:    case Instruction::Mul: -    if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) || -        !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI)) +    if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) || +        !canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI))        return false;      // These can all be promoted if neither operand has 'bits to clear'.      if (BitsToClear == 0 && Tmp == 0) @@ -713,7 +779,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,      // We can promote shl(x, cst) if we can promote x.  Since shl overwrites the      // upper bits we can reduce BitsToClear by the shift amount.      if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) { -      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) +      if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))          return false;        uint64_t ShiftAmt = Amt->getZExtValue();        BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0; @@ -724,7 +790,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,      // We can promote lshr(x, cst) if we can promote x.  This requires the      // ultimate 'and' to clear out the high zero bits we're clearing out though.      if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) { -      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) +      if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))          return false;        BitsToClear += Amt->getZExtValue();        if (BitsToClear > V->getType()->getScalarSizeInBits()) @@ -734,8 +800,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,      // Cannot promote variable LSHR.      return false;    case Instruction::Select: -    if (!CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) || -        !CanEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) || +    if (!canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) || +        !canEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) ||          // TODO: If important, we could handle the case when the BitsToClear are          // known zero in the disagreeing side.          Tmp != BitsToClear) @@ -747,10 +813,10 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,      // get into trouble with cyclic PHIs here because we only consider      // instructions with a single use.      PHINode *PN = cast<PHINode>(I); -    if (!CanEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI)) +    if (!canEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI))        return false;      for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) -      if (!CanEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) || +      if (!canEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) ||            // TODO: If important, we could handle the case when the BitsToClear            // are known zero in the disagreeing input.            Tmp != BitsToClear) @@ -787,13 +853,13 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {    // strange.    unsigned BitsToClear;    if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && -      CanEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) { +      canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {      assert(BitsToClear < SrcTy->getScalarSizeInBits() &&             "Unreasonable BitsToClear");      // Okay, we can transform this!  Insert the new expression now.      DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" -          " to avoid zero extend: " << CI); +          " to avoid zero extend: " << CI << '\n');      Value *Res = EvaluateInDifferentType(Src, DestTy, false);      assert(Res->getType() == DestTy); @@ -897,8 +963,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {    return nullptr;  } -/// transformSExtICmp - Transform (sext icmp) to bitwise / integer operations -/// in order to eliminate the icmp. +/// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp.  Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {    Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1);    ICmpInst::Predicate Pred = ICI->getPredicate(); @@ -985,15 +1050,14 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {    return nullptr;  } -/// CanEvaluateSExtd - Return true if we can take the specified value -/// and return it as type Ty without inserting any new casts and without -/// changing the value of the common low bits.  This is used by code that tries -/// to promote integer operations to a wider types will allow us to eliminate -/// the extension. +/// Return true if we can take the specified value and return it as type Ty +/// without inserting any new casts and without changing the value of the common +/// low bits.  This is used by code that tries to promote integer operations to +/// a wider types will allow us to eliminate the extension.  ///  /// This function works on both vectors and scalars.  /// -static bool CanEvaluateSExtd(Value *V, Type *Ty) { +static bool canEvaluateSExtd(Value *V, Type *Ty) {    assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&           "Can't sign extend type to a smaller type");    // If this is a constant, it can be trivially promoted. @@ -1023,15 +1087,15 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {    case Instruction::Sub:    case Instruction::Mul:      // These operators can all arbitrarily be extended if their inputs can. -    return CanEvaluateSExtd(I->getOperand(0), Ty) && -           CanEvaluateSExtd(I->getOperand(1), Ty); +    return canEvaluateSExtd(I->getOperand(0), Ty) && +           canEvaluateSExtd(I->getOperand(1), Ty);    //case Instruction::Shl:   TODO    //case Instruction::LShr:  TODO    case Instruction::Select: -    return CanEvaluateSExtd(I->getOperand(1), Ty) && -           CanEvaluateSExtd(I->getOperand(2), Ty); +    return canEvaluateSExtd(I->getOperand(1), Ty) && +           canEvaluateSExtd(I->getOperand(2), Ty);    case Instruction::PHI: {      // We can change a phi if we can change all operands.  Note that we never @@ -1039,7 +1103,7 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {      // instructions with a single use.      PHINode *PN = cast<PHINode>(I);      for (Value *IncValue : PN->incoming_values()) -      if (!CanEvaluateSExtd(IncValue, Ty)) return false; +      if (!canEvaluateSExtd(IncValue, Ty)) return false;      return true;    }    default: @@ -1081,10 +1145,10 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {    // expression tree to something weird like i93 unless the source is also    // strange.    if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && -      CanEvaluateSExtd(Src, DestTy)) { +      canEvaluateSExtd(Src, DestTy)) {      // Okay, we can transform this!  Insert the new expression now.      DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" -          " to avoid sign extend: " << CI); +          " to avoid sign extend: " << CI << '\n');      Value *Res = EvaluateInDifferentType(Src, DestTy, true);      assert(Res->getType() == DestTy); @@ -1149,9 +1213,9 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {  } -/// FitsInFPType - Return a Constant* for the specified FP constant if it fits +/// Return a Constant* for the specified floating-point constant if it fits  /// in the specified FP type without changing its value. -static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { +static Constant *fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {    bool losesInfo;    APFloat F = CFP->getValueAPF();    (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo); @@ -1160,12 +1224,12 @@ static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {    return nullptr;  } -/// LookThroughFPExtensions - If this is an fp extension instruction, look +/// If this is a floating-point extension instruction, look  /// through it until we get the source value. -static Value *LookThroughFPExtensions(Value *V) { +static Value *lookThroughFPExtensions(Value *V) {    if (Instruction *I = dyn_cast<Instruction>(V))      if (I->getOpcode() == Instruction::FPExt) -      return LookThroughFPExtensions(I->getOperand(0)); +      return lookThroughFPExtensions(I->getOperand(0));    // If this value is a constant, return the constant in the smallest FP type    // that can accurately represent it.  This allows us to turn @@ -1174,14 +1238,14 @@ static Value *LookThroughFPExtensions(Value *V) {      if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext()))        return V;  // No constant folding of this.      // See if the value can be truncated to half and then reextended. -    if (Value *V = FitsInFPType(CFP, APFloat::IEEEhalf)) +    if (Value *V = fitsInFPType(CFP, APFloat::IEEEhalf))        return V;      // See if the value can be truncated to float and then reextended. -    if (Value *V = FitsInFPType(CFP, APFloat::IEEEsingle)) +    if (Value *V = fitsInFPType(CFP, APFloat::IEEEsingle))        return V;      if (CFP->getType()->isDoubleTy())        return V;  // Won't shrink. -    if (Value *V = FitsInFPType(CFP, APFloat::IEEEdouble)) +    if (Value *V = fitsInFPType(CFP, APFloat::IEEEdouble))        return V;      // Don't try to shrink to various long double types.    } @@ -1193,7 +1257,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {    if (Instruction *I = commonCastTransforms(CI))      return I;    // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to -  // simpilify this expression to avoid one or more of the trunc/extend +  // simplify this expression to avoid one or more of the trunc/extend    // operations if we can do so without changing the numerical results.    //    // The exact manner in which the widths of the operands interact to limit @@ -1201,8 +1265,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {    // is explained below in the various case statements.    BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0));    if (OpI && OpI->hasOneUse()) { -    Value *LHSOrig = LookThroughFPExtensions(OpI->getOperand(0)); -    Value *RHSOrig = LookThroughFPExtensions(OpI->getOperand(1)); +    Value *LHSOrig = lookThroughFPExtensions(OpI->getOperand(0)); +    Value *RHSOrig = lookThroughFPExtensions(OpI->getOperand(1));      unsigned OpWidth = OpI->getType()->getFPMantissaWidth();      unsigned LHSWidth = LHSOrig->getType()->getFPMantissaWidth();      unsigned RHSWidth = RHSOrig->getType()->getFPMantissaWidth(); @@ -1307,10 +1371,16 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {    // (fptrunc (select cond, R1, Cst)) -->    // (select cond, (fptrunc R1), (fptrunc Cst)) +  // +  //  - but only if this isn't part of a min/max operation, else we'll +  // ruin min/max canonical form which is to have the select and +  // compare's operands be of the same type with no casts to look through. +  Value *LHS, *RHS;    SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0));    if (SI &&        (isa<ConstantFP>(SI->getOperand(1)) || -       isa<ConstantFP>(SI->getOperand(2)))) { +       isa<ConstantFP>(SI->getOperand(2))) && +      matchSelectPattern(SI, LHS, RHS).Flavor == SPF_UNKNOWN) {      Value *LHSTrunc = Builder->CreateFPTrunc(SI->getOperand(1),                                               CI.getType());      Value *RHSTrunc = Builder->CreateFPTrunc(SI->getOperand(2), @@ -1327,9 +1397,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {          Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0),                                                     CI.getType());          Type *IntrinsicType[] = { CI.getType() }; -        Function *Overload = -          Intrinsic::getDeclaration(CI.getParent()->getParent()->getParent(), -                                    II->getIntrinsicID(), IntrinsicType); +        Function *Overload = Intrinsic::getDeclaration( +            CI.getModule(), II->getIntrinsicID(), IntrinsicType);          Value *Args[] = { InnerTrunc };          return CallInst::Create(Overload, Args, II->getName()); @@ -1483,12 +1552,12 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {    return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);  } -/// OptimizeVectorResize - This input value (which is known to have vector type) -/// is being zero extended or truncated to the specified vector type.  Try to -/// replace it with a shuffle (and vector/vector bitcast) if possible. +/// This input value (which is known to have vector type) is being zero extended +/// or truncated to the specified vector type. +/// Try to replace it with a shuffle (and vector/vector bitcast) if possible.  ///  /// The source and destination vector types may have different element types. -static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy, +static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy,                                           InstCombiner &IC) {    // We can only do this optimization if the output is a multiple of the input    // element size, or the input is a multiple of the output element size. @@ -1548,8 +1617,8 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {    return Value / Ty->getPrimitiveSizeInBits();  } -/// CollectInsertionElements - V is a value which is inserted into a vector of -/// VecEltTy.  Look through the value to see if we can decompose it into +/// V is a value which is inserted into a vector of VecEltTy. +/// Look through the value to see if we can decompose it into  /// insertions into the vector.  See the example in the comment for  /// OptimizeIntegerToVectorInsertions for the pattern this handles.  /// The type of V is always a non-zero multiple of VecEltTy's size. @@ -1558,7 +1627,7 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {  ///  /// This returns false if the pattern can't be matched or true if it can,  /// filling in Elements with the elements found here. -static bool CollectInsertionElements(Value *V, unsigned Shift, +static bool collectInsertionElements(Value *V, unsigned Shift,                                       SmallVectorImpl<Value *> &Elements,                                       Type *VecEltTy, bool isBigEndian) {    assert(isMultipleOfTypeSize(Shift, VecEltTy) && @@ -1595,7 +1664,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,      // If the constant is the size of a vector element, we just need to bitcast      // it to the right type so it gets properly inserted.      if (NumElts == 1) -      return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), +      return collectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),                                        Shift, Elements, VecEltTy, isBigEndian);      // Okay, this is a constant that covers multiple elements.  Slice it up into @@ -1611,7 +1680,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,        Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),                                                                    ShiftI));        Piece = ConstantExpr::getTrunc(Piece, ElementIntTy); -      if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy, +      if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy,                                      isBigEndian))          return false;      } @@ -1625,19 +1694,19 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,    switch (I->getOpcode()) {    default: return false; // Unhandled case.    case Instruction::BitCast: -    return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, +    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,                                      isBigEndian);    case Instruction::ZExt:      if (!isMultipleOfTypeSize(                            I->getOperand(0)->getType()->getPrimitiveSizeInBits(),                                VecEltTy))        return false; -    return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, +    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,                                      isBigEndian);    case Instruction::Or: -    return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, +    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,                                      isBigEndian) && -           CollectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy, +           collectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy,                                      isBigEndian);    case Instruction::Shl: {      // Must be shifting by a constant that is a multiple of the element size. @@ -1645,7 +1714,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,      if (!CI) return false;      Shift += CI->getZExtValue();      if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false; -    return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, +    return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,                                      isBigEndian);    } @@ -1653,8 +1722,8 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,  } -/// OptimizeIntegerToVectorInsertions - If the input is an 'or' instruction, we -/// may be doing shifts and ors to assemble the elements of the vector manually. +/// If the input is an 'or' instruction, we may be doing shifts and ors to +/// assemble the elements of the vector manually.  /// Try to rip the code out and replace it with insertelements.  This is to  /// optimize code like this:  /// @@ -1667,13 +1736,13 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,  ///    %tmp43 = bitcast i64 %ins35 to <2 x float>  ///  /// Into two insertelements that do "buildvector{%inc, %inc5}". -static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, +static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,                                                  InstCombiner &IC) {    VectorType *DestVecTy = cast<VectorType>(CI.getType());    Value *IntInput = CI.getOperand(0);    SmallVector<Value*, 8> Elements(DestVecTy->getNumElements()); -  if (!CollectInsertionElements(IntInput, 0, Elements, +  if (!collectInsertionElements(IntInput, 0, Elements,                                  DestVecTy->getElementType(),                                  IC.getDataLayout().isBigEndian()))      return nullptr; @@ -1692,63 +1761,29 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,    return Result;  } - -/// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double -/// bitcast.  The various long double bitcasts can't get in here. -static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI, InstCombiner &IC, +/// Canonicalize scalar bitcasts of extracted elements into a bitcast of the +/// vector followed by extract element. The backend tends to handle bitcasts of +/// vectors better than bitcasts of scalars because vector registers are +/// usually not type-specific like scalar integer or scalar floating-point. +static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast, +                                              InstCombiner &IC,                                                const DataLayout &DL) { -  Value *Src = CI.getOperand(0); -  Type *DestTy = CI.getType(); - -  // If this is a bitcast from int to float, check to see if the int is an -  // extraction from a vector. -  Value *VecInput = nullptr; -  // bitcast(trunc(bitcast(somevector))) -  if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) && -      isa<VectorType>(VecInput->getType())) { -    VectorType *VecTy = cast<VectorType>(VecInput->getType()); -    unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); - -    if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0) { -      // If the element type of the vector doesn't match the result type, -      // bitcast it to be a vector type we can extract from. -      if (VecTy->getElementType() != DestTy) { -        VecTy = VectorType::get(DestTy, -                                VecTy->getPrimitiveSizeInBits() / DestWidth); -        VecInput = IC.Builder->CreateBitCast(VecInput, VecTy); -      } - -      unsigned Elt = 0; -      if (DL.isBigEndian()) -        Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1; -      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); -    } -  } +  // TODO: Create and use a pattern matcher for ExtractElementInst. +  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0)); +  if (!ExtElt || !ExtElt->hasOneUse()) +    return nullptr; -  // bitcast(trunc(lshr(bitcast(somevector), cst)) -  ConstantInt *ShAmt = nullptr; -  if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)), -                                m_ConstantInt(ShAmt)))) && -      isa<VectorType>(VecInput->getType())) { -    VectorType *VecTy = cast<VectorType>(VecInput->getType()); -    unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); -    if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0 && -        ShAmt->getZExtValue() % DestWidth == 0) { -      // If the element type of the vector doesn't match the result type, -      // bitcast it to be a vector type we can extract from. -      if (VecTy->getElementType() != DestTy) { -        VecTy = VectorType::get(DestTy, -                                VecTy->getPrimitiveSizeInBits() / DestWidth); -        VecInput = IC.Builder->CreateBitCast(VecInput, VecTy); -      } +  // The bitcast must be to a vectorizable type, otherwise we can't make a new +  // type to extract from. +  Type *DestType = BitCast.getType(); +  if (!VectorType::isValidElementType(DestType)) +    return nullptr; -      unsigned Elt = ShAmt->getZExtValue() / DestWidth; -      if (DL.isBigEndian()) -        Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1 - Elt; -      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); -    } -  } -  return nullptr; +  unsigned NumElts = ExtElt->getVectorOperandType()->getNumElements(); +  auto *NewVecType = VectorType::get(DestType, NumElts); +  auto *NewBC = IC.Builder->CreateBitCast(ExtElt->getVectorOperand(), +                                          NewVecType, "bc"); +  return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());  }  Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { @@ -1794,11 +1829,6 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {      }    } -  // Try to optimize int -> float bitcasts. -  if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy)) -    if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this, DL)) -      return I; -    if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {      if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) {        Value *Elem = Builder->CreateBitCast(Src, DestVTy->getElementType()); @@ -1815,7 +1845,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {          CastInst *SrcCast = cast<CastInst>(Src);          if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0)))            if (isa<VectorType>(BCIn->getOperand(0)->getType())) -            if (Instruction *I = OptimizeVectorResize(BCIn->getOperand(0), +            if (Instruction *I = optimizeVectorResize(BCIn->getOperand(0),                                                 cast<VectorType>(DestTy), *this))                return I;        } @@ -1823,7 +1853,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {        // If the input is an 'or' instruction, we may be doing shifts and ors to        // assemble the elements of the vector manually.  Try to rip the code out        // and replace it with insertelements. -      if (Value *V = OptimizeIntegerToVectorInsertions(CI, *this)) +      if (Value *V = optimizeIntegerToVectorInsertions(CI, *this))          return ReplaceInstUsesWith(CI, V);      }    } @@ -1872,6 +1902,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {      }    } +  if (Instruction *I = canonicalizeBitCastExtElt(CI, *this, DL)) +    return I; +    if (SrcTy->isPointerTy())      return commonPointerCastTransforms(CI);    return commonCastTransforms(CI); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 95bba3c7af7d..c0786afe965e 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -216,8 +216,6 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,    Max = KnownOne|UnknownBits;  } - -  /// FoldCmpLoadFromIndexedGlobal - Called we see this pattern:  ///   cmp pred (load (gep GV, ...)), cmpcst  /// where GV is a global variable with a constant initializer.  Try to simplify @@ -371,7 +369,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,        }      } -      // If this element is in range, update our magic bitvector.      if (i < 64 && IsTrueForElt)        MagicBitvector |= 1ULL << i; @@ -469,7 +466,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,      return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End);    } -    // If a magic bitvector captures the entire comparison state    // of this load, replace it with computation that does:    //   ((magic_cst >> i) & 1) != 0 @@ -496,7 +492,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,    return nullptr;  } -  /// EvaluateGEPOffsetExpression - Return a value that can be used to compare  /// the *offset* implied by a GEP to zero.  For example, if we have &A[i], we  /// want to return 'i' for "icmp ne i, 0".  Note that, in general, indices can @@ -562,8 +557,6 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,      }    } - -    // Okay, we know we have a single variable index, which must be a    // pointer/array/vector index.  If there is no offset, life is simple, return    // the index. @@ -737,6 +730,83 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,    return nullptr;  } +Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca, +                                         Value *Other) { +  assert(ICI.isEquality() && "Cannot fold non-equality comparison."); + +  // It would be tempting to fold away comparisons between allocas and any +  // pointer not based on that alloca (e.g. an argument). However, even +  // though such pointers cannot alias, they can still compare equal. +  // +  // But LLVM doesn't specify where allocas get their memory, so if the alloca +  // doesn't escape we can argue that it's impossible to guess its value, and we +  // can therefore act as if any such guesses are wrong. +  // +  // The code below checks that the alloca doesn't escape, and that it's only +  // used in a comparison once (the current instruction). The +  // single-comparison-use condition ensures that we're trivially folding all +  // comparisons against the alloca consistently, and avoids the risk of +  // erroneously folding a comparison of the pointer with itself. + +  unsigned MaxIter = 32; // Break cycles and bound to constant-time. + +  SmallVector<Use *, 32> Worklist; +  for (Use &U : Alloca->uses()) { +    if (Worklist.size() >= MaxIter) +      return nullptr; +    Worklist.push_back(&U); +  } + +  unsigned NumCmps = 0; +  while (!Worklist.empty()) { +    assert(Worklist.size() <= MaxIter); +    Use *U = Worklist.pop_back_val(); +    Value *V = U->getUser(); +    --MaxIter; + +    if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) || +        isa<SelectInst>(V)) { +      // Track the uses. +    } else if (isa<LoadInst>(V)) { +      // Loading from the pointer doesn't escape it. +      continue; +    } else if (auto *SI = dyn_cast<StoreInst>(V)) { +      // Storing *to* the pointer is fine, but storing the pointer escapes it. +      if (SI->getValueOperand() == U->get()) +        return nullptr; +      continue; +    } else if (isa<ICmpInst>(V)) { +      if (NumCmps++) +        return nullptr; // Found more than one cmp. +      continue; +    } else if (auto *Intrin = dyn_cast<IntrinsicInst>(V)) { +      switch (Intrin->getIntrinsicID()) { +        // These intrinsics don't escape or compare the pointer. Memset is safe +        // because we don't allow ptrtoint. Memcpy and memmove are safe because +        // we don't allow stores, so src cannot point to V. +        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: +        case Intrinsic::dbg_declare: case Intrinsic::dbg_value: +        case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: +          continue; +        default: +          return nullptr; +      } +    } else { +      return nullptr; +    } +    for (Use &U : V->uses()) { +      if (Worklist.size() >= MaxIter) +        return nullptr; +      Worklist.push_back(&U); +    } +  } + +  Type *CmpTy = CmpInst::makeCmpResultType(Other->getType()); +  return ReplaceInstUsesWith( +      ICI, +      ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate()))); +} +  /// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X".  Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI,                                              Value *X, ConstantInt *CI, @@ -851,7 +921,6 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,        // to the same result value.        HiOverflow = AddWithOverflow(HiBound, LoBound, RangeSize, false);      } -    } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0.      if (CmpRHSV == 0) {       // (X / pos) op 0        // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2) @@ -996,7 +1065,6 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,      return Res;    } -    // If we are comparing against bits always shifted out, the    // comparison cannot succeed.    APInt Comp = CmpRHSV << ShAmtVal; @@ -1074,18 +1142,22 @@ Instruction *InstCombiner::FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A,    if (AP1 == AP2)      return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType())); -  // Get the distance between the highest bit that's set.    int Shift; -  // Both the constants are negative, take their positive to calculate log.    if (IsAShr && AP1.isNegative()) -    // Get the ones' complement of AP2 and AP1 when computing the distance. -    Shift = (~AP2).logBase2() - (~AP1).logBase2(); +    Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes();    else -    Shift = AP2.logBase2() - AP1.logBase2(); +    Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros();    if (Shift > 0) { -    if (IsAShr ? AP1 == AP2.ashr(Shift) : AP1 == AP2.lshr(Shift)) +    if (IsAShr && AP1 == AP2.ashr(Shift)) { +      // There are multiple solutions if we are comparing against -1 and the LHS +      // of the ashr is not a power of two. +      if (AP1.isAllOnesValue() && !AP2.isPowerOf2()) +        return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift)); +      return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); +    } else if (AP1 == AP2.lshr(Shift)) {        return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); +    }    }    // Shifting const2 will never be equal to const1.    return getConstant(false); @@ -1145,6 +1217,14 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,    switch (LHSI->getOpcode()) {    case Instruction::Trunc: +    if (RHS->isOne() && RHSV.getBitWidth() > 1) { +      // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1 +      Value *V = nullptr; +      if (ICI.getPredicate() == ICmpInst::ICMP_SLT && +          match(LHSI->getOperand(0), m_Signum(m_Value(V)))) +        return new ICmpInst(ICmpInst::ICMP_SLT, V, +                            ConstantInt::get(V->getType(), 1)); +    }      if (ICI.isEquality() && LHSI->hasOneUse()) {        // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all        // of the high bits truncated out of x are known. @@ -1447,9 +1527,35 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,            ICI.getPredicate() == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_UGT                                                    : ICmpInst::ICMP_ULE,            LHSI->getOperand(0), SubOne(RHS)); + +    // (icmp eq (and %A, C), 0) -> (icmp sgt (trunc %A), -1) +    //   iff C is a power of 2 +    if (ICI.isEquality() && LHSI->hasOneUse() && match(RHS, m_Zero())) { +      if (auto *CI = dyn_cast<ConstantInt>(LHSI->getOperand(1))) { +        const APInt &AI = CI->getValue(); +        int32_t ExactLogBase2 = AI.exactLogBase2(); +        if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) { +          Type *NTy = IntegerType::get(ICI.getContext(), ExactLogBase2 + 1); +          Value *Trunc = Builder->CreateTrunc(LHSI->getOperand(0), NTy); +          return new ICmpInst(ICI.getPredicate() == ICmpInst::ICMP_EQ +                                  ? ICmpInst::ICMP_SGE +                                  : ICmpInst::ICMP_SLT, +                              Trunc, Constant::getNullValue(NTy)); +        } +      } +    }      break;    case Instruction::Or: { +    if (RHS->isOne()) { +      // icmp slt signum(V) 1 --> icmp slt V, 1 +      Value *V = nullptr; +      if (ICI.getPredicate() == ICmpInst::ICMP_SLT && +          match(LHSI, m_Signum(m_Value(V)))) +        return new ICmpInst(ICmpInst::ICMP_SLT, V, +                            ConstantInt::get(V->getType(), 1)); +    } +      if (!ICI.isEquality() || !RHS->isNullValue() || !LHSI->hasOneUse())        break;      Value *P, *Q; @@ -2083,11 +2189,9 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,    // If the pattern matches, truncate the inputs to the narrower type and    // use the sadd_with_overflow intrinsic to efficiently compute both the    // result and the overflow bit. -  Module *M = I.getParent()->getParent()->getParent(); -    Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth); -  Value *F = Intrinsic::getDeclaration(M, Intrinsic::sadd_with_overflow, -                                       NewType); +  Value *F = Intrinsic::getDeclaration(I.getModule(), +                                       Intrinsic::sadd_with_overflow, NewType);    InstCombiner::BuilderTy *Builder = IC.Builder; @@ -2123,6 +2227,12 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,      return true;    }; +  // If the overflow check was an add followed by a compare, the insertion point +  // may be pointing to the compare.  We want to insert the new instructions +  // before the add in case there are uses of the add between the add and the +  // compare. +  Builder->SetInsertPoint(&OrigI); +    switch (OCF) {    case OCF_INVALID:      llvm_unreachable("bad overflow check kind!"); @@ -2223,7 +2333,9 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,    assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal);    assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal); -  Instruction *MulInstr = cast<Instruction>(MulVal); +  auto *MulInstr = dyn_cast<Instruction>(MulVal); +  if (!MulInstr) +    return nullptr;    assert(MulInstr->getOpcode() == Instruction::Mul);    auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)), @@ -2357,7 +2469,6 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,    InstCombiner::BuilderTy *Builder = IC.Builder;    Builder->SetInsertPoint(MulInstr); -  Module *M = I.getParent()->getParent()->getParent();    // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B)    Value *MulA = A, *MulB = B; @@ -2365,8 +2476,8 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,      MulA = Builder->CreateZExt(A, MulType);    if (WidthB < MulWidth)      MulB = Builder->CreateZExt(B, MulType); -  Value *F = -      Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow, MulType); +  Value *F = Intrinsic::getDeclaration(I.getModule(), +                                       Intrinsic::umul_with_overflow, MulType);    CallInst *Call = Builder->CreateCall(F, {MulA, MulB}, "umul");    IC.Worklist.Add(MulInstr); @@ -2468,7 +2579,6 @@ static APInt DemandedBitsLHSMask(ICmpInst &I,    default:      return APInt::getAllOnesValue(BitWidth);    } -  }  /// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst @@ -2905,7 +3015,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {                                ConstantInt::get(X->getType(),                                                 CI->countTrailingZeros()));        } -        break;      }      case ICmpInst::ICMP_NE: { @@ -2950,7 +3059,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {                                ConstantInt::get(X->getType(),                                                 CI->countTrailingZeros()));        } -        break;      }      case ICmpInst::ICMP_ULT: @@ -3103,7 +3211,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {          // comparison into the select arms, which will cause one to be          // constant folded and the select turned into a bitwise or.          Value *Op1 = nullptr, *Op2 = nullptr; -        ConstantInt *CI = 0; +        ConstantInt *CI = nullptr;          if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {            Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);            CI = dyn_cast<ConstantInt>(Op1); @@ -3177,6 +3285,17 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {                             ICmpInst::getSwappedPredicate(I.getPredicate()), I))        return NI; +  // Try to optimize equality comparisons against alloca-based pointers. +  if (Op0->getType()->isPointerTy() && I.isEquality()) { +    assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?"); +    if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op0, DL))) +      if (Instruction *New = FoldAllocaCmp(I, Alloca, Op1)) +        return New; +    if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op1, DL))) +      if (Instruction *New = FoldAllocaCmp(I, Alloca, Op0)) +        return New; +  } +    // Test to see if the operands of the icmp are casted versions of other    // values.  If the ptr->ptr cast can be stripped off both arguments, we do so    // now. @@ -3304,6 +3423,26 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {          match(B, m_One()))        return new ICmpInst(CmpInst::ICMP_SGE, A, Op1); +    // icmp sgt X, (Y + -1) -> icmp sge X, Y +    if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT && +        match(D, m_AllOnes())) +      return new ICmpInst(CmpInst::ICMP_SGE, Op0, C); + +    // icmp sle X, (Y + -1) -> icmp slt X, Y +    if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE && +        match(D, m_AllOnes())) +      return new ICmpInst(CmpInst::ICMP_SLT, Op0, C); + +    // icmp sge X, (Y + 1) -> icmp sgt X, Y +    if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && +        match(D, m_One())) +      return new ICmpInst(CmpInst::ICMP_SGT, Op0, C); + +    // icmp slt X, (Y + 1) -> icmp sle X, Y +    if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && +        match(D, m_One())) +      return new ICmpInst(CmpInst::ICMP_SLE, Op0, C); +      // if C1 has greater magnitude than C2:      //  icmp (X + C1), (Y + C2) -> icmp (X + C3), Y      //  s.t. C3 = C1 - C2 @@ -3473,6 +3612,18 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {        }        }      } + +    if (BO0) { +      // Transform  A & (L - 1) `ult` L --> L != 0 +      auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes()); +      auto BitwiseAnd = +          m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value())); + +      if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) { +        auto *Zero = Constant::getNullValue(BO0->getType()); +        return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero); +      } +    }    }    { Value *A, *B; @@ -3697,15 +3848,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,    IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType()); -  // Check to see that the input is converted from an integer type that is small -  // enough that preserves all bits.  TODO: check here for "known" sign bits. -  // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e. -  unsigned InputSize = IntTy->getScalarSizeInBits(); - -  // If this is a uitofp instruction, we need an extra bit to hold the sign.    bool LHSUnsigned = isa<UIToFPInst>(LHSI); -  if (LHSUnsigned) -    ++InputSize;    if (I.isEquality()) {      FCmpInst::Predicate P = I.getPredicate(); @@ -3732,13 +3875,30 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,      // equality compares as integer?    } -  // Comparisons with zero are a special case where we know we won't lose -  // information. -  bool IsCmpZero = RHS.isPosZero(); +  // Check to see that the input is converted from an integer type that is small +  // enough that preserves all bits.  TODO: check here for "known" sign bits. +  // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e. +  unsigned InputSize = IntTy->getScalarSizeInBits(); -  // If the conversion would lose info, don't hack on this. -  if ((int)InputSize > MantissaWidth && !IsCmpZero) -    return nullptr; +  // Following test does NOT adjust InputSize downwards for signed inputs,  +  // because the most negative value still requires all the mantissa bits  +  // to distinguish it from one less than that value. +  if ((int)InputSize > MantissaWidth) { +    // Conversion would lose accuracy. Check if loss can impact comparison. +    int Exp = ilogb(RHS); +    if (Exp == APFloat::IEK_Inf) { +      int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics())); +      if (MaxExponent < (int)InputSize - !LHSUnsigned)  +        // Conversion could create infinity. +        return nullptr; +    } else { +      // Note that if RHS is zero or NaN, then Exp is negative  +      // and first condition is trivially false. +      if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned)  +        // Conversion could affect comparison. +        return nullptr; +    } +  }    // Otherwise, we can potentially simplify the comparison.  We know that it    // will always come through as an integer value and we know the constant is diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index ac934f1bd85c..534f67008150 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -281,6 +281,7 @@ public:                                  ICmpInst::Predicate Pred);    Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,                             ICmpInst::Predicate Cond, Instruction &I); +  Instruction *FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca, Value *Other);    Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1,                                     BinaryOperator &I);    Instruction *commonCastTransforms(CastInst &CI); @@ -341,6 +342,7 @@ public:                                   const unsigned SIOpd);  private: +  bool ShouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;    bool ShouldChangeType(Type *From, Type *To) const;    Value *dyn_castNegVal(Value *V) const;    Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const; @@ -360,6 +362,11 @@ private:    /// \brief Try to optimize a sequence of instructions checking if an operation    /// on LHS and RHS overflows.    /// +  /// If this overflow check is done via one of the overflow check intrinsics, +  /// then CtxI has to be the call instruction calling that intrinsic.  If this +  /// overflow check is done by arithmetic followed by a compare, then CtxI has +  /// to be the arithmetic instruction. +  ///    /// If a simplification is possible, stores the simplified result of the    /// operation in OperationResult and result of the overflow check in    /// OverflowResult, and return true.  If no simplification is possible, @@ -393,7 +400,7 @@ public:      assert(New && !New->getParent() &&             "New instruction already inserted into a basic block!");      BasicBlock *BB = Old.getParent(); -    BB->getInstList().insert(&Old, New); // Insert inst +    BB->getInstList().insert(Old.getIterator(), New); // Insert inst      Worklist.Add(New);      return New;    } @@ -539,6 +546,7 @@ private:    Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);    Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);    Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN); +  Instruction *FoldPHIArgZextsIntoPHI(PHINode &PN);    Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,                          ConstantInt *AndRHS, BinaryOperator &TheAnd); @@ -548,7 +556,7 @@ private:    Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, bool isSigned,                           bool Inside);    Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); -  Instruction *MatchBSwap(BinaryOperator &I); +  Instruction *MatchBSwapOrBitReverse(BinaryOperator &I);    bool SimplifyStoreAtEndOfBlock(StoreInst &SI);    Instruction *SimplifyMemTransfer(MemIntrinsic *MI);    Instruction *SimplifyMemSet(MemSetInst *MI); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index e3179dbeece8..47406b9a1632 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -12,6 +12,7 @@  //===----------------------------------------------------------------------===//  #include "InstCombineInternal.h" +#include "llvm/ADT/SmallString.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/Loads.h"  #include "llvm/IR/DataLayout.h" @@ -90,21 +91,23 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,          if (CS.isCallee(&U))            continue; +        unsigned DataOpNo = CS.getDataOperandNo(&U); +        bool IsArgOperand = CS.isArgOperand(&U); +          // Inalloca arguments are clobbered by the call. -        unsigned ArgNo = CS.getArgumentNo(&U); -        if (CS.isInAllocaArgument(ArgNo)) +        if (IsArgOperand && CS.isInAllocaArgument(DataOpNo))            return false;          // If this is a readonly/readnone call site, then we know it is just a          // load (but one that potentially returns the value itself), so we can          // ignore it if we know that the value isn't captured.          if (CS.onlyReadsMemory() && -            (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo))) +            (CS.getInstruction()->use_empty() || CS.doesNotCapture(DataOpNo)))            continue;          // If this is being passed as a byval argument, the caller is making a          // copy, so it is only a read of the alloca. -        if (CS.isByValArgument(ArgNo)) +        if (IsArgOperand && CS.isByValArgument(DataOpNo))            continue;        } @@ -186,7 +189,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {      // Scan to the end of the allocation instructions, to skip over a block of      // allocas if possible...also skip interleaved debug info      // -    BasicBlock::iterator It = New; +    BasicBlock::iterator It(New);      while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))        ++It; @@ -367,7 +370,13 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT                               MDB.createRange(NonNullInt, NullInt));        }        break; - +    case LLVMContext::MD_align: +    case LLVMContext::MD_dereferenceable: +    case LLVMContext::MD_dereferenceable_or_null: +      // These only directly apply if the new type is also a pointer. +      if (NewTy->isPointerTy()) +        NewLoad->setMetadata(ID, N); +      break;      case LLVMContext::MD_range:        // FIXME: It would be nice to propagate this in some way, but the type        // conversions make it hard. If the new type is a pointer, we could @@ -418,6 +427,9 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value      case LLVMContext::MD_invariant_load:      case LLVMContext::MD_nonnull:      case LLVMContext::MD_range: +    case LLVMContext::MD_align: +    case LLVMContext::MD_dereferenceable: +    case LLVMContext::MD_dereferenceable_or_null:        // These don't apply for stores.        break;      } @@ -511,16 +523,46 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {    if (!T->isAggregateType())      return nullptr; -  assert(LI.getAlignment() && "Alignement must be set at this point"); +  assert(LI.getAlignment() && "Alignment must be set at this point");    if (auto *ST = dyn_cast<StructType>(T)) {      // If the struct only have one element, we unpack. -    if (ST->getNumElements() == 1) { +    unsigned Count = ST->getNumElements(); +    if (Count == 1) {        LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U),                                                 ".unpack");        return IC.ReplaceInstUsesWith(LI, IC.Builder->CreateInsertValue(          UndefValue::get(T), NewLoad, 0, LI.getName()));      } + +    // We don't want to break loads with padding here as we'd loose +    // the knowledge that padding exists for the rest of the pipeline. +    const DataLayout &DL = IC.getDataLayout(); +    auto *SL = DL.getStructLayout(ST); +    if (SL->hasPadding()) +      return nullptr; + +    auto Name = LI.getName(); +    SmallString<16> LoadName = Name; +    LoadName += ".unpack"; +    SmallString<16> EltName = Name; +    EltName += ".elt"; +    auto *Addr = LI.getPointerOperand(); +    Value *V = UndefValue::get(T); +    auto *IdxType = Type::getInt32Ty(ST->getContext()); +    auto *Zero = ConstantInt::get(IdxType, 0); +    for (unsigned i = 0; i < Count; i++) { +      Value *Indices[2] = { +        Zero, +        ConstantInt::get(IdxType, i), +      }; +      auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), EltName); +      auto *L = IC.Builder->CreateLoad(ST->getTypeAtIndex(i), Ptr, LoadName); +      V = IC.Builder->CreateInsertValue(V, L, i); +    } + +    V->setName(Name); +    return IC.ReplaceInstUsesWith(LI, V);    }    if (auto *AT = dyn_cast<ArrayType>(T)) { @@ -681,7 +723,7 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,    // FIXME: If the GEP is not inbounds, and there are extra indices after the    // one we'll replace, those could cause the address computation to wrap    // (rendering the IsAllNonNegative() check below insufficient). We can do -  // better, ignoring zero indicies (and other indicies we can prove small +  // better, ignoring zero indices (and other indices we can prove small    // enough not to wrap).    if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds())      return false; @@ -748,19 +790,19 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {    // Do really simple store-to-load forwarding and load CSE, to catch cases    // where there are several consecutive memory accesses to the same location,    // separated by a few arithmetic operations. -  BasicBlock::iterator BBI = &LI; +  BasicBlock::iterator BBI(LI);    AAMDNodes AATags; -  if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI, -                                                     6, AA, &AATags)) { +  if (Value *AvailableVal = +      FindAvailableLoadedValue(Op, LI.getParent(), BBI, +                               DefMaxInstsToScan, AA, &AATags)) {      if (LoadInst *NLI = dyn_cast<LoadInst>(AvailableVal)) {        unsigned KnownIDs[] = { -        LLVMContext::MD_tbaa, -        LLVMContext::MD_alias_scope, -        LLVMContext::MD_noalias, -        LLVMContext::MD_range, -        LLVMContext::MD_invariant_load, -        LLVMContext::MD_nonnull, -      }; +          LLVMContext::MD_tbaa,            LLVMContext::MD_alias_scope, +          LLVMContext::MD_noalias,         LLVMContext::MD_range, +          LLVMContext::MD_invariant_load,  LLVMContext::MD_nonnull, +          LLVMContext::MD_invariant_group, LLVMContext::MD_align, +          LLVMContext::MD_dereferenceable, +          LLVMContext::MD_dereferenceable_or_null};        combineMetadata(NLI, &LI, KnownIDs);      }; @@ -822,7 +864,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {        }        // load (select (cond, null, P)) -> load P -      if (isa<ConstantPointerNull>(SI->getOperand(1)) &&  +      if (isa<ConstantPointerNull>(SI->getOperand(1)) &&            LI.getPointerAddressSpace() == 0) {          LI.setOperand(0, SI->getOperand(2));          return &LI; @@ -857,7 +899,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {  ///  /// \returns true if the store was successfully combined away. This indicates  /// the caller must erase the store instruction. We have to let the caller erase -/// the store instruction sas otherwise there is no way to signal whether it was +/// the store instruction as otherwise there is no way to signal whether it was  /// combined or not: IC.EraseInstFromFunction returns a null pointer.  static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {    // FIXME: We could probably with some care handle both volatile and atomic @@ -893,11 +935,38 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {    if (auto *ST = dyn_cast<StructType>(T)) {      // If the struct only have one element, we unpack. -    if (ST->getNumElements() == 1) { +    unsigned Count = ST->getNumElements(); +    if (Count == 1) {        V = IC.Builder->CreateExtractValue(V, 0);        combineStoreToNewValue(IC, SI, V);        return true;      } + +    // We don't want to break loads with padding here as we'd loose +    // the knowledge that padding exists for the rest of the pipeline. +    const DataLayout &DL = IC.getDataLayout(); +    auto *SL = DL.getStructLayout(ST); +    if (SL->hasPadding()) +      return false; + +    SmallString<16> EltName = V->getName(); +    EltName += ".elt"; +    auto *Addr = SI.getPointerOperand(); +    SmallString<16> AddrName = Addr->getName(); +    AddrName += ".repack"; +    auto *IdxType = Type::getInt32Ty(ST->getContext()); +    auto *Zero = ConstantInt::get(IdxType, 0); +    for (unsigned i = 0; i < Count; i++) { +      Value *Indices[2] = { +        Zero, +        ConstantInt::get(IdxType, i), +      }; +      auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), AddrName); +      auto *Val = IC.Builder->CreateExtractValue(V, i, EltName); +      IC.Builder->CreateStore(Val, Ptr); +    } + +    return true;    }    if (auto *AT = dyn_cast<ArrayType>(T)) { @@ -971,9 +1040,9 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {        return &SI;    } -  // Don't hack volatile/atomic stores. -  // FIXME: Some bits are legal for atomic stores; needs refactoring. -  if (!SI.isSimple()) return nullptr; +  // Don't hack volatile/ordered stores. +  // FIXME: Some bits are legal for ordered atomic stores; needs refactoring. +  if (!SI.isUnordered()) return nullptr;    // If the RHS is an alloca with a single use, zapify the store, making the    // alloca dead. @@ -991,7 +1060,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {    // Do really simple DSE, to catch cases where there are several consecutive    // stores to the same location, separated by a few arithmetic operations. This    // situation often occurs with bitfield accesses. -  BasicBlock::iterator BBI = &SI; +  BasicBlock::iterator BBI(SI);    for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;         --ScanInsts) {      --BBI; @@ -1005,7 +1074,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {      if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {        // Prev store isn't volatile, and stores to the same location? -      if (PrevSI->isSimple() && equivalentAddressValues(PrevSI->getOperand(1), +      if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),                                                          SI.getOperand(1))) {          ++NumDeadStore;          ++BBI; @@ -1019,9 +1088,10 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {      // the pointer we're loading and is producing the pointer we're storing,      // then *this* store is dead (X = load P; store X -> P).      if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) { -      if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) && -          LI->isSimple()) +      if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) { +        assert(SI.isUnordered() && "can't eliminate ordering operation");          return EraseInstFromFunction(SI); +      }        // Otherwise, this is a load from some other location.  Stores before it        // may not be dead. @@ -1047,10 +1117,14 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {    if (isa<UndefValue>(Val))      return EraseInstFromFunction(SI); +  // The code below needs to be audited and adjusted for unordered atomics +  if (!SI.isSimple()) +    return nullptr; +    // If this store is the last instruction in the basic block (possibly    // excepting debug info instructions), and if the block ends with an    // unconditional branch, try to move it to the successor block. -  BBI = &SI; +  BBI = SI.getIterator();    do {      ++BBI;    } while (isa<DbgInfoIntrinsic>(BBI) || @@ -1106,7 +1180,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {      return false;    // Verify that the other block ends in a branch and is not otherwise empty. -  BasicBlock::iterator BBI = OtherBB->getTerminator(); +  BasicBlock::iterator BBI(OtherBB->getTerminator());    BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);    if (!OtherBr || BBI == OtherBB->begin())      return false; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index a554e9f628e0..7ad0efc42fb4 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -22,9 +22,9 @@ using namespace PatternMatch;  #define DEBUG_TYPE "instcombine" -/// simplifyValueKnownNonZero - The specific integer value is used in a context -/// where it is known to be non-zero.  If this allows us to simplify the -/// computation, do so and return the new operand, otherwise return null. +/// The specific integer value is used in a context where it is known to be +/// non-zero.  If this allows us to simplify the computation, do so and return +/// the new operand, otherwise return null.  static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,                                          Instruction &CxtI) {    // If V has multiple uses, then we would have to do more analysis to determine @@ -76,8 +76,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,  } -/// MultiplyOverflows - True if the multiply can not be expressed in an int -/// this size. +/// True if the multiply can not be expressed in an int this size.  static bool MultiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product,                                bool IsSigned) {    bool Overflow; @@ -95,6 +94,14 @@ static bool IsMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,    assert(C1.getBitWidth() == C2.getBitWidth() &&           "Inconsistent width of constants!"); +  // Bail if we will divide by zero. +  if (C2.isMinValue()) +    return false; + +  // Bail if we would divide INT_MIN by -1. +  if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue()) +    return false; +    APInt Remainder(C1.getBitWidth(), /*Val=*/0ULL, IsSigned);    if (IsSigned)      APInt::sdivrem(C1, C2, Quotient, Remainder); @@ -705,8 +712,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {    return Changed ? &I : nullptr;  } -/// SimplifyDivRemOfSelect - Try to fold a divide or remainder of a select -/// instruction. +/// Try to fold a divide or remainder of a select instruction.  bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {    SelectInst *SI = cast<SelectInst>(I.getOperand(1)); @@ -740,7 +746,7 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {      return true;    // Scan the current block backward, looking for other uses of SI. -  BasicBlock::iterator BBI = &I, BBFront = I.getParent()->begin(); +  BasicBlock::iterator BBI = I.getIterator(), BBFront = I.getParent()->begin();    while (BBI != BBFront) {      --BBI; @@ -754,10 +760,10 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {           I != E; ++I) {        if (*I == SI) {          *I = SI->getOperand(NonNullOperand); -        Worklist.Add(BBI); +        Worklist.Add(&*BBI);        } else if (*I == SelectCond) {          *I = Builder->getInt1(NonNullOperand == 1); -        Worklist.Add(BBI); +        Worklist.Add(&*BBI);        }      } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 460f6eb6a825..f1aa98b5e359 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -15,6 +15,7 @@  #include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Transforms/Utils/Local.h"  using namespace llvm;  #define DEBUG_TYPE "instcombine" @@ -245,7 +246,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {  /// non-address-taken alloca.  Doing so will cause us to not promote the alloca  /// to a register.  static bool isSafeAndProfitableToSinkLoad(LoadInst *L) { -  BasicBlock::iterator BBI = L, E = L->getParent()->end(); +  BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();    for (++BBI; BBI != E; ++BBI)      if (BBI->mayWriteToMemory()) @@ -349,24 +350,40 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {    Value *InVal = FirstLI->getOperand(0);    NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); +  LoadInst *NewLI = new LoadInst(NewPN, "", isVolatile, LoadAlignment); + +  unsigned KnownIDs[] = { +    LLVMContext::MD_tbaa, +    LLVMContext::MD_range, +    LLVMContext::MD_invariant_load, +    LLVMContext::MD_alias_scope, +    LLVMContext::MD_noalias, +    LLVMContext::MD_nonnull, +    LLVMContext::MD_align, +    LLVMContext::MD_dereferenceable, +    LLVMContext::MD_dereferenceable_or_null, +  }; -  // Add all operands to the new PHI. +  for (unsigned ID : KnownIDs) +    NewLI->setMetadata(ID, FirstLI->getMetadata(ID)); + +  // Add all operands to the new PHI and combine TBAA metadata.    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { -    Value *NewInVal = cast<LoadInst>(PN.getIncomingValue(i))->getOperand(0); +    LoadInst *LI = cast<LoadInst>(PN.getIncomingValue(i)); +    combineMetadata(NewLI, LI, KnownIDs); +    Value *NewInVal = LI->getOperand(0);      if (NewInVal != InVal)        InVal = nullptr;      NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));    } -  Value *PhiVal;    if (InVal) {      // The new PHI unions all of the same values together.  This is really      // common, so we handle it intelligently here for compile-time speed. -    PhiVal = InVal; +    NewLI->setOperand(0, InVal);      delete NewPN;    } else {      InsertNewInstBefore(NewPN, PN); -    PhiVal = NewPN;    }    // If this was a volatile load that we are merging, make sure to loop through @@ -376,17 +393,94 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {      for (Value *IncValue : PN.incoming_values())        cast<LoadInst>(IncValue)->setVolatile(false); -  LoadInst *NewLI = new LoadInst(PhiVal, "", isVolatile, LoadAlignment);    NewLI->setDebugLoc(FirstLI->getDebugLoc());    return NewLI;  } +/// TODO: This function could handle other cast types, but then it might +/// require special-casing a cast from the 'i1' type. See the comment in +/// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types. +Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) { +  // We cannot create a new instruction after the PHI if the terminator is an +  // EHPad because there is no valid insertion point. +  if (TerminatorInst *TI = Phi.getParent()->getTerminator()) +    if (TI->isEHPad()) +      return nullptr; + +  // Early exit for the common case of a phi with two operands. These are +  // handled elsewhere. See the comment below where we check the count of zexts +  // and constants for more details. +  unsigned NumIncomingValues = Phi.getNumIncomingValues(); +  if (NumIncomingValues < 3) +    return nullptr; +  // Find the narrower type specified by the first zext. +  Type *NarrowType = nullptr; +  for (Value *V : Phi.incoming_values()) { +    if (auto *Zext = dyn_cast<ZExtInst>(V)) { +      NarrowType = Zext->getSrcTy(); +      break; +    } +  } +  if (!NarrowType) +    return nullptr; + +  // Walk the phi operands checking that we only have zexts or constants that +  // we can shrink for free. Store the new operands for the new phi. +  SmallVector<Value *, 4> NewIncoming; +  unsigned NumZexts = 0; +  unsigned NumConsts = 0; +  for (Value *V : Phi.incoming_values()) { +    if (auto *Zext = dyn_cast<ZExtInst>(V)) { +      // All zexts must be identical and have one use. +      if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUse()) +        return nullptr; +      NewIncoming.push_back(Zext->getOperand(0)); +      NumZexts++; +    } else if (auto *C = dyn_cast<Constant>(V)) { +      // Make sure that constants can fit in the new type. +      Constant *Trunc = ConstantExpr::getTrunc(C, NarrowType); +      if (ConstantExpr::getZExt(Trunc, C->getType()) != C) +        return nullptr; +      NewIncoming.push_back(Trunc); +      NumConsts++; +    } else { +      // If it's not a cast or a constant, bail out. +      return nullptr; +    } +  } + +  // The more common cases of a phi with no constant operands or just one +  // variable operand are handled by FoldPHIArgOpIntoPHI() and FoldOpIntoPhi() +  // respectively. FoldOpIntoPhi() wants to do the opposite transform that is +  // performed here. It tries to replicate a cast in the phi operand's basic +  // block to expose other folding opportunities. Thus, InstCombine will +  // infinite loop without this check. +  if (NumConsts == 0 || NumZexts < 2) +    return nullptr; + +  // All incoming values are zexts or constants that are safe to truncate. +  // Create a new phi node of the narrow type, phi together all of the new +  // operands, and zext the result back to the original type. +  PHINode *NewPhi = PHINode::Create(NarrowType, NumIncomingValues, +                                    Phi.getName() + ".shrunk"); +  for (unsigned i = 0; i != NumIncomingValues; ++i) +    NewPhi->addIncoming(NewIncoming[i], Phi.getIncomingBlock(i)); + +  InsertNewInstBefore(NewPhi, Phi); +  return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType()); +}  /// If all operands to a PHI node are the same "unary" operator and they all are  /// only used by the PHI, PHI together their inputs, and do the operation once,  /// to the result of the PHI.  Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { +  // We cannot create a new instruction after the PHI if the terminator is an +  // EHPad because there is no valid insertion point. +  if (TerminatorInst *TI = PN.getParent()->getTerminator()) +    if (TI->isEHPad()) +      return nullptr; +    Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));    if (isa<GetElementPtrInst>(FirstInst)) @@ -740,7 +834,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {          }          // Otherwise, do an extract in the predecessor. -        Builder->SetInsertPoint(Pred, Pred->getTerminator()); +        Builder->SetInsertPoint(Pred->getTerminator());          Value *Res = InVal;          if (Offset)            Res = Builder->CreateLShr(Res, ConstantInt::get(InVal->getType(), @@ -787,6 +881,9 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {    if (Value *V = SimplifyInstruction(&PN, DL, TLI, DT, AC))      return ReplaceInstUsesWith(PN, V); +  if (Instruction *Result = FoldPHIArgZextsIntoPHI(PN)) +    return Result; +    // If all PHI operands are the same operation, pull them through the PHI,    // reducing code size.    if (isa<Instruction>(PN.getIncomingValue(0)) && diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index f51442a9f36d..776704d1efa9 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -38,7 +38,8 @@ getInverseMinMaxSelectPattern(SelectPatternFlavor SPF) {    }  } -static CmpInst::Predicate getICmpPredicateForMinMax(SelectPatternFlavor SPF) { +static CmpInst::Predicate getCmpPredicateForMinMax(SelectPatternFlavor SPF, +                                                   bool Ordered=false) {    switch (SPF) {    default:      llvm_unreachable("unhandled!"); @@ -51,17 +52,22 @@ static CmpInst::Predicate getICmpPredicateForMinMax(SelectPatternFlavor SPF) {      return ICmpInst::ICMP_SGT;    case SPF_UMAX:      return ICmpInst::ICMP_UGT; +  case SPF_FMINNUM: +    return Ordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT; +  case SPF_FMAXNUM: +    return Ordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT;    }  }  static Value *generateMinMaxSelectPattern(InstCombiner::BuilderTy *Builder,                                            SelectPatternFlavor SPF, Value *A,                                            Value *B) { -  CmpInst::Predicate Pred = getICmpPredicateForMinMax(SPF); +  CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF); +  assert(CmpInst::isIntPredicate(Pred));    return Builder->CreateSelect(Builder->CreateICmp(Pred, A, B), A, B);  } -/// GetSelectFoldableOperands - We want to turn code that looks like this: +/// We want to turn code that looks like this:  ///   %C = or %A, %B  ///   %D = select %cond, %C, %A  /// into: @@ -90,8 +96,8 @@ static unsigned GetSelectFoldableOperands(Instruction *I) {    }  } -/// GetSelectFoldableConstant - For the same transformation as the previous -/// function, return the identity constant that goes into the select. +/// For the same transformation as the previous function, return the identity +/// constant that goes into the select.  static Constant *GetSelectFoldableConstant(Instruction *I) {    switch (I->getOpcode()) {    default: llvm_unreachable("This cannot happen!"); @@ -110,7 +116,7 @@ static Constant *GetSelectFoldableConstant(Instruction *I) {    }  } -/// FoldSelectOpOp - Here we have (select c, TI, FI), and we know that TI and FI +/// Here we have (select c, TI, FI), and we know that TI and FI  /// have the same opcode and only one use each.  Try to simplify this.  Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,                                            Instruction *FI) { @@ -197,8 +203,8 @@ static bool isSelect01(Constant *C1, Constant *C2) {           C2I->isOne() || C2I->isAllOnesValue();  } -/// FoldSelectIntoOp - Try fold the select into one of the operands to -/// facilitate further optimization. +/// Try to fold the select into one of the operands to allow further +/// optimization.  Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,                                              Value *FalseVal) {    // See the comment above GetSelectFoldableOperands for a description of the @@ -276,7 +282,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,    return nullptr;  } -/// foldSelectICmpAndOr - We want to turn: +/// We want to turn:  ///   (select (icmp eq (and X, C1), 0), Y, (or Y, C2))  /// into:  ///   (or (shl (and X, C1), C3), y) @@ -394,9 +400,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,    return nullptr;  } -/// visitSelectInstWithICmp - Visit a SelectInst that has an -/// ICmpInst as its first operand. -/// +/// Visit a SelectInst that has an ICmpInst as its first operand.  Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,                                                     ICmpInst *ICI) {    bool Changed = false; @@ -595,10 +599,9 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,  } -/// CanSelectOperandBeMappingIntoPredBlock - SI is a select whose condition is a -/// PHI node (but the two may be in different blocks).  See if the true/false -/// values (V) are live in all of the predecessor blocks of the PHI.  For -/// example, cases like this cannot be mapped: +/// SI is a select whose condition is a PHI node (but the two may be in +/// different blocks). See if the true/false values (V) are live in all of the +/// predecessor blocks of the PHI. For example, cases like this can't be mapped:  ///  ///   X = phi [ C1, BB1], [C2, BB2]  ///   Y = add @@ -632,7 +635,7 @@ static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V,    return false;  } -/// FoldSPFofSPF - We have an SPF (e.g. a min or max) of an SPF of the form: +/// We have an SPF (e.g. a min or max) of an SPF of the form:  ///   SPF2(SPF1(A, B), C)  Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,                                          SelectPatternFlavor SPF1, @@ -745,10 +748,10 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,    return nullptr;  } -/// foldSelectICmpAnd - If one of the constants is zero (we know they can't -/// both be) and we have an icmp instruction with zero, and we have an 'and' -/// with the non-constant value and a power of two we can turn the select -/// into a shift on the result of the 'and'. +/// If one of the constants is zero (we know they can't both be) and we have an +/// icmp instruction with zero, and we have an 'and' with the non-constant value +/// and a power of two we can turn the select into a shift on the result of the +/// 'and'.  static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal,                                  ConstantInt *FalseVal,                                  InstCombiner::BuilderTy *Builder) { @@ -926,6 +929,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {        // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X        if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {          FCmpInst::Predicate InvPred = FCI->getInversePredicate(); +        IRBuilder<>::FastMathFlagGuard FMFG(*Builder); +        Builder->SetFastMathFlags(FCI->getFastMathFlags());          Value *NewCond = Builder->CreateFCmp(InvPred, TrueVal, FalseVal,                                               FCI->getName() + ".inv"); @@ -967,6 +972,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {        // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y        if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {          FCmpInst::Predicate InvPred = FCI->getInversePredicate(); +        IRBuilder<>::FastMathFlagGuard FMFG(*Builder); +        Builder->SetFastMathFlags(FCI->getFastMathFlags());          Value *NewCond = Builder->CreateFCmp(InvPred, FalseVal, TrueVal,                                               FCI->getName() + ".inv"); @@ -1054,35 +1061,50 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {        }    // See if we can fold the select into one of our operands. -  if (SI.getType()->isIntOrIntVectorTy()) { +  if (SI.getType()->isIntOrIntVectorTy() || SI.getType()->isFPOrFPVectorTy()) {      if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal))        return FoldI;      Value *LHS, *RHS, *LHS2, *RHS2;      Instruction::CastOps CastOp; -    SelectPatternFlavor SPF = matchSelectPattern(&SI, LHS, RHS, &CastOp); +    SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp); +    auto SPF = SPR.Flavor; -    if (SPF) { +    if (SelectPatternResult::isMinOrMax(SPF)) {        // Canonicalize so that type casts are outside select patterns.        if (LHS->getType()->getPrimitiveSizeInBits() !=            SI.getType()->getPrimitiveSizeInBits()) { -        CmpInst::Predicate Pred = getICmpPredicateForMinMax(SPF); -        Value *Cmp = Builder->CreateICmp(Pred, LHS, RHS); +        CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF, SPR.Ordered); + +        Value *Cmp; +        if (CmpInst::isIntPredicate(Pred)) { +          Cmp = Builder->CreateICmp(Pred, LHS, RHS); +        } else { +          IRBuilder<>::FastMathFlagGuard FMFG(*Builder); +          auto FMF = cast<FPMathOperator>(SI.getCondition())->getFastMathFlags(); +          Builder->SetFastMathFlags(FMF); +          Cmp = Builder->CreateFCmp(Pred, LHS, RHS); +        } +          Value *NewSI = Builder->CreateCast(CastOp,                                             Builder->CreateSelect(Cmp, LHS, RHS),                                             SI.getType());          return ReplaceInstUsesWith(SI, NewSI);        } +    } +    if (SPF) {        // MAX(MAX(a, b), a) -> MAX(a, b)        // MIN(MIN(a, b), a) -> MIN(a, b)        // MAX(MIN(a, b), a) -> a        // MIN(MAX(a, b), a) -> a -      if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2)) +      // ABS(ABS(a)) -> ABS(a) +      // NABS(NABS(a)) -> NABS(a) +      if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor)          if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2,                                            SI, SPF, RHS))            return R; -      if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2)) +      if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2).Flavor)          if (Instruction *R = FoldSPFofSPF(cast<Instruction>(RHS),SPF2,LHS2,RHS2,                                            SI, SPF, LHS))            return R; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index d04ed58b014f..0c7defa5fff8 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -55,7 +55,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {    return nullptr;  } -/// CanEvaluateShifted - See if we can compute the specified value, but shifted +/// See if we can compute the specified value, but shifted  /// logically to the left or right by some number of bits.  This should return  /// true if the expression can be computed for the same cost as the current  /// expression tree.  This is used to eliminate extraneous shifting from things @@ -184,7 +184,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,    }  } -/// GetShiftedValue - When CanEvaluateShifted returned true for an expression, +/// When CanEvaluateShifted returned true for an expression,  /// this value inserts the new computation that produces the shifted value.  static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,                                InstCombiner &IC, const DataLayout &DL) { diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 80628b23f111..743d51483ea1 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -410,9 +410,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,      // If this is a select as part of a min/max pattern, don't simplify any      // further in case we break the structure.      Value *LHS, *RHS; -    if (matchSelectPattern(I, LHS, RHS) != SPF_UNKNOWN) +    if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN)        return nullptr; -       +      if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero,                               RHSKnownOne, Depth + 1) ||          SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero, @@ -1057,7 +1057,13 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,      APInt LeftDemanded(DemandedElts), RightDemanded(DemandedElts);      if (ConstantVector* CV = dyn_cast<ConstantVector>(I->getOperand(0))) {        for (unsigned i = 0; i < VWidth; i++) { -        if (CV->getAggregateElement(i)->isNullValue()) +        Constant *CElt = CV->getAggregateElement(i); +        // Method isNullValue always returns false when called on a +        // ConstantExpr. If CElt is a ConstantExpr then skip it in order to +        // to avoid propagating incorrect information. +        if (isa<ConstantExpr>(CElt)) +          continue; +        if (CElt->isNullValue())            LeftDemanded.clearBit(i);          else            RightDemanded.clearBit(i); @@ -1082,6 +1088,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,      if (!VTy) break;      unsigned InVWidth = VTy->getNumElements();      APInt InputDemandedElts(InVWidth, 0); +    UndefElts2 = APInt(InVWidth, 0);      unsigned Ratio;      if (VWidth == InVWidth) { @@ -1089,29 +1096,25 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,        // elements as are demanded of us.        Ratio = 1;        InputDemandedElts = DemandedElts; -    } else if (VWidth > InVWidth) { -      // Untested so far. -      break; - -      // If there are more elements in the result than there are in the source, -      // then an input element is live if any of the corresponding output -      // elements are live. -      Ratio = VWidth/InVWidth; -      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { +    } else if ((VWidth % InVWidth) == 0) { +      // If the number of elements in the output is a multiple of the number of +      // elements in the input then an input element is live if any of the +      // corresponding output elements are live. +      Ratio = VWidth / InVWidth; +      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)          if (DemandedElts[OutIdx]) -          InputDemandedElts.setBit(OutIdx/Ratio); -      } -    } else { -      // Untested so far. -      break; - -      // If there are more elements in the source than there are in the result, -      // then an input element is live if the corresponding output element is -      // live. -      Ratio = InVWidth/VWidth; +          InputDemandedElts.setBit(OutIdx / Ratio); +    } else if ((InVWidth % VWidth) == 0) { +      // If the number of elements in the input is a multiple of the number of +      // elements in the output then an input element is live if the +      // corresponding output element is live. +      Ratio = InVWidth / VWidth;        for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) -        if (DemandedElts[InIdx/Ratio]) +        if (DemandedElts[InIdx / Ratio])            InputDemandedElts.setBit(InIdx); +    } else { +      // Unsupported so far. +      break;      }      // div/rem demand all inputs, because they don't want divide by zero. @@ -1122,24 +1125,26 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,        MadeChange = true;      } -    UndefElts = UndefElts2; -    if (VWidth > InVWidth) { -      llvm_unreachable("Unimp"); -      // If there are more elements in the result than there are in the source, -      // then an output element is undef if the corresponding input element is -      // undef. +    if (VWidth == InVWidth) { +      UndefElts = UndefElts2; +    } else if ((VWidth % InVWidth) == 0) { +      // If the number of elements in the output is a multiple of the number of +      // elements in the input then an output element is undef if the +      // corresponding input element is undef.        for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) -        if (UndefElts2[OutIdx/Ratio]) +        if (UndefElts2[OutIdx / Ratio]) +          UndefElts.setBit(OutIdx); +    } else if ((InVWidth % VWidth) == 0) { +      // If the number of elements in the input is a multiple of the number of +      // elements in the output then an output element is undef if all of the +      // corresponding input elements are undef. +      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { +        APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio); +        if (SubUndef.countPopulation() == Ratio)            UndefElts.setBit(OutIdx); -    } else if (VWidth < InVWidth) { +      } +    } else {        llvm_unreachable("Unimp"); -      // If there are more elements in the source than there are in the result, -      // then a result element is undef if all of the corresponding input -      // elements are undef. -      UndefElts = ~0ULL >> (64-VWidth);  // Start out all undef. -      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) -        if (!UndefElts2[InIdx])            // Not undef? -          UndefElts.clearBit(InIdx/Ratio);    // Clear undef bit.      }      break;    } @@ -1237,6 +1242,15 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,        // like undef&0.  The result is known zero, not undef.        UndefElts &= UndefElts2;        break; + +    // SSE4A instructions leave the upper 64-bits of the 128-bit result +    // in an undefined state. +    case Intrinsic::x86_sse4a_extrq: +    case Intrinsic::x86_sse4a_extrqi: +    case Intrinsic::x86_sse4a_insertq: +    case Intrinsic::x86_sse4a_insertqi: +      UndefElts |= APInt::getHighBitsSet(VWidth, VWidth / 2); +      break;      }      break;    } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 273047279e90..e25639ae943b 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -22,10 +22,10 @@ using namespace PatternMatch;  #define DEBUG_TYPE "instcombine" -/// CheapToScalarize - Return true if the value is cheaper to scalarize than it -/// is to leave as a vector operation.  isConstant indicates whether we're -/// extracting one known element.  If false we're extracting a variable index. -static bool CheapToScalarize(Value *V, bool isConstant) { +/// Return true if the value is cheaper to scalarize than it is to leave as a +/// vector operation. isConstant indicates whether we're extracting one known +/// element. If false we're extracting a variable index. +static bool cheapToScalarize(Value *V, bool isConstant) {    if (Constant *C = dyn_cast<Constant>(V)) {      if (isConstant) return true; @@ -50,13 +50,13 @@ static bool CheapToScalarize(Value *V, bool isConstant) {      return true;    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))      if (BO->hasOneUse() && -        (CheapToScalarize(BO->getOperand(0), isConstant) || -         CheapToScalarize(BO->getOperand(1), isConstant))) +        (cheapToScalarize(BO->getOperand(0), isConstant) || +         cheapToScalarize(BO->getOperand(1), isConstant)))        return true;    if (CmpInst *CI = dyn_cast<CmpInst>(I))      if (CI->hasOneUse() && -        (CheapToScalarize(CI->getOperand(0), isConstant) || -         CheapToScalarize(CI->getOperand(1), isConstant))) +        (cheapToScalarize(CI->getOperand(0), isConstant) || +         cheapToScalarize(CI->getOperand(1), isConstant)))        return true;    return false; @@ -82,7 +82,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {    // and that it is a binary operation which is cheap to scalarize.    // otherwise return NULL.    if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) || -      !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true)) +      !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true))      return nullptr;    // Create a scalar PHI node that will replace the vector PHI node @@ -115,8 +115,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {        Instruction *pos = dyn_cast<Instruction>(PHIInVal);        BasicBlock::iterator InsertPos;        if (pos && !isa<PHINode>(pos)) { -        InsertPos = pos; -        ++InsertPos; +        InsertPos = ++pos->getIterator();        } else {          InsertPos = inBB->getFirstInsertionPt();        } @@ -137,7 +136,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {    // If vector val is constant with all elements the same, replace EI with    // that element.  We handle a known element # below.    if (Constant *C = dyn_cast<Constant>(EI.getOperand(0))) -    if (CheapToScalarize(C, false)) +    if (cheapToScalarize(C, false))        return ReplaceInstUsesWith(EI, C->getAggregateElement(0U));    // If extracting a specified index from the vector, see if we can recursively @@ -163,7 +162,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {        }      } -    // If the this extractelement is directly using a bitcast from a vector of +    // If this extractelement is directly using a bitcast from a vector of      // the same number of elements, see if we can find the source element from      // it.  In this case, we will end up needing to bitcast the scalars.      if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) { @@ -184,10 +183,10 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {    if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) {      // Push extractelement into predecessor operation if legal and -    // profitable to do so +    // profitable to do so.      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {        if (I->hasOneUse() && -          CheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) { +          cheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) {          Value *newEI0 =            Builder->CreateExtractElement(BO->getOperand(0), EI.getOperand(1),                                          EI.getName()+".lhs"); @@ -230,8 +229,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {                                                             SrcIdx, false));        }      } else if (CastInst *CI = dyn_cast<CastInst>(I)) { -      // Canonicalize extractelement(cast) -> cast(extractelement) -      // bitcasts can change the number of vector elements and they cost nothing +      // Canonicalize extractelement(cast) -> cast(extractelement). +      // Bitcasts can change the number of vector elements, and they cost +      // nothing.        if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {          Value *EE = Builder->CreateExtractElement(CI->getOperand(0),                                                    EI.getIndexOperand()); @@ -245,7 +245,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {          // fight the vectorizer.          // If we are extracting an element from a vector select or a select on -        // vectors, a select on the scalars extracted from the vector arguments. +        // vectors, create a select on the scalars extracted from the vector +        // arguments.          Value *TrueVal = SI->getTrueValue();          Value *FalseVal = SI->getFalseValue(); @@ -275,10 +276,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {    return nullptr;  } -/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns -/// elements from either LHS or RHS, return the shuffle mask and true. -/// Otherwise, return false. -static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, +/// If V is a shuffle of values that ONLY returns elements from either LHS or +/// RHS, return the shuffle mask and true. Otherwise, return false. +static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,                                           SmallVectorImpl<Constant*> &Mask) {    assert(LHS->getType() == RHS->getType() &&           "Invalid CollectSingleShuffleElements"); @@ -315,7 +315,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,      if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.        // We can handle this if the vector we are inserting into is        // transitively ok. -      if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { +      if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {          // If so, update the mask to reflect the inserted undef.          Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext()));          return true; @@ -330,7 +330,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,          if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {            // We can handle this if the vector we are inserting into is            // transitively ok. -          if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { +          if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {              // If so, update the mask to reflect the inserted value.              if (EI->getOperand(0) == LHS) {                Mask[InsertedIdx % NumElts] = @@ -352,6 +352,48 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,    return false;  } +/// If we have insertion into a vector that is wider than the vector that we +/// are extracting from, try to widen the source vector to allow a single +/// shufflevector to replace one or more insert/extract pairs. +static void replaceExtractElements(InsertElementInst *InsElt, +                                   ExtractElementInst *ExtElt, +                                   InstCombiner &IC) { +  VectorType *InsVecType = InsElt->getType(); +  VectorType *ExtVecType = ExtElt->getVectorOperandType(); +  unsigned NumInsElts = InsVecType->getVectorNumElements(); +  unsigned NumExtElts = ExtVecType->getVectorNumElements(); + +  // The inserted-to vector must be wider than the extracted-from vector. +  if (InsVecType->getElementType() != ExtVecType->getElementType() || +      NumExtElts >= NumInsElts) +    return; + +  // Create a shuffle mask to widen the extended-from vector using undefined +  // values. The mask selects all of the values of the original vector followed +  // by as many undefined values as needed to create a vector of the same length +  // as the inserted-to vector. +  SmallVector<Constant *, 16> ExtendMask; +  IntegerType *IntType = Type::getInt32Ty(InsElt->getContext()); +  for (unsigned i = 0; i < NumExtElts; ++i) +    ExtendMask.push_back(ConstantInt::get(IntType, i)); +  for (unsigned i = NumExtElts; i < NumInsElts; ++i) +    ExtendMask.push_back(UndefValue::get(IntType)); + +  Value *ExtVecOp = ExtElt->getVectorOperand(); +  auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), +                                        ConstantVector::get(ExtendMask)); + +  // Replace all extracts from the original narrow vector with extracts from +  // the new wide vector. +  WideVec->insertBefore(ExtElt); +  for (User *U : ExtVecOp->users()) { +    if (ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U)) { +      auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1)); +      NewExt->insertAfter(WideVec); +      IC.ReplaceInstUsesWith(*OldExt, NewExt); +    } +  } +}  /// We are building a shuffle to create V, which is a sequence of insertelement,  /// extractelement pairs. If PermittedRHS is set, then we must either use it or @@ -363,9 +405,10 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,  /// often been chosen carefully to be efficiently implementable on the target.  typedef std::pair<Value *, Value *> ShuffleOps; -static ShuffleOps CollectShuffleElements(Value *V, +static ShuffleOps collectShuffleElements(Value *V,                                           SmallVectorImpl<Constant *> &Mask, -                                         Value *PermittedRHS) { +                                         Value *PermittedRHS, +                                         InstCombiner &IC) {    assert(V->getType()->isVectorTy() && "Invalid shuffle!");    unsigned NumElts = cast<VectorType>(V->getType())->getNumElements(); @@ -396,10 +439,14 @@ static ShuffleOps CollectShuffleElements(Value *V,          // otherwise we'd end up with a shuffle of three inputs.          if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {            Value *RHS = EI->getOperand(0); -          ShuffleOps LR = CollectShuffleElements(VecOp, Mask, RHS); +          ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC);            assert(LR.second == nullptr || LR.second == RHS);            if (LR.first->getType() != RHS->getType()) { +            // Although we are giving up for now, see if we can create extracts +            // that match the inserts for another round of combining. +            replaceExtractElements(IEI, EI, IC); +              // We tried our best, but we can't find anything compatible with RHS              // further up the chain. Return a trivial shuffle.              for (unsigned i = 0; i < NumElts; ++i) @@ -429,14 +476,14 @@ static ShuffleOps CollectShuffleElements(Value *V,          // If this insertelement is a chain that comes from exactly these two          // vectors, return the vector and the effective shuffle.          if (EI->getOperand(0)->getType() == PermittedRHS->getType() && -            CollectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS, +            collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,                                           Mask))            return std::make_pair(EI->getOperand(0), PermittedRHS);        }      }    } -  // Otherwise, can't do anything fancy.  Return an identity vector. +  // Otherwise, we can't do anything fancy. Return an identity vector.    for (unsigned i = 0; i != NumElts; ++i)      Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i));    return std::make_pair(V, nullptr); @@ -512,7 +559,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {        // (and any insertelements it points to), into one big shuffle.        if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) {          SmallVector<Constant*, 16> Mask; -        ShuffleOps LR = CollectShuffleElements(&IE, Mask, nullptr); +        ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);          // The proposed shuffle may be trivial, in which case we shouldn't          // perform the combine. @@ -588,8 +635,8 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask,      case Instruction::FPTrunc:      case Instruction::FPExt:      case Instruction::GetElementPtr: { -      for (int i = 0, e = I->getNumOperands(); i != e; ++i) { -        if (!CanEvaluateShuffled(I->getOperand(i), Mask, Depth-1)) +      for (Value *Operand : I->operands()) { +        if (!CanEvaluateShuffled(Operand, Mask, Depth-1))            return false;        }        return true; @@ -617,7 +664,7 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask,  /// Rebuild a new instruction just like 'I' but with the new operands given.  /// In the event of type mismatch, the type of the operands is correct. -static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) { +static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) {    // We don't want to use the IRBuilder here because we want the replacement    // instructions to appear next to 'I', not the builder's insertion point.    switch (I->getOpcode()) { @@ -760,7 +807,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {          NeedsRebuild |= (V != I->getOperand(i));        }        if (NeedsRebuild) { -        return BuildNew(I, NewOps); +        return buildNew(I, NewOps);        }        return I;      } @@ -792,7 +839,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {    llvm_unreachable("failed to reorder elements of vector instruction!");  } -static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask, +static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask,                                    bool &isLHSID, bool &isRHSID) {    isLHSID = isRHSID = true; @@ -891,7 +938,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {    if (VWidth == LHSWidth) {      // Analyze the shuffle, are the LHS or RHS and identity shuffles?      bool isLHSID, isRHSID; -    RecognizeIdentityMask(Mask, isLHSID, isRHSID); +    recognizeIdentityMask(Mask, isLHSID, isRHSID);      // Eliminate identity shuffles.      if (isLHSID) return ReplaceInstUsesWith(SVI, LHS); @@ -1177,7 +1224,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {    // If the result mask is an identity, replace uses of this instruction with    // corresponding argument.    bool isLHSID, isRHSID; -  RecognizeIdentityMask(newMask, isLHSID, isRHSID); +  recognizeIdentityMask(newMask, isLHSID, isRHSID);    if (isLHSID && VWidth == LHSOp0Width) return ReplaceInstUsesWith(SVI, newLHS);    if (isRHSID && VWidth == RHSOp0Width) return ReplaceInstUsesWith(SVI, newRHS); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index fd34a244f271..7c46cfd28fc9 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -42,8 +42,9 @@  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/CFG.h"  #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LibCallSemantics.h"  #include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/MemoryBuiltins.h"  #include "llvm/Analysis/TargetLibraryInfo.h" @@ -79,14 +80,12 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) {    return llvm::EmitGEPOffset(Builder, DL, GEP);  } -/// ShouldChangeType - Return true if it is desirable to convert a computation -/// from 'From' to 'To'.  We don't want to convert from a legal to an illegal -/// type for example, or from a smaller to a larger illegal type. -bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { -  assert(From->isIntegerTy() && To->isIntegerTy()); - -  unsigned FromWidth = From->getPrimitiveSizeInBits(); -  unsigned ToWidth = To->getPrimitiveSizeInBits(); +/// Return true if it is desirable to convert an integer computation from a +/// given bit width to a new bit width. +/// We don't want to convert from a legal to an illegal type for example or from +/// a smaller to a larger illegal type. +bool InstCombiner::ShouldChangeType(unsigned FromWidth, +                                    unsigned ToWidth) const {    bool FromLegal = DL.isLegalInteger(FromWidth);    bool ToLegal = DL.isLegalInteger(ToWidth); @@ -103,6 +102,17 @@ bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {    return true;  } +/// Return true if it is desirable to convert a computation from 'From' to 'To'. +/// We don't want to convert from a legal to an illegal type for example or from +/// a smaller to a larger illegal type. +bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { +  assert(From->isIntegerTy() && To->isIntegerTy()); + +  unsigned FromWidth = From->getPrimitiveSizeInBits(); +  unsigned ToWidth = To->getPrimitiveSizeInBits(); +  return ShouldChangeType(FromWidth, ToWidth); +} +  // Return true, if No Signed Wrap should be maintained for I.  // The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",  // where both B and C should be ConstantInts, results in a constant that does @@ -156,27 +166,26 @@ static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {    I.setFastMathFlags(FMF);  } -/// SimplifyAssociativeOrCommutative - This performs a few simplifications for -/// operators which are associative or commutative: -// -//  Commutative operators: -// -//  1. Order operands such that they are listed from right (least complex) to -//     left (most complex).  This puts constants before unary operators before -//     binary operators. -// -//  Associative operators: -// -//  2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. -//  3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. -// -//  Associative and commutative operators: -// -//  4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. -//  5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. -//  6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" -//     if C1 and C2 are constants. -// +/// This performs a few simplifications for operators that are associative or +/// commutative: +/// +///  Commutative operators: +/// +///  1. Order operands such that they are listed from right (least complex) to +///     left (most complex).  This puts constants before unary operators before +///     binary operators. +/// +///  Associative operators: +/// +///  2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. +///  3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. +/// +///  Associative and commutative operators: +/// +///  4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. +///  5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. +///  6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" +///     if C1 and C2 are constants.  bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {    Instruction::BinaryOps Opcode = I.getOpcode();    bool Changed = false; @@ -322,7 +331,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {    } while (1);  } -/// LeftDistributesOverRight - Whether "X LOp (Y ROp Z)" is always equal to +/// Return whether "X LOp (Y ROp Z)" is always equal to  /// "(X LOp Y) ROp (X LOp Z)".  static bool LeftDistributesOverRight(Instruction::BinaryOps LOp,                                       Instruction::BinaryOps ROp) { @@ -361,7 +370,7 @@ static bool LeftDistributesOverRight(Instruction::BinaryOps LOp,    }  } -/// RightDistributesOverLeft - Whether "(X LOp Y) ROp Z" is always equal to +/// Return whether "(X LOp Y) ROp Z" is always equal to  /// "(X ROp Z) LOp (Y ROp Z)".  static bool RightDistributesOverLeft(Instruction::BinaryOps LOp,                                       Instruction::BinaryOps ROp) { @@ -519,7 +528,7 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder,            if (isa<OverflowingBinaryOperator>(Op1))              HasNSW &= Op1->hasNoSignedWrap(); -        // We can propogate 'nsw' if we know that +        // We can propagate 'nsw' if we know that          //  %Y = mul nsw i16 %X, C          //  %Z = add nsw i16 %Y, %X          // => @@ -537,11 +546,11 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder,    return SimplifiedInst;  } -/// SimplifyUsingDistributiveLaws - This tries to simplify binary operations -/// which some other binary operation distributes over either by factorizing -/// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this -/// results in simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is -/// a win).  Returns the simplified value, or null if it didn't simplify. +/// This tries to simplify binary operations which some other binary operation +/// distributes over either by factorizing out common terms +/// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in +/// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win). +/// Returns the simplified value, or null if it didn't simplify.  Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);    BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS); @@ -623,12 +632,38 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {        }    } +  // (op (select (a, c, b)), (select (a, d, b))) -> (select (a, (op c, d), 0)) +  // (op (select (a, b, c)), (select (a, b, d))) -> (select (a, 0, (op c, d))) +  if (auto *SI0 = dyn_cast<SelectInst>(LHS)) { +    if (auto *SI1 = dyn_cast<SelectInst>(RHS)) { +      if (SI0->getCondition() == SI1->getCondition()) { +        Value *SI = nullptr; +        if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getFalseValue(), +                                     SI1->getFalseValue(), DL, TLI, DT, AC)) +          SI = Builder->CreateSelect(SI0->getCondition(), +                                     Builder->CreateBinOp(TopLevelOpcode, +                                                          SI0->getTrueValue(), +                                                          SI1->getTrueValue()), +                                     V); +        if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getTrueValue(), +                                     SI1->getTrueValue(), DL, TLI, DT, AC)) +          SI = Builder->CreateSelect( +              SI0->getCondition(), V, +              Builder->CreateBinOp(TopLevelOpcode, SI0->getFalseValue(), +                                   SI1->getFalseValue())); +        if (SI) { +          SI->takeName(&I); +          return SI; +        } +      } +    } +  } +    return nullptr;  } -// dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction -// if the LHS is a constant zero (which is the 'negate' form). -// +/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a +/// constant zero (which is the 'negate' form).  Value *InstCombiner::dyn_castNegVal(Value *V) const {    if (BinaryOperator::isNeg(V))      return BinaryOperator::getNegArgument(V); @@ -644,10 +679,8 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const {    return nullptr;  } -// dyn_castFNegVal - Given a 'fsub' instruction, return the RHS of the -// instruction if the LHS is a constant negative zero (which is the 'negate' -// form). -// +/// Given a 'fsub' instruction, return the RHS of the instruction if the LHS is +/// a constant negative zero (which is the 'negate' form).  Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const {    if (BinaryOperator::isFNeg(V, IgnoreZeroSign))      return BinaryOperator::getFNegArgument(V); @@ -700,10 +733,10 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,    llvm_unreachable("Unknown binary instruction type!");  } -// FoldOpIntoSelect - Given an instruction with a select as one operand and a -// constant as the other operand, try to fold the binary operator into the -// select arguments.  This also works for Cast instructions, which obviously do -// not have a second operand. +/// Given an instruction with a select as one operand and a constant as the +/// other operand, try to fold the binary operator into the select arguments. +/// This also works for Cast instructions, which obviously do not have a second +/// operand.  Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {    // Don't modify shared select instructions    if (!SI->hasOneUse()) return nullptr; @@ -752,10 +785,9 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {    return nullptr;  } -/// FoldOpIntoPhi - Given a binary operator, cast instruction, or select which -/// has a PHI node as operand #0, see if we can fold the instruction into the -/// PHI (which is only possible if all operands to the PHI are constants). -/// +/// Given a binary operator, cast instruction, or select which has a PHI node as +/// operand #0, see if we can fold the instruction into the PHI (which is only +/// possible if all operands to the PHI are constants).  Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {    PHINode *PN = cast<PHINode>(I.getOperand(0));    unsigned NumPHIValues = PN->getNumIncomingValues(); @@ -819,7 +851,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {    NewPN->takeName(PN);    // If we are going to have to insert a new computation, do so right before the -  // predecessors terminator. +  // predecessor's terminator.    if (NonConstBB)      Builder->SetInsertPoint(NonConstBB->getTerminator()); @@ -893,10 +925,10 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {    return ReplaceInstUsesWith(I, NewPN);  } -/// FindElementAtOffset - Given a pointer type and a constant offset, determine -/// whether or not there is a sequence of GEP indices into the pointed type that -/// will land us at the specified offset.  If so, fill them into NewIndices and -/// return the resultant element type, otherwise return null. +/// Given a pointer type and a constant offset, determine whether or not there +/// is a sequence of GEP indices into the pointed type that will land us at the +/// specified offset. If so, fill them into NewIndices and return the resultant +/// element type, otherwise return null.  Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,                                          SmallVectorImpl<Value *> &NewIndices) {    Type *Ty = PtrTy->getElementType(); @@ -965,8 +997,8 @@ static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {    return true;  } -/// Descale - Return a value X such that Val = X * Scale, or null if none.  If -/// the multiplication is known not to overflow then NoSignedWrap is set. +/// Return a value X such that Val = X * Scale, or null if none. +/// If the multiplication is known not to overflow, then NoSignedWrap is set.  Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {    assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");    assert(cast<IntegerType>(Val->getType())->getBitWidth() == @@ -1008,11 +1040,11 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {    // 0'th operand of Val.    std::pair<Instruction*, unsigned> Parent; -  // RequireNoSignedWrap - Set if the transform requires a descaling at deeper -  // levels that doesn't overflow. +  // Set if the transform requires a descaling at deeper levels that doesn't +  // overflow.    bool RequireNoSignedWrap = false; -  // logScale - log base 2 of the scale.  Negative if not a power of 2. +  // Log base 2 of the scale. Negative if not a power of 2.    int32_t logScale = Scale.exactLogBase2();    for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down @@ -1213,16 +1245,11 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {  /// specified one but with other operands.  static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS,                                   InstCombiner::BuilderTy *B) { -  Value *BORes = B->CreateBinOp(Inst.getOpcode(), LHS, RHS); -  if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BORes)) { -    if (isa<OverflowingBinaryOperator>(NewBO)) { -      NewBO->setHasNoSignedWrap(Inst.hasNoSignedWrap()); -      NewBO->setHasNoUnsignedWrap(Inst.hasNoUnsignedWrap()); -    } -    if (isa<PossiblyExactOperator>(NewBO)) -      NewBO->setIsExact(Inst.isExact()); -  } -  return BORes; +  Value *BO = B->CreateBinOp(Inst.getOpcode(), LHS, RHS); +  // If LHS and RHS are constant, BO won't be a binary operator. +  if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BO)) +    NewBO->copyIRFlags(&Inst); +  return BO;  }  /// \brief Makes transformation of binary operation specific for vector types. @@ -1256,9 +1283,8 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {          LShuf->getMask() == RShuf->getMask()) {        Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0),            RShuf->getOperand(0), Builder); -      Value *Res = Builder->CreateShuffleVector(NewBO, +      return Builder->CreateShuffleVector(NewBO,            UndefValue::get(NewBO->getType()), LShuf->getMask()); -      return Res;      }    } @@ -1294,18 +1320,11 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {      }      if (MayChange) {        Constant *C2 = ConstantVector::get(C2M); -      Value *NewLHS, *NewRHS; -      if (isa<Constant>(LHS)) { -        NewLHS = C2; -        NewRHS = Shuffle->getOperand(0); -      } else { -        NewLHS = Shuffle->getOperand(0); -        NewRHS = C2; -      } +      Value *NewLHS = isa<Constant>(LHS) ? C2 : Shuffle->getOperand(0); +      Value *NewRHS = isa<Constant>(LHS) ? Shuffle->getOperand(0) : C2;        Value *NewBO = CreateBinOpAsGiven(Inst, NewLHS, NewRHS, Builder); -      Value *Res = Builder->CreateShuffleVector(NewBO, +      return Builder->CreateShuffleVector(NewBO,            UndefValue::get(Inst.getType()), Shuffle->getMask()); -      return Res;      }    } @@ -1323,7 +1342,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {    // Eliminate unneeded casts for indices, and replace indices which displace    // by multiples of a zero size type with zero.    bool MadeChange = false; -  Type *IntPtrTy = DL.getIntPtrType(GEP.getPointerOperandType()); +  Type *IntPtrTy = +    DL.getIntPtrType(GEP.getPointerOperandType()->getScalarType());    gep_type_iterator GTI = gep_type_begin(GEP);    for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E; @@ -1333,21 +1353,25 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {      if (!SeqTy)        continue; +    // Index type should have the same width as IntPtr +    Type *IndexTy = (*I)->getType(); +    Type *NewIndexType = IndexTy->isVectorTy() ? +      VectorType::get(IntPtrTy, IndexTy->getVectorNumElements()) : IntPtrTy; +       // If the element type has zero size then any index over it is equivalent      // to an index of zero, so replace it with zero if it is not zero already.      if (SeqTy->getElementType()->isSized() &&          DL.getTypeAllocSize(SeqTy->getElementType()) == 0)        if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) { -        *I = Constant::getNullValue(IntPtrTy); +        *I = Constant::getNullValue(NewIndexType);          MadeChange = true;        } -    Type *IndexTy = (*I)->getType(); -    if (IndexTy != IntPtrTy) { +    if (IndexTy != NewIndexType) {        // If we are using a wider index than needed for this platform, shrink        // it to what we need.  If narrower, sign-extend it to what we need.        // This explicit cast can make subsequent optimizations more obvious. -      *I = Builder->CreateIntCast(*I, IntPtrTy, true); +      *I = Builder->CreateIntCast(*I, NewIndexType, true);        MadeChange = true;      }    } @@ -1421,8 +1445,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {        }      } -    GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone()); +    // If not all GEPs are identical we'll have to create a new PHI node. +    // Check that the old PHI node has only one use so that it will get +    // removed. +    if (DI != -1 && !PN->hasOneUse()) +      return nullptr; +    GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone());      if (DI == -1) {        // All the GEPs feeding the PHI are identical. Clone one down into our        // BB so that it can be merged with the current GEP. @@ -1432,11 +1461,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {        // All the GEPs feeding the PHI differ at a single offset. Clone a GEP        // into the current block so it can be merged, and create a new PHI to        // set that index. -      Instruction *InsertPt = Builder->GetInsertPoint(); -      Builder->SetInsertPoint(PN); -      PHINode *NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(), -                                          PN->getNumOperands()); -      Builder->SetInsertPoint(InsertPt); +      PHINode *NewPN; +      { +        IRBuilderBase::InsertPointGuard Guard(*Builder); +        Builder->SetInsertPoint(PN); +        NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(), +                                   PN->getNumOperands()); +      }        for (auto &I : PN->operands())          NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI), @@ -1790,7 +1821,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {            if (Instruction *I = visitBitCast(*BCI)) {              if (I != BCI) {                I->takeName(BCI); -              BCI->getParent()->getInstList().insert(BCI, I); +              BCI->getParent()->getInstList().insert(BCI->getIterator(), I);                ReplaceInstUsesWith(*BCI, I);              }              return &GEP; @@ -1931,7 +1962,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {      if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {        // Replace invoke with a NOP intrinsic to maintain the original CFG -      Module *M = II->getParent()->getParent()->getParent(); +      Module *M = II->getModule();        Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);        InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),                           None, "", II->getParent()); @@ -2280,9 +2311,10 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {    }    if (LoadInst *L = dyn_cast<LoadInst>(Agg))      // If the (non-volatile) load only has one use, we can rewrite this to a -    // load from a GEP. This reduces the size of the load. -    // FIXME: If a load is used only by extractvalue instructions then this -    //        could be done regardless of having multiple uses. +    // load from a GEP. This reduces the size of the load. If a load is used +    // only by extractvalue instructions then this either must have been +    // optimized before, or it is a struct with padding, in which case we +    // don't want to do the transformation as it loses padding knowledge.      if (L->isSimple() && L->hasOneUse()) {        // extractvalue has integer indices, getelementptr has Value*s. Convert.        SmallVector<Value*, 4> Indices; @@ -2294,7 +2326,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {        // We need to insert these at the location of the old load, not at that of        // the extractvalue. -      Builder->SetInsertPoint(L->getParent(), L); +      Builder->SetInsertPoint(L);        Value *GEP = Builder->CreateInBoundsGEP(L->getType(),                                                L->getPointerOperand(), Indices);        // Returning the load directly will cause the main loop to insert it in @@ -2312,7 +2344,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {    return nullptr;  } -/// isCatchAll - Return 'true' if the given typeinfo will match anything. +/// Return 'true' if the given typeinfo will match anything.  static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {    switch (Personality) {    case EHPersonality::GNU_C: @@ -2330,6 +2362,7 @@ static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {    case EHPersonality::MSVC_X86SEH:    case EHPersonality::MSVC_Win64SEH:    case EHPersonality::MSVC_CXX: +  case EHPersonality::CoreCLR:      return TypeInfo->isNullValue();    }    llvm_unreachable("invalid enum"); @@ -2441,10 +2474,24 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {              SawCatchAll = true;              break;            } -          if (AlreadyCaught.count(TypeInfo)) -            // Already caught by an earlier clause, so having it in the filter -            // is pointless. -            continue; + +          // Even if we've seen a type in a catch clause, we don't want to +          // remove it from the filter.  An unexpected type handler may be +          // set up for a call site which throws an exception of the same +          // type caught.  In order for the exception thrown by the unexpected +          // handler to propogate correctly, the filter must be correctly +          // described for the call site. +          // +          // Example: +          // +          // void unexpected() { throw 1;} +          // void foo() throw (int) { +          //   std::set_unexpected(unexpected); +          //   try { +          //     throw 2.0; +          //   } catch (int i) {} +          // } +            // There is no point in having multiple copies of the same typeinfo in            // a filter, so only add it if we didn't already.            if (SeenInFilter.insert(TypeInfo).second) @@ -2637,15 +2684,15 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {    return nullptr;  } -/// TryToSinkInstruction - Try to move the specified instruction from its -/// current block into the beginning of DestBlock, which can only happen if it's -/// safe to move the instruction past all of the instructions between it and the -/// end of its block. +/// Try to move the specified instruction from its current block into the +/// beginning of DestBlock, which can only happen if it's safe to move the +/// instruction past all of the instructions between it and the end of its +/// block.  static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {    assert(I->hasOneUse() && "Invariants didn't hold!");    // Cannot move control-flow-involving, volatile loads, vaarg, etc. -  if (isa<PHINode>(I) || isa<LandingPadInst>(I) || I->mayHaveSideEffects() || +  if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() ||        isa<TerminatorInst>(I))      return false; @@ -2654,17 +2701,24 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {          &DestBlock->getParent()->getEntryBlock())      return false; +  // Do not sink convergent call instructions. +  if (auto *CI = dyn_cast<CallInst>(I)) { +    if (CI->isConvergent()) +      return false; +  } +    // We can only sink load instructions if there is nothing between the load and    // the end of block that could change the value.    if (I->mayReadFromMemory()) { -    for (BasicBlock::iterator Scan = I, E = I->getParent()->end(); +    for (BasicBlock::iterator Scan = I->getIterator(), +                              E = I->getParent()->end();           Scan != E; ++Scan)        if (Scan->mayWriteToMemory())          return false;    }    BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt(); -  I->moveBefore(InsertPos); +  I->moveBefore(&*InsertPos);    ++NumSunkInst;    return true;  } @@ -2698,6 +2752,27 @@ bool InstCombiner::run() {        }      } +    // In general, it is possible for computeKnownBits to determine all bits in a +    // value even when the operands are not all constants. +    if (!I->use_empty() && I->getType()->isIntegerTy()) { +      unsigned BitWidth = I->getType()->getScalarSizeInBits(); +      APInt KnownZero(BitWidth, 0); +      APInt KnownOne(BitWidth, 0); +      computeKnownBits(I, KnownZero, KnownOne, /*Depth*/0, I); +      if ((KnownZero | KnownOne).isAllOnesValue()) { +        Constant *C = ConstantInt::get(I->getContext(), KnownOne); +        DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C << +                        " from: " << *I << '\n'); + +        // Add operands to the worklist. +        ReplaceInstUsesWith(*I, C); +        ++NumConstProp; +        EraseInstFromFunction(*I); +        MadeIRChange = true; +        continue; +      } +    } +      // See if we can trivially sink this instruction to a successor basic block.      if (I->hasOneUse()) {        BasicBlock *BB = I->getParent(); @@ -2738,7 +2813,7 @@ bool InstCombiner::run() {      }      // Now that we have an instruction, try combining it to simplify it. -    Builder->SetInsertPoint(I->getParent(), I); +    Builder->SetInsertPoint(I);      Builder->SetCurrentDebugLocation(I->getDebugLoc());  #ifndef NDEBUG @@ -2768,7 +2843,7 @@ bool InstCombiner::run() {          // Insert the new instruction into the basic block...          BasicBlock *InstParent = I->getParent(); -        BasicBlock::iterator InsertPos = I; +        BasicBlock::iterator InsertPos = I->getIterator();          // If we replace a PHI with something that isn't a PHI, fix up the          // insertion point. @@ -2801,8 +2876,8 @@ bool InstCombiner::run() {    return MadeIRChange;  } -/// AddReachableCodeToWorklist - Walk the function in depth-first order, adding -/// all reachable code to the worklist. +/// Walk the function in depth-first order, adding all reachable code to the +/// worklist.  ///  /// This has a couple of tricks to make the code faster and more powerful.  In  /// particular, we constant fold and DCE instructions as we go, to avoid adding @@ -2829,7 +2904,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,        continue;      for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { -      Instruction *Inst = BBI++; +      Instruction *Inst = &*BBI++;        // DCE instruction if trivially dead.        if (isInstructionTriviallyDead(Inst, TLI)) { @@ -2900,8 +2975,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,        }      } -    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) -      Worklist.push_back(TI->getSuccessor(i)); +    for (BasicBlock *SuccBB : TI->successors()) +      Worklist.push_back(SuccBB);    } while (!Worklist.empty());    // Once we've found all of the instructions to add to instcombine's worklist, @@ -2909,8 +2984,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,    // of the function down.  This jives well with the way that it adds all uses    // of instructions to the worklist after doing a transformation, thus avoiding    // some N^2 behavior in pathological cases. -  ICWorklist.AddInitialGroup(&InstrsForInstCombineWorklist[0], -                             InstrsForInstCombineWorklist.size()); +  ICWorklist.AddInitialGroup(InstrsForInstCombineWorklist);    return MadeIRChange;  } @@ -2930,13 +3004,13 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,    // track of which blocks we visit.    SmallPtrSet<BasicBlock *, 64> Visited;    MadeIRChange |= -      AddReachableCodeToWorklist(F.begin(), DL, Visited, ICWorklist, TLI); +      AddReachableCodeToWorklist(&F.front(), DL, Visited, ICWorklist, TLI);    // Do a quick scan over the function.  If we find any blocks that are    // unreachable, remove any instructions inside of them.  This prevents    // the instcombine code from having to deal with some bad special cases.    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { -    if (Visited.count(BB)) +    if (Visited.count(&*BB))        continue;      // Delete the instructions backwards, as it has a reduced likelihood of @@ -2944,11 +3018,10 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,      Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.      while (EndInst != BB->begin()) {        // Delete the next to last instruction. -      BasicBlock::iterator I = EndInst; -      Instruction *Inst = --I; -      if (!Inst->use_empty()) +      Instruction *Inst = &*--EndInst->getIterator(); +      if (!Inst->use_empty() && !Inst->getType()->isTokenTy())          Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); -      if (isa<LandingPadInst>(Inst)) { +      if (Inst->isEHPad()) {          EndInst = Inst;          continue;        } @@ -2956,7 +3029,8 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,          ++NumDeadInst;          MadeIRChange = true;        } -      Inst->eraseFromParent(); +      if (!Inst->getType()->isTokenTy()) +        Inst->eraseFromParent();      }    } @@ -2968,8 +3042,6 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,                                  AliasAnalysis *AA, AssumptionCache &AC,                                  TargetLibraryInfo &TLI, DominatorTree &DT,                                  LoopInfo *LI = nullptr) { -  // Minimizing size? -  bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize);    auto &DL = F.getParent()->getDataLayout();    /// Builder - This is an IRBuilder that automatically inserts new @@ -2992,7 +3064,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,      if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist))        Changed = true; -    InstCombiner IC(Worklist, &Builder, MinimizeSize, +    InstCombiner IC(Worklist, &Builder, F.optForMinSize(),                      AA, &AC, &TLI, &DT, DL, LI);      if (IC.run())        Changed = true; @@ -3046,11 +3118,12 @@ public:  void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {    AU.setPreservesCFG(); -  AU.addRequired<AliasAnalysis>(); +  AU.addRequired<AAResultsWrapperPass>();    AU.addRequired<AssumptionCacheTracker>();    AU.addRequired<TargetLibraryInfoWrapperPass>();    AU.addRequired<DominatorTreeWrapperPass>();    AU.addPreserved<DominatorTreeWrapperPass>(); +  AU.addPreserved<GlobalsAAWrapperPass>();  }  bool InstructionCombiningPass::runOnFunction(Function &F) { @@ -3058,7 +3131,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {      return false;    // Required analyses. -  auto AA = &getAnalysis<AliasAnalysis>(); +  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -3076,7 +3149,8 @@ INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",  INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)  INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",                      "Combine redundant instructions", false, false) diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index e7ef9f96edc2..a9df5e5898ae 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -18,6 +18,7 @@  #include "llvm/ADT/DenseMap.h"  #include "llvm/ADT/DenseSet.h"  #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SetVector.h"  #include "llvm/ADT/SmallSet.h"  #include "llvm/ADT/SmallString.h"  #include "llvm/ADT/SmallVector.h" @@ -90,7 +91,9 @@ static const char *const kAsanUnregisterGlobalsName =      "__asan_unregister_globals";  static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";  static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init"; -static const char *const kAsanInitName = "__asan_init_v5"; +static const char *const kAsanInitName = "__asan_init"; +static const char *const kAsanVersionCheckName = +    "__asan_version_mismatch_check_v6";  static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";  static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";  static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return"; @@ -119,6 +122,10 @@ static const unsigned kAllocaRzSize = 32;  static cl::opt<bool> ClEnableKasan(      "asan-kernel", cl::desc("Enable KernelAddressSanitizer instrumentation"),      cl::Hidden, cl::init(false)); +static cl::opt<bool> ClRecover( +    "asan-recover", +    cl::desc("Enable recovery mode (continue-after-error)."), +    cl::Hidden, cl::init(false));  // This flag may need to be replaced with -f[no-]asan-reads.  static cl::opt<bool> ClInstrumentReads("asan-instrument-reads", @@ -177,7 +184,7 @@ static cl::opt<std::string> ClMemoryAccessCallbackPrefix(      cl::init("__asan_"));  static cl::opt<bool> ClInstrumentAllocas("asan-instrument-allocas",                                           cl::desc("instrument dynamic allocas"), -                                         cl::Hidden, cl::init(false)); +                                         cl::Hidden, cl::init(true));  static cl::opt<bool> ClSkipPromotableAllocas(      "asan-skip-promotable-allocas",      cl::desc("Do not instrument promotable allocas"), cl::Hidden, @@ -273,6 +280,11 @@ class GlobalsMetadata {    GlobalsMetadata() : inited_(false) {} +  void reset() { +    inited_ = false; +    Entries.clear(); +  } +    void init(Module &M) {      assert(!inited_);      inited_ = true; @@ -321,7 +333,7 @@ struct ShadowMapping {  static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,                                        bool IsKasan) { -  bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android; +  bool IsAndroid = TargetTriple.isAndroid();    bool IsIOS = TargetTriple.isiOS();    bool IsFreeBSD = TargetTriple.isOSFreeBSD();    bool IsLinux = TargetTriple.isOSLinux(); @@ -338,6 +350,8 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,    ShadowMapping Mapping;    if (LongSize == 32) { +    // Android is always PIE, which means that the beginning of the address +    // space is always available.      if (IsAndroid)        Mapping.Offset = 0;      else if (IsMIPS32) @@ -376,7 +390,8 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,    // OR-ing shadow offset if more efficient (at least on x86) if the offset    // is a power of two, but on ppc64 we have to use add since the shadow    // offset is not necessary 1/8-th of the address space. -  Mapping.OrShadowOffset = !IsPPC64 && !(Mapping.Offset & (Mapping.Offset - 1)); +  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 +                           && !(Mapping.Offset & (Mapping.Offset - 1));    return Mapping;  } @@ -389,8 +404,9 @@ static size_t RedzoneSizeForScale(int MappingScale) {  /// AddressSanitizer: instrument the code in module to find memory bugs.  struct AddressSanitizer : public FunctionPass { -  explicit AddressSanitizer(bool CompileKernel = false) -      : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan) { +  explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false) +      : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan), +        Recover(Recover || ClRecover) {      initializeAddressSanitizerPass(*PassRegistry::getPassRegistry());    }    const char *getPassName() const override { @@ -437,7 +453,9 @@ struct AddressSanitizer : public FunctionPass {    Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);    bool runOnFunction(Function &F) override;    bool maybeInsertAsanInitAtFunctionEntry(Function &F); +  void markEscapedLocalAllocas(Function &F);    bool doInitialization(Module &M) override; +  bool doFinalization(Module &M) override;    static char ID;  // Pass identification, replacement for typeid    DominatorTree &getDominatorTree() const { return *DT; } @@ -450,10 +468,21 @@ struct AddressSanitizer : public FunctionPass {    bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr,                      uint64_t TypeSize) const; +  /// Helper to cleanup per-function state. +  struct FunctionStateRAII { +    AddressSanitizer *Pass; +    FunctionStateRAII(AddressSanitizer *Pass) : Pass(Pass) { +      assert(Pass->ProcessedAllocas.empty() && +             "last pass forgot to clear cache"); +    } +    ~FunctionStateRAII() { Pass->ProcessedAllocas.clear(); } +  }; +    LLVMContext *C;    Triple TargetTriple;    int LongSize;    bool CompileKernel; +  bool Recover;    Type *IntptrTy;    ShadowMapping Mapping;    DominatorTree *DT; @@ -477,8 +506,10 @@ struct AddressSanitizer : public FunctionPass {  class AddressSanitizerModule : public ModulePass {   public: -  explicit AddressSanitizerModule(bool CompileKernel = false) -      : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan) {} +  explicit AddressSanitizerModule(bool CompileKernel = false, +                                  bool Recover = false) +      : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan), +        Recover(Recover || ClRecover) {}    bool runOnModule(Module &M) override;    static char ID;  // Pass identification, replacement for typeid    const char *getPassName() const override { return "AddressSanitizerModule"; } @@ -496,6 +527,7 @@ class AddressSanitizerModule : public ModulePass {    GlobalsMetadata GlobalsMD;    bool CompileKernel; +  bool Recover;    Type *IntptrTy;    LLVMContext *C;    Triple TargetTriple; @@ -525,6 +557,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {    ShadowMapping Mapping;    SmallVector<AllocaInst *, 16> AllocaVec; +  SmallSetVector<AllocaInst *, 16> NonInstrumentedStaticAllocaVec;    SmallVector<Instruction *, 8> RetVec;    unsigned StackAlignment; @@ -545,12 +578,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {    SmallVector<AllocaInst *, 1> DynamicAllocaVec;    SmallVector<IntrinsicInst *, 1> StackRestoreVec;    AllocaInst *DynamicAllocaLayout = nullptr; +  IntrinsicInst *LocalEscapeCall = nullptr;    // Maps Value to an AllocaInst from which the Value is originated.    typedef DenseMap<Value *, AllocaInst *> AllocaForValueMapTy;    AllocaForValueMapTy AllocaForValue; -  bool HasNonEmptyInlineAsm; +  bool HasNonEmptyInlineAsm = false; +  bool HasReturnsTwiceCall = false;    std::unique_ptr<CallInst> EmptyInlineAsm;    FunctionStackPoisoner(Function &F, AddressSanitizer &ASan) @@ -562,7 +597,6 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {          IntptrPtrTy(PointerType::get(IntptrTy, 0)),          Mapping(ASan.Mapping),          StackAlignment(1 << Mapping.Scale), -        HasNonEmptyInlineAsm(false),          EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {}    bool runOnFunction() { @@ -596,9 +630,24 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {    void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,                                          Value *SavedStack) {      IRBuilder<> IRB(InstBefore); +    Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy); +    // When we insert _asan_allocas_unpoison before @llvm.stackrestore, we +    // need to adjust extracted SP to compute the address of the most recent +    // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for +    // this purpose. +    if (!isa<ReturnInst>(InstBefore)) { +      Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration( +          InstBefore->getModule(), Intrinsic::get_dynamic_area_offset, +          {IntptrTy}); + +      Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {}); + +      DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy), +                                     DynamicAreaOffset); +    } +      IRB.CreateCall(AsanAllocasUnpoisonFunc, -                   {IRB.CreateLoad(DynamicAllocaLayout), -                    IRB.CreatePtrToInt(SavedStack, IntptrTy)}); +                   {IRB.CreateLoad(DynamicAllocaLayout), DynamicAreaPtr});    }    // Unpoison dynamic allocas redzones. @@ -625,7 +674,10 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {    /// \brief Collect Alloca instructions we want (and can) handle.    void visitAllocaInst(AllocaInst &AI) { -    if (!ASan.isInterestingAlloca(AI)) return; +    if (!ASan.isInterestingAlloca(AI)) { +      if (AI.isStaticAlloca()) NonInstrumentedStaticAllocaVec.insert(&AI); +      return; +    }      StackAlignment = std::max(StackAlignment, AI.getAlignment());      if (ASan.isDynamicAlloca(AI)) @@ -639,6 +691,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {    void visitIntrinsicInst(IntrinsicInst &II) {      Intrinsic::ID ID = II.getIntrinsicID();      if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II); +    if (ID == Intrinsic::localescape) LocalEscapeCall = &II;      if (!ClCheckLifetime) return;      if (ID != Intrinsic::lifetime_start && ID != Intrinsic::lifetime_end)        return; @@ -660,9 +713,13 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {      AllocaPoisonCallVec.push_back(APC);    } -  void visitCallInst(CallInst &CI) { -    HasNonEmptyInlineAsm |= -        CI.isInlineAsm() && !CI.isIdenticalTo(EmptyInlineAsm.get()); +  void visitCallSite(CallSite CS) { +    Instruction *I = CS.getInstruction(); +    if (CallInst *CI = dyn_cast<CallInst>(I)) { +      HasNonEmptyInlineAsm |= +          CI->isInlineAsm() && !CI->isIdenticalTo(EmptyInlineAsm.get()); +      HasReturnsTwiceCall |= CI->canReturnTwice(); +    }    }    // ---------------------- Helpers. @@ -689,7 +746,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {                       Instruction *ThenTerm, Value *ValueIfFalse);  }; -}  // namespace +} // anonymous namespace  char AddressSanitizer::ID = 0;  INITIALIZE_PASS_BEGIN( @@ -697,12 +754,15 @@ INITIALIZE_PASS_BEGIN(      "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,      false)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)  INITIALIZE_PASS_END(      AddressSanitizer, "asan",      "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,      false) -FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel) { -  return new AddressSanitizer(CompileKernel); +FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel, +                                                       bool Recover) { +  assert(!CompileKernel || Recover); +  return new AddressSanitizer(CompileKernel, Recover);  }  char AddressSanitizerModule::ID = 0; @@ -711,8 +771,10 @@ INITIALIZE_PASS(      "AddressSanitizer: detects use-after-free and out-of-bounds bugs."      "ModulePass",      false, false) -ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel) { -  return new AddressSanitizerModule(CompileKernel); +ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel, +                                                   bool Recover) { +  assert(!CompileKernel || Recover); +  return new AddressSanitizerModule(CompileKernel, Recover);  }  static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { @@ -799,8 +861,10 @@ bool AddressSanitizer::isInterestingAlloca(AllocaInst &AI) {         getAllocaSizeInBytes(&AI) > 0 &&         // We are only interested in allocas not promotable to registers.         // Promotable allocas are common under -O0. -       (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI) || -        isDynamicAlloca(AI))); +       (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) && +       // inalloca allocas are not treated as static, and we don't want +       // dynamic alloca instrumentation for them as well. +       !AI.isUsedWithInAlloca());    ProcessedAllocas[&AI] = IsInteresting;    return IsInteresting; @@ -868,10 +932,8 @@ static bool isInterestingPointerComparisonOrSubtraction(Instruction *I) {    } else {      return false;    } -  if (!isPointerOperand(I->getOperand(0)) || -      !isPointerOperand(I->getOperand(1))) -    return false; -  return true; +  return isPointerOperand(I->getOperand(0)) && +         isPointerOperand(I->getOperand(1));  }  bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) { @@ -919,7 +981,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,      // If initialization order checking is disabled, a simple access to a      // dynamically initialized global is always valid.      GlobalVariable *G = dyn_cast<GlobalVariable>(GetUnderlyingObject(Addr, DL)); -    if (G != NULL && (!ClInitializers || GlobalIsLinkerInitialized(G)) && +    if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) &&          isSafeAccess(ObjSizeVis, Addr, TypeSize)) {        NumOptimizedAccessesToGlobalVar++;        return; @@ -1041,13 +1103,17 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,      BasicBlock *NextBB = CheckTerm->getSuccessor(0);      IRB.SetInsertPoint(CheckTerm);      Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize); -    BasicBlock *CrashBlock = +    if (Recover) { +      CrashTerm = SplitBlockAndInsertIfThen(Cmp2, CheckTerm, false); +    } else { +      BasicBlock *CrashBlock =          BasicBlock::Create(*C, "", NextBB->getParent(), NextBB); -    CrashTerm = new UnreachableInst(*C, CrashBlock); -    BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2); -    ReplaceInstWithInst(CheckTerm, NewTerm); +      CrashTerm = new UnreachableInst(*C, CrashBlock); +      BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2); +      ReplaceInstWithInst(CheckTerm, NewTerm); +    }    } else { -    CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, true); +    CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, !Recover);    }    Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite, @@ -1084,7 +1150,8 @@ void AddressSanitizer::instrumentUnusualSizeOrAlignment(  void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit,                                                    GlobalValue *ModuleName) {    // Set up the arguments to our poison/unpoison functions. -  IRBuilder<> IRB(GlobalInit.begin()->getFirstInsertionPt()); +  IRBuilder<> IRB(&GlobalInit.front(), +                  GlobalInit.front().getFirstInsertionPt());    // Add a call to poison all external globals before the given function starts.    Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy); @@ -1147,6 +1214,14 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {      // Do not instrument globals from special LLVM sections.      if (Section.find("__llvm") != StringRef::npos) return false; +    // Do not instrument function pointers to initialization and termination +    // routines: dynamic linker will not properly handle redzones. +    if (Section.startswith(".preinit_array") || +        Section.startswith(".init_array") || +        Section.startswith(".fini_array")) { +      return false; +    } +      // Callbacks put into the CRT initializer/terminator sections      // should not be instrumented.      // See https://code.google.com/p/address-sanitizer/issues/detail?id=305 @@ -1162,10 +1237,7 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {        bool TAAParsed;        std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier(            Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize); -      if (!ErrorCode.empty()) { -        assert(false && "Invalid section specifier."); -        return false; -      } +      assert(ErrorCode.empty() && "Invalid section specifier.");        // Ignore the globals from the __OBJC section. The ObjC runtime assumes        // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to @@ -1383,13 +1455,11 @@ void AddressSanitizer::initializeCallbacks(Module &M) {        const std::string TypeStr = AccessIsWrite ? "store" : "load";        const std::string ExpStr = Exp ? "exp_" : "";        const std::string SuffixStr = CompileKernel ? "N" : "_n"; -      const std::string EndingStr = CompileKernel ? "_noabort" : ""; -      const Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr; -      // TODO(glider): for KASan builds add _noabort to error reporting -      // functions and make them actually noabort (remove the UnreachableInst). +      const std::string EndingStr = Recover ? "_noabort" : ""; +      Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr;        AsanErrorCallbackSized[AccessIsWrite][Exp] =            checkSanitizerInterfaceFunction(M.getOrInsertFunction( -              kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr, +              kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr + EndingStr,                IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));        AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =            checkSanitizerInterfaceFunction(M.getOrInsertFunction( @@ -1400,7 +1470,7 @@ void AddressSanitizer::initializeCallbacks(Module &M) {          const std::string Suffix = TypeStr + itostr(1 << AccessSizeIndex);          AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =              checkSanitizerInterfaceFunction(M.getOrInsertFunction( -                kAsanReportErrorTemplate + ExpStr + Suffix, +                kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,                  IRB.getVoidTy(), IntptrTy, ExpType, nullptr));          AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =              checkSanitizerInterfaceFunction(M.getOrInsertFunction( @@ -1448,15 +1518,20 @@ bool AddressSanitizer::doInitialization(Module &M) {    if (!CompileKernel) {      std::tie(AsanCtorFunction, AsanInitFunction) = -        createSanitizerCtorAndInitFunctions(M, kAsanModuleCtorName, kAsanInitName, -                                            /*InitArgTypes=*/{}, -                                            /*InitArgs=*/{}); +        createSanitizerCtorAndInitFunctions( +            M, kAsanModuleCtorName, kAsanInitName, +            /*InitArgTypes=*/{}, /*InitArgs=*/{}, kAsanVersionCheckName);      appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);    }    Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);    return true;  } +bool AddressSanitizer::doFinalization(Module &M) { +  GlobalsMD.reset(); +  return false; +} +  bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {    // For each NSObject descendant having a +load method, this method is invoked    // by the ObjC runtime before any of the static constructors is called. @@ -1466,13 +1541,41 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {    // We cannot just ignore these methods, because they may call other    // instrumented functions.    if (F.getName().find(" load]") != std::string::npos) { -    IRBuilder<> IRB(F.begin()->begin()); +    IRBuilder<> IRB(&F.front(), F.front().begin());      IRB.CreateCall(AsanInitFunction, {});      return true;    }    return false;  } +void AddressSanitizer::markEscapedLocalAllocas(Function &F) { +  // Find the one possible call to llvm.localescape and pre-mark allocas passed +  // to it as uninteresting. This assumes we haven't started processing allocas +  // yet. This check is done up front because iterating the use list in +  // isInterestingAlloca would be algorithmically slower. +  assert(ProcessedAllocas.empty() && "must process localescape before allocas"); + +  // Try to get the declaration of llvm.localescape. If it's not in the module, +  // we can exit early. +  if (!F.getParent()->getFunction("llvm.localescape")) return; + +  // Look for a call to llvm.localescape call in the entry block. It can't be in +  // any other block. +  for (Instruction &I : F.getEntryBlock()) { +    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); +    if (II && II->getIntrinsicID() == Intrinsic::localescape) { +      // We found a call. Mark all the allocas passed in as uninteresting. +      for (Value *Arg : II->arg_operands()) { +        AllocaInst *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts()); +        assert(AI && AI->isStaticAlloca() && +               "non-static alloca arg to localescape"); +        ProcessedAllocas[AI] = false; +      } +      break; +    } +  } +} +  bool AddressSanitizer::runOnFunction(Function &F) {    if (&F == AsanCtorFunction) return false;    if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false; @@ -1488,6 +1591,12 @@ bool AddressSanitizer::runOnFunction(Function &F) {    if (!ClDebugFunc.empty() && ClDebugFunc != F.getName()) return false; +  FunctionStateRAII CleanupObj(this); + +  // We can't instrument allocas used with llvm.localescape. Only static allocas +  // can be passed to that intrinsic. +  markEscapedLocalAllocas(F); +    // We want to instrument every address only once per basic block (unless there    // are calls between uses).    SmallSet<Value *, 16> TempsToInstrument; @@ -1715,6 +1824,16 @@ void FunctionStackPoisoner::createDynamicAllocasInitStorage() {  void FunctionStackPoisoner::poisonStack() {    assert(AllocaVec.size() > 0 || DynamicAllocaVec.size() > 0); +  // Insert poison calls for lifetime intrinsics for alloca. +  bool HavePoisonedAllocas = false; +  for (const auto &APC : AllocaPoisonCallVec) { +    assert(APC.InsBefore); +    assert(APC.AI); +    IRBuilder<> IRB(APC.InsBefore); +    poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison); +    HavePoisonedAllocas |= APC.DoPoison; +  } +    if (ClInstrumentAllocas && DynamicAllocaVec.size() > 0) {      // Handle dynamic allocas.      createDynamicAllocasInitStorage(); @@ -1723,7 +1842,7 @@ void FunctionStackPoisoner::poisonStack() {      unpoisonDynamicAllocas();    } -  if (AllocaVec.size() == 0) return; +  if (AllocaVec.empty()) return;    int StackMallocIdx = -1;    DebugLoc EntryDebugLocation; @@ -1734,6 +1853,19 @@ void FunctionStackPoisoner::poisonStack() {    IRBuilder<> IRB(InsBefore);    IRB.SetCurrentDebugLocation(EntryDebugLocation); +  // Make sure non-instrumented allocas stay in the entry block. Otherwise, +  // debug info is broken, because only entry-block allocas are treated as +  // regular stack slots. +  auto InsBeforeB = InsBefore->getParent(); +  assert(InsBeforeB == &F.getEntryBlock()); +  for (BasicBlock::iterator I(InsBefore); I != InsBeforeB->end(); ++I) +    if (auto *AI = dyn_cast<AllocaInst>(I)) +      if (NonInstrumentedStaticAllocaVec.count(AI) > 0) +        AI->moveBefore(InsBefore); + +  // If we have a call to llvm.localescape, keep it in the entry block. +  if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore); +    SmallVector<ASanStackVariableDescription, 16> SVD;    SVD.reserve(AllocaVec.size());    for (AllocaInst *AI : AllocaVec) { @@ -1751,10 +1883,15 @@ void FunctionStackPoisoner::poisonStack() {    uint64_t LocalStackSize = L.FrameSize;    bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&                         LocalStackSize <= kMaxStackMallocSize; -  // Don't do dynamic alloca or stack malloc in presence of inline asm: -  // too often it makes assumptions on which registers are available. -  bool DoDynamicAlloca = ClDynamicAllocaStack && !HasNonEmptyInlineAsm; -  DoStackMalloc &= !HasNonEmptyInlineAsm; +  bool DoDynamicAlloca = ClDynamicAllocaStack; +  // Don't do dynamic alloca or stack malloc if: +  // 1) There is inline asm: too often it makes assumptions on which registers +  //    are available. +  // 2) There is a returns_twice call (typically setjmp), which is +  //    optimization-hostile, and doesn't play well with introduced indirect +  //    register-relative calculation of local variable addresses. +  DoDynamicAlloca &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall; +  DoStackMalloc &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall;    Value *StaticAlloca =        DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false); @@ -1804,16 +1941,6 @@ void FunctionStackPoisoner::poisonStack() {          DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;    } -  // Insert poison calls for lifetime intrinsics for alloca. -  bool HavePoisonedAllocas = false; -  for (const auto &APC : AllocaPoisonCallVec) { -    assert(APC.InsBefore); -    assert(APC.AI); -    IRBuilder<> IRB(APC.InsBefore); -    poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison); -    HavePoisonedAllocas |= APC.DoPoison; -  } -    // Replace Alloca instructions with base+offset.    for (const auto &Desc : SVD) {      AllocaInst *AI = Desc.AI; diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index f6858034d79e..fd3dfd9af033 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -106,7 +106,7 @@ void BoundsChecking::emitBranchToTrap(Value *Cmp) {    }    ++ChecksAdded; -  Instruction *Inst = Builder->GetInsertPoint(); +  BasicBlock::iterator Inst = Builder->GetInsertPoint();    BasicBlock *OldBB = Inst->getParent();    BasicBlock *Cont = OldBB->splitBasicBlock(Inst);    OldBB->getTerminator()->eraseFromParent(); diff --git a/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h new file mode 100644 index 000000000000..c47fdbf68996 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h @@ -0,0 +1,217 @@ +//===-- CFGMST.h - Minimum Spanning Tree for CFG ----------------*- C++ -*-===// +// +//                      The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a Union-find algorithm to compute Minimum Spanning Tree +// for a given CFG. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <string> +#include <utility> +#include <vector> + +namespace llvm { + +#define DEBUG_TYPE "cfgmst" + +/// \brief An union-find based Minimum Spanning Tree for CFG +/// +/// Implements a Union-find algorithm to compute Minimum Spanning Tree +/// for a given CFG. +template <class Edge, class BBInfo> class CFGMST { +public: +  Function &F; + +  // Store all the edges in CFG. It may contain some stale edges +  // when Removed is set. +  std::vector<std::unique_ptr<Edge>> AllEdges; + +  // This map records the auxiliary information for each BB. +  DenseMap<const BasicBlock *, std::unique_ptr<BBInfo>> BBInfos; + +  // Find the root group of the G and compress the path from G to the root. +  BBInfo *findAndCompressGroup(BBInfo *G) { +    if (G->Group != G) +      G->Group = findAndCompressGroup(static_cast<BBInfo *>(G->Group)); +    return static_cast<BBInfo *>(G->Group); +  } + +  // Union BB1 and BB2 into the same group and return true. +  // Returns false if BB1 and BB2 are already in the same group. +  bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2) { +    BBInfo *BB1G = findAndCompressGroup(&getBBInfo(BB1)); +    BBInfo *BB2G = findAndCompressGroup(&getBBInfo(BB2)); + +    if (BB1G == BB2G) +      return false; + +    // Make the smaller rank tree a direct child or the root of high rank tree. +    if (BB1G->Rank < BB2G->Rank) +      BB1G->Group = BB2G; +    else { +      BB2G->Group = BB1G; +      // If the ranks are the same, increment root of one tree by one. +      if (BB1G->Rank == BB2G->Rank) +        BB1G->Rank++; +    } +    return true; +  } + +  // Give BB, return the auxiliary information. +  BBInfo &getBBInfo(const BasicBlock *BB) const { +    auto It = BBInfos.find(BB); +    assert(It->second.get() != nullptr); +    return *It->second.get(); +  } + +  // Traverse the CFG using a stack. Find all the edges and assign the weight. +  // Edges with large weight will be put into MST first so they are less likely +  // to be instrumented. +  void buildEdges() { +    DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n"); + +    const BasicBlock *BB = &(F.getEntryBlock()); +    uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2); +    // Add a fake edge to the entry. +    addEdge(nullptr, BB, EntryWeight); + +    // Special handling for single BB functions. +    if (succ_empty(BB)) { +      addEdge(BB, nullptr, EntryWeight); +      return; +    } + +    static const uint32_t CriticalEdgeMultiplier = 1000; + +    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { +      TerminatorInst *TI = BB->getTerminator(); +      uint64_t BBWeight = +          (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2); +      uint64_t Weight = 2; +      if (int successors = TI->getNumSuccessors()) { +        for (int i = 0; i != successors; ++i) { +          BasicBlock *TargetBB = TI->getSuccessor(i); +          bool Critical = isCriticalEdge(TI, i); +          uint64_t scaleFactor = BBWeight; +          if (Critical) { +            if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier) +              scaleFactor *= CriticalEdgeMultiplier; +            else +              scaleFactor = UINT64_MAX; +          } +          if (BPI != nullptr) +            Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor); +          addEdge(&*BB, TargetBB, Weight).IsCritical = Critical; +          DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to " +                       << TargetBB->getName() << "  w=" << Weight << "\n"); +        } +      } else { +        addEdge(&*BB, nullptr, BBWeight); +        DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to exit" +                     << " w = " << BBWeight << "\n"); +      } +    } +  } + +  // Sort CFG edges based on its weight. +  void sortEdgesByWeight() { +    std::stable_sort(AllEdges.begin(), AllEdges.end(), +                     [](const std::unique_ptr<Edge> &Edge1, +                        const std::unique_ptr<Edge> &Edge2) { +                       return Edge1->Weight > Edge2->Weight; +                     }); +  } + +  // Traverse all the edges and compute the Minimum Weight Spanning Tree +  // using union-find algorithm. +  void computeMinimumSpanningTree() { +    // First, put all the critical edge with landing-pad as the Dest to MST. +    // This works around the insufficient support of critical edges split +    // when destination BB is a landing pad. +    for (auto &Ei : AllEdges) { +      if (Ei->Removed) +        continue; +      if (Ei->IsCritical) { +        if (Ei->DestBB && Ei->DestBB->isLandingPad()) { +          if (unionGroups(Ei->SrcBB, Ei->DestBB)) +            Ei->InMST = true; +        } +      } +    } + +    for (auto &Ei : AllEdges) { +      if (Ei->Removed) +        continue; +      if (unionGroups(Ei->SrcBB, Ei->DestBB)) +        Ei->InMST = true; +    } +  } + +  // Dump the Debug information about the instrumentation. +  void dumpEdges(raw_ostream &OS, const Twine &Message) const { +    if (!Message.str().empty()) +      OS << Message << "\n"; +    OS << "  Number of Basic Blocks: " << BBInfos.size() << "\n"; +    for (auto &BI : BBInfos) { +      const BasicBlock *BB = BI.first; +      OS << "  BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << "  " +         << BI.second->infoString() << "\n"; +    } + +    OS << "  Number of Edges: " << AllEdges.size() +       << " (*: Instrument, C: CriticalEdge, -: Removed)\n"; +    uint32_t Count = 0; +    for (auto &EI : AllEdges) +      OS << "  Edge " << Count++ << ": " << getBBInfo(EI->SrcBB).Index << "-->" +         << getBBInfo(EI->DestBB).Index << EI->infoString() << "\n"; +  } + +  // Add an edge to AllEdges with weight W. +  Edge &addEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W) { +    uint32_t Index = BBInfos.size(); +    auto Iter = BBInfos.end(); +    bool Inserted; +    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr)); +    if (Inserted) { +      // Newly inserted, update the real info. +      Iter->second = std::move(llvm::make_unique<BBInfo>(Index)); +      Index++; +    } +    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr)); +    if (Inserted) +      // Newly inserted, update the real info. +      Iter->second = std::move(llvm::make_unique<BBInfo>(Index)); +    AllEdges.emplace_back(new Edge(Src, Dest, W)); +    return *AllEdges.back(); +  } + +  BranchProbabilityInfo *BPI; +  BlockFrequencyInfo *BFI; + +public: +  CFGMST(Function &Func, BranchProbabilityInfo *BPI_ = nullptr, +         BlockFrequencyInfo *BFI_ = nullptr) +      : F(Func), BPI(BPI_), BFI(BFI_) { +    buildEdges(); +    sortEdgesByWeight(); +    computeMinimumSpanningTree(); +  } +}; + +#undef DEBUG_TYPE // "cfgmst" +} // end namespace llvm diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 2de6e1afaba9..d459fc50d136 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -72,6 +72,11 @@  using namespace llvm; +// External symbol to be used when generating the shadow address for +// architectures with multiple VMAs. Instead of using a constant integer +// the runtime will set the external mask based on the VMA range. +static const char *const kDFSanExternShadowPtrMask = "__dfsan_shadow_ptr_mask"; +  // The -dfsan-preserve-alignment flag controls whether this pass assumes that  // alignment requirements provided by the input IR are correct.  For example,  // if the input IR contains a load with alignment 8, this flag will cause @@ -124,6 +129,7 @@ static cl::opt<bool> ClDebugNonzeroLabels(               "load or return with a nonzero label"),      cl::Hidden); +  namespace {  StringRef GetGlobalTypeString(const GlobalValue &G) { @@ -231,6 +237,7 @@ class DataFlowSanitizer : public ModulePass {    void *(*GetRetvalTLSPtr)();    Constant *GetArgTLS;    Constant *GetRetvalTLS; +  Constant *ExternalShadowMask;    FunctionType *DFSanUnionFnTy;    FunctionType *DFSanUnionLoadFnTy;    FunctionType *DFSanUnimplementedFnTy; @@ -248,7 +255,7 @@ class DataFlowSanitizer : public ModulePass {    DFSanABIList ABIList;    DenseMap<Value *, Function *> UnwrappedFnMap;    AttributeSet ReadOnlyNoneAttrs; -  DenseMap<const Function *, DISubprogram *> FunctionDIs; +  bool DFSanRuntimeShadowMask;    Value *getShadowAddress(Value *Addr, Instruction *Pos);    bool isInstrumented(const Function *F); @@ -362,7 +369,8 @@ llvm::createDataFlowSanitizerPass(const std::vector<std::string> &ABIListFiles,  DataFlowSanitizer::DataFlowSanitizer(      const std::vector<std::string> &ABIListFiles, void *(*getArgTLS)(),      void *(*getRetValTLS)()) -    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS) { +    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS), +      DFSanRuntimeShadowMask(false) {    std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));    AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(),                           ClABIListFiles.end()); @@ -420,6 +428,8 @@ bool DataFlowSanitizer::doInitialization(Module &M) {    bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;    bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 ||                    TargetTriple.getArch() == llvm::Triple::mips64el; +  bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64 || +                   TargetTriple.getArch() == llvm::Triple::aarch64_be;    const DataLayout &DL = M.getDataLayout(); @@ -434,6 +444,9 @@ bool DataFlowSanitizer::doInitialization(Module &M) {      ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);    else if (IsMIPS64)      ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL); +  // AArch64 supports multiple VMAs and the shadow mask is set at runtime. +  else if (IsAArch64) +    DFSanRuntimeShadowMask = true;    else      report_fatal_error("unsupported triple"); @@ -578,7 +591,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,      DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);      Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;      for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) -      DFSF.ValShadowMap[ValAI] = ShadowAI; +      DFSF.ValShadowMap[&*ValAI] = &*ShadowAI;      DFSanVisitor(DFSF).visitCallInst(*CI);      if (!FT->getReturnType()->isVoidTy())        new StoreInst(DFSF.getShadow(RI->getReturnValue()), @@ -592,8 +605,6 @@ bool DataFlowSanitizer::runOnModule(Module &M) {    if (ABIList.isIn(M, "skip"))      return false; -  FunctionDIs = makeSubprogramMap(M); -    if (!GetArgTLSPtr) {      Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);      ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy); @@ -606,6 +617,9 @@ bool DataFlowSanitizer::runOnModule(Module &M) {        G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);    } +  ExternalShadowMask = +      Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy); +    DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy);    if (Function *F = dyn_cast<Function>(DFSanUnionFn)) {      F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind); @@ -643,16 +657,16 @@ bool DataFlowSanitizer::runOnModule(Module &M) {    std::vector<Function *> FnsToInstrument;    llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI; -  for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) { -    if (!i->isIntrinsic() && -        i != DFSanUnionFn && -        i != DFSanCheckedUnionFn && -        i != DFSanUnionLoadFn && -        i != DFSanUnimplementedFn && -        i != DFSanSetLabelFn && -        i != DFSanNonzeroLabelFn && -        i != DFSanVarargWrapperFn) -      FnsToInstrument.push_back(&*i); +  for (Function &i : M) { +    if (!i.isIntrinsic() && +        &i != DFSanUnionFn && +        &i != DFSanCheckedUnionFn && +        &i != DFSanUnionLoadFn && +        &i != DFSanUnimplementedFn && +        &i != DFSanSetLabelFn && +        &i != DFSanNonzeroLabelFn && +        &i != DFSanVarargWrapperFn) +      FnsToInstrument.push_back(&i);    }    // Give function aliases prefixes when necessary, and build wrappers where the @@ -710,7 +724,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {                                      NewFArg = NewF->arg_begin(),                                      FArgEnd = F.arg_end();               FArg != FArgEnd; ++FArg, ++NewFArg) { -          FArg->replaceAllUsesWith(NewFArg); +          FArg->replaceAllUsesWith(&*NewFArg);          }          NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList()); @@ -750,11 +764,6 @@ bool DataFlowSanitizer::runOnModule(Module &M) {            ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));        F.replaceAllUsesWith(WrappedFnCst); -      // Patch the pointer to LLVM function in debug info descriptor. -      auto DI = FunctionDIs.find(&F); -      if (DI != FunctionDIs.end()) -        DI->second->replaceFunction(&F); -        UnwrappedFnMap[WrappedFnCst] = &F;        *i = NewF; @@ -842,7 +851,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {          if (Instruction *I = dyn_cast<Instruction>(V))            Pos = I->getNextNode();          else -          Pos = DFSF.F->getEntryBlock().begin(); +          Pos = &DFSF.F->getEntryBlock().front();          while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))            Pos = Pos->getNextNode();          IRBuilder<> IRB(Pos); @@ -864,7 +873,7 @@ Value *DFSanFunction::getArgTLSPtr() {    if (DFS.ArgTLS)      return ArgTLSPtr = DFS.ArgTLS; -  IRBuilder<> IRB(F->getEntryBlock().begin()); +  IRBuilder<> IRB(&F->getEntryBlock().front());    return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS, {});  } @@ -874,7 +883,7 @@ Value *DFSanFunction::getRetvalTLS() {    if (DFS.RetvalTLS)      return RetvalTLSPtr = DFS.RetvalTLS; -  IRBuilder<> IRB(F->getEntryBlock().begin()); +  IRBuilder<> IRB(&F->getEntryBlock().front());    return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS, {});  } @@ -906,7 +915,7 @@ Value *DFSanFunction::getShadow(Value *V) {          Function::arg_iterator i = F->arg_begin();          while (ArgIdx--)            ++i; -        Shadow = i; +        Shadow = &*i;          assert(Shadow->getType() == DFS.ShadowTy);          break;        } @@ -928,9 +937,15 @@ void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {  Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {    assert(Addr != RetvalTLS && "Reinstrumenting?");    IRBuilder<> IRB(Pos); +  Value *ShadowPtrMaskValue; +  if (DFSanRuntimeShadowMask) +    ShadowPtrMaskValue = IRB.CreateLoad(IntptrTy, ExternalShadowMask); +  else +    ShadowPtrMaskValue = ShadowPtrMask;    return IRB.CreateIntToPtr(        IRB.CreateMul( -          IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), ShadowPtrMask), +          IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), +                        IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy)),            ShadowPtrMul),        ShadowPtrTy);  } @@ -991,7 +1006,7 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {      Call->addAttribute(2, Attribute::ZExt);      BasicBlock *Tail = BI->getSuccessor(0); -    PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", Tail->begin()); +    PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());      Phi->addIncoming(Call, Call->getParent());      Phi->addIncoming(V1, Head); @@ -1105,7 +1120,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,      Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);      BasicBlock *Head = Pos->getParent(); -    BasicBlock *Tail = Head->splitBasicBlock(Pos); +    BasicBlock *Tail = Head->splitBasicBlock(Pos->getIterator());      if (DomTreeNode *OldNode = DT.getNode(Head)) {        std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end()); @@ -1475,8 +1490,8 @@ void DFSanVisitor::visitCallSite(CallSite CS) {          if (FT->isVarArg()) {            auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy,                                             CS.arg_size() - FT->getNumParams()); -          auto *LabelVAAlloca = new AllocaInst(LabelVATy, "labelva", -                                               DFSF.F->getEntryBlock().begin()); +          auto *LabelVAAlloca = new AllocaInst( +              LabelVATy, "labelva", &DFSF.F->getEntryBlock().front());            for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) {              auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n); @@ -1490,7 +1505,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {            if (!DFSF.LabelReturnAlloca) {              DFSF.LabelReturnAlloca =                  new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn", -                               DFSF.F->getEntryBlock().begin()); +                               &DFSF.F->getEntryBlock().front());            }            Args.push_back(DFSF.LabelReturnAlloca);          } @@ -1529,13 +1544,14 @@ void DFSanVisitor::visitCallSite(CallSite CS) {    if (!CS.getType()->isVoidTy()) {      if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {        if (II->getNormalDest()->getSinglePredecessor()) { -        Next = II->getNormalDest()->begin(); +        Next = &II->getNormalDest()->front();        } else {          BasicBlock *NewBB =              SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT); -        Next = NewBB->begin(); +        Next = &NewBB->front();        }      } else { +      assert(CS->getIterator() != CS->getParent()->end());        Next = CS->getNextNode();      } @@ -1568,7 +1584,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {        unsigned VarArgSize = CS.arg_size() - FT->getNumParams();        ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);        AllocaInst *VarArgShadow = -          new AllocaInst(VarArgArrayTy, "", DFSF.F->getEntryBlock().begin()); +          new AllocaInst(VarArgArrayTy, "", &DFSF.F->getEntryBlock().front());        Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));        for (unsigned n = 0; i != e; ++i, ++n) {          IRB.CreateStore( diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 9a3ed5c04efc..fa939aee252a 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -138,6 +138,7 @@ namespace {      Module *M;      LLVMContext *Ctx;      SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs; +    DenseMap<DISubprogram *, Function *> FnMap;    };  } @@ -309,13 +310,12 @@ namespace {    // object users can construct, the blocks and lines will be rooted here.    class GCOVFunction : public GCOVRecord {     public: -     GCOVFunction(const DISubprogram *SP, raw_ostream *os, uint32_t Ident, -                  bool UseCfgChecksum, bool ExitBlockBeforeBody) +     GCOVFunction(const DISubprogram *SP, Function *F, raw_ostream *os, +                  uint32_t Ident, bool UseCfgChecksum, bool ExitBlockBeforeBody)           : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0),             ReturnBlock(1, os) {        this->os = os; -      Function *F = SP->getFunction();        DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");        uint32_t i = 0; @@ -347,8 +347,8 @@ namespace {        std::string EdgeDestinations;        raw_string_ostream EDOS(EdgeDestinations);        Function *F = Blocks.begin()->first->getParent(); -      for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { -        GCOVBlock &Block = getBlock(I); +      for (BasicBlock &I : *F) { +        GCOVBlock &Block = getBlock(&I);          for (int i = 0, e = Block.OutEdges.size(); i != e; ++i)            EDOS << Block.OutEdges[i]->Number;        } @@ -389,8 +389,8 @@ namespace {        // Emit edges between blocks.        if (Blocks.empty()) return;        Function *F = Blocks.begin()->first->getParent(); -      for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { -        GCOVBlock &Block = getBlock(I); +      for (BasicBlock &I : *F) { +        GCOVBlock &Block = getBlock(&I);          if (Block.OutEdges.empty()) continue;          writeBytes(EdgeTag, 4); @@ -405,9 +405,8 @@ namespace {        }        // Emit lines for each block. -      for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { -        getBlock(I).writeOut(); -      } +      for (BasicBlock &I : *F) +        getBlock(&I).writeOut();      }     private: @@ -451,6 +450,12 @@ bool GCOVProfiler::runOnModule(Module &M) {    this->M = &M;    Ctx = &M.getContext(); +  FnMap.clear(); +  for (Function &F : M) { +    if (DISubprogram *SP = F.getSubprogram()) +      FnMap[SP] = &F; +  } +    if (Options.EmitNotes) emitProfileNotes();    if (Options.EmitData) return emitProfileArcs();    return false; @@ -495,7 +500,7 @@ void GCOVProfiler::emitProfileNotes() {      unsigned FunctionIdent = 0;      for (auto *SP : CU->getSubprograms()) { -      Function *F = SP->getFunction(); +      Function *F = FnMap[SP];        if (!F) continue;        if (!functionHasLines(F)) continue; @@ -507,13 +512,13 @@ void GCOVProfiler::emitProfileNotes() {          ++It;        EntryBlock.splitBasicBlock(It); -      Funcs.push_back(make_unique<GCOVFunction>(SP, &out, FunctionIdent++, +      Funcs.push_back(make_unique<GCOVFunction>(SP, F, &out, FunctionIdent++,                                                  Options.UseCfgChecksum,                                                  Options.ExitBlockBeforeBody));        GCOVFunction &Func = *Funcs.back();        for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { -        GCOVBlock &Block = Func.getBlock(BB); +        GCOVBlock &Block = Func.getBlock(&*BB);          TerminatorInst *TI = BB->getTerminator();          if (int successors = TI->getNumSuccessors()) {            for (int i = 0; i != successors; ++i) { @@ -574,7 +579,7 @@ bool GCOVProfiler::emitProfileArcs() {      auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));      SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;      for (auto *SP : CU->getSubprograms()) { -      Function *F = SP->getFunction(); +      Function *F = FnMap[SP];        if (!F) continue;        if (!functionHasLines(F)) continue;        if (!Result) Result = true; @@ -605,7 +610,7 @@ bool GCOVProfiler::emitProfileArcs() {          int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();          if (Successors) {            if (Successors == 1) { -            IRBuilder<> Builder(BB->getFirstInsertionPt()); +            IRBuilder<> Builder(&*BB->getFirstInsertionPt());              Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,                                                                  Edge);              Value *Count = Builder.CreateLoad(Counter); @@ -625,7 +630,7 @@ bool GCOVProfiler::emitProfileArcs() {              Count = Builder.CreateAdd(Count, Builder.getInt64(1));              Builder.CreateStore(Count, Counter);            } else { -            ComplexEdgePreds.insert(BB); +            ComplexEdgePreds.insert(&*BB);              for (int i = 0; i != Successors; ++i)                ComplexEdgeSuccs.insert(TI->getSuccessor(i));            } @@ -641,13 +646,13 @@ bool GCOVProfiler::emitProfileArcs() {          GlobalVariable *EdgeState = getEdgeStateValue();          for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) { -          IRBuilder<> Builder(ComplexEdgePreds[i + 1]->getFirstInsertionPt()); +          IRBuilder<> Builder(&*ComplexEdgePreds[i + 1]->getFirstInsertionPt());            Builder.CreateStore(Builder.getInt32(i), EdgeState);          }          for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {            // Call runtime to perform increment. -          IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstInsertionPt()); +          IRBuilder<> Builder(&*ComplexEdgeSuccs[i + 1]->getFirstInsertionPt());            Value *CounterPtrArray =              Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,                                                 i * ComplexEdgePreds.size()); @@ -731,8 +736,8 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(          IRBuilder<> Builder(Succ);          Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,                                                              Edge + i); -        EdgeTable[((Succs.idFor(Succ)-1) * Preds.size()) + -                  (Preds.idFor(BB)-1)] = cast<Constant>(Counter); +        EdgeTable[((Succs.idFor(Succ) - 1) * Preds.size()) + +                  (Preds.idFor(&*BB) - 1)] = cast<Constant>(Counter);        }      }      Edge += Successors; @@ -901,7 +906,7 @@ void GCOVProfiler::insertIndirectCounterIncrement() {    // uint32_t pred = *predecessor;    // if (pred == 0xffffffff) return; -  Argument *Arg = Fn->arg_begin(); +  Argument *Arg = &*Fn->arg_begin();    Arg->setName("predecessor");    Value *Pred = Builder.CreateLoad(Arg, "pred");    Value *Cond = Builder.CreateICmpEQ(Pred, Builder.getInt32(0xffffffff)); @@ -912,7 +917,7 @@ void GCOVProfiler::insertIndirectCounterIncrement() {    // uint64_t *counter = counters[pred];    // if (!counter) return;    Value *ZExtPred = Builder.CreateZExt(Pred, Builder.getInt64Ty()); -  Arg = std::next(Fn->arg_begin()); +  Arg = &*std::next(Fn->arg_begin());    Arg->setName("counters");    Value *GEP = Builder.CreateGEP(Type::getInt64PtrTy(*Ctx), Arg, ZExtPred);    Value *Counter = Builder.CreateLoad(GEP, "counter"); diff --git a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 712bf8edc7ea..92e41ee27c09 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -7,18 +7,18 @@  //  //===----------------------------------------------------------------------===//  // -// This pass lowers instrprof_increment intrinsics emitted by a frontend for -// profiling. It also builds the data structures and initialization code needed -// for updating execution counts and emitting the profile at runtime. +// This pass lowers instrprof_* intrinsics emitted by a frontend for profiling. +// It also builds the data structures and initialization code needed for +// updating execution counts and emitting the profile at runtime.  //  //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Instrumentation.h" -  #include "llvm/ADT/Triple.h"  #include "llvm/IR/IRBuilder.h"  #include "llvm/IR/IntrinsicInst.h"  #include "llvm/IR/Module.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/Transforms/Instrumentation.h"  #include "llvm/Transforms/Utils/ModuleUtils.h"  using namespace llvm; @@ -49,7 +49,15 @@ public:  private:    InstrProfOptions Options;    Module *M; -  DenseMap<GlobalVariable *, GlobalVariable *> RegionCounters; +  typedef struct PerFunctionProfileData { +    uint32_t NumValueSites[IPVK_Last+1]; +    GlobalVariable* RegionCounters; +    GlobalVariable* DataVar; +    PerFunctionProfileData() : RegionCounters(nullptr), DataVar(nullptr) { +      memset(NumValueSites, 0, sizeof(uint32_t) * (IPVK_Last+1)); +    } +  } PerFunctionProfileData; +  DenseMap<GlobalVariable *, PerFunctionProfileData> ProfileDataMap;    std::vector<Value *> UsedVars;    bool isMachO() const { @@ -58,24 +66,30 @@ private:    /// Get the section name for the counter variables.    StringRef getCountersSection() const { -    return isMachO() ? "__DATA,__llvm_prf_cnts" : "__llvm_prf_cnts"; +    return getInstrProfCountersSectionName(isMachO());    }    /// Get the section name for the name variables.    StringRef getNameSection() const { -    return isMachO() ? "__DATA,__llvm_prf_names" : "__llvm_prf_names"; +    return getInstrProfNameSectionName(isMachO());    }    /// Get the section name for the profile data variables.    StringRef getDataSection() const { -    return isMachO() ? "__DATA,__llvm_prf_data" : "__llvm_prf_data"; +    return getInstrProfDataSectionName(isMachO());    }    /// Get the section name for the coverage mapping data.    StringRef getCoverageSection() const { -    return isMachO() ? "__DATA,__llvm_covmap" : "__llvm_covmap"; +    return getInstrProfCoverageSectionName(isMachO());    } +  /// Count the number of instrumented value sites for the function. +  void computeNumValueSiteCounts(InstrProfValueProfileInst *Ins); + +  /// Replace instrprof_value_profile with a call to runtime library. +  void lowerValueProfileInst(InstrProfValueProfileInst *Ins); +    /// Replace instrprof_increment with an increment of the appropriate value.    void lowerIncrement(InstrProfIncrementInst *Inc); @@ -117,20 +131,37 @@ bool InstrProfiling::runOnModule(Module &M) {    bool MadeChange = false;    this->M = &M; -  RegionCounters.clear(); +  ProfileDataMap.clear();    UsedVars.clear(); +  // We did not know how many value sites there would be inside +  // the instrumented function. This is counting the number of instrumented +  // target value sites to enter it as field in the profile data variable.    for (Function &F : M)      for (BasicBlock &BB : F)        for (auto I = BB.begin(), E = BB.end(); I != E;) -        if (auto *Inc = dyn_cast<InstrProfIncrementInst>(I++)) { +        if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I++)) +          computeNumValueSiteCounts(Ind); + +  for (Function &F : M) +    for (BasicBlock &BB : F) +      for (auto I = BB.begin(), E = BB.end(); I != E;) { +        auto Instr = I++; +        if (auto *Inc = dyn_cast<InstrProfIncrementInst>(Instr)) {            lowerIncrement(Inc);            MadeChange = true; +        } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) { +          lowerValueProfileInst(Ind); +          MadeChange = true;          } -  if (GlobalVariable *Coverage = M.getNamedGlobal("__llvm_coverage_mapping")) { +      } + +  if (GlobalVariable *Coverage = +          M.getNamedGlobal(getCoverageMappingVarName())) {      lowerCoverageData(Coverage);      MadeChange = true;    } +    if (!MadeChange)      return false; @@ -141,10 +172,59 @@ bool InstrProfiling::runOnModule(Module &M) {    return true;  } +static Constant *getOrInsertValueProfilingCall(Module &M) { +  LLVMContext &Ctx = M.getContext(); +  auto *ReturnTy = Type::getVoidTy(M.getContext()); +  Type *ParamTypes[] = { +#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType +#include "llvm/ProfileData/InstrProfData.inc" +  }; +  auto *ValueProfilingCallTy = +      FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false); +  return M.getOrInsertFunction(getInstrProfValueProfFuncName(), +                               ValueProfilingCallTy); +} + +void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) { + +  GlobalVariable *Name = Ind->getName(); +  uint64_t ValueKind = Ind->getValueKind()->getZExtValue(); +  uint64_t Index = Ind->getIndex()->getZExtValue(); +  auto It = ProfileDataMap.find(Name); +  if (It == ProfileDataMap.end()) { +    PerFunctionProfileData PD; +    PD.NumValueSites[ValueKind] = Index + 1; +    ProfileDataMap[Name] = PD; +  } else if (It->second.NumValueSites[ValueKind] <= Index) +    It->second.NumValueSites[ValueKind] = Index + 1; +} + +void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { + +  GlobalVariable *Name = Ind->getName(); +  auto It = ProfileDataMap.find(Name); +  assert(It != ProfileDataMap.end() && It->second.DataVar && +    "value profiling detected in function with no counter incerement"); + +  GlobalVariable *DataVar = It->second.DataVar; +  uint64_t ValueKind = Ind->getValueKind()->getZExtValue(); +  uint64_t Index = Ind->getIndex()->getZExtValue(); +  for (uint32_t Kind = IPVK_First; Kind < ValueKind; ++Kind) +    Index += It->second.NumValueSites[Kind]; + +  IRBuilder<> Builder(Ind); +  Value* Args[3] = {Ind->getTargetValue(), +      Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()), +      Builder.getInt32(Index)}; +  Ind->replaceAllUsesWith( +      Builder.CreateCall(getOrInsertValueProfilingCall(*M), Args)); +  Ind->eraseFromParent(); +} +  void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {    GlobalVariable *Counters = getOrCreateRegionCounters(Inc); -  IRBuilder<> Builder(Inc->getParent(), *Inc); +  IRBuilder<> Builder(Inc);    uint64_t Index = Inc->getIndex()->getZExtValue();    Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index);    Value *Count = Builder.CreateLoad(Addr, "pgocount"); @@ -172,9 +252,10 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageData) {      GlobalVariable *Name = cast<GlobalVariable>(V);      // If we have region counters for this name, we've already handled it. -    auto It = RegionCounters.find(Name); -    if (It != RegionCounters.end()) -      continue; +    auto It = ProfileDataMap.find(Name); +    if (It != ProfileDataMap.end()) +      if (It->second.RegionCounters) +        continue;      // Move the name variable to the right section.      Name->setSection(getNameSection()); @@ -183,69 +264,108 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageData) {  }  /// Get the name of a profiling variable for a particular function. -static std::string getVarName(InstrProfIncrementInst *Inc, StringRef VarName) { -  auto *Arr = cast<ConstantDataArray>(Inc->getName()->getInitializer()); -  StringRef Name = Arr->isCString() ? Arr->getAsCString() : Arr->getAsString(); -  return ("__llvm_profile_" + VarName + "_" + Name).str(); +static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) { +  StringRef NamePrefix = getInstrProfNameVarPrefix(); +  StringRef Name = Inc->getName()->getName().substr(NamePrefix.size()); +  return (Prefix + Name).str(); +} + +static inline bool shouldRecordFunctionAddr(Function *F) { +  // Check the linkage +  if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() && +      !F->hasAvailableExternallyLinkage()) +    return true; +  // Check uses of this function for other than direct calls or invokes to it. +  return F->hasAddressTaken(); +} + +static inline Comdat *getOrCreateProfileComdat(Module &M, +                                               InstrProfIncrementInst *Inc) { +  // COFF format requires a COMDAT section to have a key symbol with the same +  // name. The linker targeting COFF also requires that the COMDAT section +  // a section is associated to must precede the associating section. For this +  // reason, we must choose the name var's name as the name of the comdat. +  StringRef ComdatPrefix = (Triple(M.getTargetTriple()).isOSBinFormatCOFF() +                                ? getInstrProfNameVarPrefix() +                                : getInstrProfComdatPrefix()); +  return M.getOrInsertComdat(StringRef(getVarName(Inc, ComdatPrefix)));  }  GlobalVariable *  InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { -  GlobalVariable *Name = Inc->getName(); -  auto It = RegionCounters.find(Name); -  if (It != RegionCounters.end()) -    return It->second; - -  // Move the name variable to the right section. Make sure it is placed in the -  // same comdat as its associated function. Otherwise, we may get multiple -  // counters for the same function in certain cases. +  GlobalVariable *NamePtr = Inc->getName(); +  auto It = ProfileDataMap.find(NamePtr); +  PerFunctionProfileData PD; +  if (It != ProfileDataMap.end()) { +    if (It->second.RegionCounters) +      return It->second.RegionCounters; +    PD = It->second; +  } + +  // Move the name variable to the right section. Place them in a COMDAT group +  // if the associated function is a COMDAT. This will make sure that +  // only one copy of counters of the COMDAT function will be emitted after +  // linking.    Function *Fn = Inc->getParent()->getParent(); -  Name->setSection(getNameSection()); -  Name->setAlignment(1); -  Name->setComdat(Fn->getComdat()); +  Comdat *ProfileVarsComdat = nullptr; +  if (Fn->hasComdat()) +    ProfileVarsComdat = getOrCreateProfileComdat(*M, Inc); +  NamePtr->setSection(getNameSection()); +  NamePtr->setAlignment(1); +  NamePtr->setComdat(ProfileVarsComdat);    uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();    LLVMContext &Ctx = M->getContext();    ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);    // Create the counters variable. -  auto *Counters = new GlobalVariable(*M, CounterTy, false, Name->getLinkage(), -                                      Constant::getNullValue(CounterTy), -                                      getVarName(Inc, "counters")); -  Counters->setVisibility(Name->getVisibility()); -  Counters->setSection(getCountersSection()); -  Counters->setAlignment(8); -  Counters->setComdat(Fn->getComdat()); - -  RegionCounters[Inc->getName()] = Counters; +  auto *CounterPtr = +      new GlobalVariable(*M, CounterTy, false, NamePtr->getLinkage(), +                         Constant::getNullValue(CounterTy), +                         getVarName(Inc, getInstrProfCountersVarPrefix())); +  CounterPtr->setVisibility(NamePtr->getVisibility()); +  CounterPtr->setSection(getCountersSection()); +  CounterPtr->setAlignment(8); +  CounterPtr->setComdat(ProfileVarsComdat);    // Create data variable. -  auto *NameArrayTy = Name->getType()->getPointerElementType(); -  auto *Int32Ty = Type::getInt32Ty(Ctx); -  auto *Int64Ty = Type::getInt64Ty(Ctx);    auto *Int8PtrTy = Type::getInt8PtrTy(Ctx); -  auto *Int64PtrTy = Type::getInt64PtrTy(Ctx); - -  Type *DataTypes[] = {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int64PtrTy}; +  auto *Int16Ty = Type::getInt16Ty(Ctx); +  auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last+1); +  Type *DataTypes[] = { +    #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType, +    #include "llvm/ProfileData/InstrProfData.inc" +  };    auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes)); + +  Constant *FunctionAddr = shouldRecordFunctionAddr(Fn) ? +                           ConstantExpr::getBitCast(Fn, Int8PtrTy) : +                           ConstantPointerNull::get(Int8PtrTy); + +  Constant *Int16ArrayVals[IPVK_Last+1]; +  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) +    Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]); +    Constant *DataVals[] = { -      ConstantInt::get(Int32Ty, NameArrayTy->getArrayNumElements()), -      ConstantInt::get(Int32Ty, NumCounters), -      ConstantInt::get(Int64Ty, Inc->getHash()->getZExtValue()), -      ConstantExpr::getBitCast(Name, Int8PtrTy), -      ConstantExpr::getBitCast(Counters, Int64PtrTy)}; -  auto *Data = new GlobalVariable(*M, DataTy, true, Name->getLinkage(), +    #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init, +    #include "llvm/ProfileData/InstrProfData.inc" +  }; +  auto *Data = new GlobalVariable(*M, DataTy, false, NamePtr->getLinkage(),                                    ConstantStruct::get(DataTy, DataVals), -                                  getVarName(Inc, "data")); -  Data->setVisibility(Name->getVisibility()); +                                  getVarName(Inc, getInstrProfDataVarPrefix())); +  Data->setVisibility(NamePtr->getVisibility());    Data->setSection(getDataSection()); -  Data->setAlignment(8); -  Data->setComdat(Fn->getComdat()); +  Data->setAlignment(INSTR_PROF_DATA_ALIGNMENT); +  Data->setComdat(ProfileVarsComdat); + +  PD.RegionCounters = CounterPtr; +  PD.DataVar = Data; +  ProfileDataMap[NamePtr] = PD;    // Mark the data variable as used so that it isn't stripped out.    UsedVars.push_back(Data); -  return Counters; +  return CounterPtr;  }  void InstrProfiling::emitRegistration() { @@ -253,20 +373,24 @@ void InstrProfiling::emitRegistration() {    if (Triple(M->getTargetTriple()).isOSDarwin())      return; +  // Use linker script magic to get data/cnts/name start/end. +  if (Triple(M->getTargetTriple()).isOSLinux() || +      Triple(M->getTargetTriple()).isOSFreeBSD()) +    return; +    // Construct the function.    auto *VoidTy = Type::getVoidTy(M->getContext());    auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext());    auto *RegisterFTy = FunctionType::get(VoidTy, false);    auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage, -                                     "__llvm_profile_register_functions", M); +                                     getInstrProfRegFuncsName(), M);    RegisterF->setUnnamedAddr(true); -  if (Options.NoRedZone) -    RegisterF->addFnAttr(Attribute::NoRedZone); +  if (Options.NoRedZone) RegisterF->addFnAttr(Attribute::NoRedZone);    auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false);    auto *RuntimeRegisterF =        Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage, -                       "__llvm_profile_register_function", M); +                       getInstrProfRegFuncName(), M);    IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF));    for (Value *Data : UsedVars) @@ -275,26 +399,27 @@ void InstrProfiling::emitRegistration() {  }  void InstrProfiling::emitRuntimeHook() { -  const char *const RuntimeVarName = "__llvm_profile_runtime"; -  const char *const RuntimeUserName = "__llvm_profile_runtime_user"; -  // If the module's provided its own runtime, we don't need to do anything. -  if (M->getGlobalVariable(RuntimeVarName)) +  // We expect the linker to be invoked with -u<hook_var> flag for linux, +  // for which case there is no need to emit the user function. +  if (Triple(M->getTargetTriple()).isOSLinux())      return; +  // If the module's provided its own runtime, we don't need to do anything. +  if (M->getGlobalVariable(getInstrProfRuntimeHookVarName())) return; +    // Declare an external variable that will pull in the runtime initialization.    auto *Int32Ty = Type::getInt32Ty(M->getContext());    auto *Var =        new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage, -                         nullptr, RuntimeVarName); +                         nullptr, getInstrProfRuntimeHookVarName());    // Make a function that uses it. -  auto *User = -      Function::Create(FunctionType::get(Int32Ty, false), -                       GlobalValue::LinkOnceODRLinkage, RuntimeUserName, M); +  auto *User = Function::Create(FunctionType::get(Int32Ty, false), +                                GlobalValue::LinkOnceODRLinkage, +                                getInstrProfRuntimeHookVarUseFuncName(), M);    User->addFnAttr(Attribute::NoInline); -  if (Options.NoRedZone) -    User->addFnAttr(Attribute::NoRedZone); +  if (Options.NoRedZone) User->addFnAttr(Attribute::NoRedZone);    User->setVisibility(GlobalValue::HiddenVisibility);    IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User)); @@ -330,26 +455,23 @@ void InstrProfiling::emitUses() {    LLVMUsed =        new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage,                           ConstantArray::get(ATy, MergedVars), "llvm.used"); -    LLVMUsed->setSection("llvm.metadata");  }  void InstrProfiling::emitInitialization() {    std::string InstrProfileOutput = Options.InstrProfileOutput; -  Constant *RegisterF = M->getFunction("__llvm_profile_register_functions"); -  if (!RegisterF && InstrProfileOutput.empty()) -    return; +  Constant *RegisterF = M->getFunction(getInstrProfRegFuncsName()); +  if (!RegisterF && InstrProfileOutput.empty()) return;    // Create the initialization function.    auto *VoidTy = Type::getVoidTy(M->getContext()); -  auto *F = -      Function::Create(FunctionType::get(VoidTy, false), -                       GlobalValue::InternalLinkage, "__llvm_profile_init", M); +  auto *F = Function::Create(FunctionType::get(VoidTy, false), +                             GlobalValue::InternalLinkage, +                             getInstrProfInitFuncName(), M);    F->setUnnamedAddr(true);    F->addFnAttr(Attribute::NoInline); -  if (Options.NoRedZone) -    F->addFnAttr(Attribute::NoRedZone); +  if (Options.NoRedZone) F->addFnAttr(Attribute::NoRedZone);    // Add the basic block and the necessary calls.    IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F)); @@ -358,9 +480,8 @@ void InstrProfiling::emitInitialization() {    if (!InstrProfileOutput.empty()) {      auto *Int8PtrTy = Type::getInt8PtrTy(M->getContext());      auto *SetNameTy = FunctionType::get(VoidTy, Int8PtrTy, false); -    auto *SetNameF = -        Function::Create(SetNameTy, GlobalValue::ExternalLinkage, -                         "__llvm_profile_override_default_filename", M); +    auto *SetNameF = Function::Create(SetNameTy, GlobalValue::ExternalLinkage, +                                      getInstrProfFileOverriderFuncName(), M);      // Create variable for profile name.      Constant *ProfileNameConst = diff --git a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp index 27505859100b..a05a5fa09f9a 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -12,12 +12,47 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Instrumentation.h"  #include "llvm-c/Initialization.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/InitializePasses.h"  #include "llvm/PassRegistry.h"  using namespace llvm; +/// Moves I before IP. Returns new insert point. +static BasicBlock::iterator moveBeforeInsertPoint(BasicBlock::iterator I, BasicBlock::iterator IP) { +  // If I is IP, move the insert point down. +  if (I == IP) +    return ++IP; +  // Otherwise, move I before IP and return IP. +  I->moveBefore(&*IP); +  return IP; +} + +/// Instrumentation passes often insert conditional checks into entry blocks. +/// Call this function before splitting the entry block to move instructions +/// that must remain in the entry block up before the split point. Static +/// allocas and llvm.localescape calls, for example, must remain in the entry +/// block. +BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB, +                                                    BasicBlock::iterator IP) { +  assert(&BB.getParent()->getEntryBlock() == &BB); +  for (auto I = IP, E = BB.end(); I != E; ++I) { +    bool KeepInEntry = false; +    if (auto *AI = dyn_cast<AllocaInst>(I)) { +      if (AI->isStaticAlloca()) +        KeepInEntry = true; +    } else if (auto *II = dyn_cast<IntrinsicInst>(I)) { +      if (II->getIntrinsicID() == llvm::Intrinsic::localescape) +        KeepInEntry = true; +    } +    if (KeepInEntry) +      IP = moveBeforeInsertPoint(I, IP); +  } +  return IP; +} +  /// initializeInstrumentation - Initialize all passes in the TransformUtils  /// library.  void llvm::initializeInstrumentation(PassRegistry &Registry) { @@ -25,6 +60,8 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {    initializeAddressSanitizerModulePass(Registry);    initializeBoundsCheckingPass(Registry);    initializeGCOVProfilerPass(Registry); +  initializePGOInstrumentationGenPass(Registry); +  initializePGOInstrumentationUsePass(Registry);    initializeInstrProfilingPass(Registry);    initializeMemorySanitizerPass(Registry);    initializeThreadSanitizerPass(Registry); diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 286a56330248..5a7bce5a5413 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -148,7 +148,7 @@ static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",         cl::desc("poison uninitialized stack variables with a call"),         cl::Hidden, cl::init(false));  static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern", -       cl::desc("poison uninitialized stack variables with the given patter"), +       cl::desc("poison uninitialized stack variables with the given pattern"),         cl::Hidden, cl::init(0xff));  static cl::opt<bool> ClPoisonUndef("msan-poison-undef",         cl::desc("poison undef temps"), @@ -222,10 +222,17 @@ static const MemoryMapParams Linux_I386_MemoryMapParams = {  // x86_64 Linux  static const MemoryMapParams Linux_X86_64_MemoryMapParams = { +#ifdef MSAN_LINUX_X86_64_OLD_MAPPING    0x400000000000,  // AndMask    0,               // XorMask (not used)    0,               // ShadowBase (not used)    0x200000000000,  // OriginBase +#else +  0,               // AndMask (not used) +  0x500000000000,  // XorMask +  0,               // ShadowBase (not used) +  0x100000000000,  // OriginBase +#endif  };  // mips64 Linux @@ -244,6 +251,14 @@ static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = {    0x1C0000000000,  // OriginBase  }; +// aarch64 Linux +static const MemoryMapParams Linux_AArch64_MemoryMapParams = { +  0,               // AndMask (not used) +  0x06000000000,   // XorMask +  0,               // ShadowBase (not used) +  0x01000000000,   // OriginBase +}; +  // i386 FreeBSD  static const MemoryMapParams FreeBSD_I386_MemoryMapParams = {    0x000180000000,  // AndMask @@ -266,15 +281,20 @@ static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = {  };  static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = { -  NULL, +  nullptr,    &Linux_MIPS64_MemoryMapParams,  };  static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = { -  NULL, +  nullptr,    &Linux_PowerPC64_MemoryMapParams,  }; +static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = { +  nullptr, +  &Linux_AArch64_MemoryMapParams, +}; +  static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = {    &FreeBSD_I386_MemoryMapParams,    &FreeBSD_X86_64_MemoryMapParams, @@ -353,8 +373,9 @@ class MemorySanitizer : public FunctionPass {    friend struct MemorySanitizerVisitor;    friend struct VarArgAMD64Helper;    friend struct VarArgMIPS64Helper; +  friend struct VarArgAArch64Helper;  }; -}  // namespace +} // anonymous namespace  char MemorySanitizer::ID = 0;  INITIALIZE_PASS(MemorySanitizer, "msan", @@ -377,7 +398,6 @@ static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,                              GlobalValue::PrivateLinkage, StrConst, "");  } -  /// \brief Insert extern declaration of runtime-provided functions and globals.  void MemorySanitizer::initializeCallbacks(Module &M) {    // Only do this once. @@ -496,6 +516,10 @@ bool MemorySanitizer::doInitialization(Module &M) {          case Triple::ppc64le:            MapParams = Linux_PowerPC_MemoryMapParams.bits64;            break; +        case Triple::aarch64: +        case Triple::aarch64_be: +          MapParams = Linux_ARM_MemoryMapParams.bits64; +          break;          default:            report_fatal_error("unsupported architecture");        } @@ -697,7 +721,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {          Value *Cmp = IRB.CreateICmpNE(              ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp");          Instruction *CheckTerm = SplitBlockAndInsertIfThen( -            Cmp, IRB.GetInsertPoint(), false, MS.OriginStoreWeights); +            Cmp, &*IRB.GetInsertPoint(), false, MS.OriginStoreWeights);          IRBuilder<> IRBNew(CheckTerm);          paintOrigin(IRBNew, updateOrigin(Origin, IRBNew),                      getOriginPtr(Addr, IRBNew, Alignment), StoreSize, @@ -893,16 +917,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {    ///    /// Offset = (Addr & ~AndMask) ^ XorMask    Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) { +    Value *OffsetLong = IRB.CreatePointerCast(Addr, MS.IntptrTy); +      uint64_t AndMask = MS.MapParams->AndMask; -    assert(AndMask != 0 && "AndMask shall be specified"); -    Value *OffsetLong = -      IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy), -                    ConstantInt::get(MS.IntptrTy, ~AndMask)); +    if (AndMask) +      OffsetLong = +          IRB.CreateAnd(OffsetLong, ConstantInt::get(MS.IntptrTy, ~AndMask));      uint64_t XorMask = MS.MapParams->XorMask; -    if (XorMask != 0) -      OffsetLong = IRB.CreateXor(OffsetLong, -                                 ConstantInt::get(MS.IntptrTy, XorMask)); +    if (XorMask) +      OffsetLong = +          IRB.CreateXor(OffsetLong, ConstantInt::get(MS.IntptrTy, XorMask));      return OffsetLong;    } @@ -1339,6 +1364,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {    }    void visitBitCastInst(BitCastInst &I) { +    // Special case: if this is the bitcast (there is exactly 1 allowed) between +    // a musttail call and a ret, don't instrument. New instructions are not +    // allowed after a musttail call. +    if (auto *CI = dyn_cast<CallInst>(I.getOperand(0))) +      if (CI->isMustTailCall()) +        return;      IRBuilder<> IRB(&I);      setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I)));      setOrigin(&I, getOrigin(&I, 0)); @@ -1570,18 +1601,24 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {        Type *EltTy = Ty->getSequentialElementType();        SmallVector<Constant *, 16> Elements;        for (unsigned Idx = 0; Idx < NumElements; ++Idx) { -        ConstantInt *Elt = -            dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx)); -        APInt V = Elt->getValue(); -        APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); -        Elements.push_back(ConstantInt::get(EltTy, V2)); +        if (ConstantInt *Elt = +                dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx))) { +          APInt V = Elt->getValue(); +          APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); +          Elements.push_back(ConstantInt::get(EltTy, V2)); +        } else { +          Elements.push_back(ConstantInt::get(EltTy, 1)); +        }        }        ShadowMul = ConstantVector::get(Elements);      } else { -      ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg); -      APInt V = Elt->getValue(); -      APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); -      ShadowMul = ConstantInt::get(Elt->getType(), V2); +      if (ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg)) { +        APInt V = Elt->getValue(); +        APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); +        ShadowMul = ConstantInt::get(Ty, V2); +      } else { +        ShadowMul = ConstantInt::get(Ty, 1); +      }      }      IRBuilder<> IRB(&I); @@ -1730,25 +1767,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {    /// \brief Instrument signed relational comparisons.    /// -  /// Handle (x<0) and (x>=0) comparisons (essentially, sign bit tests) by -  /// propagating the highest bit of the shadow. Everything else is delegated -  /// to handleShadowOr(). +  /// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest +  /// bit of the shadow. Everything else is delegated to handleShadowOr().    void handleSignedRelationalComparison(ICmpInst &I) { -    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0)); -    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1)); -    Value* op = nullptr; -    CmpInst::Predicate pre = I.getPredicate(); -    if (constOp0 && constOp0->isNullValue() && -        (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE)) { -      op = I.getOperand(1); -    } else if (constOp1 && constOp1->isNullValue() && -               (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) { +    Constant *constOp; +    Value *op = nullptr; +    CmpInst::Predicate pre; +    if ((constOp = dyn_cast<Constant>(I.getOperand(1)))) {        op = I.getOperand(0); +      pre = I.getPredicate(); +    } else if ((constOp = dyn_cast<Constant>(I.getOperand(0)))) { +      op = I.getOperand(1); +      pre = I.getSwappedPredicate(); +    } else { +      handleShadowOr(I); +      return;      } -    if (op) { + +    if ((constOp->isNullValue() && +         (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) || +        (constOp->isAllOnesValue() && +         (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE))) {        IRBuilder<> IRB(&I); -      Value* Shadow = -        IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), "_msprop_icmpslt"); +      Value *Shadow = IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), +                                        "_msprop_icmp_s");        setShadow(&I, Shadow);        setOrigin(&I, getOrigin(op));      } else { @@ -1860,25 +1902,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {      VAHelper->visitVACopyInst(I);    } -  enum IntrinsicKind { -    IK_DoesNotAccessMemory, -    IK_OnlyReadsMemory, -    IK_WritesMemory -  }; - -  static IntrinsicKind getIntrinsicKind(Intrinsic::ID iid) { -    const int DoesNotAccessMemory = IK_DoesNotAccessMemory; -    const int OnlyReadsArgumentPointees = IK_OnlyReadsMemory; -    const int OnlyReadsMemory = IK_OnlyReadsMemory; -    const int OnlyAccessesArgumentPointees = IK_WritesMemory; -    const int UnknownModRefBehavior = IK_WritesMemory; -#define GET_INTRINSIC_MODREF_BEHAVIOR -#define ModRefBehavior IntrinsicKind -#include "llvm/IR/Intrinsics.gen" -#undef ModRefBehavior -#undef GET_INTRINSIC_MODREF_BEHAVIOR -  } -    /// \brief Handle vector store-like intrinsics.    ///    /// Instrument intrinsics that look like a simple SIMD store: writes memory, @@ -1978,17 +2001,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {      if (NumArgOperands == 0)        return false; -    Intrinsic::ID iid = I.getIntrinsicID(); -    IntrinsicKind IK = getIntrinsicKind(iid); -    bool OnlyReadsMemory = IK == IK_OnlyReadsMemory; -    bool WritesMemory = IK == IK_WritesMemory; -    assert(!(OnlyReadsMemory && WritesMemory)); -      if (NumArgOperands == 2 &&          I.getArgOperand(0)->getType()->isPointerTy() &&          I.getArgOperand(1)->getType()->isVectorTy() &&          I.getType()->isVoidTy() && -        WritesMemory) { +        !I.onlyReadsMemory()) {        // This looks like a vector store.        return handleVectorStoreIntrinsic(I);      } @@ -1996,12 +2013,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {      if (NumArgOperands == 1 &&          I.getArgOperand(0)->getType()->isPointerTy() &&          I.getType()->isVectorTy() && -        OnlyReadsMemory) { +        I.onlyReadsMemory()) {        // This looks like a vector load.        return handleVectorLoadIntrinsic(I);      } -    if (!OnlyReadsMemory && !WritesMemory) +    if (I.doesNotAccessMemory())        if (maybeHandleSimpleNomemIntrinsic(I))          return true; @@ -2493,13 +2510,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {      // Now, get the shadow for the RetVal.      if (!I.getType()->isSized()) return; +    // Don't emit the epilogue for musttail call returns. +    if (CS.isCall() && cast<CallInst>(&I)->isMustTailCall()) return;      IRBuilder<> IRBBefore(&I);      // Until we have full dynamic coverage, make sure the retval shadow is 0.      Value *Base = getShadowPtrForRetval(&I, IRBBefore);      IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment); -    Instruction *NextInsn = nullptr; +    BasicBlock::iterator NextInsn;      if (CS.isCall()) { -      NextInsn = I.getNextNode(); +      NextInsn = ++I.getIterator(); +      assert(NextInsn != I.getParent()->end());      } else {        BasicBlock *NormalDest = cast<InvokeInst>(&I)->getNormalDest();        if (!NormalDest->getSinglePredecessor()) { @@ -2511,10 +2531,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {          return;        }        NextInsn = NormalDest->getFirstInsertionPt(); -      assert(NextInsn && +      assert(NextInsn != NormalDest->end() &&               "Could not find insertion point for retval shadow load");      } -    IRBuilder<> IRBAfter(NextInsn); +    IRBuilder<> IRBAfter(&*NextInsn);      Value *RetvalShadow =        IRBAfter.CreateAlignedLoad(getShadowPtrForRetval(&I, IRBAfter),                                   kShadowTLSAlignment, "_msret"); @@ -2523,10 +2543,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {        setOrigin(&I, IRBAfter.CreateLoad(getOriginPtrForRetval(IRBAfter)));    } +  bool isAMustTailRetVal(Value *RetVal) { +    if (auto *I = dyn_cast<BitCastInst>(RetVal)) { +      RetVal = I->getOperand(0); +    } +    if (auto *I = dyn_cast<CallInst>(RetVal)) { +      return I->isMustTailCall(); +    } +    return false; +  } +    void visitReturnInst(ReturnInst &I) {      IRBuilder<> IRB(&I);      Value *RetVal = I.getReturnValue();      if (!RetVal) return; +    // Don't emit the epilogue for musttail call returns. +    if (isAMustTailRetVal(RetVal)) return;      Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);      if (CheckReturnValue) {        insertShadowCheck(RetVal, &I); @@ -2653,6 +2685,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {      setOrigin(&I, getCleanOrigin());    } +  void visitCatchSwitchInst(CatchSwitchInst &I) { +    setShadow(&I, getCleanShadow(&I)); +    setOrigin(&I, getCleanOrigin()); +  } + +  void visitFuncletPadInst(FuncletPadInst &I) { +    setShadow(&I, getCleanShadow(&I)); +    setOrigin(&I, getCleanOrigin()); +  } +    void visitGetElementPtrInst(GetElementPtrInst &I) {      handleShadowOr(I);    } @@ -2696,6 +2738,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {      // Nothing to do here.    } +  void visitCleanupReturnInst(CleanupReturnInst &CRI) { +    DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n"); +    // Nothing to do here. +  } + +  void visitCatchReturnInst(CatchReturnInst &CRI) { +    DEBUG(dbgs() << "CatchReturn: " << CRI << "\n"); +    // Nothing to do here. +  } +    void visitInstruction(Instruction &I) {      // Everything else: stop propagating and check for poisoned shadow.      if (ClDumpStrictInstructions) @@ -2808,6 +2860,8 @@ struct VarArgAMD64Helper : public VarArgHelper {    }    void visitVAStartInst(VAStartInst &I) override { +    if (F.getCallingConv() == CallingConv::X86_64_Win64) +      return;      IRBuilder<> IRB(&I);      VAStartInstrumentationList.push_back(&I);      Value *VAListTag = I.getArgOperand(0); @@ -2820,6 +2874,8 @@ struct VarArgAMD64Helper : public VarArgHelper {    }    void visitVACopyInst(VACopyInst &I) override { +    if (F.getCallingConv() == CallingConv::X86_64_Win64) +      return;      IRBuilder<> IRB(&I);      Value *VAListTag = I.getArgOperand(0);      Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); @@ -2979,6 +3035,242 @@ struct VarArgMIPS64Helper : public VarArgHelper {    }  }; + +/// \brief AArch64-specific implementation of VarArgHelper. +struct VarArgAArch64Helper : public VarArgHelper { +  static const unsigned kAArch64GrArgSize = 56; +  static const unsigned kAArch64VrArgSize = 128; + +  static const unsigned AArch64GrBegOffset = 0; +  static const unsigned AArch64GrEndOffset = kAArch64GrArgSize; +  // Make VR space aligned to 16 bytes. +  static const unsigned AArch64VrBegOffset = AArch64GrEndOffset + 8; +  static const unsigned AArch64VrEndOffset = AArch64VrBegOffset +                                             + kAArch64VrArgSize; +  static const unsigned AArch64VAEndOffset = AArch64VrEndOffset; + +  Function &F; +  MemorySanitizer &MS; +  MemorySanitizerVisitor &MSV; +  Value *VAArgTLSCopy; +  Value *VAArgOverflowSize; + +  SmallVector<CallInst*, 16> VAStartInstrumentationList; + +  VarArgAArch64Helper(Function &F, MemorySanitizer &MS, +                    MemorySanitizerVisitor &MSV) +    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr), +      VAArgOverflowSize(nullptr) {} + +  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory }; + +  ArgKind classifyArgument(Value* arg) { +    Type *T = arg->getType(); +    if (T->isFPOrFPVectorTy()) +      return AK_FloatingPoint; +    if ((T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64) +        || (T->isPointerTy())) +      return AK_GeneralPurpose; +    return AK_Memory; +  } + +  // The instrumentation stores the argument shadow in a non ABI-specific +  // format because it does not know which argument is named (since Clang, +  // like x86_64 case, lowers the va_args in the frontend and this pass only +  // sees the low level code that deals with va_list internals). +  // The first seven GR registers are saved in the first 56 bytes of the +  // va_arg tls arra, followers by the first 8 FP/SIMD registers, and then +  // the remaining arguments. +  // Using constant offset within the va_arg TLS array allows fast copy +  // in the finalize instrumentation. +  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override { +    unsigned GrOffset = AArch64GrBegOffset; +    unsigned VrOffset = AArch64VrBegOffset; +    unsigned OverflowOffset = AArch64VAEndOffset; + +    const DataLayout &DL = F.getParent()->getDataLayout(); +    for (CallSite::arg_iterator ArgIt = CS.arg_begin() + 1, End = CS.arg_end(); +         ArgIt != End; ++ArgIt) { +      Value *A = *ArgIt; +      ArgKind AK = classifyArgument(A); +      if (AK == AK_GeneralPurpose && GrOffset >= AArch64GrEndOffset) +        AK = AK_Memory; +      if (AK == AK_FloatingPoint && VrOffset >= AArch64VrEndOffset) +        AK = AK_Memory; +      Value *Base; +      switch (AK) { +        case AK_GeneralPurpose: +          Base = getShadowPtrForVAArgument(A->getType(), IRB, GrOffset); +          GrOffset += 8; +          break; +        case AK_FloatingPoint: +          Base = getShadowPtrForVAArgument(A->getType(), IRB, VrOffset); +          VrOffset += 16; +          break; +        case AK_Memory: +          uint64_t ArgSize = DL.getTypeAllocSize(A->getType()); +          Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset); +          OverflowOffset += RoundUpToAlignment(ArgSize, 8); +          break; +      } +      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); +    } +    Constant *OverflowSize = +      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset); +    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS); +  } + +  /// Compute the shadow address for a given va_arg. +  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, +                                   int ArgOffset) { +    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); +    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); +    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0), +                              "_msarg"); +  } + +  void visitVAStartInst(VAStartInst &I) override { +    IRBuilder<> IRB(&I); +    VAStartInstrumentationList.push_back(&I); +    Value *VAListTag = I.getArgOperand(0); +    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); +    // Unpoison the whole __va_list_tag. +    // FIXME: magic ABI constants (size of va_list). +    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), +                     /* size */32, /* alignment */8, false); +  } + +  void visitVACopyInst(VACopyInst &I) override { +    IRBuilder<> IRB(&I); +    Value *VAListTag = I.getArgOperand(0); +    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); +    // Unpoison the whole __va_list_tag. +    // FIXME: magic ABI constants (size of va_list). +    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), +                     /* size */32, /* alignment */8, false); +  } + +  // Retrieve a va_list field of 'void*' size. +  Value* getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) { +    Value *SaveAreaPtrPtr = +      IRB.CreateIntToPtr( +        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), +                      ConstantInt::get(MS.IntptrTy, offset)), +        Type::getInt64PtrTy(*MS.C)); +    return IRB.CreateLoad(SaveAreaPtrPtr); +  } + +  // Retrieve a va_list field of 'int' size. +  Value* getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) { +    Value *SaveAreaPtr = +      IRB.CreateIntToPtr( +        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), +                      ConstantInt::get(MS.IntptrTy, offset)), +        Type::getInt32PtrTy(*MS.C)); +    Value *SaveArea32 = IRB.CreateLoad(SaveAreaPtr); +    return IRB.CreateSExt(SaveArea32, MS.IntptrTy); +  } + +  void finalizeInstrumentation() override { +    assert(!VAArgOverflowSize && !VAArgTLSCopy && +           "finalizeInstrumentation called twice"); +    if (!VAStartInstrumentationList.empty()) { +      // If there is a va_start in this function, make a backup copy of +      // va_arg_tls somewhere in the function entry block. +      IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI()); +      VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS); +      Value *CopySize = +        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset), +                      VAArgOverflowSize); +      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); +      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8); +    } + +    Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize); +    Value *VrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64VrArgSize); + +    // Instrument va_start, copy va_list shadow from the backup copy of +    // the TLS contents. +    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) { +      CallInst *OrigInst = VAStartInstrumentationList[i]; +      IRBuilder<> IRB(OrigInst->getNextNode()); + +      Value *VAListTag = OrigInst->getArgOperand(0); + +      // The variadic ABI for AArch64 creates two areas to save the incoming +      // argument registers (one for 64-bit general register xn-x7 and another +      // for 128-bit FP/SIMD vn-v7). +      // We need then to propagate the shadow arguments on both regions +      // 'va::__gr_top + va::__gr_offs' and 'va::__vr_top + va::__vr_offs'. +      // The remaning arguments are saved on shadow for 'va::stack'. +      // One caveat is it requires only to propagate the non-named arguments, +      // however on the call site instrumentation 'all' the arguments are +      // saved. So to copy the shadow values from the va_arg TLS array +      // we need to adjust the offset for both GR and VR fields based on +      // the __{gr,vr}_offs value (since they are stores based on incoming +      // named arguments). + +      // Read the stack pointer from the va_list. +      Value *StackSaveAreaPtr = getVAField64(IRB, VAListTag, 0); + +      // Read both the __gr_top and __gr_off and add them up. +      Value *GrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 8); +      Value *GrOffSaveArea = getVAField32(IRB, VAListTag, 24); + +      Value *GrRegSaveAreaPtr = IRB.CreateAdd(GrTopSaveAreaPtr, GrOffSaveArea); + +      // Read both the __vr_top and __vr_off and add them up. +      Value *VrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 16); +      Value *VrOffSaveArea = getVAField32(IRB, VAListTag, 28); + +      Value *VrRegSaveAreaPtr = IRB.CreateAdd(VrTopSaveAreaPtr, VrOffSaveArea); + +      // It does not know how many named arguments is being used and, on the +      // callsite all the arguments were saved.  Since __gr_off is defined as +      // '0 - ((8 - named_gr) * 8)', the idea is to just propagate the variadic +      // argument by ignoring the bytes of shadow from named arguments. +      Value *GrRegSaveAreaShadowPtrOff = +        IRB.CreateAdd(GrArgSize, GrOffSaveArea); + +      Value *GrRegSaveAreaShadowPtr = +        MSV.getShadowPtr(GrRegSaveAreaPtr, IRB.getInt8Ty(), IRB); + +      Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, +                                              GrRegSaveAreaShadowPtrOff); +      Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff); + +      IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, GrSrcPtr, GrCopySize, 8); + +      // Again, but for FP/SIMD values. +      Value *VrRegSaveAreaShadowPtrOff = +          IRB.CreateAdd(VrArgSize, VrOffSaveArea); + +      Value *VrRegSaveAreaShadowPtr = +        MSV.getShadowPtr(VrRegSaveAreaPtr, IRB.getInt8Ty(), IRB); + +      Value *VrSrcPtr = IRB.CreateInBoundsGEP( +        IRB.getInt8Ty(), +        IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, +                              IRB.getInt32(AArch64VrBegOffset)), +        VrRegSaveAreaShadowPtrOff); +      Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff); + +      IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, VrSrcPtr, VrCopySize, 8); + +      // And finally for remaining arguments. +      Value *StackSaveAreaShadowPtr = +        MSV.getShadowPtr(StackSaveAreaPtr, IRB.getInt8Ty(), IRB); + +      Value *StackSrcPtr = +        IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, +                              IRB.getInt32(AArch64VAEndOffset)); + +      IRB.CreateMemCpy(StackSaveAreaShadowPtr, StackSrcPtr, +                       VAArgOverflowSize, 16); +    } +  } +}; +  /// \brief A no-op implementation of VarArgHelper.  struct VarArgNoOpHelper : public VarArgHelper {    VarArgNoOpHelper(Function &F, MemorySanitizer &MS, @@ -3003,11 +3295,13 @@ VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,    else if (TargetTriple.getArch() == llvm::Triple::mips64 ||             TargetTriple.getArch() == llvm::Triple::mips64el)      return new VarArgMIPS64Helper(Func, Msan, Visitor); +  else if (TargetTriple.getArch() == llvm::Triple::aarch64) +    return new VarArgAArch64Helper(Func, Msan, Visitor);    else      return new VarArgNoOpHelper(Func, Msan, Visitor);  } -}  // namespace +} // anonymous namespace  bool MemorySanitizer::runOnFunction(Function &F) {    if (&F == MsanCtorFunction) diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp new file mode 100644 index 000000000000..4b59b93b325f --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -0,0 +1,718 @@ +//===-- PGOInstrumentation.cpp - MST-based PGO Instrumentation ------------===// +// +//                      The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements PGO instrumentation using a minimum spanning tree based +// on the following paper: +//   [1] Donald E. Knuth, Francis R. Stevenson. Optimal measurement of points +//   for program frequency counts. BIT Numerical Mathematics 1973, Volume 13, +//   Issue 3, pp 313-322 +// The idea of the algorithm based on the fact that for each node (except for +// the entry and exit), the sum of incoming edge counts equals the sum of +// outgoing edge counts. The count of edge on spanning tree can be derived from +// those edges not on the spanning tree. Knuth proves this method instruments +// the minimum number of edges. +// +// The minimal spanning tree here is actually a maximum weight tree -- on-tree +// edges have higher frequencies (more likely to execute). The idea is to +// instrument those less frequently executed edges to reduce the runtime +// overhead of instrumented binaries. +// +// This file contains two passes: +// (1) Pass PGOInstrumentationGen which instruments the IR to generate edge +// count profile, and +// (2) Pass PGOInstrumentationUse which reads the edge count profile and +// annotates the branch weights. +// To get the precise counter information, These two passes need to invoke at +// the same compilation point (so they see the same IR). For pass +// PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For +// pass PGOInstrumentationUse, the real work in done in class PGOUseFunc and +// the profile is opened in module level and passed to each PGOUseFunc instance. +// The shared code for PGOInstrumentationGen and PGOInstrumentationUse is put +// in class FuncPGOInstrumentation. +// +// Class PGOEdge represents a CFG edge and some auxiliary information. Class +// BBInfo contains auxiliary information for each BB. These two classes are used +// in pass PGOInstrumentationGen. Class PGOUseEdge and UseBBInfo are the derived +// class of PGOEdge and BBInfo, respectively. They contains extra data structure +// used in populating profile counters. +// The MST implementation is in Class CFGMST (CFGMST.h). +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation.h" +#include "CFGMST.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/ProfileData/InstrProfReader.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/JamCRC.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <string> +#include <utility> +#include <vector> + +using namespace llvm; + +#define DEBUG_TYPE "pgo-instrumentation" + +STATISTIC(NumOfPGOInstrument, "Number of edges instrumented."); +STATISTIC(NumOfPGOEdge, "Number of edges."); +STATISTIC(NumOfPGOBB, "Number of basic-blocks."); +STATISTIC(NumOfPGOSplit, "Number of critical edge splits."); +STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts."); +STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile."); +STATISTIC(NumOfPGOMissing, "Number of functions without profile."); + +// Command line option to specify the file to read profile from. This is +// mainly used for testing. +static cl::opt<std::string> +    PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden, +                       cl::value_desc("filename"), +                       cl::desc("Specify the path of profile data file. This is" +                                "mainly for test purpose.")); + +namespace { +class PGOInstrumentationGen : public ModulePass { +public: +  static char ID; + +  PGOInstrumentationGen() : ModulePass(ID) { +    initializePGOInstrumentationGenPass(*PassRegistry::getPassRegistry()); +  } + +  const char *getPassName() const override { +    return "PGOInstrumentationGenPass"; +  } + +private: +  bool runOnModule(Module &M) override; + +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<BlockFrequencyInfoWrapperPass>(); +  } +}; + +class PGOInstrumentationUse : public ModulePass { +public: +  static char ID; + +  // Provide the profile filename as the parameter. +  PGOInstrumentationUse(std::string Filename = "") +      : ModulePass(ID), ProfileFileName(Filename) { +    if (!PGOTestProfileFile.empty()) +      ProfileFileName = PGOTestProfileFile; +    initializePGOInstrumentationUsePass(*PassRegistry::getPassRegistry()); +  } + +  const char *getPassName() const override { +    return "PGOInstrumentationUsePass"; +  } + +private: +  std::string ProfileFileName; +  std::unique_ptr<IndexedInstrProfReader> PGOReader; +  bool runOnModule(Module &M) override; + +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<BlockFrequencyInfoWrapperPass>(); +  } +}; +} // end anonymous namespace + +char PGOInstrumentationGen::ID = 0; +INITIALIZE_PASS_BEGIN(PGOInstrumentationGen, "pgo-instr-gen", +                      "PGO instrumentation.", false, false) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_END(PGOInstrumentationGen, "pgo-instr-gen", +                    "PGO instrumentation.", false, false) + +ModulePass *llvm::createPGOInstrumentationGenPass() { +  return new PGOInstrumentationGen(); +} + +char PGOInstrumentationUse::ID = 0; +INITIALIZE_PASS_BEGIN(PGOInstrumentationUse, "pgo-instr-use", +                      "Read PGO instrumentation profile.", false, false) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_END(PGOInstrumentationUse, "pgo-instr-use", +                    "Read PGO instrumentation profile.", false, false) + +ModulePass *llvm::createPGOInstrumentationUsePass(StringRef Filename) { +  return new PGOInstrumentationUse(Filename.str()); +} + +namespace { +/// \brief An MST based instrumentation for PGO +/// +/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO +/// in the function level. +struct PGOEdge { +  // This class implements the CFG edges. Note the CFG can be a multi-graph. +  // So there might be multiple edges with same SrcBB and DestBB. +  const BasicBlock *SrcBB; +  const BasicBlock *DestBB; +  uint64_t Weight; +  bool InMST; +  bool Removed; +  bool IsCritical; +  PGOEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1) +      : SrcBB(Src), DestBB(Dest), Weight(W), InMST(false), Removed(false), +        IsCritical(false) {} +  // Return the information string of an edge. +  const std::string infoString() const { +    return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") + +            (IsCritical ? "c" : " ") + "  W=" + Twine(Weight)).str(); +  } +}; + +// This class stores the auxiliary information for each BB. +struct BBInfo { +  BBInfo *Group; +  uint32_t Index; +  uint32_t Rank; + +  BBInfo(unsigned IX) : Group(this), Index(IX), Rank(0) {} + +  // Return the information string of this object. +  const std::string infoString() const { +    return (Twine("Index=") + Twine(Index)).str(); +  } +}; + +// This class implements the CFG edges. Note the CFG can be a multi-graph. +template <class Edge, class BBInfo> class FuncPGOInstrumentation { +private: +  Function &F; +  void computeCFGHash(); + +public: +  std::string FuncName; +  GlobalVariable *FuncNameVar; +  // CFG hash value for this function. +  uint64_t FunctionHash; + +  // The Minimum Spanning Tree of function CFG. +  CFGMST<Edge, BBInfo> MST; + +  // Give an edge, find the BB that will be instrumented. +  // Return nullptr if there is no BB to be instrumented. +  BasicBlock *getInstrBB(Edge *E); + +  // Return the auxiliary BB information. +  BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); } + +  // Dump edges and BB information. +  void dumpInfo(std::string Str = "") const { +    MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " + +                          Twine(FunctionHash) + "\t" + Str); +  } + +  FuncPGOInstrumentation(Function &Func, bool CreateGlobalVar = false, +                         BranchProbabilityInfo *BPI = nullptr, +                         BlockFrequencyInfo *BFI = nullptr) +      : F(Func), FunctionHash(0), MST(F, BPI, BFI) { +    FuncName = getPGOFuncName(F); +    computeCFGHash(); +    DEBUG(dumpInfo("after CFGMST")); + +    NumOfPGOBB += MST.BBInfos.size(); +    for (auto &E : MST.AllEdges) { +      if (E->Removed) +        continue; +      NumOfPGOEdge++; +      if (!E->InMST) +        NumOfPGOInstrument++; +    } + +    if (CreateGlobalVar) +      FuncNameVar = createPGOFuncNameVar(F, FuncName); +  }; +}; + +// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index +// value of each BB in the CFG. The higher 32 bits record the number of edges. +template <class Edge, class BBInfo> +void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() { +  std::vector<char> Indexes; +  JamCRC JC; +  for (auto &BB : F) { +    const TerminatorInst *TI = BB.getTerminator(); +    for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) { +      BasicBlock *Succ = TI->getSuccessor(I); +      uint32_t Index = getBBInfo(Succ).Index; +      for (int J = 0; J < 4; J++) +        Indexes.push_back((char)(Index >> (J * 8))); +    } +  } +  JC.update(Indexes); +  FunctionHash = (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC(); +} + +// Given a CFG E to be instrumented, find which BB to place the instrumented +// code. The function will split the critical edge if necessary. +template <class Edge, class BBInfo> +BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) { +  if (E->InMST || E->Removed) +    return nullptr; + +  BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB); +  BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB); +  // For a fake edge, instrument the real BB. +  if (SrcBB == nullptr) +    return DestBB; +  if (DestBB == nullptr) +    return SrcBB; + +  // Instrument the SrcBB if it has a single successor, +  // otherwise, the DestBB if this is not a critical edge. +  TerminatorInst *TI = SrcBB->getTerminator(); +  if (TI->getNumSuccessors() <= 1) +    return SrcBB; +  if (!E->IsCritical) +    return DestBB; + +  // For a critical edge, we have to split. Instrument the newly +  // created BB. +  NumOfPGOSplit++; +  DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index << " --> " +               << getBBInfo(DestBB).Index << "\n"); +  unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB); +  BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum); +  assert(InstrBB && "Critical edge is not split"); + +  E->Removed = true; +  return InstrBB; +} + +// Visit all edge and instrument the edges not in MST. +// Critical edges will be split. +static void instrumentOneFunc(Function &F, Module *M, +                              BranchProbabilityInfo *BPI, +                              BlockFrequencyInfo *BFI) { +  unsigned NumCounters = 0; +  FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, true, BPI, BFI); +  for (auto &E : FuncInfo.MST.AllEdges) { +    if (!E->InMST && !E->Removed) +      NumCounters++; +  } + +  uint32_t I = 0; +  for (auto &E : FuncInfo.MST.AllEdges) { +    BasicBlock *InstrBB = FuncInfo.getInstrBB(E.get()); +    if (!InstrBB) +      continue; + +    IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt()); +    assert(Builder.GetInsertPoint() != InstrBB->end() && +           "Cannot get the Instrumentation point"); +    Type *I8PtrTy = Type::getInt8PtrTy(M->getContext()); +    Builder.CreateCall( +        Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment), +        {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), +         Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters), +         Builder.getInt32(I++)}); +  } +} + +// This class represents a CFG edge in profile use compilation. +struct PGOUseEdge : public PGOEdge { +  bool CountValid; +  uint64_t CountValue; +  PGOUseEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1) +      : PGOEdge(Src, Dest, W), CountValid(false), CountValue(0) {} + +  // Set edge count value +  void setEdgeCount(uint64_t Value) { +    CountValue = Value; +    CountValid = true; +  } + +  // Return the information string for this object. +  const std::string infoString() const { +    if (!CountValid) +      return PGOEdge::infoString(); +    return (Twine(PGOEdge::infoString()) + "  Count=" + Twine(CountValue)).str(); +  } +}; + +typedef SmallVector<PGOUseEdge *, 2> DirectEdges; + +// This class stores the auxiliary information for each BB. +struct UseBBInfo : public BBInfo { +  uint64_t CountValue; +  bool CountValid; +  int32_t UnknownCountInEdge; +  int32_t UnknownCountOutEdge; +  DirectEdges InEdges; +  DirectEdges OutEdges; +  UseBBInfo(unsigned IX) +      : BBInfo(IX), CountValue(0), CountValid(false), UnknownCountInEdge(0), +        UnknownCountOutEdge(0) {} +  UseBBInfo(unsigned IX, uint64_t C) +      : BBInfo(IX), CountValue(C), CountValid(true), UnknownCountInEdge(0), +        UnknownCountOutEdge(0) {} + +  // Set the profile count value for this BB. +  void setBBInfoCount(uint64_t Value) { +    CountValue = Value; +    CountValid = true; +  } + +  // Return the information string of this object. +  const std::string infoString() const { +    if (!CountValid) +      return BBInfo::infoString(); +    return (Twine(BBInfo::infoString()) + "  Count=" + Twine(CountValue)).str(); +  } +}; + +// Sum up the count values for all the edges. +static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) { +  uint64_t Total = 0; +  for (auto &E : Edges) { +    if (E->Removed) +      continue; +    Total += E->CountValue; +  } +  return Total; +} + +class PGOUseFunc { +private: +  Function &F; +  Module *M; +  // This member stores the shared information with class PGOGenFunc. +  FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo; + +  // Return the auxiliary BB information. +  UseBBInfo &getBBInfo(const BasicBlock *BB) const { +    return FuncInfo.getBBInfo(BB); +  } + +  // The maximum count value in the profile. This is only used in PGO use +  // compilation. +  uint64_t ProgramMaxCount; + +  // Find the Instrumented BB and set the value. +  void setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile); + +  // Set the edge counter value for the unknown edge -- there should be only +  // one unknown edge. +  void setEdgeCount(DirectEdges &Edges, uint64_t Value); + +  // Return FuncName string; +  const std::string getFuncName() const { return FuncInfo.FuncName; } + +  // Set the hot/cold inline hints based on the count values. +  // FIXME: This function should be removed once the functionality in +  // the inliner is implemented. +  void applyFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) { +    if (ProgramMaxCount == 0) +      return; +    // Threshold of the hot functions. +    const BranchProbability HotFunctionThreshold(1, 100); +    // Threshold of the cold functions. +    const BranchProbability ColdFunctionThreshold(2, 10000); +    if (EntryCount >= HotFunctionThreshold.scale(ProgramMaxCount)) +      F.addFnAttr(llvm::Attribute::InlineHint); +    else if (MaxCount <= ColdFunctionThreshold.scale(ProgramMaxCount)) +      F.addFnAttr(llvm::Attribute::Cold); +  } + +public: +  PGOUseFunc(Function &Func, Module *Modu, BranchProbabilityInfo *BPI = nullptr, +             BlockFrequencyInfo *BFI = nullptr) +      : F(Func), M(Modu), FuncInfo(Func, false, BPI, BFI) {} + +  // Read counts for the instrumented BB from profile. +  bool readCounters(IndexedInstrProfReader *PGOReader); + +  // Populate the counts for all BBs. +  void populateCounters(); + +  // Set the branch weights based on the count values. +  void setBranchWeights(); +}; + +// Visit all the edges and assign the count value for the instrumented +// edges and the BB. +void PGOUseFunc::setInstrumentedCounts( +    const std::vector<uint64_t> &CountFromProfile) { + +  // Use a worklist as we will update the vector during the iteration. +  std::vector<PGOUseEdge *> WorkList; +  for (auto &E : FuncInfo.MST.AllEdges) +    WorkList.push_back(E.get()); + +  uint32_t I = 0; +  for (auto &E : WorkList) { +    BasicBlock *InstrBB = FuncInfo.getInstrBB(E); +    if (!InstrBB) +      continue; +    uint64_t CountValue = CountFromProfile[I++]; +    if (!E->Removed) { +      getBBInfo(InstrBB).setBBInfoCount(CountValue); +      E->setEdgeCount(CountValue); +      continue; +    } + +    // Need to add two new edges. +    BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB); +    BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB); +    // Add new edge of SrcBB->InstrBB. +    PGOUseEdge &NewEdge = FuncInfo.MST.addEdge(SrcBB, InstrBB, 0); +    NewEdge.setEdgeCount(CountValue); +    // Add new edge of InstrBB->DestBB. +    PGOUseEdge &NewEdge1 = FuncInfo.MST.addEdge(InstrBB, DestBB, 0); +    NewEdge1.setEdgeCount(CountValue); +    NewEdge1.InMST = true; +    getBBInfo(InstrBB).setBBInfoCount(CountValue); +  } +} + +// Set the count value for the unknown edge. There should be one and only one +// unknown edge in Edges vector. +void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) { +  for (auto &E : Edges) { +    if (E->CountValid) +      continue; +    E->setEdgeCount(Value); + +    getBBInfo(E->SrcBB).UnknownCountOutEdge--; +    getBBInfo(E->DestBB).UnknownCountInEdge--; +    return; +  } +  llvm_unreachable("Cannot find the unknown count edge"); +} + +// Read the profile from ProfileFileName and assign the value to the +// instrumented BB and the edges. This function also updates ProgramMaxCount. +// Return true if the profile are successfully read, and false on errors. +bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) { +  auto &Ctx = M->getContext(); +  ErrorOr<InstrProfRecord> Result = +      PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash); +  if (std::error_code EC = Result.getError()) { +    if (EC == instrprof_error::unknown_function) +      NumOfPGOMissing++; +    else if (EC == instrprof_error::hash_mismatch || +             EC == llvm::instrprof_error::malformed) +      NumOfPGOMismatch++; + +    std::string Msg = EC.message() + std::string(" ") + F.getName().str(); +    Ctx.diagnose( +        DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning)); +    return false; +  } +  std::vector<uint64_t> &CountFromProfile = Result.get().Counts; + +  NumOfPGOFunc++; +  DEBUG(dbgs() << CountFromProfile.size() << " counts\n"); +  uint64_t ValueSum = 0; +  for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) { +    DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n"); +    ValueSum += CountFromProfile[I]; +  } + +  DEBUG(dbgs() << "SUM =  " << ValueSum << "\n"); + +  getBBInfo(nullptr).UnknownCountOutEdge = 2; +  getBBInfo(nullptr).UnknownCountInEdge = 2; + +  setInstrumentedCounts(CountFromProfile); +  ProgramMaxCount = PGOReader->getMaximumFunctionCount(); +  return true; +} + +// Populate the counters from instrumented BBs to all BBs. +// In the end of this operation, all BBs should have a valid count value. +void PGOUseFunc::populateCounters() { +  // First set up Count variable for all BBs. +  for (auto &E : FuncInfo.MST.AllEdges) { +    if (E->Removed) +      continue; + +    const BasicBlock *SrcBB = E->SrcBB; +    const BasicBlock *DestBB = E->DestBB; +    UseBBInfo &SrcInfo = getBBInfo(SrcBB); +    UseBBInfo &DestInfo = getBBInfo(DestBB); +    SrcInfo.OutEdges.push_back(E.get()); +    DestInfo.InEdges.push_back(E.get()); +    SrcInfo.UnknownCountOutEdge++; +    DestInfo.UnknownCountInEdge++; + +    if (!E->CountValid) +      continue; +    DestInfo.UnknownCountInEdge--; +    SrcInfo.UnknownCountOutEdge--; +  } + +  bool Changes = true; +  unsigned NumPasses = 0; +  while (Changes) { +    NumPasses++; +    Changes = false; + +    // For efficient traversal, it's better to start from the end as most +    // of the instrumented edges are at the end. +    for (auto &BB : reverse(F)) { +      UseBBInfo &Count = getBBInfo(&BB); +      if (!Count.CountValid) { +        if (Count.UnknownCountOutEdge == 0) { +          Count.CountValue = sumEdgeCount(Count.OutEdges); +          Count.CountValid = true; +          Changes = true; +        } else if (Count.UnknownCountInEdge == 0) { +          Count.CountValue = sumEdgeCount(Count.InEdges); +          Count.CountValid = true; +          Changes = true; +        } +      } +      if (Count.CountValid) { +        if (Count.UnknownCountOutEdge == 1) { +          uint64_t Total = Count.CountValue - sumEdgeCount(Count.OutEdges); +          setEdgeCount(Count.OutEdges, Total); +          Changes = true; +        } +        if (Count.UnknownCountInEdge == 1) { +          uint64_t Total = Count.CountValue - sumEdgeCount(Count.InEdges); +          setEdgeCount(Count.InEdges, Total); +          Changes = true; +        } +      } +    } +  } + +  DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n"); +  // Assert every BB has a valid counter. +  uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue; +  uint64_t FuncMaxCount = FuncEntryCount; +  for (auto &BB : F) { +    assert(getBBInfo(&BB).CountValid && "BB count is not valid"); +    uint64_t Count = getBBInfo(&BB).CountValue; +    if (Count > FuncMaxCount) +      FuncMaxCount = Count; +  } +  applyFunctionAttributes(FuncEntryCount, FuncMaxCount); + +  DEBUG(FuncInfo.dumpInfo("after reading profile.")); +} + +// Assign the scaled count values to the BB with multiple out edges. +void PGOUseFunc::setBranchWeights() { +  // Generate MD_prof metadata for every branch instruction. +  DEBUG(dbgs() << "\nSetting branch weights.\n"); +  MDBuilder MDB(M->getContext()); +  for (auto &BB : F) { +    TerminatorInst *TI = BB.getTerminator(); +    if (TI->getNumSuccessors() < 2) +      continue; +    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) +      continue; +    if (getBBInfo(&BB).CountValue == 0) +      continue; + +    // We have a non-zero Branch BB. +    const UseBBInfo &BBCountInfo = getBBInfo(&BB); +    unsigned Size = BBCountInfo.OutEdges.size(); +    SmallVector<unsigned, 2> EdgeCounts(Size, 0); +    uint64_t MaxCount = 0; +    for (unsigned s = 0; s < Size; s++) { +      const PGOUseEdge *E = BBCountInfo.OutEdges[s]; +      const BasicBlock *SrcBB = E->SrcBB; +      const BasicBlock *DestBB = E->DestBB; +      if (DestBB == 0) +        continue; +      unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB); +      uint64_t EdgeCount = E->CountValue; +      if (EdgeCount > MaxCount) +        MaxCount = EdgeCount; +      EdgeCounts[SuccNum] = EdgeCount; +    } +    assert(MaxCount > 0 && "Bad max count"); +    uint64_t Scale = calculateCountScale(MaxCount); +    SmallVector<unsigned, 4> Weights; +    for (const auto &ECI : EdgeCounts) +      Weights.push_back(scaleBranchCount(ECI, Scale)); + +    TI->setMetadata(llvm::LLVMContext::MD_prof, +                    MDB.createBranchWeights(Weights)); +    DEBUG(dbgs() << "Weight is: "; +          for (const auto &W : Weights) { dbgs() << W << " "; } +          dbgs() << "\n";); +  } +} +} // end anonymous namespace + +bool PGOInstrumentationGen::runOnModule(Module &M) { +  for (auto &F : M) { +    if (F.isDeclaration()) +      continue; +    BranchProbabilityInfo *BPI = +        &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI()); +    BlockFrequencyInfo *BFI = +        &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI()); +    instrumentOneFunc(F, &M, BPI, BFI); +  } +  return true; +} + +static void setPGOCountOnFunc(PGOUseFunc &Func, +                              IndexedInstrProfReader *PGOReader) { +  if (Func.readCounters(PGOReader)) { +    Func.populateCounters(); +    Func.setBranchWeights(); +  } +} + +bool PGOInstrumentationUse::runOnModule(Module &M) { +  DEBUG(dbgs() << "Read in profile counters: "); +  auto &Ctx = M.getContext(); +  // Read the counter array from file. +  auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName); +  if (std::error_code EC = ReaderOrErr.getError()) { +    Ctx.diagnose( +        DiagnosticInfoPGOProfile(ProfileFileName.data(), EC.message())); +    return false; +  } + +  PGOReader = std::move(ReaderOrErr.get()); +  if (!PGOReader) { +    Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(), +                                          "Cannot get PGOReader")); +    return false; +  } + +  for (auto &F : M) { +    if (F.isDeclaration()) +      continue; +    BranchProbabilityInfo *BPI = +        &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI()); +    BlockFrequencyInfo *BFI = +        &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI()); +    PGOUseFunc Func(F, &M, BPI, BFI); +    setPGOCountOnFunc(Func, PGOReader.get()); +  } +  return true; +} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp index 6b185a2b127b..abed465f102d 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp @@ -18,8 +18,9 @@  #include "llvm/Transforms/Instrumentation.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/ADT/Triple.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/CodeGen/Passes.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/DerivedTypes.h" @@ -37,6 +38,8 @@  #include "llvm/Support/Format.h"  #include "llvm/Support/MathExtras.h"  #include "llvm/Support/raw_os_ostream.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h"  #include "llvm/Transforms/Utils/Local.h"  #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -44,6 +47,17 @@ using namespace llvm;  #define DEBUG_TYPE "safestack" +enum UnsafeStackPtrStorageVal { ThreadLocalUSP, SingleThreadUSP }; + +static cl::opt<UnsafeStackPtrStorageVal> USPStorage("safe-stack-usp-storage", +    cl::Hidden, cl::init(ThreadLocalUSP), +    cl::desc("Type of storage for the unsafe stack pointer"), +    cl::values(clEnumValN(ThreadLocalUSP, "thread-local", +                          "Thread-local storage"), +               clEnumValN(SingleThreadUSP, "single-thread", +                          "Non-thread-local storage"), +               clEnumValEnd)); +  namespace llvm {  STATISTIC(NumFunctions, "Total number of functions"); @@ -54,118 +68,48 @@ STATISTIC(NumUnsafeStackRestorePointsFunctions,  STATISTIC(NumAllocas, "Total number of allocas");  STATISTIC(NumUnsafeStaticAllocas, "Number of unsafe static allocas");  STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas"); +STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments");  STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");  } // namespace llvm  namespace { -/// Check whether a given alloca instruction (AI) should be put on the safe -/// stack or not. The function analyzes all uses of AI and checks whether it is -/// only accessed in a memory safe way (as decided statically). -bool IsSafeStackAlloca(const AllocaInst *AI) { -  // Go through all uses of this alloca and check whether all accesses to the -  // allocated object are statically known to be memory safe and, hence, the -  // object can be placed on the safe stack. - -  SmallPtrSet<const Value *, 16> Visited; -  SmallVector<const Instruction *, 8> WorkList; -  WorkList.push_back(AI); +/// Rewrite an SCEV expression for a memory access address to an expression that +/// represents offset from the given alloca. +/// +/// The implementation simply replaces all mentions of the alloca with zero. +class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> { +  const Value *AllocaPtr; -  // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc. -  while (!WorkList.empty()) { -    const Instruction *V = WorkList.pop_back_val(); -    for (const Use &UI : V->uses()) { -      auto I = cast<const Instruction>(UI.getUser()); -      assert(V == UI.get()); - -      switch (I->getOpcode()) { -      case Instruction::Load: -        // Loading from a pointer is safe. -        break; -      case Instruction::VAArg: -        // "va-arg" from a pointer is safe. -        break; -      case Instruction::Store: -        if (V == I->getOperand(0)) -          // Stored the pointer - conservatively assume it may be unsafe. -          return false; -        // Storing to the pointee is safe. -        break; - -      case Instruction::GetElementPtr: -        if (!cast<const GetElementPtrInst>(I)->hasAllConstantIndices()) -          // GEP with non-constant indices can lead to memory errors. -          // This also applies to inbounds GEPs, as the inbounds attribute -          // represents an assumption that the address is in bounds, rather than -          // an assertion that it is. -          return false; - -        // We assume that GEP on static alloca with constant indices is safe, -        // otherwise a compiler would detect it and warn during compilation. - -        if (!isa<const ConstantInt>(AI->getArraySize())) -          // However, if the array size itself is not constant, the access -          // might still be unsafe at runtime. -          return false; - -      /* fallthrough */ - -      case Instruction::BitCast: -      case Instruction::IntToPtr: -      case Instruction::PHI: -      case Instruction::PtrToInt: -      case Instruction::Select: -        // The object can be safe or not, depending on how the result of the -        // instruction is used. -        if (Visited.insert(I).second) -          WorkList.push_back(cast<const Instruction>(I)); -        break; - -      case Instruction::Call: -      case Instruction::Invoke: { -        // FIXME: add support for memset and memcpy intrinsics. -        ImmutableCallSite CS(I); - -        // LLVM 'nocapture' attribute is only set for arguments whose address -        // is not stored, passed around, or used in any other non-trivial way. -        // We assume that passing a pointer to an object as a 'nocapture' -        // argument is safe. -        // FIXME: a more precise solution would require an interprocedural -        // analysis here, which would look at all uses of an argument inside -        // the function being called. -        ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); -        for (ImmutableCallSite::arg_iterator A = B; A != E; ++A) -          if (A->get() == V && !CS.doesNotCapture(A - B)) -            // The parameter is not marked 'nocapture' - unsafe. -            return false; -        continue; -      } +public: +  AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr) +      : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {} -      default: -        // The object is unsafe if it is used in any other way. -        return false; -      } -    } +  const SCEV *visitUnknown(const SCEVUnknown *Expr) { +    if (Expr->getValue() == AllocaPtr) +      return SE.getZero(Expr->getType()); +    return Expr;    } +}; -  // All uses of the alloca are safe, we can place it on the safe stack. -  return true; -} - -/// The SafeStack pass splits the stack of each function into the -/// safe stack, which is only accessed through memory safe dereferences -/// (as determined statically), and the unsafe stack, which contains all -/// local variables that are accessed in unsafe ways. +/// The SafeStack pass splits the stack of each function into the safe +/// stack, which is only accessed through memory safe dereferences (as +/// determined statically), and the unsafe stack, which contains all +/// local variables that are accessed in ways that we can't prove to +/// be safe.  class SafeStack : public FunctionPass { +  const TargetMachine *TM; +  const TargetLoweringBase *TL;    const DataLayout *DL; +  ScalarEvolution *SE;    Type *StackPtrTy;    Type *IntPtrTy;    Type *Int32Ty;    Type *Int8Ty; -  Constant *UnsafeStackPtr = nullptr; +  Value *UnsafeStackPtr = nullptr;    /// Unsafe stack alignment. Each stack frame must ensure that the stack is    /// aligned to this value. We need to re-align the unsafe stack if the @@ -175,26 +119,31 @@ class SafeStack : public FunctionPass {    /// might expect to appear on the stack on most common targets.    enum { StackAlignment = 16 }; -  /// \brief Build a constant representing a pointer to the unsafe stack -  /// pointer. -  Constant *getOrCreateUnsafeStackPtr(Module &M); +  /// \brief Build a value representing a pointer to the unsafe stack pointer. +  Value *getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F);    /// \brief Find all static allocas, dynamic allocas, return instructions and    /// stack restore points (exception unwind blocks and setjmp calls) in the    /// given function and append them to the respective vectors.    void findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas,                   SmallVectorImpl<AllocaInst *> &DynamicAllocas, +                 SmallVectorImpl<Argument *> &ByValArguments,                   SmallVectorImpl<ReturnInst *> &Returns,                   SmallVectorImpl<Instruction *> &StackRestorePoints); +  /// \brief Calculate the allocation size of a given alloca. Returns 0 if the +  /// size can not be statically determined. +  uint64_t getStaticAllocaAllocationSize(const AllocaInst* AI); +    /// \brief Allocate space for all static allocas in \p StaticAllocas,    /// replace allocas with pointers into the unsafe stack and generate code to    /// restore the stack pointer before all return instructions in \p Returns.    ///    /// \returns A pointer to the top of the unsafe stack after all unsafe static    /// allocas are allocated. -  Value *moveStaticAllocasToUnsafeStack(Function &F, +  Value *moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,                                          ArrayRef<AllocaInst *> StaticAllocas, +                                        ArrayRef<Argument *> ByValArguments,                                          ArrayRef<ReturnInst *> Returns);    /// \brief Generate code to restore the stack after all stack restore points @@ -203,7 +152,7 @@ class SafeStack : public FunctionPass {    /// \returns A local variable in which to maintain the dynamic top of the    /// unsafe stack if needed.    AllocaInst * -  createStackRestorePoints(Function &F, +  createStackRestorePoints(IRBuilder<> &IRB, Function &F,                             ArrayRef<Instruction *> StackRestorePoints,                             Value *StaticTop, bool NeedDynamicTop); @@ -214,17 +163,26 @@ class SafeStack : public FunctionPass {                                         AllocaInst *DynamicTop,                                         ArrayRef<AllocaInst *> DynamicAllocas); +  bool IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize); + +  bool IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U, +                          const Value *AllocaPtr, uint64_t AllocaSize); +  bool IsAccessSafe(Value *Addr, uint64_t Size, const Value *AllocaPtr, +                    uint64_t AllocaSize); +  public:    static char ID; // Pass identification, replacement for typeid. -  SafeStack() : FunctionPass(ID), DL(nullptr) { +  SafeStack(const TargetMachine *TM) +      : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) {      initializeSafeStackPass(*PassRegistry::getPassRegistry());    } +  SafeStack() : SafeStack(nullptr) {} -  virtual void getAnalysisUsage(AnalysisUsage &AU) const { -    AU.addRequired<AliasAnalysis>(); +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<ScalarEvolutionWrapperPass>();    } -  virtual bool doInitialization(Module &M) { +  bool doInitialization(Module &M) override {      DL = &M.getDataLayout();      StackPtrTy = Type::getInt8PtrTy(M.getContext()); @@ -235,51 +193,203 @@ public:      return false;    } -  bool runOnFunction(Function &F); - +  bool runOnFunction(Function &F) override;  }; // class SafeStack -Constant *SafeStack::getOrCreateUnsafeStackPtr(Module &M) { -  // The unsafe stack pointer is stored in a global variable with a magic name. -  const char *kUnsafeStackPtrVar = "__safestack_unsafe_stack_ptr"; +uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { +  uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType()); +  if (AI->isArrayAllocation()) { +    auto C = dyn_cast<ConstantInt>(AI->getArraySize()); +    if (!C) +      return 0; +    Size *= C->getZExtValue(); +  } +  return Size; +} + +bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize, +                             const Value *AllocaPtr, uint64_t AllocaSize) { +  AllocaOffsetRewriter Rewriter(*SE, AllocaPtr); +  const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr)); + +  uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType()); +  ConstantRange AccessStartRange = SE->getUnsignedRange(Expr); +  ConstantRange SizeRange = +      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize)); +  ConstantRange AccessRange = AccessStartRange.add(SizeRange); +  ConstantRange AllocaRange = +      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AllocaSize)); +  bool Safe = AllocaRange.contains(AccessRange); + +  DEBUG(dbgs() << "[SafeStack] " +               << (isa<AllocaInst>(AllocaPtr) ? "Alloca " : "ByValArgument ") +               << *AllocaPtr << "\n" +               << "            Access " << *Addr << "\n" +               << "            SCEV " << *Expr +               << " U: " << SE->getUnsignedRange(Expr) +               << ", S: " << SE->getSignedRange(Expr) << "\n" +               << "            Range " << AccessRange << "\n" +               << "            AllocaRange " << AllocaRange << "\n" +               << "            " << (Safe ? "safe" : "unsafe") << "\n"); + +  return Safe; +} + +bool SafeStack::IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U, +                                   const Value *AllocaPtr, +                                   uint64_t AllocaSize) { +  // All MemIntrinsics have destination address in Arg0 and size in Arg2. +  if (MI->getRawDest() != U) return true; +  const auto *Len = dyn_cast<ConstantInt>(MI->getLength()); +  // Non-constant size => unsafe. FIXME: try SCEV getRange. +  if (!Len) return false; +  return IsAccessSafe(U, Len->getZExtValue(), AllocaPtr, AllocaSize); +} + +/// Check whether a given allocation must be put on the safe +/// stack or not. The function analyzes all uses of AI and checks whether it is +/// only accessed in a memory safe way (as decided statically). +bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { +  // Go through all uses of this alloca and check whether all accesses to the +  // allocated object are statically known to be memory safe and, hence, the +  // object can be placed on the safe stack. +  SmallPtrSet<const Value *, 16> Visited; +  SmallVector<const Value *, 8> WorkList; +  WorkList.push_back(AllocaPtr); + +  // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc. +  while (!WorkList.empty()) { +    const Value *V = WorkList.pop_back_val(); +    for (const Use &UI : V->uses()) { +      auto I = cast<const Instruction>(UI.getUser()); +      assert(V == UI.get()); + +      switch (I->getOpcode()) { +      case Instruction::Load: { +        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr, +                          AllocaSize)) +          return false; +        break; +      } +      case Instruction::VAArg: +        // "va-arg" from a pointer is safe. +        break; +      case Instruction::Store: { +        if (V == I->getOperand(0)) { +          // Stored the pointer - conservatively assume it may be unsafe. +          DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr +                       << "\n            store of address: " << *I << "\n"); +          return false; +        } + +        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()), +                          AllocaPtr, AllocaSize)) +          return false; +        break; +      } +      case Instruction::Ret: { +        // Information leak. +        return false; +      } + +      case Instruction::Call: +      case Instruction::Invoke: { +        ImmutableCallSite CS(I); + +        if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { +          if (II->getIntrinsicID() == Intrinsic::lifetime_start || +              II->getIntrinsicID() == Intrinsic::lifetime_end) +            continue; +        } + +        if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) { +          if (!IsMemIntrinsicSafe(MI, UI, AllocaPtr, AllocaSize)) { +            DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr +                         << "\n            unsafe memintrinsic: " << *I +                         << "\n"); +            return false; +          } +          continue; +        } +        // LLVM 'nocapture' attribute is only set for arguments whose address +        // is not stored, passed around, or used in any other non-trivial way. +        // We assume that passing a pointer to an object as a 'nocapture +        // readnone' argument is safe. +        // FIXME: a more precise solution would require an interprocedural +        // analysis here, which would look at all uses of an argument inside +        // the function being called. +        ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); +        for (ImmutableCallSite::arg_iterator A = B; A != E; ++A) +          if (A->get() == V) +            if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) || +                                               CS.doesNotAccessMemory()))) { +              DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr +                           << "\n            unsafe call: " << *I << "\n"); +              return false; +            } +        continue; +      } + +      default: +        if (Visited.insert(I).second) +          WorkList.push_back(cast<const Instruction>(I)); +      } +    } +  } + +  // All uses of the alloca are safe, we can place it on the safe stack. +  return true; +} + +Value *SafeStack::getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F) { +  // Check if there is a target-specific location for the unsafe stack pointer. +  if (TL) +    if (Value *V = TL->getSafeStackPointerLocation(IRB)) +      return V; + +  // Otherwise, assume the target links with compiler-rt, which provides a +  // thread-local variable with a magic name. +  Module &M = *F.getParent(); +  const char *UnsafeStackPtrVar = "__safestack_unsafe_stack_ptr";    auto UnsafeStackPtr = -      dyn_cast_or_null<GlobalVariable>(M.getNamedValue(kUnsafeStackPtrVar)); +      dyn_cast_or_null<GlobalVariable>(M.getNamedValue(UnsafeStackPtrVar)); + +  bool UseTLS = USPStorage == ThreadLocalUSP;    if (!UnsafeStackPtr) { +    auto TLSModel = UseTLS ? +        GlobalValue::InitialExecTLSModel : +        GlobalValue::NotThreadLocal;      // The global variable is not defined yet, define it ourselves. -    // We use the initial-exec TLS model because we do not support the variable -    // living anywhere other than in the main executable. +    // We use the initial-exec TLS model because we do not support the +    // variable living anywhere other than in the main executable.      UnsafeStackPtr = new GlobalVariable( -        /*Module=*/M, /*Type=*/StackPtrTy, -        /*isConstant=*/false, /*Linkage=*/GlobalValue::ExternalLinkage, -        /*Initializer=*/0, /*Name=*/kUnsafeStackPtrVar, -        /*InsertBefore=*/nullptr, -        /*ThreadLocalMode=*/GlobalValue::InitialExecTLSModel); +        M, StackPtrTy, false, GlobalValue::ExternalLinkage, nullptr, +        UnsafeStackPtrVar, nullptr, TLSModel);    } else {      // The variable exists, check its type and attributes. -    if (UnsafeStackPtr->getValueType() != StackPtrTy) { -      report_fatal_error(Twine(kUnsafeStackPtrVar) + " must have void* type"); -    } - -    if (!UnsafeStackPtr->isThreadLocal()) { -      report_fatal_error(Twine(kUnsafeStackPtrVar) + " must be thread-local"); -    } +    if (UnsafeStackPtr->getValueType() != StackPtrTy) +      report_fatal_error(Twine(UnsafeStackPtrVar) + " must have void* type"); +    if (UseTLS != UnsafeStackPtr->isThreadLocal()) +      report_fatal_error(Twine(UnsafeStackPtrVar) + " must " + +                         (UseTLS ? "" : "not ") + "be thread-local");    } -    return UnsafeStackPtr;  }  void SafeStack::findInsts(Function &F,                            SmallVectorImpl<AllocaInst *> &StaticAllocas,                            SmallVectorImpl<AllocaInst *> &DynamicAllocas, +                          SmallVectorImpl<Argument *> &ByValArguments,                            SmallVectorImpl<ReturnInst *> &Returns,                            SmallVectorImpl<Instruction *> &StackRestorePoints) { -  for (Instruction &I : inst_range(&F)) { +  for (Instruction &I : instructions(&F)) {      if (auto AI = dyn_cast<AllocaInst>(&I)) {        ++NumAllocas; -      if (IsSafeStackAlloca(AI)) +      uint64_t Size = getStaticAllocaAllocationSize(AI); +      if (IsSafeStackAlloca(AI, Size))          continue;        if (AI->isStaticAlloca()) { @@ -304,19 +414,26 @@ void SafeStack::findInsts(Function &F,              "gcroot intrinsic not compatible with safestack attribute");      }    } +  for (Argument &Arg : F.args()) { +    if (!Arg.hasByValAttr()) +      continue; +    uint64_t Size = +        DL->getTypeStoreSize(Arg.getType()->getPointerElementType()); +    if (IsSafeStackAlloca(&Arg, Size)) +      continue; + +    ++NumUnsafeByValArguments; +    ByValArguments.push_back(&Arg); +  }  }  AllocaInst * -SafeStack::createStackRestorePoints(Function &F, +SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F,                                      ArrayRef<Instruction *> StackRestorePoints,                                      Value *StaticTop, bool NeedDynamicTop) {    if (StackRestorePoints.empty())      return nullptr; -  IRBuilder<> IRB(StaticTop -                      ? cast<Instruction>(StaticTop)->getNextNode() -                      : (Instruction *)F.getEntryBlock().getFirstInsertionPt()); -    // We need the current value of the shadow stack pointer to restore    // after longjmp or exception catching. @@ -342,7 +459,7 @@ SafeStack::createStackRestorePoints(Function &F,    for (Instruction *I : StackRestorePoints) {      ++NumUnsafeStackRestorePoints; -    IRB.SetInsertPoint(cast<Instruction>(I->getNextNode())); +    IRB.SetInsertPoint(I->getNextNode());      Value *CurrentTop = DynamicTop ? IRB.CreateLoad(DynamicTop) : StaticTop;      IRB.CreateStore(CurrentTop, UnsafeStackPtr);    } @@ -350,14 +467,12 @@ SafeStack::createStackRestorePoints(Function &F,    return DynamicTop;  } -Value * -SafeStack::moveStaticAllocasToUnsafeStack(Function &F, -                                          ArrayRef<AllocaInst *> StaticAllocas, -                                          ArrayRef<ReturnInst *> Returns) { -  if (StaticAllocas.empty()) +Value *SafeStack::moveStaticAllocasToUnsafeStack( +    IRBuilder<> &IRB, Function &F, ArrayRef<AllocaInst *> StaticAllocas, +    ArrayRef<Argument *> ByValArguments, ArrayRef<ReturnInst *> Returns) { +  if (StaticAllocas.empty() && ByValArguments.empty())      return nullptr; -  IRBuilder<> IRB(F.getEntryBlock().getFirstInsertionPt());    DIBuilder DIB(*F.getParent());    // We explicitly compute and set the unsafe stack layout for all unsafe @@ -377,6 +492,13 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F,    // Compute maximum alignment among static objects on the unsafe stack.    unsigned MaxAlignment = 0; +  for (Argument *Arg : ByValArguments) { +    Type *Ty = Arg->getType()->getPointerElementType(); +    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), +                              Arg->getParamAlignment()); +    if (Align > MaxAlignment) +      MaxAlignment = Align; +  }    for (AllocaInst *AI : StaticAllocas) {      Type *Ty = AI->getAllocatedType();      unsigned Align = @@ -388,22 +510,51 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F,    if (MaxAlignment > StackAlignment) {      // Re-align the base pointer according to the max requested alignment.      assert(isPowerOf2_32(MaxAlignment)); -    IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode())); +    IRB.SetInsertPoint(BasePointer->getNextNode());      BasePointer = cast<Instruction>(IRB.CreateIntToPtr(          IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy),                        ConstantInt::get(IntPtrTy, ~uint64_t(MaxAlignment - 1))),          StackPtrTy));    } -  // Allocate space for every unsafe static AllocaInst on the unsafe stack.    int64_t StaticOffset = 0; // Current stack top. +  IRB.SetInsertPoint(BasePointer->getNextNode()); + +  for (Argument *Arg : ByValArguments) { +    Type *Ty = Arg->getType()->getPointerElementType(); + +    uint64_t Size = DL->getTypeStoreSize(Ty); +    if (Size == 0) +      Size = 1; // Don't create zero-sized stack objects. + +    // Ensure the object is properly aligned. +    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), +                              Arg->getParamAlignment()); + +    // Add alignment. +    // NOTE: we ensure that BasePointer itself is aligned to >= Align. +    StaticOffset += Size; +    StaticOffset = RoundUpToAlignment(StaticOffset, Align); + +    Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8* +                               ConstantInt::get(Int32Ty, -StaticOffset)); +    Value *NewArg = IRB.CreateBitCast(Off, Arg->getType(), +                                     Arg->getName() + ".unsafe-byval"); + +    // Replace alloc with the new location. +    replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB, +                      /*Deref=*/true, -StaticOffset); +    Arg->replaceAllUsesWith(NewArg); +    IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode()); +    IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment()); +  } + +  // Allocate space for every unsafe static AllocaInst on the unsafe stack.    for (AllocaInst *AI : StaticAllocas) {      IRB.SetInsertPoint(AI); -    auto CArraySize = cast<ConstantInt>(AI->getArraySize());      Type *Ty = AI->getAllocatedType(); - -    uint64_t Size = DL->getTypeAllocSize(Ty) * CArraySize->getZExtValue(); +    uint64_t Size = getStaticAllocaAllocationSize(AI);      if (Size == 0)        Size = 1; // Don't create zero-sized stack objects. @@ -423,7 +574,7 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F,        cast<Instruction>(NewAI)->takeName(AI);      // Replace alloc with the new location. -    replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/true); +    replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/true, -StaticOffset);      AI->replaceAllUsesWith(NewAI);      AI->eraseFromParent();    } @@ -434,7 +585,7 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F,    StaticOffset = RoundUpToAlignment(StaticOffset, StackAlignment);    // Update shadow stack pointer in the function epilogue. -  IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode())); +  IRB.SetInsertPoint(BasePointer->getNextNode());    Value *StaticTop =        IRB.CreateGEP(BasePointer, ConstantInt::get(Int32Ty, -StaticOffset), @@ -478,7 +629,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(      if (DynamicTop)        IRB.CreateStore(NewTop, DynamicTop); -    Value *NewAI = IRB.CreateIntToPtr(SP, AI->getType()); +    Value *NewAI = IRB.CreatePointerCast(NewTop, AI->getType());      if (AI->hasName() && isa<Instruction>(NewAI))        NewAI->takeName(AI); @@ -513,8 +664,6 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(  }  bool SafeStack::runOnFunction(Function &F) { -  auto AA = &getAnalysis<AliasAnalysis>(); -    DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");    if (!F.hasFnAttribute(Attribute::SafeStack)) { @@ -529,6 +678,9 @@ bool SafeStack::runOnFunction(Function &F) {      return false;    } +  TL = TM ? TM->getSubtargetImpl(F)->getTargetLowering() : nullptr; +  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); +    {      // Make sure the regular stack protector won't run on this function      // (safestack attribute takes precedence). @@ -541,16 +693,11 @@ bool SafeStack::runOnFunction(Function &F) {          AttributeSet::get(F.getContext(), AttributeSet::FunctionIndex, B));    } -  if (AA->onlyReadsMemory(&F)) { -    // XXX: we don't protect against information leak attacks for now. -    DEBUG(dbgs() << "[SafeStack]     function only reads memory\n"); -    return false; -  } -    ++NumFunctions;    SmallVector<AllocaInst *, 16> StaticAllocas;    SmallVector<AllocaInst *, 4> DynamicAllocas; +  SmallVector<Argument *, 4> ByValArguments;    SmallVector<ReturnInst *, 4> Returns;    // Collect all points where stack gets unwound and needs to be restored @@ -562,23 +709,26 @@ bool SafeStack::runOnFunction(Function &F) {    // Find all static and dynamic alloca instructions that must be moved to the    // unsafe stack, all return instructions and stack restore points. -  findInsts(F, StaticAllocas, DynamicAllocas, Returns, StackRestorePoints); +  findInsts(F, StaticAllocas, DynamicAllocas, ByValArguments, Returns, +            StackRestorePoints);    if (StaticAllocas.empty() && DynamicAllocas.empty() && -      StackRestorePoints.empty()) +      ByValArguments.empty() && StackRestorePoints.empty())      return false; // Nothing to do in this function. -  if (!StaticAllocas.empty() || !DynamicAllocas.empty()) +  if (!StaticAllocas.empty() || !DynamicAllocas.empty() || +      !ByValArguments.empty())      ++NumUnsafeStackFunctions; // This function has the unsafe stack.    if (!StackRestorePoints.empty())      ++NumUnsafeStackRestorePointsFunctions; -  if (!UnsafeStackPtr) -    UnsafeStackPtr = getOrCreateUnsafeStackPtr(*F.getParent()); +  IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt()); +  UnsafeStackPtr = getOrCreateUnsafeStackPtr(IRB, F);    // The top of the unsafe stack after all unsafe static allocas are allocated. -  Value *StaticTop = moveStaticAllocasToUnsafeStack(F, StaticAllocas, Returns); +  Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas, +                                                    ByValArguments, Returns);    // Safe stack object that stores the current unsafe stack top. It is updated    // as unsafe dynamic (non-constant-sized) allocas are allocated and freed. @@ -587,7 +737,7 @@ bool SafeStack::runOnFunction(Function &F) {    // FIXME: a better alternative might be to store the unsafe stack pointer    // before setjmp / invoke instructions.    AllocaInst *DynamicTop = createStackRestorePoints( -      F, StackRestorePoints, StaticTop, !DynamicAllocas.empty()); +      IRB, F, StackRestorePoints, StaticTop, !DynamicAllocas.empty());    // Handle dynamic allocas.    moveDynamicAllocasToUnsafeStack(F, UnsafeStackPtr, DynamicTop, @@ -597,13 +747,14 @@ bool SafeStack::runOnFunction(Function &F) {    return true;  } -} // end anonymous namespace +} // anonymous namespace  char SafeStack::ID = 0; -INITIALIZE_PASS_BEGIN(SafeStack, "safe-stack", -                      "Safe Stack instrumentation pass", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END(SafeStack, "safe-stack", "Safe Stack instrumentation pass", -                    false, false) +INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack", +                         "Safe Stack instrumentation pass", false, false) +INITIALIZE_TM_PASS_END(SafeStack, "safe-stack", +                       "Safe Stack instrumentation pass", false, false) -FunctionPass *llvm::createSafeStackPass() { return new SafeStack(); } +FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) { +  return new SafeStack(TM); +} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 7a5b4cb0178b..09de7a2cda2b 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -31,6 +31,7 @@  #include "llvm/Transforms/Instrumentation.h"  #include "llvm/ADT/ArrayRef.h"  #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/EHPersonalities.h"  #include "llvm/IR/CallSite.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/DebugInfo.h" @@ -59,6 +60,7 @@ static const char *const kSanCovIndirCallName = "__sanitizer_cov_indir_call16";  static const char *const kSanCovTraceEnter = "__sanitizer_cov_trace_func_enter";  static const char *const kSanCovTraceBB = "__sanitizer_cov_trace_basic_block";  static const char *const kSanCovTraceCmp = "__sanitizer_cov_trace_cmp"; +static const char *const kSanCovTraceSwitch = "__sanitizer_cov_trace_switch";  static const char *const kSanCovModuleCtorName = "sancov.module_ctor";  static const uint64_t    kSanCtorAndDtorPriority = 2; @@ -148,19 +150,25 @@ class SanitizerCoverageModule : public ModulePass {    void InjectCoverageForIndirectCalls(Function &F,                                        ArrayRef<Instruction *> IndirCalls);    void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets); +  void InjectTraceForSwitch(Function &F, +                            ArrayRef<Instruction *> SwitchTraceTargets);    bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks);    void SetNoSanitizeMetadata(Instruction *I);    void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls);    unsigned NumberOfInstrumentedBlocks() { -    return SanCovFunction->getNumUses() + SanCovWithCheckFunction->getNumUses(); +    return SanCovFunction->getNumUses() + +           SanCovWithCheckFunction->getNumUses() + SanCovTraceBB->getNumUses() + +           SanCovTraceEnter->getNumUses();    }    Function *SanCovFunction;    Function *SanCovWithCheckFunction;    Function *SanCovIndirCallFunction;    Function *SanCovTraceEnter, *SanCovTraceBB;    Function *SanCovTraceCmpFunction; +  Function *SanCovTraceSwitchFunction;    InlineAsm *EmptyAsm; -  Type *IntptrTy, *Int64Ty; +  Type *IntptrTy, *Int64Ty, *Int64PtrTy; +  Module *CurModule;    LLVMContext *C;    const DataLayout *DL; @@ -177,11 +185,13 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {      return false;    C = &(M.getContext());    DL = &M.getDataLayout(); +  CurModule = &M;    IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());    Type *VoidTy = Type::getVoidTy(*C);    IRBuilder<> IRB(*C);    Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());    Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); +  Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty());    Int64Ty = IRB.getInt64Ty();    SanCovFunction = checkSanitizerInterfaceFunction( @@ -194,18 +204,19 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {    SanCovTraceCmpFunction =        checkSanitizerInterfaceFunction(M.getOrInsertFunction(            kSanCovTraceCmp, VoidTy, Int64Ty, Int64Ty, Int64Ty, nullptr)); +  SanCovTraceSwitchFunction = +      checkSanitizerInterfaceFunction(M.getOrInsertFunction( +          kSanCovTraceSwitch, VoidTy, Int64Ty, Int64PtrTy, nullptr));    // We insert an empty inline asm after cov callbacks to avoid callback merge.    EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),                              StringRef(""), StringRef(""),                              /*hasSideEffects=*/true); -  if (Options.TraceBB) { -    SanCovTraceEnter = checkSanitizerInterfaceFunction( -        M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr)); -    SanCovTraceBB = checkSanitizerInterfaceFunction( -        M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr)); -  } +  SanCovTraceEnter = checkSanitizerInterfaceFunction( +      M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr)); +  SanCovTraceBB = checkSanitizerInterfaceFunction( +      M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr));    // At this point we create a dummy array of guards because we don't    // know how many elements we will need. @@ -280,11 +291,18 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {    if (F.empty()) return false;    if (F.getName().find(".module_ctor") != std::string::npos)      return false;  // Should not instrument sanitizer init functions. +  // Don't instrument functions using SEH for now. Splitting basic blocks like +  // we do for coverage breaks WinEHPrepare. +  // FIXME: Remove this when SEH no longer uses landingpad pattern matching. +  if (F.hasPersonalityFn() && +      isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) +    return false;    if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge)      SplitAllCriticalEdges(F);    SmallVector<Instruction*, 8> IndirCalls;    SmallVector<BasicBlock*, 16> AllBlocks;    SmallVector<Instruction*, 8> CmpTraceTargets; +  SmallVector<Instruction*, 8> SwitchTraceTargets;    for (auto &BB : F) {      AllBlocks.push_back(&BB);      for (auto &Inst : BB) { @@ -293,13 +311,18 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {          if (CS && !CS.getCalledFunction())            IndirCalls.push_back(&Inst);        } -      if (Options.TraceCmp && isa<ICmpInst>(&Inst)) -        CmpTraceTargets.push_back(&Inst); +      if (Options.TraceCmp) { +        if (isa<ICmpInst>(&Inst)) +          CmpTraceTargets.push_back(&Inst); +        if (isa<SwitchInst>(&Inst)) +          SwitchTraceTargets.push_back(&Inst); +      }      }    }    InjectCoverage(F, AllBlocks);    InjectCoverageForIndirectCalls(F, IndirCalls);    InjectTraceForCmp(F, CmpTraceTargets); +  InjectTraceForSwitch(F, SwitchTraceTargets);    return true;  } @@ -348,6 +371,45 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls(    }  } +// For every switch statement we insert a call: +// __sanitizer_cov_trace_switch(CondValue, +//      {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... }) + +void SanitizerCoverageModule::InjectTraceForSwitch( +    Function &F, ArrayRef<Instruction *> SwitchTraceTargets) { +  for (auto I : SwitchTraceTargets) { +    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { +      IRBuilder<> IRB(I); +      SmallVector<Constant *, 16> Initializers; +      Value *Cond = SI->getCondition(); +      if (Cond->getType()->getScalarSizeInBits() > +          Int64Ty->getScalarSizeInBits()) +        continue; +      Initializers.push_back(ConstantInt::get(Int64Ty, SI->getNumCases())); +      Initializers.push_back( +          ConstantInt::get(Int64Ty, Cond->getType()->getScalarSizeInBits())); +      if (Cond->getType()->getScalarSizeInBits() < +          Int64Ty->getScalarSizeInBits()) +        Cond = IRB.CreateIntCast(Cond, Int64Ty, false); +      for (auto It: SI->cases()) { +        Constant *C = It.getCaseValue(); +        if (C->getType()->getScalarSizeInBits() < +            Int64Ty->getScalarSizeInBits()) +          C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty); +        Initializers.push_back(C); +      } +      ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size()); +      GlobalVariable *GV = new GlobalVariable( +          *CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage, +          ConstantArray::get(ArrayOfInt64Ty, Initializers), +          "__sancov_gen_cov_switch_values"); +      IRB.CreateCall(SanCovTraceSwitchFunction, +                     {Cond, IRB.CreatePointerCast(GV, Int64PtrTy)}); +    } +  } +} + +  void SanitizerCoverageModule::InjectTraceForCmp(      Function &F, ArrayRef<Instruction *> CmpTraceTargets) {    for (auto I : CmpTraceTargets) { @@ -369,8 +431,7 @@ void SanitizerCoverageModule::InjectTraceForCmp(  void SanitizerCoverageModule::SetNoSanitizeMetadata(Instruction *I) {    I->setMetadata( -      I->getParent()->getParent()->getParent()->getMDKindID("nosanitize"), -      MDNode::get(*C, None)); +      I->getModule()->getMDKindID("nosanitize"), MDNode::get(*C, None));  }  void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, @@ -382,34 +443,31 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,    // locations.    if (isa<UnreachableInst>(BB.getTerminator()))      return; -  BasicBlock::iterator IP = BB.getFirstInsertionPt(), BE = BB.end(); -  // Skip static allocas at the top of the entry block so they don't become -  // dynamic when we split the block.  If we used our optimized stack layout, -  // then there will only be one alloca and it will come first. -  for (; IP != BE; ++IP) { -    AllocaInst *AI = dyn_cast<AllocaInst>(IP); -    if (!AI || !AI->isStaticAlloca()) -      break; -  } +  BasicBlock::iterator IP = BB.getFirstInsertionPt();    bool IsEntryBB = &BB == &F.getEntryBlock();    DebugLoc EntryLoc;    if (IsEntryBB) {      if (auto SP = getDISubprogram(&F))        EntryLoc = DebugLoc::get(SP->getScopeLine(), 0, SP); +    // Keep static allocas and llvm.localescape calls in the entry block.  Even +    // if we aren't splitting the block, it's nice for allocas to be before +    // calls. +    IP = PrepareToSplitEntryBlock(BB, IP);    } else {      EntryLoc = IP->getDebugLoc();    } -  IRBuilder<> IRB(IP); +  IRBuilder<> IRB(&*IP);    IRB.SetCurrentDebugLocation(EntryLoc); -  SmallVector<Value *, 1> Indices;    Value *GuardP = IRB.CreateAdd(        IRB.CreatePointerCast(GuardArray, IntptrTy),        ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4));    Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());    GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy); -  if (UseCalls) { +  if (Options.TraceBB) { +    IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP); +  } else if (UseCalls) {      IRB.CreateCall(SanCovWithCheckFunction, GuardP);    } else {      LoadInst *Load = IRB.CreateLoad(GuardP); @@ -418,7 +476,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,      SetNoSanitizeMetadata(Load);      Value *Cmp = IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load);      Instruction *Ins = SplitBlockAndInsertIfThen( -        Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); +        Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000));      IRB.SetInsertPoint(Ins);      IRB.SetCurrentDebugLocation(EntryLoc);      // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC. @@ -427,7 +485,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,    }    if (Options.Use8bitCounters) { -    IRB.SetInsertPoint(IP); +    IRB.SetInsertPoint(&*IP);      Value *P = IRB.CreateAdd(          IRB.CreatePointerCast(EightBitCounterArray, IntptrTy),          ConstantInt::get(IntptrTy, NumberOfInstrumentedBlocks() - 1)); @@ -438,13 +496,6 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,      SetNoSanitizeMetadata(LI);      SetNoSanitizeMetadata(SI);    } - -  if (Options.TraceBB) { -    // Experimental support for tracing. -    // Insert a callback with the same guard variable as used for coverage. -    IRB.SetInsertPoint(IP); -    IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP); -  }  }  char SanitizerCoverageModule::ID = 0; diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 1a46bbb86122..9331e1d2b3fd 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -142,37 +142,35 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {        M.getOrInsertFunction("__tsan_func_exit", IRB.getVoidTy(), nullptr));    OrdTy = IRB.getInt32Ty();    for (size_t i = 0; i < kNumberOfAccessSizes; ++i) { -    const size_t ByteSize = 1 << i; -    const size_t BitSize = ByteSize * 8; -    SmallString<32> ReadName("__tsan_read" + itostr(ByteSize)); +    const unsigned ByteSize = 1U << i; +    const unsigned BitSize = ByteSize * 8; +    std::string ByteSizeStr = utostr(ByteSize); +    std::string BitSizeStr = utostr(BitSize); +    SmallString<32> ReadName("__tsan_read" + ByteSizeStr);      TsanRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(          ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); -    SmallString<32> WriteName("__tsan_write" + itostr(ByteSize)); +    SmallString<32> WriteName("__tsan_write" + ByteSizeStr);      TsanWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(          WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); -    SmallString<64> UnalignedReadName("__tsan_unaligned_read" + -        itostr(ByteSize)); +    SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr);      TsanUnalignedRead[i] =          checkSanitizerInterfaceFunction(M.getOrInsertFunction(              UnalignedReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); -    SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + -        itostr(ByteSize)); +    SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr);      TsanUnalignedWrite[i] =          checkSanitizerInterfaceFunction(M.getOrInsertFunction(              UnalignedWriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));      Type *Ty = Type::getIntNTy(M.getContext(), BitSize);      Type *PtrTy = Ty->getPointerTo(); -    SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) + -                                   "_load"); +    SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");      TsanAtomicLoad[i] = checkSanitizerInterfaceFunction(          M.getOrInsertFunction(AtomicLoadName, Ty, PtrTy, OrdTy, nullptr)); -    SmallString<32> AtomicStoreName("__tsan_atomic" + itostr(BitSize) + -                                    "_store"); +    SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store");      TsanAtomicStore[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(          AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy, nullptr)); @@ -201,7 +199,7 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {            M.getOrInsertFunction(RMWName, Ty, PtrTy, Ty, OrdTy, nullptr));      } -    SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) + +    SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr +                                    "_compare_exchange_val");      TsanAtomicCAS[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(          AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, nullptr)); @@ -513,8 +511,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {      int Idx = getMemoryAccessFuncIndex(Addr, DL);      if (Idx < 0)        return false; -    const size_t ByteSize = 1 << Idx; -    const size_t BitSize = ByteSize * 8; +    const unsigned ByteSize = 1U << Idx; +    const unsigned BitSize = ByteSize * 8;      Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);      Type *PtrTy = Ty->getPointerTo();      Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), @@ -527,8 +525,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {      int Idx = getMemoryAccessFuncIndex(Addr, DL);      if (Idx < 0)        return false; -    const size_t ByteSize = 1 << Idx; -    const size_t BitSize = ByteSize * 8; +    const unsigned ByteSize = 1U << Idx; +    const unsigned BitSize = ByteSize * 8;      Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);      Type *PtrTy = Ty->getPointerTo();      Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), @@ -544,8 +542,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {      Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx];      if (!F)        return false; -    const size_t ByteSize = 1 << Idx; -    const size_t BitSize = ByteSize * 8; +    const unsigned ByteSize = 1U << Idx; +    const unsigned BitSize = ByteSize * 8;      Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);      Type *PtrTy = Ty->getPointerTo();      Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), @@ -558,8 +556,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {      int Idx = getMemoryAccessFuncIndex(Addr, DL);      if (Idx < 0)        return false; -    const size_t ByteSize = 1 << Idx; -    const size_t BitSize = ByteSize * 8; +    const unsigned ByteSize = 1U << Idx; +    const unsigned BitSize = ByteSize * 8;      Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);      Type *PtrTy = Ty->getPointerTo();      Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp deleted file mode 100644 index afb873a355a7..000000000000 --- a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp +++ /dev/null @@ -1,673 +0,0 @@ -//===- ARCInstKind.cpp - ObjC ARC Optimization ----------------------------===// -// -//                     The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file defines several utility functions used by various ARC -/// optimizations which are IMHO too big to be in a header file. -/// -/// WARNING: This file knows about certain library functions. It recognizes them -/// by name, and hardwires knowledge of their semantics. -/// -/// WARNING: This file knows about how certain Objective-C library functions are -/// used. Naive LLVM IR transformations which would otherwise be -/// behavior-preserving may break these assumptions. -/// -//===----------------------------------------------------------------------===// - -#include "ObjCARC.h" -#include "llvm/IR/Intrinsics.h" - -using namespace llvm; -using namespace llvm::objcarc; - -raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, -                                       const ARCInstKind Class) { -  switch (Class) { -  case ARCInstKind::Retain: -    return OS << "ARCInstKind::Retain"; -  case ARCInstKind::RetainRV: -    return OS << "ARCInstKind::RetainRV"; -  case ARCInstKind::RetainBlock: -    return OS << "ARCInstKind::RetainBlock"; -  case ARCInstKind::Release: -    return OS << "ARCInstKind::Release"; -  case ARCInstKind::Autorelease: -    return OS << "ARCInstKind::Autorelease"; -  case ARCInstKind::AutoreleaseRV: -    return OS << "ARCInstKind::AutoreleaseRV"; -  case ARCInstKind::AutoreleasepoolPush: -    return OS << "ARCInstKind::AutoreleasepoolPush"; -  case ARCInstKind::AutoreleasepoolPop: -    return OS << "ARCInstKind::AutoreleasepoolPop"; -  case ARCInstKind::NoopCast: -    return OS << "ARCInstKind::NoopCast"; -  case ARCInstKind::FusedRetainAutorelease: -    return OS << "ARCInstKind::FusedRetainAutorelease"; -  case ARCInstKind::FusedRetainAutoreleaseRV: -    return OS << "ARCInstKind::FusedRetainAutoreleaseRV"; -  case ARCInstKind::LoadWeakRetained: -    return OS << "ARCInstKind::LoadWeakRetained"; -  case ARCInstKind::StoreWeak: -    return OS << "ARCInstKind::StoreWeak"; -  case ARCInstKind::InitWeak: -    return OS << "ARCInstKind::InitWeak"; -  case ARCInstKind::LoadWeak: -    return OS << "ARCInstKind::LoadWeak"; -  case ARCInstKind::MoveWeak: -    return OS << "ARCInstKind::MoveWeak"; -  case ARCInstKind::CopyWeak: -    return OS << "ARCInstKind::CopyWeak"; -  case ARCInstKind::DestroyWeak: -    return OS << "ARCInstKind::DestroyWeak"; -  case ARCInstKind::StoreStrong: -    return OS << "ARCInstKind::StoreStrong"; -  case ARCInstKind::CallOrUser: -    return OS << "ARCInstKind::CallOrUser"; -  case ARCInstKind::Call: -    return OS << "ARCInstKind::Call"; -  case ARCInstKind::User: -    return OS << "ARCInstKind::User"; -  case ARCInstKind::IntrinsicUser: -    return OS << "ARCInstKind::IntrinsicUser"; -  case ARCInstKind::None: -    return OS << "ARCInstKind::None"; -  } -  llvm_unreachable("Unknown instruction class!"); -} - -ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) { -  Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); - -  // No (mandatory) arguments. -  if (AI == AE) -    return StringSwitch<ARCInstKind>(F->getName()) -        .Case("objc_autoreleasePoolPush", ARCInstKind::AutoreleasepoolPush) -        .Case("clang.arc.use", ARCInstKind::IntrinsicUser) -        .Default(ARCInstKind::CallOrUser); - -  // One argument. -  const Argument *A0 = AI++; -  if (AI == AE) -    // Argument is a pointer. -    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) { -      Type *ETy = PTy->getElementType(); -      // Argument is i8*. -      if (ETy->isIntegerTy(8)) -        return StringSwitch<ARCInstKind>(F->getName()) -            .Case("objc_retain", ARCInstKind::Retain) -            .Case("objc_retainAutoreleasedReturnValue", ARCInstKind::RetainRV) -            .Case("objc_retainBlock", ARCInstKind::RetainBlock) -            .Case("objc_release", ARCInstKind::Release) -            .Case("objc_autorelease", ARCInstKind::Autorelease) -            .Case("objc_autoreleaseReturnValue", ARCInstKind::AutoreleaseRV) -            .Case("objc_autoreleasePoolPop", ARCInstKind::AutoreleasepoolPop) -            .Case("objc_retainedObject", ARCInstKind::NoopCast) -            .Case("objc_unretainedObject", ARCInstKind::NoopCast) -            .Case("objc_unretainedPointer", ARCInstKind::NoopCast) -            .Case("objc_retain_autorelease", -                  ARCInstKind::FusedRetainAutorelease) -            .Case("objc_retainAutorelease", ARCInstKind::FusedRetainAutorelease) -            .Case("objc_retainAutoreleaseReturnValue", -                  ARCInstKind::FusedRetainAutoreleaseRV) -            .Case("objc_sync_enter", ARCInstKind::User) -            .Case("objc_sync_exit", ARCInstKind::User) -            .Default(ARCInstKind::CallOrUser); - -      // Argument is i8** -      if (PointerType *Pte = dyn_cast<PointerType>(ETy)) -        if (Pte->getElementType()->isIntegerTy(8)) -          return StringSwitch<ARCInstKind>(F->getName()) -              .Case("objc_loadWeakRetained", ARCInstKind::LoadWeakRetained) -              .Case("objc_loadWeak", ARCInstKind::LoadWeak) -              .Case("objc_destroyWeak", ARCInstKind::DestroyWeak) -              .Default(ARCInstKind::CallOrUser); -    } - -  // Two arguments, first is i8**. -  const Argument *A1 = AI++; -  if (AI == AE) -    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) -      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType())) -        if (Pte->getElementType()->isIntegerTy(8)) -          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) { -            Type *ETy1 = PTy1->getElementType(); -            // Second argument is i8* -            if (ETy1->isIntegerTy(8)) -              return StringSwitch<ARCInstKind>(F->getName()) -                  .Case("objc_storeWeak", ARCInstKind::StoreWeak) -                  .Case("objc_initWeak", ARCInstKind::InitWeak) -                  .Case("objc_storeStrong", ARCInstKind::StoreStrong) -                  .Default(ARCInstKind::CallOrUser); -            // Second argument is i8**. -            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1)) -              if (Pte1->getElementType()->isIntegerTy(8)) -                return StringSwitch<ARCInstKind>(F->getName()) -                    .Case("objc_moveWeak", ARCInstKind::MoveWeak) -                    .Case("objc_copyWeak", ARCInstKind::CopyWeak) -                    // Ignore annotation calls. This is important to stop the -                    // optimizer from treating annotations as uses which would -                    // make the state of the pointers they are attempting to -                    // elucidate to be incorrect. -                    .Case("llvm.arc.annotation.topdown.bbstart", -                          ARCInstKind::None) -                    .Case("llvm.arc.annotation.topdown.bbend", -                          ARCInstKind::None) -                    .Case("llvm.arc.annotation.bottomup.bbstart", -                          ARCInstKind::None) -                    .Case("llvm.arc.annotation.bottomup.bbend", -                          ARCInstKind::None) -                    .Default(ARCInstKind::CallOrUser); -          } - -  // Anything else. -  return ARCInstKind::CallOrUser; -} - -// A whitelist of intrinsics that we know do not use objc pointers or decrement -// ref counts. -static bool isInertIntrinsic(unsigned ID) { -  // TODO: Make this into a covered switch. -  switch (ID) { -  case Intrinsic::returnaddress: -  case Intrinsic::frameaddress: -  case Intrinsic::stacksave: -  case Intrinsic::stackrestore: -  case Intrinsic::vastart: -  case Intrinsic::vacopy: -  case Intrinsic::vaend: -  case Intrinsic::objectsize: -  case Intrinsic::prefetch: -  case Intrinsic::stackprotector: -  case Intrinsic::eh_return_i32: -  case Intrinsic::eh_return_i64: -  case Intrinsic::eh_typeid_for: -  case Intrinsic::eh_dwarf_cfa: -  case Intrinsic::eh_sjlj_lsda: -  case Intrinsic::eh_sjlj_functioncontext: -  case Intrinsic::init_trampoline: -  case Intrinsic::adjust_trampoline: -  case Intrinsic::lifetime_start: -  case Intrinsic::lifetime_end: -  case Intrinsic::invariant_start: -  case Intrinsic::invariant_end: -  // Don't let dbg info affect our results. -  case Intrinsic::dbg_declare: -  case Intrinsic::dbg_value: -    // Short cut: Some intrinsics obviously don't use ObjC pointers. -    return true; -  default: -    return false; -  } -} - -// A whitelist of intrinsics that we know do not use objc pointers or decrement -// ref counts. -static bool isUseOnlyIntrinsic(unsigned ID) { -  // We are conservative and even though intrinsics are unlikely to touch -  // reference counts, we white list them for safety. -  // -  // TODO: Expand this into a covered switch. There is a lot more here. -  switch (ID) { -  case Intrinsic::memcpy: -  case Intrinsic::memmove: -  case Intrinsic::memset: -    return true; -  default: -    return false; -  } -} - -/// \brief Determine what kind of construct V is. -ARCInstKind llvm::objcarc::GetARCInstKind(const Value *V) { -  if (const Instruction *I = dyn_cast<Instruction>(V)) { -    // Any instruction other than bitcast and gep with a pointer operand have a -    // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer -    // to a subsequent use, rather than using it themselves, in this sense. -    // As a short cut, several other opcodes are known to have no pointer -    // operands of interest. And ret is never followed by a release, so it's -    // not interesting to examine. -    switch (I->getOpcode()) { -    case Instruction::Call: { -      const CallInst *CI = cast<CallInst>(I); -      // See if we have a function that we know something about. -      if (const Function *F = CI->getCalledFunction()) { -        ARCInstKind Class = GetFunctionClass(F); -        if (Class != ARCInstKind::CallOrUser) -          return Class; -        Intrinsic::ID ID = F->getIntrinsicID(); -        if (isInertIntrinsic(ID)) -          return ARCInstKind::None; -        if (isUseOnlyIntrinsic(ID)) -          return ARCInstKind::User; -      } - -      // Otherwise, be conservative. -      return GetCallSiteClass(CI); -    } -    case Instruction::Invoke: -      // Otherwise, be conservative. -      return GetCallSiteClass(cast<InvokeInst>(I)); -    case Instruction::BitCast: -    case Instruction::GetElementPtr: -    case Instruction::Select: -    case Instruction::PHI: -    case Instruction::Ret: -    case Instruction::Br: -    case Instruction::Switch: -    case Instruction::IndirectBr: -    case Instruction::Alloca: -    case Instruction::VAArg: -    case Instruction::Add: -    case Instruction::FAdd: -    case Instruction::Sub: -    case Instruction::FSub: -    case Instruction::Mul: -    case Instruction::FMul: -    case Instruction::SDiv: -    case Instruction::UDiv: -    case Instruction::FDiv: -    case Instruction::SRem: -    case Instruction::URem: -    case Instruction::FRem: -    case Instruction::Shl: -    case Instruction::LShr: -    case Instruction::AShr: -    case Instruction::And: -    case Instruction::Or: -    case Instruction::Xor: -    case Instruction::SExt: -    case Instruction::ZExt: -    case Instruction::Trunc: -    case Instruction::IntToPtr: -    case Instruction::FCmp: -    case Instruction::FPTrunc: -    case Instruction::FPExt: -    case Instruction::FPToUI: -    case Instruction::FPToSI: -    case Instruction::UIToFP: -    case Instruction::SIToFP: -    case Instruction::InsertElement: -    case Instruction::ExtractElement: -    case Instruction::ShuffleVector: -    case Instruction::ExtractValue: -      break; -    case Instruction::ICmp: -      // Comparing a pointer with null, or any other constant, isn't an -      // interesting use, because we don't care what the pointer points to, or -      // about the values of any other dynamic reference-counted pointers. -      if (IsPotentialRetainableObjPtr(I->getOperand(1))) -        return ARCInstKind::User; -      break; -    default: -      // For anything else, check all the operands. -      // Note that this includes both operands of a Store: while the first -      // operand isn't actually being dereferenced, it is being stored to -      // memory where we can no longer track who might read it and dereference -      // it, so we have to consider it potentially used. -      for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end(); -           OI != OE; ++OI) -        if (IsPotentialRetainableObjPtr(*OI)) -          return ARCInstKind::User; -    } -  } - -  // Otherwise, it's totally inert for ARC purposes. -  return ARCInstKind::None; -} - -/// \brief Test if the given class is a kind of user. -bool llvm::objcarc::IsUser(ARCInstKind Class) { -  switch (Class) { -  case ARCInstKind::User: -  case ARCInstKind::CallOrUser: -  case ARCInstKind::IntrinsicUser: -    return true; -  case ARCInstKind::Retain: -  case ARCInstKind::RetainRV: -  case ARCInstKind::RetainBlock: -  case ARCInstKind::Release: -  case ARCInstKind::Autorelease: -  case ARCInstKind::AutoreleaseRV: -  case ARCInstKind::AutoreleasepoolPush: -  case ARCInstKind::AutoreleasepoolPop: -  case ARCInstKind::NoopCast: -  case ARCInstKind::FusedRetainAutorelease: -  case ARCInstKind::FusedRetainAutoreleaseRV: -  case ARCInstKind::LoadWeakRetained: -  case ARCInstKind::StoreWeak: -  case ARCInstKind::InitWeak: -  case ARCInstKind::LoadWeak: -  case ARCInstKind::MoveWeak: -  case ARCInstKind::CopyWeak: -  case ARCInstKind::DestroyWeak: -  case ARCInstKind::StoreStrong: -  case ARCInstKind::Call: -  case ARCInstKind::None: -    return false; -  } -  llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class is objc_retain or equivalent. -bool llvm::objcarc::IsRetain(ARCInstKind Class) { -  switch (Class) { -  case ARCInstKind::Retain: -  case ARCInstKind::RetainRV: -    return true; -  // I believe we treat retain block as not a retain since it can copy its -  // block. -  case ARCInstKind::RetainBlock: -  case ARCInstKind::Release: -  case ARCInstKind::Autorelease: -  case ARCInstKind::AutoreleaseRV: -  case ARCInstKind::AutoreleasepoolPush: -  case ARCInstKind::AutoreleasepoolPop: -  case ARCInstKind::NoopCast: -  case ARCInstKind::FusedRetainAutorelease: -  case ARCInstKind::FusedRetainAutoreleaseRV: -  case ARCInstKind::LoadWeakRetained: -  case ARCInstKind::StoreWeak: -  case ARCInstKind::InitWeak: -  case ARCInstKind::LoadWeak: -  case ARCInstKind::MoveWeak: -  case ARCInstKind::CopyWeak: -  case ARCInstKind::DestroyWeak: -  case ARCInstKind::StoreStrong: -  case ARCInstKind::IntrinsicUser: -  case ARCInstKind::CallOrUser: -  case ARCInstKind::Call: -  case ARCInstKind::User: -  case ARCInstKind::None: -    return false; -  } -  llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class is objc_autorelease or equivalent. -bool llvm::objcarc::IsAutorelease(ARCInstKind Class) { -  switch (Class) { -  case ARCInstKind::Autorelease: -  case ARCInstKind::AutoreleaseRV: -    return true; -  case ARCInstKind::Retain: -  case ARCInstKind::RetainRV: -  case ARCInstKind::RetainBlock: -  case ARCInstKind::Release: -  case ARCInstKind::AutoreleasepoolPush: -  case ARCInstKind::AutoreleasepoolPop: -  case ARCInstKind::NoopCast: -  case ARCInstKind::FusedRetainAutorelease: -  case ARCInstKind::FusedRetainAutoreleaseRV: -  case ARCInstKind::LoadWeakRetained: -  case ARCInstKind::StoreWeak: -  case ARCInstKind::InitWeak: -  case ARCInstKind::LoadWeak: -  case ARCInstKind::MoveWeak: -  case ARCInstKind::CopyWeak: -  case ARCInstKind::DestroyWeak: -  case ARCInstKind::StoreStrong: -  case ARCInstKind::IntrinsicUser: -  case ARCInstKind::CallOrUser: -  case ARCInstKind::Call: -  case ARCInstKind::User: -  case ARCInstKind::None: -    return false; -  } -  llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class represents instructions which return their -/// argument verbatim. -bool llvm::objcarc::IsForwarding(ARCInstKind Class) { -  switch (Class) { -  case ARCInstKind::Retain: -  case ARCInstKind::RetainRV: -  case ARCInstKind::Autorelease: -  case ARCInstKind::AutoreleaseRV: -  case ARCInstKind::NoopCast: -    return true; -  case ARCInstKind::RetainBlock: -  case ARCInstKind::Release: -  case ARCInstKind::AutoreleasepoolPush: -  case ARCInstKind::AutoreleasepoolPop: -  case ARCInstKind::FusedRetainAutorelease: -  case ARCInstKind::FusedRetainAutoreleaseRV: -  case ARCInstKind::LoadWeakRetained: -  case ARCInstKind::StoreWeak: -  case ARCInstKind::InitWeak: -  case ARCInstKind::LoadWeak: -  case ARCInstKind::MoveWeak: -  case ARCInstKind::CopyWeak: -  case ARCInstKind::DestroyWeak: -  case ARCInstKind::StoreStrong: -  case ARCInstKind::IntrinsicUser: -  case ARCInstKind::CallOrUser: -  case ARCInstKind::Call: -  case ARCInstKind::User: -  case ARCInstKind::None: -    return false; -  } -  llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class represents instructions which do nothing if -/// passed a null pointer. -bool llvm::objcarc::IsNoopOnNull(ARCInstKind Class) { -  switch (Class) { -  case ARCInstKind::Retain: -  case ARCInstKind::RetainRV: -  case ARCInstKind::Release: -  case ARCInstKind::Autorelease: -  case ARCInstKind::AutoreleaseRV: -  case ARCInstKind::RetainBlock: -    return true; -  case ARCInstKind::AutoreleasepoolPush: -  case ARCInstKind::AutoreleasepoolPop: -  case ARCInstKind::FusedRetainAutorelease: -  case ARCInstKind::FusedRetainAutoreleaseRV: -  case ARCInstKind::LoadWeakRetained: -  case ARCInstKind::StoreWeak: -  case ARCInstKind::InitWeak: -  case ARCInstKind::LoadWeak: -  case ARCInstKind::MoveWeak: -  case ARCInstKind::CopyWeak: -  case ARCInstKind::DestroyWeak: -  case ARCInstKind::StoreStrong: -  case ARCInstKind::IntrinsicUser: -  case ARCInstKind::CallOrUser: -  case ARCInstKind::Call: -  case ARCInstKind::User: -  case ARCInstKind::None: -  case ARCInstKind::NoopCast: -    return false; -  } -  llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the "tail" keyword. -bool llvm::objcarc::IsAlwaysTail(ARCInstKind Class) { -  // ARCInstKind::RetainBlock may be given a stack argument. -  switch (Class) { -  case ARCInstKind::Retain: -  case ARCInstKind::RetainRV: -  case ARCInstKind::AutoreleaseRV: -    return true; -  case ARCInstKind::Release: -  case ARCInstKind::Autorelease: -  case ARCInstKind::RetainBlock: -  case ARCInstKind::AutoreleasepoolPush: -  case ARCInstKind::AutoreleasepoolPop: -  case ARCInstKind::FusedRetainAutorelease: -  case ARCInstKind::FusedRetainAutoreleaseRV: -  case ARCInstKind::LoadWeakRetained: -  case ARCInstKind::StoreWeak: -  case ARCInstKind::InitWeak: -  case ARCInstKind::LoadWeak: -  case ARCInstKind::MoveWeak: -  case ARCInstKind::CopyWeak: -  case ARCInstKind::DestroyWeak: -  case ARCInstKind::StoreStrong: -  case ARCInstKind::IntrinsicUser: -  case ARCInstKind::CallOrUser: -  case ARCInstKind::Call: -  case ARCInstKind::User: -  case ARCInstKind::None: -  case ARCInstKind::NoopCast: -    return false; -  } -  llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class represents instructions which are never safe -/// to mark with the "tail" keyword. -bool llvm::objcarc::IsNeverTail(ARCInstKind Class) { -  /// It is never safe to tail call objc_autorelease since by tail calling -  /// objc_autorelease: fast autoreleasing causing our object to be potentially -  /// reclaimed from the autorelease pool which violates the semantics of -  /// __autoreleasing types in ARC. -  switch (Class) { -  case ARCInstKind::Autorelease: -    return true; -  case ARCInstKind::Retain: -  case ARCInstKind::RetainRV: -  case ARCInstKind::AutoreleaseRV: -  case ARCInstKind::Release: -  case ARCInstKind::RetainBlock: -  case ARCInstKind::AutoreleasepoolPush: -  case ARCInstKind::AutoreleasepoolPop: -  case ARCInstKind::FusedRetainAutorelease: -  case ARCInstKind::FusedRetainAutoreleaseRV: -  case ARCInstKind::LoadWeakRetained: -  case ARCInstKind::StoreWeak: -  case ARCInstKind::InitWeak: -  case ARCInstKind::LoadWeak: -  case ARCInstKind::MoveWeak: -  case ARCInstKind::CopyWeak: -  case ARCInstKind::DestroyWeak: -  case ARCInstKind::StoreStrong: -  case ARCInstKind::IntrinsicUser: -  case ARCInstKind::CallOrUser: -  case ARCInstKind::Call: -  case ARCInstKind::User: -  case ARCInstKind::None: -  case ARCInstKind::NoopCast: -    return false; -  } -  llvm_unreachable("covered switch isn't covered?"); -} - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the nounwind attribute. -bool llvm::objcarc::IsNoThrow(ARCInstKind Class) { -  // objc_retainBlock is not nounwind because it calls user copy constructors -  // which could theoretically throw. -  switch (Class) { -  case ARCInstKind::Retain: -  case ARCInstKind::RetainRV: -  case ARCInstKind::Release: -  case ARCInstKind::Autorelease: -  case ARCInstKind::AutoreleaseRV: -  case ARCInstKind::AutoreleasepoolPush: -  case ARCInstKind::AutoreleasepoolPop: -    return true; -  case ARCInstKind::RetainBlock: -  case ARCInstKind::FusedRetainAutorelease: -  case ARCInstKind::FusedRetainAutoreleaseRV: -  case ARCInstKind::LoadWeakRetained: -  case ARCInstKind::StoreWeak: -  case ARCInstKind::InitWeak: -  case ARCInstKind::LoadWeak: -  case ARCInstKind::MoveWeak: -  case ARCInstKind::CopyWeak: -  case ARCInstKind::DestroyWeak: -  case ARCInstKind::StoreStrong: -  case ARCInstKind::IntrinsicUser: -  case ARCInstKind::CallOrUser: -  case ARCInstKind::Call: -  case ARCInstKind::User: -  case ARCInstKind::None: -  case ARCInstKind::NoopCast: -    return false; -  } -  llvm_unreachable("covered switch isn't covered?"); -} - -/// Test whether the given instruction can autorelease any pointer or cause an -/// autoreleasepool pop. -/// -/// This means that it *could* interrupt the RV optimization. -bool llvm::objcarc::CanInterruptRV(ARCInstKind Class) { -  switch (Class) { -  case ARCInstKind::AutoreleasepoolPop: -  case ARCInstKind::CallOrUser: -  case ARCInstKind::Call: -  case ARCInstKind::Autorelease: -  case ARCInstKind::AutoreleaseRV: -  case ARCInstKind::FusedRetainAutorelease: -  case ARCInstKind::FusedRetainAutoreleaseRV: -    return true; -  case ARCInstKind::Retain: -  case ARCInstKind::RetainRV: -  case ARCInstKind::Release: -  case ARCInstKind::AutoreleasepoolPush: -  case ARCInstKind::RetainBlock: -  case ARCInstKind::LoadWeakRetained: -  case ARCInstKind::StoreWeak: -  case ARCInstKind::InitWeak: -  case ARCInstKind::LoadWeak: -  case ARCInstKind::MoveWeak: -  case ARCInstKind::CopyWeak: -  case ARCInstKind::DestroyWeak: -  case ARCInstKind::StoreStrong: -  case ARCInstKind::IntrinsicUser: -  case ARCInstKind::User: -  case ARCInstKind::None: -  case ARCInstKind::NoopCast: -    return false; -  } -  llvm_unreachable("covered switch isn't covered?"); -} - -bool llvm::objcarc::CanDecrementRefCount(ARCInstKind Kind) { -  switch (Kind) { -  case ARCInstKind::Retain: -  case ARCInstKind::RetainRV: -  case ARCInstKind::Autorelease: -  case ARCInstKind::AutoreleaseRV: -  case ARCInstKind::NoopCast: -  case ARCInstKind::FusedRetainAutorelease: -  case ARCInstKind::FusedRetainAutoreleaseRV: -  case ARCInstKind::IntrinsicUser: -  case ARCInstKind::User: -  case ARCInstKind::None: -    return false; - -  // The cases below are conservative. - -  // RetainBlock can result in user defined copy constructors being called -  // implying releases may occur. -  case ARCInstKind::RetainBlock: -  case ARCInstKind::Release: -  case ARCInstKind::AutoreleasepoolPush: -  case ARCInstKind::AutoreleasepoolPop: -  case ARCInstKind::LoadWeakRetained: -  case ARCInstKind::StoreWeak: -  case ARCInstKind::InitWeak: -  case ARCInstKind::LoadWeak: -  case ARCInstKind::MoveWeak: -  case ARCInstKind::CopyWeak: -  case ARCInstKind::DestroyWeak: -  case ARCInstKind::StoreStrong: -  case ARCInstKind::CallOrUser: -  case ARCInstKind::Call: -    return true; -  } - -  llvm_unreachable("covered switch isn't covered?"); -} diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h b/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h deleted file mode 100644 index 636c65c9b627..000000000000 --- a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h +++ /dev/null @@ -1,123 +0,0 @@ -//===--- ARCInstKind.h - ARC instruction equivalence classes -*- C++ -*----===// -// -//                     The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H -#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H - -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Function.h" - -namespace llvm { -namespace objcarc { - -/// \enum ARCInstKind -/// -/// \brief Equivalence classes of instructions in the ARC Model. -/// -/// Since we do not have "instructions" to represent ARC concepts in LLVM IR, -/// we instead operate on equivalence classes of instructions. -/// -/// TODO: This should be split into two enums: a runtime entry point enum -/// (possibly united with the ARCRuntimeEntrypoint class) and an enum that deals -/// with effects of instructions in the ARC model (which would handle the notion -/// of a User or CallOrUser). -enum class ARCInstKind { -  Retain,                   ///< objc_retain -  RetainRV,                 ///< objc_retainAutoreleasedReturnValue -  RetainBlock,              ///< objc_retainBlock -  Release,                  ///< objc_release -  Autorelease,              ///< objc_autorelease -  AutoreleaseRV,            ///< objc_autoreleaseReturnValue -  AutoreleasepoolPush,      ///< objc_autoreleasePoolPush -  AutoreleasepoolPop,       ///< objc_autoreleasePoolPop -  NoopCast,                 ///< objc_retainedObject, etc. -  FusedRetainAutorelease,   ///< objc_retainAutorelease -  FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue -  LoadWeakRetained,         ///< objc_loadWeakRetained (primitive) -  StoreWeak,                ///< objc_storeWeak (primitive) -  InitWeak,                 ///< objc_initWeak (derived) -  LoadWeak,                 ///< objc_loadWeak (derived) -  MoveWeak,                 ///< objc_moveWeak (derived) -  CopyWeak,                 ///< objc_copyWeak (derived) -  DestroyWeak,              ///< objc_destroyWeak (derived) -  StoreStrong,              ///< objc_storeStrong (derived) -  IntrinsicUser,            ///< clang.arc.use -  CallOrUser,               ///< could call objc_release and/or "use" pointers -  Call,                     ///< could call objc_release -  User,                     ///< could "use" a pointer -  None                      ///< anything that is inert from an ARC perspective. -}; - -raw_ostream &operator<<(raw_ostream &OS, const ARCInstKind Class); - -/// \brief Test if the given class is a kind of user. -bool IsUser(ARCInstKind Class); - -/// \brief Test if the given class is objc_retain or equivalent. -bool IsRetain(ARCInstKind Class); - -/// \brief Test if the given class is objc_autorelease or equivalent. -bool IsAutorelease(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which return their -/// argument verbatim. -bool IsForwarding(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which do nothing if -/// passed a null pointer. -bool IsNoopOnNull(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the "tail" keyword. -bool IsAlwaysTail(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which are never safe -/// to mark with the "tail" keyword. -bool IsNeverTail(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the nounwind attribute. -bool IsNoThrow(ARCInstKind Class); - -/// Test whether the given instruction can autorelease any pointer or cause an -/// autoreleasepool pop. -bool CanInterruptRV(ARCInstKind Class); - -/// \brief Determine if F is one of the special known Functions.  If it isn't, -/// return ARCInstKind::CallOrUser. -ARCInstKind GetFunctionClass(const Function *F); - -/// \brief Determine which objc runtime call instruction class V belongs to. -/// -/// This is similar to GetARCInstKind except that it only detects objc -/// runtime calls. This allows it to be faster. -/// -static inline ARCInstKind GetBasicARCInstKind(const Value *V) { -  if (const CallInst *CI = dyn_cast<CallInst>(V)) { -    if (const Function *F = CI->getCalledFunction()) -      return GetFunctionClass(F); -    // Otherwise, be conservative. -    return ARCInstKind::CallOrUser; -  } - -  // Otherwise, be conservative. -  return isa<InvokeInst>(V) ? ARCInstKind::CallOrUser : ARCInstKind::User; -} - -/// Map V to its ARCInstKind equivalence class. -ARCInstKind GetARCInstKind(const Value *V); - -/// Returns false if conservatively we can prove that any instruction mapped to -/// this kind can not decrement ref counts. Returns true otherwise. -bool CanDecrementRefCount(ARCInstKind Kind); - -} // end namespace objcarc -} // end namespace llvm - -#endif diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp index 4edd02904b22..9d78e5ae3b9b 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp @@ -49,7 +49,7 @@ bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,    assert(CS && "Only calls can alter reference counts!");    // See if AliasAnalysis can help us with the call. -  AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS); +  FunctionModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS);    if (AliasAnalysis::onlyReadsMemory(MRB))      return false;    if (AliasAnalysis::onlyAccessesArgPointees(MRB)) { @@ -226,7 +226,7 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor,                                  SmallPtrSetImpl<Instruction *> &DependingInsts,                                  SmallPtrSetImpl<const BasicBlock *> &Visited,                                  ProvenanceAnalysis &PA) { -  BasicBlock::iterator StartPos = StartInst; +  BasicBlock::iterator StartPos = StartInst->getIterator();    SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;    Worklist.push_back(std::make_pair(StartBB, StartPos)); @@ -252,7 +252,7 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor,          break;        } -      Instruction *Inst = --LocalStartPos; +      Instruction *Inst = &*--LocalStartPos;        if (Depends(Flavor, Inst, Arg, PA)) {          DependingInsts.insert(Inst);          break; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp index 6ea038b8ba8c..d860723bb460 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp @@ -26,18 +26,10 @@ namespace llvm {  using namespace llvm;  using namespace llvm::objcarc; -/// \brief A handy option to enable/disable all ARC Optimizations. -bool llvm::objcarc::EnableARCOpts; -static cl::opt<bool, true> -EnableARCOptimizations("enable-objc-arc-opts", -                       cl::desc("enable/disable all ARC Optimizations"), -                       cl::location(EnableARCOpts), -                       cl::init(true)); -  /// initializeObjCARCOptsPasses - Initialize all passes linked into the  /// ObjCARCOpts library.  void llvm::initializeObjCARCOpts(PassRegistry &Registry) { -  initializeObjCARCAliasAnalysisPass(Registry); +  initializeObjCARCAAWrapperPassPass(Registry);    initializeObjCARCAPElimPass(Registry);    initializeObjCARCExpandPass(Registry);    initializeObjCARCContractPass(Registry); diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h index 7595e2db1a7a..5fd45b00af17 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h @@ -26,6 +26,8 @@  #include "llvm/ADT/StringSwitch.h"  #include "llvm/ADT/Optional.h"  #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ObjCARCAnalysisUtils.h" +#include "llvm/Analysis/ObjCARCInstKind.h"  #include "llvm/Analysis/Passes.h"  #include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/CallSite.h" @@ -34,7 +36,6 @@  #include "llvm/Pass.h"  #include "llvm/Transforms/ObjCARC.h"  #include "llvm/Transforms/Utils/Local.h" -#include "ARCInstKind.h"  namespace llvm {  class raw_ostream; @@ -43,99 +44,6 @@ class raw_ostream;  namespace llvm {  namespace objcarc { -/// \brief A handy option to enable/disable all ARC Optimizations. -extern bool EnableARCOpts; - -/// \brief Test if the given module looks interesting to run ARC optimization -/// on. -static inline bool ModuleHasARC(const Module &M) { -  return -    M.getNamedValue("objc_retain") || -    M.getNamedValue("objc_release") || -    M.getNamedValue("objc_autorelease") || -    M.getNamedValue("objc_retainAutoreleasedReturnValue") || -    M.getNamedValue("objc_retainBlock") || -    M.getNamedValue("objc_autoreleaseReturnValue") || -    M.getNamedValue("objc_autoreleasePoolPush") || -    M.getNamedValue("objc_loadWeakRetained") || -    M.getNamedValue("objc_loadWeak") || -    M.getNamedValue("objc_destroyWeak") || -    M.getNamedValue("objc_storeWeak") || -    M.getNamedValue("objc_initWeak") || -    M.getNamedValue("objc_moveWeak") || -    M.getNamedValue("objc_copyWeak") || -    M.getNamedValue("objc_retainedObject") || -    M.getNamedValue("objc_unretainedObject") || -    M.getNamedValue("objc_unretainedPointer") || -    M.getNamedValue("clang.arc.use"); -} - -/// \brief This is a wrapper around getUnderlyingObject which also knows how to -/// look through objc_retain and objc_autorelease calls, which we know to return -/// their argument verbatim. -static inline const Value *GetUnderlyingObjCPtr(const Value *V, -                                                const DataLayout &DL) { -  for (;;) { -    V = GetUnderlyingObject(V, DL); -    if (!IsForwarding(GetBasicARCInstKind(V))) -      break; -    V = cast<CallInst>(V)->getArgOperand(0); -  } - -  return V; -} - -/// The RCIdentity root of a value \p V is a dominating value U for which -/// retaining or releasing U is equivalent to retaining or releasing V. In other -/// words, ARC operations on \p V are equivalent to ARC operations on \p U. -/// -/// We use this in the ARC optimizer to make it easier to match up ARC -/// operations by always mapping ARC operations to RCIdentityRoots instead of -/// pointers themselves. -/// -/// The two ways that we see RCIdentical values in ObjC are via: -/// -///   1. PointerCasts -///   2. Forwarding Calls that return their argument verbatim. -/// -/// Thus this function strips off pointer casts and forwarding calls. *NOTE* -/// This implies that two RCIdentical values must alias. -static inline const Value *GetRCIdentityRoot(const Value *V) { -  for (;;) { -    V = V->stripPointerCasts(); -    if (!IsForwarding(GetBasicARCInstKind(V))) -      break; -    V = cast<CallInst>(V)->getArgOperand(0); -  } -  return V; -} - -/// Helper which calls const Value *GetRCIdentityRoot(const Value *V) and just -/// casts away the const of the result. For documentation about what an -/// RCIdentityRoot (and by extension GetRCIdentityRoot is) look at that -/// function. -static inline Value *GetRCIdentityRoot(Value *V) { -  return const_cast<Value *>(GetRCIdentityRoot((const Value *)V)); -} - -/// \brief Assuming the given instruction is one of the special calls such as -/// objc_retain or objc_release, return the RCIdentity root of the argument of -/// the call. -static inline Value *GetArgRCIdentityRoot(Value *Inst) { -  return GetRCIdentityRoot(cast<CallInst>(Inst)->getArgOperand(0)); -} - -static inline bool IsNullOrUndef(const Value *V) { -  return isa<ConstantPointerNull>(V) || isa<UndefValue>(V); -} - -static inline bool IsNoopInstruction(const Instruction *I) { -  return isa<BitCastInst>(I) || -    (isa<GetElementPtrInst>(I) && -     cast<GetElementPtrInst>(I)->hasAllZeroIndices()); -} - -  /// \brief Erase the given instruction.  ///  /// Many ObjC calls return their argument verbatim, @@ -162,152 +70,6 @@ static inline void EraseInstruction(Instruction *CI) {      RecursivelyDeleteTriviallyDeadInstructions(OldArg);  } -/// \brief Test whether the given value is possible a retainable object pointer. -static inline bool IsPotentialRetainableObjPtr(const Value *Op) { -  // Pointers to static or stack storage are not valid retainable object -  // pointers. -  if (isa<Constant>(Op) || isa<AllocaInst>(Op)) -    return false; -  // Special arguments can not be a valid retainable object pointer. -  if (const Argument *Arg = dyn_cast<Argument>(Op)) -    if (Arg->hasByValAttr() || -        Arg->hasInAllocaAttr() || -        Arg->hasNestAttr() || -        Arg->hasStructRetAttr()) -      return false; -  // Only consider values with pointer types. -  // -  // It seemes intuitive to exclude function pointer types as well, since -  // functions are never retainable object pointers, however clang occasionally -  // bitcasts retainable object pointers to function-pointer type temporarily. -  PointerType *Ty = dyn_cast<PointerType>(Op->getType()); -  if (!Ty) -    return false; -  // Conservatively assume anything else is a potential retainable object -  // pointer. -  return true; -} - -static inline bool IsPotentialRetainableObjPtr(const Value *Op, -                                               AliasAnalysis &AA) { -  // First make the rudimentary check. -  if (!IsPotentialRetainableObjPtr(Op)) -    return false; - -  // Objects in constant memory are not reference-counted. -  if (AA.pointsToConstantMemory(Op)) -    return false; - -  // Pointers in constant memory are not pointing to reference-counted objects. -  if (const LoadInst *LI = dyn_cast<LoadInst>(Op)) -    if (AA.pointsToConstantMemory(LI->getPointerOperand())) -      return false; - -  // Otherwise assume the worst. -  return true; -} - -/// \brief Helper for GetARCInstKind. Determines what kind of construct CS -/// is. -static inline ARCInstKind GetCallSiteClass(ImmutableCallSite CS) { -  for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); -       I != E; ++I) -    if (IsPotentialRetainableObjPtr(*I)) -      return CS.onlyReadsMemory() ? ARCInstKind::User : ARCInstKind::CallOrUser; - -  return CS.onlyReadsMemory() ? ARCInstKind::None : ARCInstKind::Call; -} - -/// \brief Return true if this value refers to a distinct and identifiable -/// object. -/// -/// This is similar to AliasAnalysis's isIdentifiedObject, except that it uses -/// special knowledge of ObjC conventions. -static inline bool IsObjCIdentifiedObject(const Value *V) { -  // Assume that call results and arguments have their own "provenance". -  // Constants (including GlobalVariables) and Allocas are never -  // reference-counted. -  if (isa<CallInst>(V) || isa<InvokeInst>(V) || -      isa<Argument>(V) || isa<Constant>(V) || -      isa<AllocaInst>(V)) -    return true; - -  if (const LoadInst *LI = dyn_cast<LoadInst>(V)) { -    const Value *Pointer = -      GetRCIdentityRoot(LI->getPointerOperand()); -    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) { -      // A constant pointer can't be pointing to an object on the heap. It may -      // be reference-counted, but it won't be deleted. -      if (GV->isConstant()) -        return true; -      StringRef Name = GV->getName(); -      // These special variables are known to hold values which are not -      // reference-counted pointers. -      if (Name.startswith("\01l_objc_msgSend_fixup_")) -        return true; - -      StringRef Section = GV->getSection(); -      if (Section.find("__message_refs") != StringRef::npos || -          Section.find("__objc_classrefs") != StringRef::npos || -          Section.find("__objc_superrefs") != StringRef::npos || -          Section.find("__objc_methname") != StringRef::npos || -          Section.find("__cstring") != StringRef::npos) -        return true; -    } -  } - -  return false; -} - -enum class ARCMDKindID { -  ImpreciseRelease, -  CopyOnEscape, -  NoObjCARCExceptions, -}; - -/// A cache of MDKinds used by various ARC optimizations. -class ARCMDKindCache { -  Module *M; - -  /// The Metadata Kind for clang.imprecise_release metadata. -  llvm::Optional<unsigned> ImpreciseReleaseMDKind; - -  /// The Metadata Kind for clang.arc.copy_on_escape metadata. -  llvm::Optional<unsigned> CopyOnEscapeMDKind; - -  /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata. -  llvm::Optional<unsigned> NoObjCARCExceptionsMDKind; - -public: -  void init(Module *Mod) { -    M = Mod; -    ImpreciseReleaseMDKind = NoneType::None; -    CopyOnEscapeMDKind = NoneType::None; -    NoObjCARCExceptionsMDKind = NoneType::None; -  } - -  unsigned get(ARCMDKindID ID) { -    switch (ID) { -    case ARCMDKindID::ImpreciseRelease: -      if (!ImpreciseReleaseMDKind) -        ImpreciseReleaseMDKind = -            M->getContext().getMDKindID("clang.imprecise_release"); -      return *ImpreciseReleaseMDKind; -    case ARCMDKindID::CopyOnEscape: -      if (!CopyOnEscapeMDKind) -        CopyOnEscapeMDKind = -            M->getContext().getMDKindID("clang.arc.copy_on_escape"); -      return *CopyOnEscapeMDKind; -    case ARCMDKindID::NoObjCARCExceptions: -      if (!NoObjCARCExceptionsMDKind) -        NoObjCARCExceptionsMDKind = -            M->getContext().getMDKindID("clang.arc.no_objc_arc_exceptions"); -      return *NoObjCARCExceptionsMDKind; -    } -    llvm_unreachable("Covered switch isn't covered?!"); -  } -}; -  } // end namespace objcarc  } // end namespace llvm diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp index d318643a359a..969e77c1f888 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp @@ -72,12 +72,9 @@ bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {    if (const Function *Callee = CS.getCalledFunction()) {      if (Callee->isDeclaration() || Callee->mayBeOverridden())        return true; -    for (Function::const_iterator I = Callee->begin(), E = Callee->end(); -         I != E; ++I) { -      const BasicBlock *BB = I; -      for (BasicBlock::const_iterator J = BB->begin(), F = BB->end(); -           J != F; ++J) -        if (ImmutableCallSite JCS = ImmutableCallSite(J)) +    for (const BasicBlock &BB : *Callee) { +      for (const Instruction &I : BB) +        if (ImmutableCallSite JCS = ImmutableCallSite(&I))            // This recursion depth limit is arbitrary. It's just great            // enough to cover known interesting testcases.            if (Depth < 3 && @@ -96,7 +93,7 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {    Instruction *Push = nullptr;    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { -    Instruction *Inst = I++; +    Instruction *Inst = &*I++;      switch (GetBasicARCInstKind(Inst)) {      case ARCInstKind::AutoreleasepoolPush:        Push = Inst; @@ -169,7 +166,7 @@ bool ObjCARCAPElim::runOnModule(Module &M) {      if (std::next(F->begin()) != F->end())        continue;      // Ok, a single-block constructor function definition. Try to optimize it. -    Changed |= OptimizeBB(F->begin()); +    Changed |= OptimizeBB(&F->front());    }    return Changed; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp deleted file mode 100644 index 3893aab76b2a..000000000000 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp +++ /dev/null @@ -1,168 +0,0 @@ -//===- ObjCARCAliasAnalysis.cpp - ObjC ARC Optimization -------------------===// -// -//                     The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file defines a simple ARC-aware AliasAnalysis using special knowledge -/// of Objective C to enhance other optimization passes which rely on the Alias -/// Analysis infrastructure. -/// -/// WARNING: This file knows about certain library functions. It recognizes them -/// by name, and hardwires knowledge of their semantics. -/// -/// WARNING: This file knows about how certain Objective-C library functions are -/// used. Naive LLVM IR transformations which would otherwise be -/// behavior-preserving may break these assumptions. -/// -//===----------------------------------------------------------------------===// - -#include "ObjCARC.h" -#include "ObjCARCAliasAnalysis.h" -#include "llvm/IR/Instruction.h" -#include "llvm/InitializePasses.h" -#include "llvm/PassAnalysisSupport.h" -#include "llvm/PassSupport.h" - -#define DEBUG_TYPE "objc-arc-aa" - -namespace llvm { -  class Function; -  class Value; -} - -using namespace llvm; -using namespace llvm::objcarc; - -// Register this pass... -char ObjCARCAliasAnalysis::ID = 0; -INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa", -                   "ObjC-ARC-Based Alias Analysis", false, true, false) - -ImmutablePass *llvm::createObjCARCAliasAnalysisPass() { -  return new ObjCARCAliasAnalysis(); -} - -bool ObjCARCAliasAnalysis::doInitialization(Module &M) { -  InitializeAliasAnalysis(this, &M.getDataLayout()); -  return true; -} - -void -ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { -  AU.setPreservesAll(); -  AliasAnalysis::getAnalysisUsage(AU); -} - -AliasResult ObjCARCAliasAnalysis::alias(const MemoryLocation &LocA, -                                        const MemoryLocation &LocB) { -  if (!EnableARCOpts) -    return AliasAnalysis::alias(LocA, LocB); - -  // First, strip off no-ops, including ObjC-specific no-ops, and try making a -  // precise alias query. -  const Value *SA = GetRCIdentityRoot(LocA.Ptr); -  const Value *SB = GetRCIdentityRoot(LocB.Ptr); -  AliasResult Result = -      AliasAnalysis::alias(MemoryLocation(SA, LocA.Size, LocA.AATags), -                           MemoryLocation(SB, LocB.Size, LocB.AATags)); -  if (Result != MayAlias) -    return Result; - -  // If that failed, climb to the underlying object, including climbing through -  // ObjC-specific no-ops, and try making an imprecise alias query. -  const Value *UA = GetUnderlyingObjCPtr(SA, *DL); -  const Value *UB = GetUnderlyingObjCPtr(SB, *DL); -  if (UA != SA || UB != SB) { -    Result = AliasAnalysis::alias(MemoryLocation(UA), MemoryLocation(UB)); -    // We can't use MustAlias or PartialAlias results here because -    // GetUnderlyingObjCPtr may return an offsetted pointer value. -    if (Result == NoAlias) -      return NoAlias; -  } - -  // If that failed, fail. We don't need to chain here, since that's covered -  // by the earlier precise query. -  return MayAlias; -} - -bool ObjCARCAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc, -                                                  bool OrLocal) { -  if (!EnableARCOpts) -    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); - -  // First, strip off no-ops, including ObjC-specific no-ops, and try making -  // a precise alias query. -  const Value *S = GetRCIdentityRoot(Loc.Ptr); -  if (AliasAnalysis::pointsToConstantMemory( -          MemoryLocation(S, Loc.Size, Loc.AATags), OrLocal)) -    return true; - -  // If that failed, climb to the underlying object, including climbing through -  // ObjC-specific no-ops, and try making an imprecise alias query. -  const Value *U = GetUnderlyingObjCPtr(S, *DL); -  if (U != S) -    return AliasAnalysis::pointsToConstantMemory(MemoryLocation(U), OrLocal); - -  // If that failed, fail. We don't need to chain here, since that's covered -  // by the earlier precise query. -  return false; -} - -AliasAnalysis::ModRefBehavior -ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) { -  // We have nothing to do. Just chain to the next AliasAnalysis. -  return AliasAnalysis::getModRefBehavior(CS); -} - -AliasAnalysis::ModRefBehavior -ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) { -  if (!EnableARCOpts) -    return AliasAnalysis::getModRefBehavior(F); - -  switch (GetFunctionClass(F)) { -  case ARCInstKind::NoopCast: -    return DoesNotAccessMemory; -  default: -    break; -  } - -  return AliasAnalysis::getModRefBehavior(F); -} - -AliasAnalysis::ModRefResult -ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, -                                    const MemoryLocation &Loc) { -  if (!EnableARCOpts) -    return AliasAnalysis::getModRefInfo(CS, Loc); - -  switch (GetBasicARCInstKind(CS.getInstruction())) { -  case ARCInstKind::Retain: -  case ARCInstKind::RetainRV: -  case ARCInstKind::Autorelease: -  case ARCInstKind::AutoreleaseRV: -  case ARCInstKind::NoopCast: -  case ARCInstKind::AutoreleasepoolPush: -  case ARCInstKind::FusedRetainAutorelease: -  case ARCInstKind::FusedRetainAutoreleaseRV: -    // These functions don't access any memory visible to the compiler. -    // Note that this doesn't include objc_retainBlock, because it updates -    // pointers when it copies block data. -    return NoModRef; -  default: -    break; -  } - -  return AliasAnalysis::getModRefInfo(CS, Loc); -} - -AliasAnalysis::ModRefResult -ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1, -                                    ImmutableCallSite CS2) { -  // TODO: Theoretically we could check for dependencies between objc_* calls -  // and OnlyAccessesArgumentPointees calls or other well-behaved calls. -  return AliasAnalysis::getModRefInfo(CS1, CS2); -} diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h deleted file mode 100644 index eecc82fe572c..000000000000 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h +++ /dev/null @@ -1,74 +0,0 @@ -//===- ObjCARCAliasAnalysis.h - ObjC ARC Optimization -*- C++ -*-----------===// -// -//                     The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file declares a simple ARC-aware AliasAnalysis using special knowledge -/// of Objective C to enhance other optimization passes which rely on the Alias -/// Analysis infrastructure. -/// -/// WARNING: This file knows about certain library functions. It recognizes them -/// by name, and hardwires knowledge of their semantics. -/// -/// WARNING: This file knows about how certain Objective-C library functions are -/// used. Naive LLVM IR transformations which would otherwise be -/// behavior-preserving may break these assumptions. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H -#define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H - -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Pass.h" - -namespace llvm { -namespace objcarc { - -  /// \brief This is a simple alias analysis implementation that uses knowledge -  /// of ARC constructs to answer queries. -  /// -  /// TODO: This class could be generalized to know about other ObjC-specific -  /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing -  /// even though their offsets are dynamic. -  class ObjCARCAliasAnalysis : public ImmutablePass, -                               public AliasAnalysis { -  public: -    static char ID; // Class identification, replacement for typeinfo -    ObjCARCAliasAnalysis() : ImmutablePass(ID) { -      initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry()); -    } - -  private: -    bool doInitialization(Module &M) override; - -    /// This method is used when a pass implements an analysis interface through -    /// multiple inheritance.  If needed, it should override this to adjust the -    /// this pointer as needed for the specified pass info. -    void *getAdjustedAnalysisPointer(const void *PI) override { -      if (PI == &AliasAnalysis::ID) -        return static_cast<AliasAnalysis *>(this); -      return this; -    } - -    void getAnalysisUsage(AnalysisUsage &AU) const override; -    AliasResult alias(const MemoryLocation &LocA, -                      const MemoryLocation &LocB) override; -    bool pointsToConstantMemory(const MemoryLocation &Loc, -                                bool OrLocal) override; -    ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override; -    ModRefBehavior getModRefBehavior(const Function *F) override; -    ModRefResult getModRefInfo(ImmutableCallSite CS, -                               const MemoryLocation &Loc) override; -    ModRefResult getModRefInfo(ImmutableCallSite CS1, -                               ImmutableCallSite CS2) override; -  }; - -} // namespace objcarc -} // namespace llvm - -#endif diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index baca76ba3f2a..1cdf5689f42a 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -119,9 +119,9 @@ bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) {      return false;    // Check that the call is next to the retain. -  BasicBlock::const_iterator I = Call; -  ++I; -  while (IsNoopInstruction(I)) ++I; +  BasicBlock::const_iterator I = ++Call->getIterator(); +  while (IsNoopInstruction(&*I)) +    ++I;    if (&*I != Retain)      return false; @@ -247,7 +247,7 @@ static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,      // Ok, now we know we have not seen a store yet. See if Inst can write to      // our load location, if it can not, just ignore the instruction. -    if (!(AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod)) +    if (!(AA->getModRefInfo(Inst, Loc) & MRI_Mod))        continue;      Store = dyn_cast<StoreInst>(Inst); @@ -282,9 +282,9 @@ findRetainForStoreStrongContraction(Value *New, StoreInst *Store,                                      Instruction *Release,                                      ProvenanceAnalysis &PA) {    // Walk up from the Store to find the retain. -  BasicBlock::iterator I = Store; +  BasicBlock::iterator I = Store->getIterator();    BasicBlock::iterator Begin = Store->getParent()->begin(); -  while (I != Begin && GetBasicARCInstKind(I) != ARCInstKind::Retain) { +  while (I != Begin && GetBasicARCInstKind(&*I) != ARCInstKind::Retain) {      Instruction *Inst = &*I;      // It is only safe to move the retain to the store if we can prove @@ -294,7 +294,7 @@ findRetainForStoreStrongContraction(Value *New, StoreInst *Store,        return nullptr;      --I;    } -  Instruction *Retain = I; +  Instruction *Retain = &*I;    if (GetBasicARCInstKind(Retain) != ARCInstKind::Retain)      return nullptr;    if (GetArgRCIdentityRoot(Retain) != New) @@ -429,7 +429,7 @@ bool ObjCARCContract::tryToPeepholeInstruction(        // insert it now.        if (!RetainRVMarker)          return false; -      BasicBlock::iterator BBI = Inst; +      BasicBlock::iterator BBI = Inst->getIterator();        BasicBlock *InstParent = Inst->getParent();        // Step up to see if the call immediately precedes the RetainRV call. @@ -440,11 +440,11 @@ bool ObjCARCContract::tryToPeepholeInstruction(            BasicBlock *Pred = InstParent->getSinglePredecessor();            if (!Pred)              goto decline_rv_optimization; -          BBI = Pred->getTerminator(); +          BBI = Pred->getTerminator()->getIterator();            break;          }          --BBI; -      } while (IsNoopInstruction(BBI)); +      } while (IsNoopInstruction(&*BBI));        if (&*BBI == GetArgRCIdentityRoot(Inst)) {          DEBUG(dbgs() << "Adding inline asm marker for " @@ -511,10 +511,10 @@ bool ObjCARCContract::runOnFunction(Function &F) {      return false;    Changed = false; -  AA = &getAnalysis<AliasAnalysis>(); +  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); -  PA.setAA(&getAnalysis<AliasAnalysis>()); +  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());    DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n"); @@ -629,13 +629,13 @@ bool ObjCARCContract::runOnFunction(Function &F) {  char ObjCARCContract::ID = 0;  INITIALIZE_PASS_BEGIN(ObjCARCContract, "objc-arc-contract",                        "ObjC ARC contraction", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)  INITIALIZE_PASS_END(ObjCARCContract, "objc-arc-contract",                      "ObjC ARC contraction", false, false)  void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const { -  AU.addRequired<AliasAnalysis>(); +  AU.addRequired<AAResultsWrapperPass>();    AU.addRequired<DominatorTreeWrapperPass>();    AU.setPreservesCFG();  } diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 9edbb17e8d1b..f0ee6e2be487 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -28,7 +28,6 @@  #include "ARCRuntimeEntryPoints.h"  #include "BlotMapVector.h"  #include "DependencyAnalysis.h" -#include "ObjCARCAliasAnalysis.h"  #include "ProvenanceAnalysis.h"  #include "PtrState.h"  #include "llvm/ADT/DenseMap.h" @@ -36,6 +35,7 @@  #include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ObjCARCAliasAnalysis.h"  #include "llvm/IR/CFG.h"  #include "llvm/IR/IRBuilder.h"  #include "llvm/IR/LLVMContext.h" @@ -482,7 +482,7 @@ namespace {      /// A flag indicating whether this optimization pass should run.      bool Run; -    /// Flags which determine whether each of the interesting runtine functions +    /// Flags which determine whether each of the interesting runtime functions      /// is in fact used in the current function.      unsigned UsedInThisFunction; @@ -556,7 +556,7 @@ namespace {  char ObjCARCOpt::ID = 0;  INITIALIZE_PASS_BEGIN(ObjCARCOpt,                        "objc-arc", "ObjC ARC optimization", false, false) -INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass)  INITIALIZE_PASS_END(ObjCARCOpt,                      "objc-arc", "ObjC ARC optimization", false, false) @@ -565,8 +565,8 @@ Pass *llvm::createObjCARCOptPass() {  }  void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const { -  AU.addRequired<ObjCARCAliasAnalysis>(); -  AU.addRequired<AliasAnalysis>(); +  AU.addRequired<ObjCARCAAWrapperPass>(); +  AU.addRequired<AAResultsWrapperPass>();    // ARC optimization doesn't currently split critical edges.    AU.setPreservesCFG();  } @@ -581,16 +581,18 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {    ImmutableCallSite CS(Arg);    if (const Instruction *Call = CS.getInstruction()) {      if (Call->getParent() == RetainRV->getParent()) { -      BasicBlock::const_iterator I = Call; +      BasicBlock::const_iterator I(Call);        ++I; -      while (IsNoopInstruction(I)) ++I; +      while (IsNoopInstruction(&*I)) +        ++I;        if (&*I == RetainRV)          return false;      } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {        BasicBlock *RetainRVParent = RetainRV->getParent();        if (II->getNormalDest() == RetainRVParent) {          BasicBlock::const_iterator I = RetainRVParent->begin(); -        while (IsNoopInstruction(I)) ++I; +        while (IsNoopInstruction(&*I)) +          ++I;          if (&*I == RetainRV)            return false;        } @@ -599,18 +601,21 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {    // Check for being preceded by an objc_autoreleaseReturnValue on the same    // pointer. In this case, we can delete the pair. -  BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin(); +  BasicBlock::iterator I = RetainRV->getIterator(), +                       Begin = RetainRV->getParent()->begin();    if (I != Begin) { -    do --I; while (I != Begin && IsNoopInstruction(I)); -    if (GetBasicARCInstKind(I) == ARCInstKind::AutoreleaseRV && -        GetArgRCIdentityRoot(I) == Arg) { +    do +      --I; +    while (I != Begin && IsNoopInstruction(&*I)); +    if (GetBasicARCInstKind(&*I) == ARCInstKind::AutoreleaseRV && +        GetArgRCIdentityRoot(&*I) == Arg) {        Changed = true;        ++NumPeeps;        DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n"                     << "Erasing " << *RetainRV << "\n"); -      EraseInstruction(I); +      EraseInstruction(&*I);        EraseInstruction(RetainRV);        return true;      } @@ -1216,7 +1221,7 @@ bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,    // Visit all the instructions, bottom-up.    for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) { -    Instruction *Inst = std::prev(I); +    Instruction *Inst = &*std::prev(I);      // Invoke instructions are visited as part of their successors (below).      if (isa<InvokeInst>(Inst)) @@ -1264,7 +1269,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,      Arg = GetArgRCIdentityRoot(Inst);      TopDownPtrState &S = MyStates.getPtrTopDownState(Arg);      NestingDetected |= S.InitTopDown(Class, Inst); -    // A retain can be a potential use; procede to the generic checking +    // A retain can be a potential use; proceed to the generic checking      // code below.      break;    } @@ -1342,12 +1347,10 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,                       << "Performing Dataflow:\n");    // Visit all the instructions, top-down. -  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { -    Instruction *Inst = I; +  for (Instruction &Inst : *BB) { +    DEBUG(dbgs() << "    Visiting " << Inst << "\n"); -    DEBUG(dbgs() << "    Visiting " << *Inst << "\n"); - -    NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates); +    NestingDetected |= VisitInstructionTopDown(&Inst, Releases, MyStates);    }    DEBUG(llvm::dbgs() << "\nState Before Checking for CFG Hazards:\n" @@ -1413,16 +1416,15 @@ ComputePostOrders(Function &F,    // Functions may have many exits, and there also blocks which we treat    // as exits due to ignored edges.    SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack; -  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { -    BasicBlock *ExitBB = I; -    BBState &MyStates = BBStates[ExitBB]; +  for (BasicBlock &ExitBB : F) { +    BBState &MyStates = BBStates[&ExitBB];      if (!MyStates.isExit())        continue;      MyStates.SetAsExit(); -    PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin())); -    Visited.insert(ExitBB); +    PredStack.push_back(std::make_pair(&ExitBB, MyStates.pred_begin())); +    Visited.insert(&ExitBB);      while (!PredStack.empty()) {      reverse_dfs_next_succ:        BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end(); @@ -1830,7 +1832,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {      // analysis too, but that would want caching. A better approach would be to      // use the technique that EarlyCSE uses.      inst_iterator Current = std::prev(I); -    BasicBlock *CurrentBB = Current.getBasicBlockIterator(); +    BasicBlock *CurrentBB = &*Current.getBasicBlockIterator();      for (BasicBlock::iterator B = CurrentBB->begin(),                                J = Current.getInstructionIterator();           J != B; --J) { @@ -2008,10 +2010,7 @@ HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain,    // Check that the call is a regular call.    ARCInstKind Class = GetBasicARCInstKind(Call); -  if (Class != ARCInstKind::CallOrUser && Class != ARCInstKind::Call) -    return false; - -  return true; +  return Class == ARCInstKind::CallOrUser || Class == ARCInstKind::Call;  }  /// Find a dependent retain that precedes the given autorelease for which there @@ -2081,9 +2080,8 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {    SmallPtrSet<Instruction *, 4> DependingInstructions;    SmallPtrSet<const BasicBlock *, 4> Visited; -  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { -    BasicBlock *BB = FI; -    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back()); +  for (BasicBlock &BB: F) { +    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB.back());      DEBUG(dbgs() << "Visiting: " << *Ret << "\n"); @@ -2095,19 +2093,16 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {      // Look for an ``autorelease'' instruction that is a predecessor of Ret and      // dependent on Arg such that there are no instructions dependent on Arg      // that need a positive ref count in between the autorelease and Ret. -    CallInst *Autorelease = -      FindPredecessorAutoreleaseWithSafePath(Arg, BB, Ret, -                                             DependingInstructions, Visited, -                                             PA); +    CallInst *Autorelease = FindPredecessorAutoreleaseWithSafePath( +        Arg, &BB, Ret, DependingInstructions, Visited, PA);      DependingInstructions.clear();      Visited.clear();      if (!Autorelease)        continue; -    CallInst *Retain = -      FindPredecessorRetainWithSafePath(Arg, BB, Autorelease, -                                        DependingInstructions, Visited, PA); +    CallInst *Retain = FindPredecessorRetainWithSafePath( +        Arg, &BB, Autorelease, DependingInstructions, Visited, PA);      DependingInstructions.clear();      Visited.clear(); @@ -2192,7 +2187,7 @@ bool ObjCARCOpt::runOnFunction(Function &F) {    DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName() << " >>>"          "\n"); -  PA.setAA(&getAnalysis<AliasAnalysis>()); +  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());  #ifndef NDEBUG    if (AreStatisticsEnabled()) { diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h index 0ac41d3ea326..1a12b659e5a3 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h @@ -26,10 +26,10 @@  #define LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H  #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/AliasAnalysis.h"  namespace llvm {    class Value; -  class AliasAnalysis;    class DataLayout;    class PHINode;    class SelectInst; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp index 0be75af52014..c274e8182fb5 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp @@ -35,7 +35,7 @@ char PAEval::ID = 0;  PAEval::PAEval() : FunctionPass(ID) {}  void PAEval::getAnalysisUsage(AnalysisUsage &AU) const { -  AU.addRequired<AliasAnalysis>(); +  AU.addRequired<AAResultsWrapperPass>();  }  static StringRef getName(Value *V) { @@ -65,7 +65,7 @@ bool PAEval::runOnFunction(Function &F) {    }    ProvenanceAnalysis PA; -  PA.setAA(&getAnalysis<AliasAnalysis>()); +  PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());    const DataLayout &DL = F.getParent()->getDataLayout();    for (Value *V1 : Values) { @@ -89,6 +89,6 @@ FunctionPass *llvm::createPAEvalPass() { return new PAEval(); }  INITIALIZE_PASS_BEGIN(PAEval, "pa-eval",                        "Evaluate ProvenanceAnalysis on all pairs", false, true) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)  INITIALIZE_PASS_END(PAEval, "pa-eval",                      "Evaluate ProvenanceAnalysis on all pairs", false, true) diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp index ae20e7e6d347..df64fa32f3f8 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp @@ -256,9 +256,9 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,        // one of its successor blocks, since we can't insert code after it        // in its own block, and we don't want to split critical edges.        if (isa<InvokeInst>(Inst)) -        InsertReverseInsertPt(BB->getFirstInsertionPt()); +        InsertReverseInsertPt(&*BB->getFirstInsertionPt());        else -        InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst))); +        InsertReverseInsertPt(&*++Inst->getIterator());        SetSeq(S_Use);      } else if (Seq == S_Release && IsUser(Class)) {        DEBUG(dbgs() << "            PreciseReleaseUse: Seq: " << GetSeq() << "; " @@ -268,9 +268,9 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,        assert(!HasReverseInsertPts());        // As above; handle invoke specially.        if (isa<InvokeInst>(Inst)) -        InsertReverseInsertPt(BB->getFirstInsertionPt()); +        InsertReverseInsertPt(&*BB->getFirstInsertionPt());        else -        InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst))); +        InsertReverseInsertPt(&*++Inst->getIterator());      }      break;    case S_Stop: diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h index e45e1ea96c53..9749e44822b2 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h @@ -17,8 +17,8 @@  #ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H  #define LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H -#include "ARCInstKind.h"  #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/ObjCARCInstKind.h"  #include "llvm/IR/Instruction.h"  #include "llvm/IR/Value.h"  #include "llvm/Support/raw_ostream.h" @@ -96,7 +96,7 @@ struct RRInfo {  };  /// \brief This class summarizes several per-pointer runtime properties which -/// are propogated through the flow graph. +/// are propagated through the flow graph.  class PtrState {  protected:    /// True if the reference count is known to be incremented. @@ -172,7 +172,7 @@ struct BottomUpPtrState : PtrState {    bool InitBottomUp(ARCMDKindCache &Cache, Instruction *I);    /// Return true if this set of releases can be paired with a release. Modifies -  /// state appropriately to reflect that the matching occured if it is +  /// state appropriately to reflect that the matching occurred if it is    /// successful.    ///    /// It is assumed that one has already checked that the RCIdentity of the @@ -194,7 +194,7 @@ struct TopDownPtrState : PtrState {    /// Return true if this set of retains can be paired with the given    /// release. Modifies state appropriately to reflect that the matching -  /// occured. +  /// occurred.    bool MatchWithRelease(ARCMDKindCache &Cache, Instruction *Release);    void HandlePotentialUse(Instruction *Inst, const Value *Ptr, diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp index d6fc91641588..590a52da6b19 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -1,4 +1,4 @@ -//===- DCE.cpp - Code to perform dead code elimination --------------------===// +//===- ADCE.cpp - Code to perform dead code elimination -------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -14,52 +14,33 @@  //  //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/ADCE.h"  #include "llvm/ADT/DepthFirstIterator.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/IR/BasicBlock.h"  #include "llvm/IR/CFG.h"  #include "llvm/IR/InstIterator.h"  #include "llvm/IR/Instructions.h"  #include "llvm/IR/IntrinsicInst.h"  #include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h"  using namespace llvm;  #define DEBUG_TYPE "adce"  STATISTIC(NumRemoved, "Number of instructions removed"); -namespace { -struct ADCE : public FunctionPass { -  static char ID; // Pass identification, replacement for typeid -  ADCE() : FunctionPass(ID) { -    initializeADCEPass(*PassRegistry::getPassRegistry()); -  } - -  bool runOnFunction(Function& F) override; - -  void getAnalysisUsage(AnalysisUsage& AU) const override { -    AU.setPreservesCFG(); -  } -}; -} - -char ADCE::ID = 0; -INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false) - -bool ADCE::runOnFunction(Function& F) { -  if (skipOptnoneFunction(F)) -    return false; - +static bool aggressiveDCE(Function& F) {    SmallPtrSet<Instruction*, 128> Alive;    SmallVector<Instruction*, 128> Worklist;    // Collect the set of "root" instructions that are known live. -  for (Instruction &I : inst_range(F)) { -    if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || -        isa<LandingPadInst>(I) || I.mayHaveSideEffects()) { +  for (Instruction &I : instructions(F)) { +    if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || I.isEHPad() || +        I.mayHaveSideEffects()) {        Alive.insert(&I);        Worklist.push_back(&I);      } @@ -79,7 +60,7 @@ bool ADCE::runOnFunction(Function& F) {    // which have no side effects and do not influence the control flow or return    // value of the function, and may therefore be deleted safely.    // NOTE: We reuse the Worklist vector here for memory efficiency. -  for (Instruction &I : inst_range(F)) { +  for (Instruction &I : instructions(F)) {      if (!Alive.count(&I)) {        Worklist.push_back(&I);        I.dropAllReferences(); @@ -94,6 +75,34 @@ bool ADCE::runOnFunction(Function& F) {    return !Worklist.empty();  } -FunctionPass *llvm::createAggressiveDCEPass() { -  return new ADCE(); +PreservedAnalyses ADCEPass::run(Function &F) { +  if (aggressiveDCE(F)) +    return PreservedAnalyses::none(); +  return PreservedAnalyses::all();  } + +namespace { +struct ADCELegacyPass : public FunctionPass { +  static char ID; // Pass identification, replacement for typeid +  ADCELegacyPass() : FunctionPass(ID) { +    initializeADCELegacyPassPass(*PassRegistry::getPassRegistry()); +  } + +  bool runOnFunction(Function& F) override { +    if (skipOptnoneFunction(F)) +      return false; +    return aggressiveDCE(F); +  } + +  void getAnalysisUsage(AnalysisUsage& AU) const override { +    AU.setPreservesCFG(); +    AU.addPreserved<GlobalsAAWrapperPass>(); +  } +}; +} + +char ADCELegacyPass::ID = 0; +INITIALIZE_PASS(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination", +                false, false) + +FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 8918909f484a..4b721d38adba 100644 --- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -21,6 +21,8 @@  #include "llvm/Transforms/Scalar.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/ScalarEvolution.h" @@ -54,13 +56,15 @@ struct AlignmentFromAssumptions : public FunctionPass {    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.addRequired<AssumptionCacheTracker>(); -    AU.addRequired<ScalarEvolution>(); +    AU.addRequired<ScalarEvolutionWrapperPass>();      AU.addRequired<DominatorTreeWrapperPass>();      AU.setPreservesCFG(); +    AU.addPreserved<AAResultsWrapperPass>(); +    AU.addPreserved<GlobalsAAWrapperPass>();      AU.addPreserved<LoopInfoWrapperPass>();      AU.addPreserved<DominatorTreeWrapperPass>(); -    AU.addPreserved<ScalarEvolution>(); +    AU.addPreserved<ScalarEvolutionWrapperPass>();    }    // For memory transfers, we need a common alignment for both the source and @@ -84,7 +88,7 @@ INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,                        aip_name, false, false)  INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,                      aip_name, false, false) @@ -249,8 +253,7 @@ bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I,    // The mask must have some trailing ones (otherwise the condition is    // trivial and tells us nothing about the alignment of the left operand). -  unsigned TrailingOnes = -    MaskSCEV->getValue()->getValue().countTrailingOnes(); +  unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes();    if (!TrailingOnes)      return false; @@ -270,7 +273,7 @@ bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I,    OffSCEV = nullptr;    if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) {      AAPtr = PToI->getPointerOperand(); -    OffSCEV = SE->getConstant(Int64Ty, 0); +    OffSCEV = SE->getZero(Int64Ty);    } else if (const SCEVAddExpr* AndLHSAddSCEV =               dyn_cast<SCEVAddExpr>(AndLHSSCEV)) {      // Try to find the ptrtoint; subtract it and the rest is the offset. @@ -410,7 +413,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) {  bool AlignmentFromAssumptions::runOnFunction(Function &F) {    bool Changed = false;    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); -  SE = &getAnalysis<ScalarEvolution>(); +  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();    NewDestAlignments.clear(); diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp index 09c605e76737..cb9b8b6fffc8 100644 --- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp @@ -15,26 +15,18 @@  //===----------------------------------------------------------------------===//  #include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/BasicBlock.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/DemandedBits.h"  #include "llvm/IR/CFG.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Dominators.h"  #include "llvm/IR/InstIterator.h"  #include "llvm/IR/Instructions.h"  #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h"  #include "llvm/IR/Operator.h"  #include "llvm/Pass.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/raw_ostream.h" -  using namespace llvm;  #define DEBUG_TYPE "bdce" @@ -53,342 +45,42 @@ struct BDCE : public FunctionPass {    void getAnalysisUsage(AnalysisUsage& AU) const override {      AU.setPreservesCFG(); -    AU.addRequired<AssumptionCacheTracker>(); -    AU.addRequired<DominatorTreeWrapperPass>(); +    AU.addRequired<DemandedBits>(); +    AU.addPreserved<GlobalsAAWrapperPass>();    } - -  void determineLiveOperandBits(const Instruction *UserI, -                                const Instruction *I, unsigned OperandNo, -                                const APInt &AOut, APInt &AB, -                                APInt &KnownZero, APInt &KnownOne, -                                APInt &KnownZero2, APInt &KnownOne2); - -  AssumptionCache *AC; -  DominatorTree *DT;  };  }  char BDCE::ID = 0;  INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",                        false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DemandedBits)  INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",                      false, false) -static bool isAlwaysLive(Instruction *I) { -  return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || -         isa<LandingPadInst>(I) || I->mayHaveSideEffects(); -} - -void BDCE::determineLiveOperandBits(const Instruction *UserI, -                                    const Instruction *I, unsigned OperandNo, -                                    const APInt &AOut, APInt &AB, -                                    APInt &KnownZero, APInt &KnownOne, -                                    APInt &KnownZero2, APInt &KnownOne2) { -  unsigned BitWidth = AB.getBitWidth(); - -  // We're called once per operand, but for some instructions, we need to -  // compute known bits of both operands in order to determine the live bits of -  // either (when both operands are instructions themselves). We don't, -  // however, want to do this twice, so we cache the result in APInts that live -  // in the caller. For the two-relevant-operands case, both operand values are -  // provided here. -  auto ComputeKnownBits = -      [&](unsigned BitWidth, const Value *V1, const Value *V2) { -        const DataLayout &DL = I->getModule()->getDataLayout(); -        KnownZero = APInt(BitWidth, 0); -        KnownOne = APInt(BitWidth, 0); -        computeKnownBits(const_cast<Value *>(V1), KnownZero, KnownOne, DL, 0, -                         AC, UserI, DT); - -        if (V2) { -          KnownZero2 = APInt(BitWidth, 0); -          KnownOne2 = APInt(BitWidth, 0); -          computeKnownBits(const_cast<Value *>(V2), KnownZero2, KnownOne2, DL, -                           0, AC, UserI, DT); -        } -      }; - -  switch (UserI->getOpcode()) { -  default: break; -  case Instruction::Call: -  case Instruction::Invoke: -    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI)) -      switch (II->getIntrinsicID()) { -      default: break; -      case Intrinsic::bswap: -        // The alive bits of the input are the swapped alive bits of -        // the output. -        AB = AOut.byteSwap(); -        break; -      case Intrinsic::ctlz: -        if (OperandNo == 0) { -          // We need some output bits, so we need all bits of the -          // input to the left of, and including, the leftmost bit -          // known to be one. -          ComputeKnownBits(BitWidth, I, nullptr); -          AB = APInt::getHighBitsSet(BitWidth, -                 std::min(BitWidth, KnownOne.countLeadingZeros()+1)); -        } -        break; -      case Intrinsic::cttz: -        if (OperandNo == 0) { -          // We need some output bits, so we need all bits of the -          // input to the right of, and including, the rightmost bit -          // known to be one. -          ComputeKnownBits(BitWidth, I, nullptr); -          AB = APInt::getLowBitsSet(BitWidth, -                 std::min(BitWidth, KnownOne.countTrailingZeros()+1)); -        } -        break; -      } -    break; -  case Instruction::Add: -  case Instruction::Sub: -    // Find the highest live output bit. We don't need any more input -    // bits than that (adds, and thus subtracts, ripple only to the -    // left). -    AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits()); -    break; -  case Instruction::Shl: -    if (OperandNo == 0) -      if (ConstantInt *CI = -            dyn_cast<ConstantInt>(UserI->getOperand(1))) { -        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); -        AB = AOut.lshr(ShiftAmt); - -        // If the shift is nuw/nsw, then the high bits are not dead -        // (because we've promised that they *must* be zero). -        const ShlOperator *S = cast<ShlOperator>(UserI); -        if (S->hasNoSignedWrap()) -          AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1); -        else if (S->hasNoUnsignedWrap()) -          AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt); -      } -    break; -  case Instruction::LShr: -    if (OperandNo == 0) -      if (ConstantInt *CI = -            dyn_cast<ConstantInt>(UserI->getOperand(1))) { -        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); -        AB = AOut.shl(ShiftAmt); - -        // If the shift is exact, then the low bits are not dead -        // (they must be zero). -        if (cast<LShrOperator>(UserI)->isExact()) -          AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); -      } -    break; -  case Instruction::AShr: -    if (OperandNo == 0) -      if (ConstantInt *CI = -            dyn_cast<ConstantInt>(UserI->getOperand(1))) { -        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); -        AB = AOut.shl(ShiftAmt); -        // Because the high input bit is replicated into the -        // high-order bits of the result, if we need any of those -        // bits, then we must keep the highest input bit. -        if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt)) -            .getBoolValue()) -          AB.setBit(BitWidth-1); - -        // If the shift is exact, then the low bits are not dead -        // (they must be zero). -        if (cast<AShrOperator>(UserI)->isExact()) -          AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); -      } -    break; -  case Instruction::And: -    AB = AOut; - -    // For bits that are known zero, the corresponding bits in the -    // other operand are dead (unless they're both zero, in which -    // case they can't both be dead, so just mark the LHS bits as -    // dead). -    if (OperandNo == 0) { -      ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); -      AB &= ~KnownZero2; -    } else { -      if (!isa<Instruction>(UserI->getOperand(0))) -        ComputeKnownBits(BitWidth, UserI->getOperand(0), I); -      AB &= ~(KnownZero & ~KnownZero2); -    } -    break; -  case Instruction::Or: -    AB = AOut; - -    // For bits that are known one, the corresponding bits in the -    // other operand are dead (unless they're both one, in which -    // case they can't both be dead, so just mark the LHS bits as -    // dead). -    if (OperandNo == 0) { -      ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); -      AB &= ~KnownOne2; -    } else { -      if (!isa<Instruction>(UserI->getOperand(0))) -        ComputeKnownBits(BitWidth, UserI->getOperand(0), I); -      AB &= ~(KnownOne & ~KnownOne2); -    } -    break; -  case Instruction::Xor: -  case Instruction::PHI: -    AB = AOut; -    break; -  case Instruction::Trunc: -    AB = AOut.zext(BitWidth); -    break; -  case Instruction::ZExt: -    AB = AOut.trunc(BitWidth); -    break; -  case Instruction::SExt: -    AB = AOut.trunc(BitWidth); -    // Because the high input bit is replicated into the -    // high-order bits of the result, if we need any of those -    // bits, then we must keep the highest input bit. -    if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(), -                                      AOut.getBitWidth() - BitWidth)) -        .getBoolValue()) -      AB.setBit(BitWidth-1); -    break; -  case Instruction::Select: -    if (OperandNo != 0) -      AB = AOut; -    break; -  } -} -  bool BDCE::runOnFunction(Function& F) {    if (skipOptnoneFunction(F))      return false; +  DemandedBits &DB = getAnalysis<DemandedBits>(); -  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); -  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - -  DenseMap<Instruction *, APInt> AliveBits;    SmallVector<Instruction*, 128> Worklist; - -  // The set of visited instructions (non-integer-typed only). -  SmallPtrSet<Instruction*, 128> Visited; - -  // Collect the set of "root" instructions that are known live. -  for (Instruction &I : inst_range(F)) { -    if (!isAlwaysLive(&I)) -      continue; - -    DEBUG(dbgs() << "BDCE: Root: " << I << "\n"); -    // For integer-valued instructions, set up an initial empty set of alive -    // bits and add the instruction to the work list. For other instructions -    // add their operands to the work list (for integer values operands, mark -    // all bits as live). -    if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) { -      if (!AliveBits.count(&I)) { -        AliveBits[&I] = APInt(IT->getBitWidth(), 0); -        Worklist.push_back(&I); -      } - -      continue; -    } - -    // Non-integer-typed instructions... -    for (Use &OI : I.operands()) { -      if (Instruction *J = dyn_cast<Instruction>(OI)) { -        if (IntegerType *IT = dyn_cast<IntegerType>(J->getType())) -          AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth()); -        Worklist.push_back(J); -      } -    } -    // To save memory, we don't add I to the Visited set here. Instead, we -    // check isAlwaysLive on every instruction when searching for dead -    // instructions later (we need to check isAlwaysLive for the -    // integer-typed instructions anyway). -  } - -  // Propagate liveness backwards to operands. -  while (!Worklist.empty()) { -    Instruction *UserI = Worklist.pop_back_val(); - -    DEBUG(dbgs() << "BDCE: Visiting: " << *UserI); -    APInt AOut; -    if (UserI->getType()->isIntegerTy()) { -      AOut = AliveBits[UserI]; -      DEBUG(dbgs() << " Alive Out: " << AOut); -    } -    DEBUG(dbgs() << "\n"); - -    if (!UserI->getType()->isIntegerTy()) -      Visited.insert(UserI); - -    APInt KnownZero, KnownOne, KnownZero2, KnownOne2; -    // Compute the set of alive bits for each operand. These are anded into the -    // existing set, if any, and if that changes the set of alive bits, the -    // operand is added to the work-list. -    for (Use &OI : UserI->operands()) { -      if (Instruction *I = dyn_cast<Instruction>(OI)) { -        if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) { -          unsigned BitWidth = IT->getBitWidth(); -          APInt AB = APInt::getAllOnesValue(BitWidth); -          if (UserI->getType()->isIntegerTy() && !AOut && -              !isAlwaysLive(UserI)) { -            AB = APInt(BitWidth, 0); -          } else { -            // If all bits of the output are dead, then all bits of the input  -            // Bits of each operand that are used to compute alive bits of the -            // output are alive, all others are dead. -            determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB, -                                     KnownZero, KnownOne, -                                     KnownZero2, KnownOne2); -          } - -          // If we've added to the set of alive bits (or the operand has not -          // been previously visited), then re-queue the operand to be visited -          // again. -          APInt ABPrev(BitWidth, 0); -          auto ABI = AliveBits.find(I); -          if (ABI != AliveBits.end()) -            ABPrev = ABI->second; - -          APInt ABNew = AB | ABPrev; -          if (ABNew != ABPrev || ABI == AliveBits.end()) { -            AliveBits[I] = std::move(ABNew); -            Worklist.push_back(I); -          } -        } else if (!Visited.count(I)) { -          Worklist.push_back(I); -        } -      } -    } -  } -    bool Changed = false; -  // The inverse of the live set is the dead set.  These are those instructions -  // which have no side effects and do not influence the control flow or return -  // value of the function, and may therefore be deleted safely. -  // NOTE: We reuse the Worklist vector here for memory efficiency. -  for (Instruction &I : inst_range(F)) { -    // For live instructions that have all dead bits, first make them dead by -    // replacing all uses with something else. Then, if they don't need to -    // remain live (because they have side effects, etc.) we can remove them. -    if (I.getType()->isIntegerTy()) { -      auto ABI = AliveBits.find(&I); -      if (ABI != AliveBits.end()) { -        if (ABI->second.getBoolValue()) -          continue; - -        DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); -        // FIXME: In theory we could substitute undef here instead of zero. -        // This should be reconsidered once we settle on the semantics of -        // undef, poison, etc. -        Value *Zero = ConstantInt::get(I.getType(), 0); -        ++NumSimplified; -        I.replaceAllUsesWith(Zero); -        Changed = true; -      } -    } else if (Visited.count(&I)) { -      continue; +  for (Instruction &I : instructions(F)) { +    if (I.getType()->isIntegerTy() && +        !DB.getDemandedBits(&I).getBoolValue()) { +      // For live instructions that have all dead bits, first make them dead by +      // replacing all uses with something else. Then, if they don't need to +      // remain live (because they have side effects, etc.) we can remove them. +      DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); +      // FIXME: In theory we could substitute undef here instead of zero. +      // This should be reconsidered once we settle on the semantics of +      // undef, poison, etc. +      Value *Zero = ConstantInt::get(I.getType(), 0); +      ++NumSimplified; +      I.replaceAllUsesWith(Zero); +      Changed = true;      } - -    if (isAlwaysLive(&I)) +    if (!DB.isInstructionDead(&I))        continue;      Worklist.push_back(&I); diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 4288742dd3eb..84f7f5fff5b5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -223,10 +223,10 @@ Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst,    }    // The simple and common case. This also includes constant expressions. -  if (!isa<PHINode>(Inst) && !isa<LandingPadInst>(Inst)) +  if (!isa<PHINode>(Inst) && !Inst->isEHPad())      return Inst; -  // We can't insert directly before a phi node or landing pad. Insert before +  // We can't insert directly before a phi node or an eh pad. Insert before    // the terminator of the incoming or dominating block.    assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!");    if (Idx != ~0U && isa<PHINode>(Inst)) @@ -365,9 +365,9 @@ void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,  /// into an instruction itself.  void ConstantHoisting::collectConstantCandidates(Function &Fn) {    ConstCandMapType ConstCandMap; -  for (Function::iterator BB : Fn) -    for (BasicBlock::iterator Inst : *BB) -      collectConstantCandidates(ConstCandMap, Inst); +  for (BasicBlock &BB : Fn) +    for (Instruction &Inst : BB) +      collectConstantCandidates(ConstCandMap, &Inst);  }  /// \brief Find the base constant within the given range and rebase all other diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 79624b2e4c47..686bd4071104 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -13,6 +13,7 @@  #include "llvm/Transforms/Scalar.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/InstructionSimplify.h"  #include "llvm/Analysis/LazyValueInfo.h"  #include "llvm/IR/CFG.h" @@ -32,6 +33,7 @@ STATISTIC(NumPhis,      "Number of phis propagated");  STATISTIC(NumSelects,   "Number of selects propagated");  STATISTIC(NumMemAccess, "Number of memory access targets propagated");  STATISTIC(NumCmps,      "Number of comparisons propagated"); +STATISTIC(NumReturns,   "Number of return values propagated");  STATISTIC(NumDeadCases, "Number of switch cases removed");  namespace { @@ -43,6 +45,11 @@ namespace {      bool processMemAccess(Instruction *I);      bool processCmp(CmpInst *C);      bool processSwitch(SwitchInst *SI); +    bool processCallSite(CallSite CS); + +    /// Return a constant value for V usable at At and everything it +    /// dominates.  If no such Constant can be found, return nullptr. +    Constant *getConstantAt(Value *V, Instruction *At);    public:      static char ID; @@ -54,6 +61,7 @@ namespace {      void getAnalysisUsage(AnalysisUsage &AU) const override {        AU.addRequired<LazyValueInfo>(); +      AU.addPreserved<GlobalsAAWrapperPass>();      }    };  } @@ -178,44 +186,33 @@ bool CorrelatedValuePropagation::processMemAccess(Instruction *I) {    return true;  } -/// processCmp - If the value of this comparison could be determined locally, -/// constant propagation would already have figured it out.  Instead, walk -/// the predecessors and statically evaluate the comparison based on information -/// available on that edge.  If a given static evaluation is true on ALL -/// incoming edges, then it's true universally and we can simplify the compare. +/// processCmp - See if LazyValueInfo's ability to exploit edge conditions, +/// or range information is sufficient to prove this comparison.  Even for +/// local conditions, this can sometimes prove conditions instcombine can't by +/// exploiting range information.  bool CorrelatedValuePropagation::processCmp(CmpInst *C) {    Value *Op0 = C->getOperand(0); -  if (isa<Instruction>(Op0) && -      cast<Instruction>(Op0)->getParent() == C->getParent()) -    return false; -    Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));    if (!Op1) return false; -  pred_iterator PI = pred_begin(C->getParent()), PE = pred_end(C->getParent()); -  if (PI == PE) return false; +  // As a policy choice, we choose not to waste compile time on anything where +  // the comparison is testing local values.  While LVI can sometimes reason +  // about such cases, it's not its primary purpose.  We do make sure to do +  // the block local query for uses from terminator instructions, but that's +  // handled in the code for each terminator. +  auto *I = dyn_cast<Instruction>(Op0); +  if (I && I->getParent() == C->getParent()) +    return false; -  LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(), -                                    C->getOperand(0), Op1, *PI, -                                    C->getParent(), C); +  LazyValueInfo::Tristate Result = +    LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C);    if (Result == LazyValueInfo::Unknown) return false; -  ++PI; -  while (PI != PE) { -    LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(), -                                    C->getOperand(0), Op1, *PI, -                                    C->getParent(), C); -    if (Res != Result) return false; -    ++PI; -  } -    ++NumCmps; -    if (Result == LazyValueInfo::True)      C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext()));    else      C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext())); -    C->eraseFromParent();    return true; @@ -307,6 +304,59 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {    return Changed;  } +/// processCallSite - Infer nonnull attributes for the arguments at the +/// specified callsite. +bool CorrelatedValuePropagation::processCallSite(CallSite CS) { +  SmallVector<unsigned, 4> Indices; +  unsigned ArgNo = 0; + +  for (Value *V : CS.args()) { +    PointerType *Type = dyn_cast<PointerType>(V->getType()); + +    if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) && +        LVI->getPredicateAt(ICmpInst::ICMP_EQ, V, +                            ConstantPointerNull::get(Type), +                            CS.getInstruction()) == LazyValueInfo::False) +      Indices.push_back(ArgNo + 1); +    ArgNo++; +  } + +  assert(ArgNo == CS.arg_size() && "sanity check"); + +  if (Indices.empty()) +    return false; + +  AttributeSet AS = CS.getAttributes(); +  LLVMContext &Ctx = CS.getInstruction()->getContext(); +  AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull)); +  CS.setAttributes(AS); + +  return true; +} + +Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) { +  if (Constant *C = LVI->getConstant(V, At->getParent(), At)) +    return C; + +  // TODO: The following really should be sunk inside LVI's core algorithm, or +  // at least the outer shims around such. +  auto *C = dyn_cast<CmpInst>(V); +  if (!C) return nullptr; + +  Value *Op0 = C->getOperand(0); +  Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); +  if (!Op1) return nullptr; +   +  LazyValueInfo::Tristate Result = +    LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At); +  if (Result == LazyValueInfo::Unknown) +    return nullptr; +   +  return (Result == LazyValueInfo::True) ? +    ConstantInt::getTrue(C->getContext()) : +    ConstantInt::getFalse(C->getContext()); +} +  bool CorrelatedValuePropagation::runOnFunction(Function &F) {    if (skipOptnoneFunction(F))      return false; @@ -318,7 +368,7 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {    for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {      bool BBChanged = false;      for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) { -      Instruction *II = BI++; +      Instruction *II = &*BI++;        switch (II->getOpcode()) {        case Instruction::Select:          BBChanged |= processSelect(cast<SelectInst>(II)); @@ -334,6 +384,10 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {        case Instruction::Store:          BBChanged |= processMemAccess(II);          break; +      case Instruction::Call: +      case Instruction::Invoke: +        BBChanged |= processCallSite(CallSite(II)); +        break;        }      } @@ -342,7 +396,21 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {      case Instruction::Switch:        BBChanged |= processSwitch(cast<SwitchInst>(Term));        break; +    case Instruction::Ret: { +      auto *RI = cast<ReturnInst>(Term); +      // Try to determine the return value if we can.  This is mainly here to +      // simplify the writing of unit tests, but also helps to enable IPO by +      // constant folding the return values of callees. +      auto *RetVal = RI->getReturnValue(); +      if (!RetVal) break; // handle "ret void" +      if (isa<Constant>(RetVal)) break; // nothing to do +      if (auto *C = getConstantAt(RetVal, RI)) { +        ++NumReturns; +        RI->replaceUsesOfWith(RetVal, C); +        BBChanged = true;         +      }      } +    };      FnChanged |= BBChanged;    } diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp index 3b262a23091f..b67c3c7742fd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp @@ -17,6 +17,7 @@  //===----------------------------------------------------------------------===//  #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SetVector.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/IR/InstIterator.h"  #include "llvm/IR/Instruction.h" @@ -46,7 +47,7 @@ namespace {        TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;        bool Changed = false;        for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { -        Instruction *Inst = DI++; +        Instruction *Inst = &*DI++;          if (isInstructionTriviallyDead(Inst, TLI)) {            Inst->eraseFromParent();            Changed = true; @@ -92,6 +93,34 @@ namespace {  char DCE::ID = 0;  INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false) +static bool DCEInstruction(Instruction *I, +                           SmallSetVector<Instruction *, 16> &WorkList, +                           const TargetLibraryInfo *TLI) { +  if (isInstructionTriviallyDead(I, TLI)) { +    // Null out all of the instruction's operands to see if any operand becomes +    // dead as we go. +    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { +      Value *OpV = I->getOperand(i); +      I->setOperand(i, nullptr); + +      if (!OpV->use_empty() || I == OpV) +        continue; + +      // If the operand is an instruction that became dead as we nulled out the +      // operand, and if it is 'trivially' dead, delete it in a future loop +      // iteration. +      if (Instruction *OpI = dyn_cast<Instruction>(OpV)) +        if (isInstructionTriviallyDead(OpI, TLI)) +          WorkList.insert(OpI); +    } + +    I->eraseFromParent(); +    ++DCEEliminated; +    return true; +  } +  return false; +} +  bool DCE::runOnFunction(Function &F) {    if (skipOptnoneFunction(F))      return false; @@ -99,39 +128,24 @@ bool DCE::runOnFunction(Function &F) {    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();    TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; -  // Start out with all of the instructions in the worklist... -  std::vector<Instruction*> WorkList; -  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) -    WorkList.push_back(&*i); - -  // Loop over the worklist finding instructions that are dead.  If they are -  // dead make them drop all of their uses, making other instructions -  // potentially dead, and work until the worklist is empty. -  //    bool MadeChange = false; +  SmallSetVector<Instruction *, 16> WorkList; +  // Iterate over the original function, only adding insts to the worklist +  // if they actually need to be revisited. This avoids having to pre-init +  // the worklist with the entire function's worth of instructions. +  for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) { +    Instruction *I = &*FI; +    ++FI; + +    // We're visiting this instruction now, so make sure it's not in the +    // worklist from an earlier visit. +    if (!WorkList.count(I)) +      MadeChange |= DCEInstruction(I, WorkList, TLI); +  } +    while (!WorkList.empty()) { -    Instruction *I = WorkList.back(); -    WorkList.pop_back(); - -    if (isInstructionTriviallyDead(I, TLI)) { // If the instruction is dead. -      // Loop over all of the values that the instruction uses, if there are -      // instructions being used, add them to the worklist, because they might -      // go dead after this one is removed. -      // -      for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) -        if (Instruction *Used = dyn_cast<Instruction>(*OI)) -          WorkList.push_back(Used); - -      // Remove the instruction. -      I->eraseFromParent(); - -      // Remove the instruction from the worklist if it still exists in it. -      WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I), -                     WorkList.end()); - -      MadeChange = true; -      ++DCEEliminated; -    } +    Instruction *I = WorkList.pop_back_val(); +    MadeChange |= DCEInstruction(I, WorkList, TLI);    }    return MadeChange;  } diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index c50558434da2..36ad0a5f7b91 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -21,6 +21,7 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/MemoryBuiltins.h"  #include "llvm/Analysis/MemoryDependenceAnalysis.h"  #include "llvm/Analysis/TargetLibraryInfo.h" @@ -40,6 +41,7 @@ using namespace llvm;  #define DEBUG_TYPE "dse" +STATISTIC(NumRedundantStores, "Number of redundant stores deleted");  STATISTIC(NumFastStores, "Number of stores deleted");  STATISTIC(NumFastOther , "Number of other instrs removed"); @@ -59,23 +61,24 @@ namespace {        if (skipOptnoneFunction(F))          return false; -      AA = &getAnalysis<AliasAnalysis>(); +      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();        MD = &getAnalysis<MemoryDependenceAnalysis>();        DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); -      TLI = AA->getTargetLibraryInfo(); +      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();        bool Changed = false; -      for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) +      for (BasicBlock &I : F)          // Only check non-dead blocks.  Dead blocks may have strange pointer          // cycles that will confuse alias analysis. -        if (DT->isReachableFromEntry(I)) -          Changed |= runOnBasicBlock(*I); +        if (DT->isReachableFromEntry(&I)) +          Changed |= runOnBasicBlock(I);        AA = nullptr; MD = nullptr; DT = nullptr;        return Changed;      }      bool runOnBasicBlock(BasicBlock &BB); +    bool MemoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI);      bool HandleFree(CallInst *F);      bool handleEndBlock(BasicBlock &BB);      void RemoveAccessedObjects(const MemoryLocation &LoadedLoc, @@ -85,10 +88,11 @@ namespace {      void getAnalysisUsage(AnalysisUsage &AU) const override {        AU.setPreservesCFG();        AU.addRequired<DominatorTreeWrapperPass>(); -      AU.addRequired<AliasAnalysis>(); +      AU.addRequired<AAResultsWrapperPass>();        AU.addRequired<MemoryDependenceAnalysis>(); -      AU.addPreserved<AliasAnalysis>(); +      AU.addRequired<TargetLibraryInfoWrapperPass>();        AU.addPreserved<DominatorTreeWrapperPass>(); +      AU.addPreserved<GlobalsAAWrapperPass>();        AU.addPreserved<MemoryDependenceAnalysis>();      }    }; @@ -97,8 +101,10 @@ namespace {  char DSE::ID = 0;  INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)  INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)  INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false)  FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } @@ -115,7 +121,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }  ///  static void DeleteDeadInstruction(Instruction *I,                                 MemoryDependenceAnalysis &MD, -                               const TargetLibraryInfo *TLI, +                               const TargetLibraryInfo &TLI,                                 SmallSetVector<Value*, 16> *ValueSet = nullptr) {    SmallVector<Instruction*, 32> NowDeadInsts; @@ -140,7 +146,7 @@ static void DeleteDeadInstruction(Instruction *I,        if (!Op->use_empty()) continue;        if (Instruction *OpI = dyn_cast<Instruction>(Op)) -        if (isInstructionTriviallyDead(OpI, TLI)) +        if (isInstructionTriviallyDead(OpI, &TLI))            NowDeadInsts.push_back(OpI);      } @@ -153,7 +159,7 @@ static void DeleteDeadInstruction(Instruction *I,  /// hasMemoryWrite - Does this instruction write some memory?  This only returns  /// true for things that we can analyze with other helpers below. -static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { +static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {    if (isa<StoreInst>(I))      return true;    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { @@ -170,20 +176,20 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) {    }    if (auto CS = CallSite(I)) {      if (Function *F = CS.getCalledFunction()) { -      if (TLI && TLI->has(LibFunc::strcpy) && -          F->getName() == TLI->getName(LibFunc::strcpy)) { +      if (TLI.has(LibFunc::strcpy) && +          F->getName() == TLI.getName(LibFunc::strcpy)) {          return true;        } -      if (TLI && TLI->has(LibFunc::strncpy) && -          F->getName() == TLI->getName(LibFunc::strncpy)) { +      if (TLI.has(LibFunc::strncpy) && +          F->getName() == TLI.getName(LibFunc::strncpy)) {          return true;        } -      if (TLI && TLI->has(LibFunc::strcat) && -          F->getName() == TLI->getName(LibFunc::strcat)) { +      if (TLI.has(LibFunc::strcat) && +          F->getName() == TLI.getName(LibFunc::strcat)) {          return true;        } -      if (TLI && TLI->has(LibFunc::strncat) && -          F->getName() == TLI->getName(LibFunc::strncat)) { +      if (TLI.has(LibFunc::strncat) && +          F->getName() == TLI.getName(LibFunc::strncat)) {          return true;        }      } @@ -224,9 +230,9 @@ static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {  /// getLocForRead - Return the location read by the specified "hasMemoryWrite"  /// instruction if any. -static MemoryLocation getLocForRead(Instruction *Inst, AliasAnalysis &AA) { -  assert(hasMemoryWrite(Inst, AA.getTargetLibraryInfo()) && -         "Unknown instruction case"); +static MemoryLocation getLocForRead(Instruction *Inst, +                                    const TargetLibraryInfo &TLI) { +  assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case");    // The only instructions that both read and write are the mem transfer    // instructions (memcpy/memmove). @@ -313,9 +319,9 @@ static Value *getStoredPointerOperand(Instruction *I) {  }  static uint64_t getPointerSize(const Value *V, const DataLayout &DL, -                               const TargetLibraryInfo *TLI) { +                               const TargetLibraryInfo &TLI) {    uint64_t Size; -  if (getObjectSize(V, Size, DL, TLI)) +  if (getObjectSize(V, Size, DL, &TLI))      return Size;    return MemoryLocation::UnknownSize;  } @@ -336,7 +342,7 @@ namespace {  static OverwriteResult isOverwrite(const MemoryLocation &Later,                                     const MemoryLocation &Earlier,                                     const DataLayout &DL, -                                   const TargetLibraryInfo *TLI, +                                   const TargetLibraryInfo &TLI,                                     int64_t &EarlierOff, int64_t &LaterOff) {    const Value *P1 = Earlier.Ptr->stripPointerCasts();    const Value *P2 = Later.Ptr->stripPointerCasts(); @@ -442,10 +448,12 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,  /// because the DSE inducing instruction may be a self-read.  static bool isPossibleSelfRead(Instruction *Inst,                                 const MemoryLocation &InstStoreLoc, -                               Instruction *DepWrite, AliasAnalysis &AA) { +                               Instruction *DepWrite, +                               const TargetLibraryInfo &TLI, +                               AliasAnalysis &AA) {    // Self reads can only happen for instructions that read memory.  Get the    // location read. -  MemoryLocation InstReadLoc = getLocForRead(Inst, AA); +  MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);    if (!InstReadLoc.Ptr) return false;  // Not a reading instruction.    // If the read and written loc obviously don't alias, it isn't a read. @@ -459,7 +467,7 @@ static bool isPossibleSelfRead(Instruction *Inst,    // Here we don't know if A/B may alias, but we do know that B/B are must    // aliases, so removing the first memcpy is safe (assuming it writes <= #    // bytes as the second one. -  MemoryLocation DepReadLoc = getLocForRead(DepWrite, AA); +  MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);    if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))      return false; @@ -475,11 +483,12 @@ static bool isPossibleSelfRead(Instruction *Inst,  //===----------------------------------------------------------------------===//  bool DSE::runOnBasicBlock(BasicBlock &BB) { +  const DataLayout &DL = BB.getModule()->getDataLayout();    bool MadeChange = false;    // Do a top-down walk on the BB.    for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { -    Instruction *Inst = BBI++; +    Instruction *Inst = &*BBI++;      // Handle 'free' calls specially.      if (CallInst *F = isFreeCall(Inst, TLI)) { @@ -488,42 +497,68 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {      }      // If we find something that writes memory, get its memory dependence. -    if (!hasMemoryWrite(Inst, TLI)) -      continue; - -    MemDepResult InstDep = MD->getDependency(Inst); - -    // Ignore any store where we can't find a local dependence. -    // FIXME: cross-block DSE would be fun. :) -    if (!InstDep.isDef() && !InstDep.isClobber()) +    if (!hasMemoryWrite(Inst, *TLI))        continue;      // If we're storing the same value back to a pointer that we just      // loaded from, then the store can be removed.      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { -      if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) { + +      auto RemoveDeadInstAndUpdateBBI = [&](Instruction *DeadInst) { +        // DeleteDeadInstruction can delete the current instruction.  Save BBI +        // in case we need it. +        WeakVH NextInst(&*BBI); + +        DeleteDeadInstruction(DeadInst, *MD, *TLI); + +        if (!NextInst) // Next instruction deleted. +          BBI = BB.begin(); +        else if (BBI != BB.begin()) // Revisit this instruction if possible. +          --BBI; +        ++NumRedundantStores; +        MadeChange = true; +      }; + +      if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {          if (SI->getPointerOperand() == DepLoad->getPointerOperand() && -            SI->getOperand(0) == DepLoad && isRemovable(SI)) { +            isRemovable(SI) && +            MemoryIsNotModifiedBetween(DepLoad, SI)) { +            DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n  "                         << "LOAD: " << *DepLoad << "\n  STORE: " << *SI << '\n'); -          // DeleteDeadInstruction can delete the current instruction.  Save BBI -          // in case we need it. -          WeakVH NextInst(BBI); +          RemoveDeadInstAndUpdateBBI(SI); +          continue; +        } +      } -          DeleteDeadInstruction(SI, *MD, TLI); +      // Remove null stores into the calloc'ed objects +      Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand()); -          if (!NextInst)  // Next instruction deleted. -            BBI = BB.begin(); -          else if (BBI != BB.begin())  // Revisit this instruction if possible. -            --BBI; -          ++NumFastStores; -          MadeChange = true; +      if (StoredConstant && StoredConstant->isNullValue() && +          isRemovable(SI)) { +        Instruction *UnderlyingPointer = dyn_cast<Instruction>( +            GetUnderlyingObject(SI->getPointerOperand(), DL)); + +        if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) && +            MemoryIsNotModifiedBetween(UnderlyingPointer, SI)) { +          DEBUG(dbgs() +                << "DSE: Remove null store to the calloc'ed object:\n  DEAD: " +                << *Inst << "\n  OBJECT: " << *UnderlyingPointer << '\n'); + +          RemoveDeadInstAndUpdateBBI(SI);            continue;          }        }      } +    MemDepResult InstDep = MD->getDependency(Inst); + +    // Ignore any store where we can't find a local dependence. +    // FIXME: cross-block DSE would be fun. :) +    if (!InstDep.isDef() && !InstDep.isClobber()) +      continue; +      // Figure out what location is being stored to.      MemoryLocation Loc = getLocForWrite(Inst, *AA); @@ -549,24 +584,22 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {        // completely obliterated by the store to 'Loc', and c) which we know that        // 'Inst' doesn't load from, then we can remove it.        if (isRemovable(DepWrite) && -          !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { +          !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {          int64_t InstWriteOffset, DepWriteOffset; -        const DataLayout &DL = BB.getModule()->getDataLayout();          OverwriteResult OR = -            isOverwrite(Loc, DepLoc, DL, AA->getTargetLibraryInfo(), -                        DepWriteOffset, InstWriteOffset); +            isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset);          if (OR == OverwriteComplete) {            DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "                  << *DepWrite << "\n  KILLER: " << *Inst << '\n');            // Delete the store and now-dead instructions that feed it. -          DeleteDeadInstruction(DepWrite, *MD, TLI); +          DeleteDeadInstruction(DepWrite, *MD, *TLI);            ++NumFastStores;            MadeChange = true;            // DeleteDeadInstruction can delete the current instruction in loop            // cases, reset BBI. -          BBI = Inst; +          BBI = Inst->getIterator();            if (BBI != BB.begin())              --BBI;            break; @@ -609,10 +642,11 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {        if (DepWrite == &BB.front()) break;        // Can't look past this instruction if it might read 'Loc'. -      if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref) +      if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)          break; -      InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB); +      InstDep = MD->getPointerDependencyFrom(Loc, false, +                                             DepWrite->getIterator(), &BB);      }    } @@ -624,6 +658,64 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {    return MadeChange;  } +/// Returns true if the memory which is accessed by the second instruction is not +/// modified between the first and the second instruction. +/// Precondition: Second instruction must be dominated by the first +/// instruction. +bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI, +                                     Instruction *SecondI) { +  SmallVector<BasicBlock *, 16> WorkList; +  SmallPtrSet<BasicBlock *, 8> Visited; +  BasicBlock::iterator FirstBBI(FirstI); +  ++FirstBBI; +  BasicBlock::iterator SecondBBI(SecondI); +  BasicBlock *FirstBB = FirstI->getParent(); +  BasicBlock *SecondBB = SecondI->getParent(); +  MemoryLocation MemLoc = MemoryLocation::get(SecondI); + +  // Start checking the store-block. +  WorkList.push_back(SecondBB); +  bool isFirstBlock = true; + +  // Check all blocks going backward until we reach the load-block. +  while (!WorkList.empty()) { +    BasicBlock *B = WorkList.pop_back_val(); + +    // Ignore instructions before LI if this is the FirstBB. +    BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin()); + +    BasicBlock::iterator EI; +    if (isFirstBlock) { +      // Ignore instructions after SI if this is the first visit of SecondBB. +      assert(B == SecondBB && "first block is not the store block"); +      EI = SecondBBI; +      isFirstBlock = false; +    } else { +      // It's not SecondBB or (in case of a loop) the second visit of SecondBB. +      // In this case we also have to look at instructions after SI. +      EI = B->end(); +    } +    for (; BI != EI; ++BI) { +      Instruction *I = &*BI; +      if (I->mayWriteToMemory() && I != SecondI) { +        auto Res = AA->getModRefInfo(I, MemLoc); +        if (Res != MRI_NoModRef) +          return false; +      } +    } +    if (B != FirstBB) { +      assert(B != &FirstBB->getParent()->getEntryBlock() && +          "Should not hit the entry block because SI must be dominated by LI"); +      for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) { +        if (!Visited.insert(*PredI).second) +          continue; +        WorkList.push_back(*PredI); +      } +    } +  } +  return true; +} +  /// Find all blocks that will unconditionally lead to the block BB and append  /// them to F.  static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks, @@ -655,10 +747,11 @@ bool DSE::HandleFree(CallInst *F) {      Instruction *InstPt = BB->getTerminator();      if (BB == F->getParent()) InstPt = F; -    MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB); +    MemDepResult Dep = +        MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);      while (Dep.isDef() || Dep.isClobber()) {        Instruction *Dependency = Dep.getInst(); -      if (!hasMemoryWrite(Dependency, TLI) || !isRemovable(Dependency)) +      if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency))          break;        Value *DepPointer = @@ -668,10 +761,10 @@ bool DSE::HandleFree(CallInst *F) {        if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))          break; -      Instruction *Next = std::next(BasicBlock::iterator(Dependency)); +      auto Next = ++Dependency->getIterator();        // DCE instructions only used to calculate that store -      DeleteDeadInstruction(Dependency, *MD, TLI); +      DeleteDeadInstruction(Dependency, *MD, *TLI);        ++NumFastStores;        MadeChange = true; @@ -704,23 +797,22 @@ bool DSE::handleEndBlock(BasicBlock &BB) {    SmallSetVector<Value*, 16> DeadStackObjects;    // Find all of the alloca'd pointers in the entry block. -  BasicBlock *Entry = BB.getParent()->begin(); -  for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) { -    if (isa<AllocaInst>(I)) -      DeadStackObjects.insert(I); +  BasicBlock &Entry = BB.getParent()->front(); +  for (Instruction &I : Entry) { +    if (isa<AllocaInst>(&I)) +      DeadStackObjects.insert(&I);      // Okay, so these are dead heap objects, but if the pointer never escapes      // then it's leaked by this function anyways. -    else if (isAllocLikeFn(I, TLI) && !PointerMayBeCaptured(I, true, true)) -      DeadStackObjects.insert(I); +    else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true)) +      DeadStackObjects.insert(&I);    }    // Treat byval or inalloca arguments the same, stores to them are dead at the    // end of the function. -  for (Function::arg_iterator AI = BB.getParent()->arg_begin(), -       AE = BB.getParent()->arg_end(); AI != AE; ++AI) -    if (AI->hasByValOrInAllocaAttr()) -      DeadStackObjects.insert(AI); +  for (Argument &AI : BB.getParent()->args()) +    if (AI.hasByValOrInAllocaAttr()) +      DeadStackObjects.insert(&AI);    const DataLayout &DL = BB.getModule()->getDataLayout(); @@ -729,10 +821,10 @@ bool DSE::handleEndBlock(BasicBlock &BB) {      --BBI;      // If we find a store, check to see if it points into a dead stack value. -    if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) { +    if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {        // See through pointer-to-pointer bitcasts        SmallVector<Value *, 4> Pointers; -      GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers, DL); +      GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);        // Stores to stack values are valid candidates for removal.        bool AllDead = true; @@ -744,7 +836,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {          }        if (AllDead) { -        Instruction *Dead = BBI++; +        Instruction *Dead = &*BBI++;          DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "                       << *Dead << "\n  Objects: "; @@ -757,7 +849,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {                dbgs() << '\n');          // DCE instructions only used to calculate that store. -        DeleteDeadInstruction(Dead, *MD, TLI, &DeadStackObjects); +        DeleteDeadInstruction(Dead, *MD, *TLI, &DeadStackObjects);          ++NumFastStores;          MadeChange = true;          continue; @@ -765,9 +857,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {      }      // Remove any dead non-memory-mutating instructions. -    if (isInstructionTriviallyDead(BBI, TLI)) { -      Instruction *Inst = BBI++; -      DeleteDeadInstruction(Inst, *MD, TLI, &DeadStackObjects); +    if (isInstructionTriviallyDead(&*BBI, TLI)) { +      Instruction *Inst = &*BBI++; +      DeleteDeadInstruction(Inst, *MD, *TLI, &DeadStackObjects);        ++NumFastOther;        MadeChange = true;        continue; @@ -776,15 +868,15 @@ bool DSE::handleEndBlock(BasicBlock &BB) {      if (isa<AllocaInst>(BBI)) {        // Remove allocas from the list of dead stack objects; there can't be        // any references before the definition. -      DeadStackObjects.remove(BBI); +      DeadStackObjects.remove(&*BBI);        continue;      } -    if (auto CS = CallSite(BBI)) { +    if (auto CS = CallSite(&*BBI)) {        // Remove allocation function calls from the list of dead stack objects;         // there can't be any references before the definition. -      if (isAllocLikeFn(BBI, TLI)) -        DeadStackObjects.remove(BBI); +      if (isAllocLikeFn(&*BBI, TLI)) +        DeadStackObjects.remove(&*BBI);        // If this call does not access memory, it can't be loading any of our        // pointers. @@ -795,10 +887,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {        // the call is live.        DeadStackObjects.remove_if([&](Value *I) {          // See if the call site touches the value. -        AliasAnalysis::ModRefResult A = AA->getModRefInfo( -            CS, I, getPointerSize(I, DL, AA->getTargetLibraryInfo())); +        ModRefInfo A = AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)); -        return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; +        return A == MRI_ModRef || A == MRI_Ref;        });        // If all of the allocas were clobbered by the call then we're not going @@ -864,8 +955,7 @@ void DSE::RemoveAccessedObjects(const MemoryLocation &LoadedLoc,    // Remove objects that could alias LoadedLoc.    DeadStackObjects.remove_if([&](Value *I) {      // See if the loaded location could alias the stack location. -    MemoryLocation StackLoc(I, -                            getPointerSize(I, DL, AA->getTargetLibraryInfo())); +    MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI));      return !AA->isNoAlias(StackLoc, LoadedLoc);    });  } diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 029b44c2ea80..7ef062e71ff3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -16,6 +16,7 @@  #include "llvm/ADT/Hashing.h"  #include "llvm/ADT/ScopedHashTable.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/InstructionSimplify.h"  #include "llvm/Analysis/TargetLibraryInfo.h" @@ -263,7 +264,6 @@ namespace {  /// expected that a later pass of GVN will catch the interesting/hard cases.  class EarlyCSE {  public: -  Function &F;    const TargetLibraryInfo &TLI;    const TargetTransformInfo &TTI;    DominatorTree &DT; @@ -281,20 +281,37 @@ public:    /// that dominated values can succeed in their lookup.    ScopedHTType AvailableValues; -  /// \brief A scoped hash table of the current values of loads. +  /// A scoped hash table of the current values of previously encounted memory +  /// locations.    /// -  /// This allows us to get efficient access to dominating loads when we have -  /// a fully redundant load.  In addition to the most recent load, we keep -  /// track of a generation count of the read, which is compared against the -  /// current generation count.  The current generation count is incremented +  /// This allows us to get efficient access to dominating loads or stores when +  /// we have a fully redundant load.  In addition to the most recent load, we +  /// keep track of a generation count of the read, which is compared against +  /// the current generation count.  The current generation count is incremented    /// after every possibly writing memory operation, which ensures that we only -  /// CSE loads with other loads that have no intervening store. -  typedef RecyclingAllocator< -      BumpPtrAllocator, -      ScopedHashTableVal<Value *, std::pair<Value *, unsigned>>> +  /// CSE loads with other loads that have no intervening store.  Ordering +  /// events (such as fences or atomic instructions) increment the generation +  /// count as well; essentially, we model these as writes to all possible +  /// locations.  Note that atomic and/or volatile loads and stores can be +  /// present the table; it is the responsibility of the consumer to inspect +  /// the atomicity/volatility if needed. +  struct LoadValue { +    Value *Data; +    unsigned Generation; +    int MatchingId; +    bool IsAtomic; +    LoadValue() +      : Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {} +    LoadValue(Value *Data, unsigned Generation, unsigned MatchingId, +              bool IsAtomic) +      : Data(Data), Generation(Generation), MatchingId(MatchingId), +        IsAtomic(IsAtomic) {} +  }; +  typedef RecyclingAllocator<BumpPtrAllocator, +                             ScopedHashTableVal<Value *, LoadValue>>        LoadMapAllocator; -  typedef ScopedHashTable<Value *, std::pair<Value *, unsigned>, -                          DenseMapInfo<Value *>, LoadMapAllocator> LoadHTType; +  typedef ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>, +                          LoadMapAllocator> LoadHTType;    LoadHTType AvailableLoads;    /// \brief A scoped hash table of the current values of read-only call @@ -308,10 +325,9 @@ public:    unsigned CurrentGeneration;    /// \brief Set up the EarlyCSE runner for a particular function. -  EarlyCSE(Function &F, const TargetLibraryInfo &TLI, -           const TargetTransformInfo &TTI, DominatorTree &DT, -           AssumptionCache &AC) -      : F(F), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {} +  EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, +           DominatorTree &DT, AssumptionCache &AC) +      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {}    bool run(); @@ -382,57 +398,91 @@ private:    class ParseMemoryInst {    public:      ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI) -        : Load(false), Store(false), Vol(false), MayReadFromMemory(false), -          MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) { -      MayReadFromMemory = Inst->mayReadFromMemory(); -      MayWriteToMemory = Inst->mayWriteToMemory(); -      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { -        MemIntrinsicInfo Info; -        if (!TTI.getTgtMemIntrinsic(II, Info)) -          return; -        if (Info.NumMemRefs == 1) { -          Store = Info.WriteMem; -          Load = Info.ReadMem; -          MatchingId = Info.MatchingId; -          MayReadFromMemory = Info.ReadMem; -          MayWriteToMemory = Info.WriteMem; -          Vol = Info.Vol; -          Ptr = Info.PtrVal; -        } -      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { -        Load = true; -        Vol = !LI->isSimple(); -        Ptr = LI->getPointerOperand(); +      : IsTargetMemInst(false), Inst(Inst) { +      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) +        if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1) +          IsTargetMemInst = true; +    } +    bool isLoad() const { +      if (IsTargetMemInst) return Info.ReadMem; +      return isa<LoadInst>(Inst); +    } +    bool isStore() const { +      if (IsTargetMemInst) return Info.WriteMem; +      return isa<StoreInst>(Inst); +    } +    bool isAtomic() const { +      if (IsTargetMemInst) { +        assert(Info.IsSimple && "need to refine IsSimple in TTI"); +        return false; +      } +      return Inst->isAtomic(); +    } +    bool isUnordered() const { +      if (IsTargetMemInst) { +        assert(Info.IsSimple && "need to refine IsSimple in TTI"); +        return true; +      } +      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { +        return LI->isUnordered(); +      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { +        return SI->isUnordered(); +      } +      // Conservative answer +      return !Inst->isAtomic(); +    } + +    bool isVolatile() const { +      if (IsTargetMemInst) { +        assert(Info.IsSimple && "need to refine IsSimple in TTI"); +        return false; +      } +      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { +        return LI->isVolatile();        } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { -        Store = true; -        Vol = !SI->isSimple(); -        Ptr = SI->getPointerOperand(); +        return SI->isVolatile();        } +      // Conservative answer +      return true;      } -    bool isLoad() { return Load; } -    bool isStore() { return Store; } -    bool isVolatile() { return Vol; } -    bool isMatchingMemLoc(const ParseMemoryInst &Inst) { -      return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId; + +     +    bool isMatchingMemLoc(const ParseMemoryInst &Inst) const { +      return (getPointerOperand() == Inst.getPointerOperand() && +              getMatchingId() == Inst.getMatchingId());      } -    bool isValid() { return Ptr != nullptr; } -    int getMatchingId() { return MatchingId; } -    Value *getPtr() { return Ptr; } -    bool mayReadFromMemory() { return MayReadFromMemory; } -    bool mayWriteToMemory() { return MayWriteToMemory; } +    bool isValid() const { return getPointerOperand() != nullptr; } -  private: -    bool Load; -    bool Store; -    bool Vol; -    bool MayReadFromMemory; -    bool MayWriteToMemory;      // For regular (non-intrinsic) loads/stores, this is set to -1. For      // intrinsic loads/stores, the id is retrieved from the corresponding      // field in the MemIntrinsicInfo structure.  That field contains      // non-negative values only. -    int MatchingId; -    Value *Ptr; +    int getMatchingId() const { +      if (IsTargetMemInst) return Info.MatchingId; +      return -1; +    } +    Value *getPointerOperand() const { +      if (IsTargetMemInst) return Info.PtrVal; +      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { +        return LI->getPointerOperand(); +      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { +        return SI->getPointerOperand(); +      } +      return nullptr; +    } +    bool mayReadFromMemory() const { +      if (IsTargetMemInst) return Info.ReadMem; +      return Inst->mayReadFromMemory(); +    } +    bool mayWriteToMemory() const { +      if (IsTargetMemInst) return Info.WriteMem; +      return Inst->mayWriteToMemory(); +    } + +  private: +    bool IsTargetMemInst; +    MemIntrinsicInfo Info; +    Instruction *Inst;    };    bool processNode(DomTreeNode *Node); @@ -497,7 +547,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {    // See if any instructions in the block can be eliminated.  If so, do it.  If    // not, add them to AvailableValues.    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { -    Instruction *Inst = I++; +    Instruction *Inst = &*I++;      // Dead instructions should just be removed.      if (isInstructionTriviallyDead(Inst, &TLI)) { @@ -548,24 +598,26 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {      ParseMemoryInst MemInst(Inst, TTI);      // If this is a non-volatile load, process it.      if (MemInst.isValid() && MemInst.isLoad()) { -      // Ignore volatile loads. -      if (MemInst.isVolatile()) { +      // (conservatively) we can't peak past the ordering implied by this +      // operation, but we can add this load to our set of available values +      if (MemInst.isVolatile() || !MemInst.isUnordered()) {          LastStore = nullptr; -        // Don't CSE across synchronization boundaries. -        if (Inst->mayWriteToMemory()) -          ++CurrentGeneration; -        continue; +        ++CurrentGeneration;        }        // If we have an available version of this load, and if it is the right        // generation, replace this instruction. -      std::pair<Value *, unsigned> InVal = -          AvailableLoads.lookup(MemInst.getPtr()); -      if (InVal.first != nullptr && InVal.second == CurrentGeneration) { -        Value *Op = getOrCreateResult(InVal.first, Inst->getType()); +      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); +      if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration && +          InVal.MatchingId == MemInst.getMatchingId() && +          // We don't yet handle removing loads with ordering of any kind. +          !MemInst.isVolatile() && MemInst.isUnordered() && +          // We can't replace an atomic load with one which isn't also atomic. +          InVal.IsAtomic >= MemInst.isAtomic()) { +        Value *Op = getOrCreateResult(InVal.Data, Inst->getType());          if (Op != nullptr) {            DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst -                       << "  to: " << *InVal.first << '\n'); +                       << "  to: " << *InVal.Data << '\n');            if (!Inst->use_empty())              Inst->replaceAllUsesWith(Op);            Inst->eraseFromParent(); @@ -576,8 +628,10 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {        }        // Otherwise, remember that we have this instruction. -      AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( -                                                  Inst, CurrentGeneration)); +      AvailableLoads.insert( +          MemInst.getPointerOperand(), +          LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(), +                    MemInst.isAtomic()));        LastStore = nullptr;        continue;      } @@ -613,6 +667,44 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {        continue;      } +    // A release fence requires that all stores complete before it, but does +    // not prevent the reordering of following loads 'before' the fence.  As a +    // result, we don't need to consider it as writing to memory and don't need +    // to advance the generation.  We do need to prevent DSE across the fence, +    // but that's handled above. +    if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) +      if (FI->getOrdering() == Release) { +        assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above"); +        continue; +      } + +    // write back DSE - If we write back the same value we just loaded from +    // the same location and haven't passed any intervening writes or ordering +    // operations, we can remove the write.  The primary benefit is in allowing +    // the available load table to remain valid and value forward past where +    // the store originally was. +    if (MemInst.isValid() && MemInst.isStore()) { +      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); +      if (InVal.Data && +          InVal.Data == getOrCreateResult(Inst, InVal.Data->getType()) && +          InVal.Generation == CurrentGeneration && +          InVal.MatchingId == MemInst.getMatchingId() && +          // We don't yet handle removing stores with ordering of any kind. +          !MemInst.isVolatile() && MemInst.isUnordered()) { +        assert((!LastStore || +                ParseMemoryInst(LastStore, TTI).getPointerOperand() == +                MemInst.getPointerOperand()) && +               "can't have an intervening store!"); +        DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n'); +        Inst->eraseFromParent(); +        Changed = true; +        ++NumDSE; +        // We can avoid incrementing the generation count since we were able +        // to eliminate this store. +        continue; +      } +    } +      // Okay, this isn't something we can CSE at all.  Check to see if it is      // something that could modify memory.  If so, our available memory values      // cannot be used so bump the generation count. @@ -622,8 +714,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {        if (MemInst.isValid() && MemInst.isStore()) {          // We do a trivial form of DSE if there are two stores to the same          // location with no intervening loads.  Delete the earlier store. +        // At the moment, we don't remove ordered stores, but do remove +        // unordered atomic stores.  There's no special requirement (for +        // unordered atomics) about removing atomic stores only in favor of +        // other atomic stores since we we're going to execute the non-atomic +        // one anyway and the atomic one might never have become visible.          if (LastStore) {            ParseMemoryInst LastStoreMemInst(LastStore, TTI); +          assert(LastStoreMemInst.isUnordered() && +                 !LastStoreMemInst.isVolatile() && +                 "Violated invariant");            if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {              DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore                           << "  due to: " << *Inst << '\n'); @@ -640,12 +740,22 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {          // version of the pointer.  It is safe to forward from volatile stores          // to non-volatile loads, so we don't have to check for volatility of          // the store. -        AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( -                                                    Inst, CurrentGeneration)); - -        // Remember that this was the last store we saw for DSE. -        if (!MemInst.isVolatile()) +        AvailableLoads.insert( +            MemInst.getPointerOperand(), +            LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(), +                      MemInst.isAtomic())); + +        // Remember that this was the last unordered store we saw for DSE. We +        // don't yet handle DSE on ordered or volatile stores since we don't +        // have a good way to model the ordering requirement for following +        // passes  once the store is removed.  We could insert a fence, but +        // since fences are slightly stronger than stores in their ordering, +        // it's not clear this is a profitable transform. Another option would +        // be to merge the ordering with that of the post dominating store. +        if (MemInst.isUnordered() && !MemInst.isVolatile())            LastStore = Inst; +        else +          LastStore = nullptr;        }      }    } @@ -714,7 +824,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,    auto &DT = AM->getResult<DominatorTreeAnalysis>(F);    auto &AC = AM->getResult<AssumptionAnalysis>(F); -  EarlyCSE CSE(F, TLI, TTI, DT, AC); +  EarlyCSE CSE(TLI, TTI, DT, AC);    if (!CSE.run())      return PreservedAnalyses::all(); @@ -751,7 +861,7 @@ public:      auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();      auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); -    EarlyCSE CSE(F, TLI, TTI, DT, AC); +    EarlyCSE CSE(TLI, TTI, DT, AC);      return CSE.run();    } @@ -761,6 +871,7 @@ public:      AU.addRequired<DominatorTreeWrapperPass>();      AU.addRequired<TargetLibraryInfoWrapperPass>();      AU.addRequired<TargetTransformInfoWrapperPass>(); +    AU.addPreserved<GlobalsAAWrapperPass>();      AU.setPreservesCFG();    }  }; diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index 0430c1898c8d..185cdbdda378 100644 --- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -30,7 +30,7 @@ public:    bool runOnFunction(Function &F) override;    void getAnalysisUsage(AnalysisUsage &AU) const override { -    AU.addRequired<AliasAnalysis>(); +    AU.addRequired<AAResultsWrapperPass>();    }  private: @@ -41,7 +41,7 @@ private:  char FlattenCFGPass::ID = 0;  INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,                        false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)  INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,                      false) @@ -59,7 +59,7 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {      // Loop over all of the basic blocks and remove them if they are unneeded...      //      for (Function::iterator BBIt = F.begin(); BBIt != F.end();) { -      if (FlattenCFG(BBIt++, AA)) { +      if (FlattenCFG(&*BBIt++, AA)) {          LocalChange = true;        }      } @@ -69,7 +69,7 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {  }  bool FlattenCFGPass::runOnFunction(Function &F) { -  AA = &getAnalysis<AliasAnalysis>(); +  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();    bool EverChanged = false;    // iterativelyFlattenCFG can make some blocks dead.    while (iterativelyFlattenCFG(F, AA)) { diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp index c9314229c38b..7f5d78656b50 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -19,6 +19,8 @@  #include "llvm/ADT/EquivalenceClasses.h"  #include "llvm/ADT/MapVector.h"  #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/IR/ConstantRange.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/IRBuilder.h" @@ -41,7 +43,7 @@ using namespace llvm;  // integer domain inputs, produce an integer output; fadd, for example.  //  // If a non-mappable instruction is seen, this entire def-use graph is marked -// as non-transformable. If we see an instruction that converts from the  +// as non-transformable. If we see an instruction that converts from the  // integer domain to FP domain (uitofp,sitofp), we terminate our walk.  /// The largest integer type worth dealing with. @@ -60,6 +62,7 @@ namespace {      bool runOnFunction(Function &F) override;      void getAnalysisUsage(AnalysisUsage &AU) const override {        AU.setPreservesCFG(); +      AU.addPreserved<GlobalsAAWrapperPass>();      }      void findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots); @@ -82,7 +85,9 @@ namespace {  }  char Float2Int::ID = 0; -INITIALIZE_PASS(Float2Int, "float2int", "Float to int", false, false) +INITIALIZE_PASS_BEGIN(Float2Int, "float2int", "Float to int", false, false) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_END(Float2Int, "float2int", "Float to int", false, false)  // Given a FCmp predicate, return a matching ICmp predicate if one  // exists, otherwise return BAD_ICMP_PREDICATE. @@ -125,7 +130,9 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {  // Find the roots - instructions that convert from the FP domain to  // integer domain.  void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { -  for (auto &I : inst_range(F)) { +  for (auto &I : instructions(F)) { +    if (isa<VectorType>(I.getType())) +      continue;      switch (I.getOpcode()) {      default: break;      case Instruction::FPToUI: @@ -133,7 +140,7 @@ void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {        Roots.insert(&I);        break;      case Instruction::FCmp: -      if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=  +      if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=            CmpInst::BAD_ICMP_PREDICATE)          Roots.insert(&I);        break; @@ -176,7 +183,7 @@ ConstantRange Float2Int::validateRange(ConstantRange R) {  //   - walkForwards:  Iterate over SeenInsts in reverse order, so we visit  //                     defs before their uses. Calculate the real range info. -// Breadth-first walk of the use-def graph; determine the set of nodes  +// Breadth-first walk of the use-def graph; determine the set of nodes  // we care about and eagerly determine if some of them are poisonous.  void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {    std::deque<Instruction*> Worklist(Roots.begin(), Roots.end()); @@ -222,14 +229,14 @@ void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {        seen(I, unknownRange());        break;      } -   +      for (Value *O : I->operands()) {        if (Instruction *OI = dyn_cast<Instruction>(O)) {          // Unify def-use chains if they interfere.          ECs.unionSets(I, OI); -	if (SeenInsts.find(I)->second != badRange()) +        if (SeenInsts.find(I)->second != badRange())            Worklist.push_back(OI); -      } else if (!isa<ConstantFP>(O)) {       +      } else if (!isa<ConstantFP>(O)) {          // Not an instruction or ConstantFP? we can't do anything.          seen(I, badRange());        } @@ -240,11 +247,11 @@ void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {  // Walk forwards down the list of seen instructions, so we visit defs before  // uses.  void Float2Int::walkForwards() { -  for (auto It = SeenInsts.rbegin(), E = SeenInsts.rend(); It != E; ++It) { -    if (It->second != unknownRange()) +  for (auto &It : make_range(SeenInsts.rbegin(), SeenInsts.rend())) { +    if (It.second != unknownRange())        continue; -    Instruction *I = It->first; +    Instruction *I = It.first;      std::function<ConstantRange(ArrayRef<ConstantRange>)> Op;      switch (I->getOpcode()) {        // FIXME: Handle select and phi nodes. @@ -299,7 +306,7 @@ void Float2Int::walkForwards() {      for (Value *O : I->operands()) {        if (Instruction *OI = dyn_cast<Instruction>(O)) {          assert(SeenInsts.find(OI) != SeenInsts.end() && -	       "def not seen before use!"); +               "def not seen before use!");          OpRanges.push_back(SeenInsts.find(OI)->second);        } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) {          // Work out if the floating point number can be losslessly represented @@ -314,11 +321,11 @@ void Float2Int::walkForwards() {          APFloat F = CF->getValueAPF();          // First, weed out obviously incorrect values. Non-finite numbers -        // can't be represented and neither can negative zero, unless  +        // can't be represented and neither can negative zero, unless          // we're in fast math mode.          if (!F.isFinite() ||              (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) && -	     !I->hasNoSignedZeros())) { +             !I->hasNoSignedZeros())) {            seen(I, badRange());            Abort = true;            break; @@ -345,7 +352,7 @@ void Float2Int::walkForwards() {      // Reduce the operands' ranges to a single range and return.      if (!Abort) -      seen(I, Op(OpRanges));     +      seen(I, Op(OpRanges));    }  } @@ -395,7 +402,7 @@ bool Float2Int::validateAndTransform() {          R.isFullSet() || R.isSignWrappedSet())        continue;      assert(ConvertedToTy && "Must have set the convertedtoty by this point!"); -     +      // The number of bits required is the maximum of the upper and      // lower limits, plus one so it can be signed.      unsigned MinBW = std::max(R.getLower().getMinSignedBits(), @@ -505,9 +512,8 @@ Value *Float2Int::convert(Instruction *I, Type *ToTy) {  // Perform dead code elimination on the instructions we just modified.  void Float2Int::cleanup() { -  for (auto I = ConvertedInsts.rbegin(), E = ConvertedInsts.rend(); -       I != E; ++I) -    I->first->eraseFromParent(); +  for (auto &I : make_range(ConvertedInsts.rbegin(), ConvertedInsts.rend())) +    I.first->eraseFromParent();  }  bool Float2Int::runOnFunction(Function &F) { @@ -534,7 +540,4 @@ bool Float2Int::runOnFunction(Function &F) {    return Modified;  } -FunctionPass *llvm::createFloat2IntPass() { -  return new Float2Int(); -} - +FunctionPass *llvm::createFloat2IntPass() { return new Float2Int(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index 89a0d0af93be..a028b8c444ba 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -28,6 +28,7 @@  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/CFG.h"  #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/InstructionSimplify.h"  #include "llvm/Analysis/Loads.h"  #include "llvm/Analysis/MemoryBuiltins.h" @@ -128,6 +129,7 @@ namespace {      uint32_t lookup(Value *V) const;      uint32_t lookup_or_add_cmp(unsigned Opcode, CmpInst::Predicate Pred,                                 Value *LHS, Value *RHS); +    bool exists(Value *V) const;      void add(Value *V, uint32_t num);      void clear();      void erase(Value *v); @@ -388,6 +390,9 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) {    }  } +/// Returns true if a value number exists for the specified value. +bool ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; } +  /// lookup_or_add - Returns the value number for the specified value, assigning  /// it a new number if it did not have one before.  uint32_t ValueTable::lookup_or_add(Value *V) { @@ -608,6 +613,10 @@ namespace {      DenseMap<uint32_t, LeaderTableEntry> LeaderTable;      BumpPtrAllocator TableAllocator; +    // Block-local map of equivalent values to their leader, does not +    // propagate to any successors. Entries added mid-block are applied +    // to the remaining instructions in the block. +    SmallMapVector<llvm::Value *, llvm::Constant *, 4> ReplaceWithConstMap;      SmallVector<Instruction*, 8> InstrsToErase;      typedef SmallVector<NonLocalDepResult, 64> LoadDepVect; @@ -689,16 +698,17 @@ namespace {        AU.addRequired<TargetLibraryInfoWrapperPass>();        if (!NoLoads)          AU.addRequired<MemoryDependenceAnalysis>(); -      AU.addRequired<AliasAnalysis>(); +      AU.addRequired<AAResultsWrapperPass>();        AU.addPreserved<DominatorTreeWrapperPass>(); -      AU.addPreserved<AliasAnalysis>(); +      AU.addPreserved<GlobalsAAWrapperPass>();      } -    // Helper fuctions of redundant load elimination  +    // Helper functions of redundant load elimination       bool processLoad(LoadInst *L);      bool processNonLocalLoad(LoadInst *L); +    bool processAssumeIntrinsic(IntrinsicInst *II);      void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,                                    AvailValInBlkVect &ValuesPerBlock,                                   UnavailBlkVect &UnavailableBlocks); @@ -719,7 +729,9 @@ namespace {      void verifyRemoved(const Instruction *I) const;      bool splitCriticalEdges();      BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ); -    bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root); +    bool replaceOperandsWithConsts(Instruction *I) const; +    bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, +                           bool DominatesByEdge);      bool processFoldableCondBr(BranchInst *BI);      void addDeadBlock(BasicBlock *BB);      void assignValNumForDeadCode(); @@ -738,7 +750,8 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)  INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)  INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1290,8 +1303,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,    SSAUpdater SSAUpdate(&NewPHIs);    SSAUpdate.Initialize(LI->getType(), LI->getName()); -  for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { -    const AvailableValueInBlock &AV = ValuesPerBlock[i]; +  for (const AvailableValueInBlock &AV : ValuesPerBlock) {      BasicBlock *BB = AV.BB;      if (SSAUpdate.HasValueForBlock(BB)) @@ -1301,24 +1313,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,    }    // Perform PHI construction. -  Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); - -  // If new PHI nodes were created, notify alias analysis. -  if (V->getType()->getScalarType()->isPointerTy()) { -    AliasAnalysis *AA = gvn.getAliasAnalysis(); - -    // Scan the new PHIs and inform alias analysis that we've added potentially -    // escaping uses to any values that are operands to these PHIs. -    for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) { -      PHINode *P = NewPHIs[i]; -      for (unsigned ii = 0, ee = P->getNumIncomingValues(); ii != ee; ++ii) { -        unsigned jj = PHINode::getOperandNumForIncomingValue(ii); -        AA->addEscapingUse(P->getOperandUse(jj)); -      } -    } -  } - -  return V; +  return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());  }  Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI, @@ -1518,9 +1513,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,    // that we only have to insert *one* load (which means we're basically moving    // the load, not inserting a new one). -  SmallPtrSet<BasicBlock *, 4> Blockers; -  for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) -    Blockers.insert(UnavailableBlocks[i]); +  SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(), +                                        UnavailableBlocks.end());    // Let's find the first basic block with more than one predecessor.  Walk    // backwards through predecessors if needed. @@ -1550,15 +1544,22 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,    // available.    MapVector<BasicBlock *, Value *> PredLoads;    DenseMap<BasicBlock*, char> FullyAvailableBlocks; -  for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) -    FullyAvailableBlocks[ValuesPerBlock[i].BB] = true; -  for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) -    FullyAvailableBlocks[UnavailableBlocks[i]] = false; +  for (const AvailableValueInBlock &AV : ValuesPerBlock) +    FullyAvailableBlocks[AV.BB] = true; +  for (BasicBlock *UnavailableBB : UnavailableBlocks) +    FullyAvailableBlocks[UnavailableBB] = false;    SmallVector<BasicBlock *, 4> CriticalEdgePred; -  for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); -       PI != E; ++PI) { -    BasicBlock *Pred = *PI; +  for (BasicBlock *Pred : predecessors(LoadBB)) { +    // If any predecessor block is an EH pad that does not allow non-PHI +    // instructions before the terminator, we can't PRE the load. +    if (Pred->getTerminator()->isEHPad()) { +      DEBUG(dbgs() +            << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '" +            << Pred->getName() << "': " << *LI << '\n'); +      return false; +    } +      if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {        continue;      } @@ -1570,9 +1571,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,          return false;        } -      if (LoadBB->isLandingPad()) { +      if (LoadBB->isEHPad()) {          DEBUG(dbgs() -              << "COULD NOT PRE LOAD BECAUSE OF LANDING PAD CRITICAL EDGE '" +              << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"                << Pred->getName() << "': " << *LI << '\n');          return false;        } @@ -1655,12 +1656,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,                   << *NewInsts.back() << '\n');    // Assign value numbers to the new instructions. -  for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) { +  for (Instruction *I : NewInsts) {      // FIXME: We really _ought_ to insert these value numbers into their      // parent's availability map.  However, in doing so, we risk getting into      // ordering issues.  If a block hasn't been processed yet, we would be      // marking a value as AVAIL-IN, which isn't what we intend. -    VN.lookup_or_add(NewInsts[i]); +    VN.lookup_or_add(I);    }    for (const auto &PredLoad : PredLoads) { @@ -1677,6 +1678,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,      if (Tags)        NewLoad->setAAMetadata(Tags); +    if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load)) +      NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD); +    if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group)) +      NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD); +      // Transfer DebugLoc.      NewLoad->setDebugLoc(LI->getDebugLoc()); @@ -1704,6 +1710,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,  /// Attempt to eliminate a load whose dependencies are  /// non-local by performing PHI construction.  bool GVN::processNonLocalLoad(LoadInst *LI) { +  // non-local speculations are not allowed under asan. +  if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeAddress)) +    return false; +    // Step 1: Find the non-local dependencies of the load.    LoadDepVect Deps;    MD->getNonLocalPointerDependency(LI, Deps); @@ -1777,6 +1787,63 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {    return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);  } +bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { +  assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume && +         "This function can only be called with llvm.assume intrinsic"); +  Value *V = IntrinsicI->getArgOperand(0); + +  if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) { +    if (Cond->isZero()) { +      Type *Int8Ty = Type::getInt8Ty(V->getContext()); +      // Insert a new store to null instruction before the load to indicate that +      // this code is not reachable.  FIXME: We could insert unreachable +      // instruction directly because we can modify the CFG. +      new StoreInst(UndefValue::get(Int8Ty), +                    Constant::getNullValue(Int8Ty->getPointerTo()), +                    IntrinsicI); +    } +    markInstructionForDeletion(IntrinsicI); +    return false; +  } + +  Constant *True = ConstantInt::getTrue(V->getContext()); +  bool Changed = false; + +  for (BasicBlock *Successor : successors(IntrinsicI->getParent())) { +    BasicBlockEdge Edge(IntrinsicI->getParent(), Successor); + +    // This property is only true in dominated successors, propagateEquality +    // will check dominance for us. +    Changed |= propagateEquality(V, True, Edge, false); +  } + +  // We can replace assume value with true, which covers cases like this: +  // call void @llvm.assume(i1 %cmp) +  // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true +  ReplaceWithConstMap[V] = True; + +  // If one of *cmp *eq operand is const, adding it to map will cover this: +  // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen +  // call void @llvm.assume(i1 %cmp) +  // ret float %0 ; will change it to ret float 3.000000e+00 +  if (auto *CmpI = dyn_cast<CmpInst>(V)) { +    if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ || +        CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ || +        (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ && +         CmpI->getFastMathFlags().noNaNs())) { +      Value *CmpLHS = CmpI->getOperand(0); +      Value *CmpRHS = CmpI->getOperand(1); +      if (isa<Constant>(CmpLHS)) +        std::swap(CmpLHS, CmpRHS); +      auto *RHSConst = dyn_cast<Constant>(CmpRHS); + +      // If only one operand is constant. +      if (RHSConst != nullptr && !isa<Constant>(CmpLHS)) +        ReplaceWithConstMap[CmpLHS] = RHSConst; +    } +  } +  return Changed; +}  static void patchReplacementInstruction(Instruction *I, Value *Repl) {    // Patch the replacement so that it is not more restrictive than the value @@ -1789,7 +1856,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {    if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) {      // FIXME: If both the original and replacement value are part of the      // same control-flow region (meaning that the execution of one -    // guarentees the executation of the other), then we can combine the +    // guarantees the execution of the other), then we can combine the      // noalias scopes here and do better than the general conservative      // answer used in combineMetadata(). @@ -1797,13 +1864,10 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {      // regions, and so we need a conservative combination of the noalias      // scopes.      static const unsigned KnownIDs[] = { -      LLVMContext::MD_tbaa, -      LLVMContext::MD_alias_scope, -      LLVMContext::MD_noalias, -      LLVMContext::MD_range, -      LLVMContext::MD_fpmath, -      LLVMContext::MD_invariant_load, -    }; +        LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope, +        LLVMContext::MD_noalias,        LLVMContext::MD_range, +        LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load, +        LLVMContext::MD_invariant_group};      combineMetadata(ReplInst, I, KnownIDs);    }  } @@ -1890,10 +1954,8 @@ bool GVN::processLoad(LoadInst *L) {        ++NumGVNLoad;        return true;      } -  } -  // If the value isn't available, don't do anything! -  if (Dep.isClobber()) { +    // If the value isn't available, don't do anything!      DEBUG(        // fast print dep, using operator<< on instruction is too slow.        dbgs() << "GVN: load "; @@ -2049,11 +2111,31 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,    return Pred != nullptr;  } +// Tries to replace instruction with const, using information from +// ReplaceWithConstMap. +bool GVN::replaceOperandsWithConsts(Instruction *Instr) const { +  bool Changed = false; +  for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) { +    Value *Operand = Instr->getOperand(OpNum); +    auto it = ReplaceWithConstMap.find(Operand); +    if (it != ReplaceWithConstMap.end()) { +      assert(!isa<Constant>(Operand) && +             "Replacing constants with constants is invalid"); +      DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second +                   << " in instruction " << *Instr << '\n'); +      Instr->setOperand(OpNum, it->second); +      Changed = true; +    } +  } +  return Changed; +} +  /// The given values are known to be equal in every block  /// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with  /// 'RHS' everywhere in the scope.  Returns whether a change was made. -bool GVN::propagateEquality(Value *LHS, Value *RHS, -                            const BasicBlockEdge &Root) { +/// If DominatesByEdge is false, then it means that it is dominated by Root.End. +bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, +                            bool DominatesByEdge) {    SmallVector<std::pair<Value*, Value*>, 4> Worklist;    Worklist.push_back(std::make_pair(LHS, RHS));    bool Changed = false; @@ -2065,11 +2147,13 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,      std::pair<Value*, Value*> Item = Worklist.pop_back_val();      LHS = Item.first; RHS = Item.second; -    if (LHS == RHS) continue; +    if (LHS == RHS) +      continue;      assert(LHS->getType() == RHS->getType() && "Equality but unequal types!");      // Don't try to propagate equalities between constants. -    if (isa<Constant>(LHS) && isa<Constant>(RHS)) continue; +    if (isa<Constant>(LHS) && isa<Constant>(RHS)) +      continue;      // Prefer a constant on the right-hand side, or an Argument if no constants.      if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS))) @@ -2108,7 +2192,11 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,      // LHS always has at least one use that is not dominated by Root, this will      // never do anything if LHS has only one use.      if (!LHS->hasOneUse()) { -      unsigned NumReplacements = replaceDominatedUsesWith(LHS, RHS, *DT, Root); +      unsigned NumReplacements = +          DominatesByEdge +              ? replaceDominatedUsesWith(LHS, RHS, *DT, Root) +              : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getEnd()); +        Changed |= NumReplacements > 0;        NumGVNEqProp += NumReplacements;      } @@ -2180,7 +2268,10 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,          Value *NotCmp = findLeader(Root.getEnd(), Num);          if (NotCmp && isa<Instruction>(NotCmp)) {            unsigned NumReplacements = -            replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root); +              DominatesByEdge +                  ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root) +                  : replaceDominatedUsesWith(NotCmp, NotVal, *DT, +                                             Root.getEnd());            Changed |= NumReplacements > 0;            NumGVNEqProp += NumReplacements;          } @@ -2220,6 +2311,10 @@ bool GVN::processInstruction(Instruction *I) {      return true;    } +  if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I)) +    if (IntrinsicI->getIntrinsicID() == Intrinsic::assume) +      return processAssumeIntrinsic(IntrinsicI); +    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {      if (processLoad(LI))        return true; @@ -2250,11 +2345,11 @@ bool GVN::processInstruction(Instruction *I) {      Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext());      BasicBlockEdge TrueE(Parent, TrueSucc); -    Changed |= propagateEquality(BranchCond, TrueVal, TrueE); +    Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true);      Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext());      BasicBlockEdge FalseE(Parent, FalseSucc); -    Changed |= propagateEquality(BranchCond, FalseVal, FalseE); +    Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true);      return Changed;    } @@ -2276,7 +2371,7 @@ bool GVN::processInstruction(Instruction *I) {        // If there is only a single edge, propagate the case value into it.        if (SwitchEdges.lookup(Dst) == 1) {          BasicBlockEdge E(Parent, Dst); -        Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E); +        Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true);        }      }      return Changed; @@ -2284,7 +2379,8 @@ bool GVN::processInstruction(Instruction *I) {    // Instructions with void type don't return a value, so there's    // no point in trying to find redundancies in them. -  if (I->getType()->isVoidTy()) return false; +  if (I->getType()->isVoidTy()) +    return false;    uint32_t NextNum = VN.getNextUnusedValueNumber();    unsigned Num = VN.lookup_or_add(I); @@ -2306,17 +2402,21 @@ bool GVN::processInstruction(Instruction *I) {    // Perform fast-path value-number based elimination of values inherited from    // dominators. -  Value *repl = findLeader(I->getParent(), Num); -  if (!repl) { +  Value *Repl = findLeader(I->getParent(), Num); +  if (!Repl) {      // Failure, just remember this instance for future use.      addToLeaderTable(Num, I, I->getParent());      return false; +  } else if (Repl == I) { +    // If I was the result of a shortcut PRE, it might already be in the table +    // and the best replacement for itself. Nothing to do. +    return false;    }    // Remove it! -  patchAndReplaceAllUsesWith(I, repl); -  if (MD && repl->getType()->getScalarType()->isPointerTy()) -    MD->invalidateCachedPointerInfo(repl); +  patchAndReplaceAllUsesWith(I, Repl); +  if (MD && Repl->getType()->getScalarType()->isPointerTy()) +    MD->invalidateCachedPointerInfo(Repl);    markInstructionForDeletion(I);    return true;  } @@ -2331,7 +2431,7 @@ bool GVN::runOnFunction(Function& F) {    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();    AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); -  VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); +  VN.setAliasAnalysis(&getAnalysis<AAResultsWrapperPass>().getAAResults());    VN.setMemDep(MD);    VN.setDomTree(DT); @@ -2341,10 +2441,10 @@ bool GVN::runOnFunction(Function& F) {    // Merge unconditional branches, allowing PRE to catch more    // optimization opportunities.    for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { -    BasicBlock *BB = FI++; +    BasicBlock *BB = &*FI++; -    bool removedBlock = MergeBlockIntoPredecessor( -        BB, DT, /* LoopInfo */ nullptr, VN.getAliasAnalysis(), MD); +    bool removedBlock = +        MergeBlockIntoPredecessor(BB, DT, /* LoopInfo */ nullptr, MD);      if (removedBlock) ++NumGVNBlocks;      Changed |= removedBlock; @@ -2382,7 +2482,6 @@ bool GVN::runOnFunction(Function& F) {    return Changed;  } -  bool GVN::processBlock(BasicBlock *BB) {    // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function    // (and incrementing BI before processing an instruction). @@ -2391,11 +2490,16 @@ bool GVN::processBlock(BasicBlock *BB) {    if (DeadBlocks.count(BB))      return false; +  // Clearing map before every BB because it can be used only for single BB. +  ReplaceWithConstMap.clear();    bool ChangedFunction = false;    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();         BI != BE;) { -    ChangedFunction |= processInstruction(BI); +    if (!ReplaceWithConstMap.empty()) +      ChangedFunction |= replaceOperandsWithConsts(&*BI); +    ChangedFunction |= processInstruction(&*BI); +      if (InstrsToErase.empty()) {        ++BI;        continue; @@ -2439,7 +2543,14 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,      Value *Op = Instr->getOperand(i);      if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))        continue; - +    // This could be a newly inserted instruction, in which case, we won't +    // find a value number, and should give up before we hurt ourselves. +    // FIXME: Rewrite the infrastructure to let it easier to value number +    // and process newly inserted instructions. +    if (!VN.exists(Op)) { +      success = false; +      break; +    }      if (Value *V = findLeader(Pred, VN.lookup(Op))) {        Instr->setOperand(i, V);      } else { @@ -2499,9 +2610,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {    BasicBlock *CurrentBlock = CurInst->getParent();    predMap.clear(); -  for (pred_iterator PI = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock); -       PI != PE; ++PI) { -    BasicBlock *P = *PI; +  for (BasicBlock *P : predecessors(CurrentBlock)) {      // We're not interested in PRE where the block is its      // own predecessor, or in blocks with predecessors      // that are not reachable. @@ -2570,7 +2679,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {    // Create a PHI to make the value available in this block.    PHINode *Phi =        PHINode::Create(CurInst->getType(), predMap.size(), -                      CurInst->getName() + ".pre-phi", CurrentBlock->begin()); +                      CurInst->getName() + ".pre-phi", &CurrentBlock->front());    for (unsigned i = 0, e = predMap.size(); i != e; ++i) {      if (Value *V = predMap[i].first)        Phi->addIncoming(V, predMap[i].second); @@ -2582,18 +2691,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) {    addToLeaderTable(ValNo, Phi, CurrentBlock);    Phi->setDebugLoc(CurInst->getDebugLoc());    CurInst->replaceAllUsesWith(Phi); -  if (Phi->getType()->getScalarType()->isPointerTy()) { -    // Because we have added a PHI-use of the pointer value, it has now -    // "escaped" from alias analysis' perspective.  We need to inform -    // AA of this. -    for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) { -      unsigned jj = PHINode::getOperandNumForIncomingValue(ii); -      VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); -    } - -    if (MD) -      MD->invalidateCachedPointerInfo(Phi); -  } +  if (MD && Phi->getType()->getScalarType()->isPointerTy()) +    MD->invalidateCachedPointerInfo(Phi);    VN.erase(CurInst);    removeFromLeaderTable(ValNo, CurInst, CurrentBlock); @@ -2616,15 +2715,15 @@ bool GVN::performPRE(Function &F) {      if (CurrentBlock == &F.getEntryBlock())        continue; -    // Don't perform PRE on a landing pad. -    if (CurrentBlock->isLandingPad()) +    // Don't perform PRE on an EH pad. +    if (CurrentBlock->isEHPad())        continue;      for (BasicBlock::iterator BI = CurrentBlock->begin(),                                BE = CurrentBlock->end();           BI != BE;) { -      Instruction *CurInst = BI++; -      Changed = performScalarPRE(CurInst); +      Instruction *CurInst = &*BI++; +      Changed |= performScalarPRE(CurInst);      }    } @@ -2637,8 +2736,8 @@ bool GVN::performPRE(Function &F) {  /// Split the critical edge connecting the given two blocks, and return  /// the block inserted to the critical edge.  BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { -  BasicBlock *BB = SplitCriticalEdge( -      Pred, Succ, CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); +  BasicBlock *BB = +      SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT));    if (MD)      MD->invalidateCachedPredecessors();    return BB; @@ -2652,7 +2751,7 @@ bool GVN::splitCriticalEdges() {    do {      std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val();      SplitCriticalEdge(Edge.first, Edge.second, -                      CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); +                      CriticalEdgeSplittingOptions(DT));    } while (!toSplit.empty());    if (MD) MD->invalidateCachedPredecessors();    return true; @@ -2728,17 +2827,14 @@ void GVN::addDeadBlock(BasicBlock *BB) {      DeadBlocks.insert(Dom.begin(), Dom.end());      // Figure out the dominance-frontier(D). -    for (SmallVectorImpl<BasicBlock *>::iterator I = Dom.begin(), -           E = Dom.end(); I != E; I++) { -      BasicBlock *B = *I; -      for (succ_iterator SI = succ_begin(B), SE = succ_end(B); SI != SE; SI++) { -        BasicBlock *S = *SI; +    for (BasicBlock *B : Dom) { +      for (BasicBlock *S : successors(B)) {          if (DeadBlocks.count(S))            continue;          bool AllPredDead = true; -        for (pred_iterator PI = pred_begin(S), PE = pred_end(S); PI != PE; PI++) -          if (!DeadBlocks.count(*PI)) { +        for (BasicBlock *P : predecessors(S)) +          if (!DeadBlocks.count(P)) {              AllPredDead = false;              break;            } @@ -2766,10 +2862,7 @@ void GVN::addDeadBlock(BasicBlock *BB) {        continue;      SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B)); -    for (SmallVectorImpl<BasicBlock *>::iterator PI = Preds.begin(), -           PE = Preds.end(); PI != PE; PI++) { -      BasicBlock *P = *PI; - +    for (BasicBlock *P : Preds) {        if (!DeadBlocks.count(P))          continue; @@ -2794,7 +2887,7 @@ void GVN::addDeadBlock(BasicBlock *BB) {  //     R be the target of the dead out-coming edge.  //  1) Identify the set of dead blocks implied by the branch's dead outcoming  //     edge. The result of this step will be {X| X is dominated by R} -//  2) Identify those blocks which haves at least one dead prodecessor. The +//  2) Identify those blocks which haves at least one dead predecessor. The  //     result of this step will be dominance-frontier(R).  //  3) Update the PHIs in DF(R) by replacing the operands corresponding to   //     dead blocks with "UndefVal" in an hope these PHIs will optimized away. @@ -2829,14 +2922,10 @@ bool GVN::processFoldableCondBr(BranchInst *BI) {  // instructions, it makes more sense just to "fabricate" a val-number for the  // dead code than checking if instruction involved is dead or not.  void GVN::assignValNumForDeadCode() { -  for (SetVector<BasicBlock *>::iterator I = DeadBlocks.begin(), -        E = DeadBlocks.end(); I != E; I++) { -    BasicBlock *BB = *I; -    for (BasicBlock::iterator II = BB->begin(), EE = BB->end(); -          II != EE; II++) { -      Instruction *Inst = &*II; -      unsigned ValNum = VN.lookup_or_add(Inst); -      addToLeaderTable(ValNum, Inst, BB); +  for (BasicBlock *BB : DeadBlocks) { +    for (Instruction &Inst : *BB) { +      unsigned ValNum = VN.lookup_or_add(&Inst); +      addToLeaderTable(ValNum, &Inst, BB);      }    }  } diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 2a954d9961f2..ec5e15f0b8f8 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -28,9 +28,11 @@  #include "llvm/ADT/DenseMap.h"  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/LoopPass.h"  #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"  #include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/Analysis/TargetTransformInfo.h"  #include "llvm/IR/BasicBlock.h" @@ -48,6 +50,7 @@  #include "llvm/Support/raw_ostream.h"  #include "llvm/Transforms/Utils/BasicBlockUtils.h"  #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h"  #include "llvm/Transforms/Utils/SimplifyIndVar.h"  using namespace llvm; @@ -83,64 +86,62 @@ static cl::opt<ReplaceExitVal> ReplaceExitValue(  namespace {  struct RewritePhi; -} -namespace { -  class IndVarSimplify : public LoopPass { -    LoopInfo                  *LI; -    ScalarEvolution           *SE; -    DominatorTree             *DT; -    TargetLibraryInfo         *TLI; -    const TargetTransformInfo *TTI; - -    SmallVector<WeakVH, 16> DeadInsts; -    bool Changed; -  public: - -    static char ID; // Pass identification, replacement for typeid -    IndVarSimplify() -        : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) { -      initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); -    } +class IndVarSimplify : public LoopPass { +  LoopInfo                  *LI; +  ScalarEvolution           *SE; +  DominatorTree             *DT; +  TargetLibraryInfo         *TLI; +  const TargetTransformInfo *TTI; -    bool runOnLoop(Loop *L, LPPassManager &LPM) override; - -    void getAnalysisUsage(AnalysisUsage &AU) const override { -      AU.addRequired<DominatorTreeWrapperPass>(); -      AU.addRequired<LoopInfoWrapperPass>(); -      AU.addRequired<ScalarEvolution>(); -      AU.addRequiredID(LoopSimplifyID); -      AU.addRequiredID(LCSSAID); -      AU.addPreserved<ScalarEvolution>(); -      AU.addPreservedID(LoopSimplifyID); -      AU.addPreservedID(LCSSAID); -      AU.setPreservesCFG(); -    } +  SmallVector<WeakVH, 16> DeadInsts; +  bool Changed; +public: -  private: -    void releaseMemory() override { -      DeadInsts.clear(); -    } +  static char ID; // Pass identification, replacement for typeid +  IndVarSimplify() +    : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) { +    initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); +  } -    bool isValidRewrite(Value *FromVal, Value *ToVal); +  bool runOnLoop(Loop *L, LPPassManager &LPM) override; + +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<DominatorTreeWrapperPass>(); +    AU.addRequired<LoopInfoWrapperPass>(); +    AU.addRequired<ScalarEvolutionWrapperPass>(); +    AU.addRequiredID(LoopSimplifyID); +    AU.addRequiredID(LCSSAID); +    AU.addPreserved<GlobalsAAWrapperPass>(); +    AU.addPreserved<ScalarEvolutionWrapperPass>(); +    AU.addPreservedID(LoopSimplifyID); +    AU.addPreservedID(LCSSAID); +    AU.setPreservesCFG(); +  } -    void HandleFloatingPointIV(Loop *L, PHINode *PH); -    void RewriteNonIntegerIVs(Loop *L); +private: +  void releaseMemory() override { +    DeadInsts.clear(); +  } -    void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM); +  bool isValidRewrite(Value *FromVal, Value *ToVal); -    bool CanLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet); -    void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); +  void handleFloatingPointIV(Loop *L, PHINode *PH); +  void rewriteNonIntegerIVs(Loop *L); -    Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, -                                     PHINode *IndVar, SCEVExpander &Rewriter); +  void simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); -    void SinkUnusedInvariants(Loop *L); +  bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet); +  void rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); -    Value *ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, -                              Instruction *InsertPt, Type *Ty, -                              bool &IsHighCostExpansion); -  }; +  Value *linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, +                                   PHINode *IndVar, SCEVExpander &Rewriter); + +  void sinkUnusedInvariants(Loop *L); + +  Value *expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, +                            Instruction *InsertPt, Type *Ty); +};  }  char IndVarSimplify::ID = 0; @@ -148,7 +149,7 @@ INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars",                  "Induction Variable Simplification", false, false)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)  INITIALIZE_PASS_DEPENDENCY(LCSSA)  INITIALIZE_PASS_END(IndVarSimplify, "indvars", @@ -158,10 +159,10 @@ Pass *llvm::createIndVarSimplifyPass() {    return new IndVarSimplify();  } -/// isValidRewrite - Return true if the SCEV expansion generated by the -/// rewriter can replace the original value. SCEV guarantees that it -/// produces the same value, but the way it is produced may be illegal IR. -/// Ideally, this function will only be called for verification. +/// Return true if the SCEV expansion generated by the rewriter can replace the +/// original value. SCEV guarantees that it produces the same value, but the way +/// it is produced may be illegal IR.  Ideally, this function will only be +/// called for verification.  bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {    // If an SCEV expression subsumed multiple pointers, its expansion could    // reassociate the GEP changing the base pointer. This is illegal because the @@ -175,10 +176,10 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {    // because it understands lcssa phis while SCEV does not.    Value *FromPtr = FromVal;    Value *ToPtr = ToVal; -  if (GEPOperator *GEP = dyn_cast<GEPOperator>(FromVal)) { +  if (auto *GEP = dyn_cast<GEPOperator>(FromVal)) {      FromPtr = GEP->getPointerOperand();    } -  if (GEPOperator *GEP = dyn_cast<GEPOperator>(ToVal)) { +  if (auto *GEP = dyn_cast<GEPOperator>(ToVal)) {      ToPtr = GEP->getPointerOperand();    }    if (FromPtr != FromVal || ToPtr != ToVal) { @@ -215,7 +216,7 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {  /// loop. For PHI nodes, there may be multiple uses, so compute the nearest  /// common dominator for the incoming blocks.  static Instruction *getInsertPointForUses(Instruction *User, Value *Def, -                                          DominatorTree *DT) { +                                          DominatorTree *DT, LoopInfo *LI) {    PHINode *PHI = dyn_cast<PHINode>(User);    if (!PHI)      return User; @@ -234,17 +235,28 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def,      InsertPt = InsertBB->getTerminator();    }    assert(InsertPt && "Missing phi operand"); -  assert((!isa<Instruction>(Def) || -          DT->dominates(cast<Instruction>(Def), InsertPt)) && -         "def does not dominate all uses"); -  return InsertPt; + +  auto *DefI = dyn_cast<Instruction>(Def); +  if (!DefI) +    return InsertPt; + +  assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses"); + +  auto *L = LI->getLoopFor(DefI->getParent()); +  assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent()))); + +  for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom()) +    if (LI->getLoopFor(DTN->getBlock()) == L) +      return DTN->getBlock()->getTerminator(); + +  llvm_unreachable("DefI dominates InsertPt!");  }  //===----------------------------------------------------------------------===// -// RewriteNonIntegerIVs and helpers. Prefer integer IVs. +// rewriteNonIntegerIVs and helpers. Prefer integer IVs.  //===----------------------------------------------------------------------===// -/// ConvertToSInt - Convert APF to an integer, if possible. +/// Convert APF to an integer, if possible.  static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {    bool isExact = false;    // See if we can convert this to an int64_t @@ -256,8 +268,8 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {    return true;  } -/// HandleFloatingPointIV - If the loop has floating induction variable -/// then insert corresponding integer induction variable if possible. +/// If the loop has floating induction variable then insert corresponding +/// integer induction variable if possible.  /// For example,  /// for(double i = 0; i < 10000; ++i)  ///   bar(i) @@ -265,13 +277,12 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {  /// for(int i = 0; i < 10000; ++i)  ///   bar((double)i);  /// -void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { +void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {    unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));    unsigned BackEdge     = IncomingEdge^1;    // Check incoming value. -  ConstantFP *InitValueVal = -    dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); +  auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));    int64_t InitValue;    if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) @@ -279,8 +290,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {    // Check IV increment. Reject this PN if increment operation is not    // an add or increment value can not be represented by an integer. -  BinaryOperator *Incr = -    dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); +  auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge));    if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return;    // If this is not an add of the PHI with a constantfp, or if the constant fp @@ -456,14 +466,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {    // platforms.    if (WeakPH) {      Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", -                                 PN->getParent()->getFirstInsertionPt()); +                                 &*PN->getParent()->getFirstInsertionPt());      PN->replaceAllUsesWith(Conv);      RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);    }    Changed = true;  } -void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { +void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {    // First step.  Check to see if there are any floating-point recurrences.    // If there are, change them into integer recurrences, permitting analysis by    // the SCEV routines. @@ -477,7 +487,7 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {    for (unsigned i = 0, e = PHIs.size(); i != e; ++i)      if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i])) -      HandleFloatingPointIV(L, PN); +      handleFloatingPointIV(L, PN);    // If the loop previously had floating-point IV, ScalarEvolution    // may not have been able to compute a trip count. Now that we've done some @@ -488,7 +498,7 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {  namespace {  // Collect information about PHI nodes which can be transformed in -// RewriteLoopExitValues. +// rewriteLoopExitValues.  struct RewritePhi {    PHINode *PN;    unsigned Ith;  // Ith incoming value. @@ -501,70 +511,37 @@ struct RewritePhi {  };  } -Value *IndVarSimplify::ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, +Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,                                            Loop *L, Instruction *InsertPt, -                                          Type *ResultTy, -                                          bool &IsHighCostExpansion) { -  using namespace llvm::PatternMatch; - -  if (!Rewriter.isHighCostExpansion(S, L)) { -    IsHighCostExpansion = false; -    return Rewriter.expandCodeFor(S, ResultTy, InsertPt); -  } - +                                          Type *ResultTy) {    // Before expanding S into an expensive LLVM expression, see if we can use an -  // already existing value as the expansion for S.  There is potential to make -  // this significantly smarter, but this simple heuristic already gets some -  // interesting cases. - -  SmallVector<BasicBlock *, 4> Latches; -  L->getLoopLatches(Latches); - -  for (BasicBlock *BB : Latches) { -    ICmpInst::Predicate Pred; -    Instruction *LHS, *RHS; -    BasicBlock *TrueBB, *FalseBB; - -    if (!match(BB->getTerminator(), -               m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)), -                    TrueBB, FalseBB))) -      continue; - -    if (SE->getSCEV(LHS) == S && DT->dominates(LHS, InsertPt)) { -      IsHighCostExpansion = false; -      return LHS; -    } - -    if (SE->getSCEV(RHS) == S && DT->dominates(RHS, InsertPt)) { -      IsHighCostExpansion = false; -      return RHS; -    } -  } +  // already existing value as the expansion for S. +  if (Value *ExistingValue = Rewriter.findExistingExpansion(S, InsertPt, L)) +    if (ExistingValue->getType() == ResultTy) +      return ExistingValue;    // We didn't find anything, fall back to using SCEVExpander. -  assert(Rewriter.isHighCostExpansion(S, L) && "this should not have changed!"); -  IsHighCostExpansion = true;    return Rewriter.expandCodeFor(S, ResultTy, InsertPt);  }  //===----------------------------------------------------------------------===// -// RewriteLoopExitValues - Optimize IV users outside the loop. +// rewriteLoopExitValues - Optimize IV users outside the loop.  // As a side effect, reduces the amount of IV processing within the loop.  //===----------------------------------------------------------------------===// -/// RewriteLoopExitValues - Check to see if this loop has a computable -/// loop-invariant execution count.  If so, this means that we can compute the -/// final value of any expressions that are recurrent in the loop, and -/// substitute the exit values from the loop into any instructions outside of -/// the loop that use the final values of the current expressions. +/// Check to see if this loop has a computable loop-invariant execution count. +/// If so, this means that we can compute the final value of any expressions +/// that are recurrent in the loop, and substitute the exit values from the loop +/// into any instructions outside of the loop that use the final values of the +/// current expressions.  ///  /// This is mostly redundant with the regular IndVarSimplify activities that  /// happen later, except that it's more powerful in some cases, because it's  /// able to brute-force evaluate arbitrary instructions as long as they have  /// constant operands at the beginning of the loop. -void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { -  // Verify the input to the pass in already in LCSSA form. -  assert(L->isLCSSAForm(*DT)); +void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { +  // Check a pre-condition. +  assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");    SmallVector<BasicBlock*, 8> ExitBlocks;    L->getUniqueExitBlocks(ExitBlocks); @@ -679,9 +656,9 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {              continue;          } -        bool HighCost = false; -        Value *ExitVal = ExpandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, -                                            PN->getType(), HighCost); +        bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst); +        Value *ExitVal = +            expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType());          DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'                       << "  LoopVal = " << *Inst << "\n"); @@ -698,7 +675,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {      }    } -  bool LoopCanBeDel = CanLoopBeDeleted(L, RewritePhiSet); +  bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);    // Transformation.    for (const RewritePhi &Phi : RewritePhiSet) { @@ -735,10 +712,10 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {    Rewriter.clearInsertPoint();  } -/// CanLoopBeDeleted - Check whether it is possible to delete the loop after -/// rewriting exit value. If it is possible, ignore ReplaceExitValue and -/// do rewriting aggressively. -bool IndVarSimplify::CanLoopBeDeleted( +/// Check whether it is possible to delete the loop after rewriting exit +/// value. If it is possible, ignore ReplaceExitValue and do rewriting +/// aggressively. +bool IndVarSimplify::canLoopBeDeleted(      Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {    BasicBlock *Preheader = L->getLoopPreheader(); @@ -782,14 +759,9 @@ bool IndVarSimplify::CanLoopBeDeleted(      ++BI;    } -  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); -       LI != LE; ++LI) { -    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end(); BI != BE; -         ++BI) { -      if (BI->mayHaveSideEffects()) -        return false; -    } -  } +  for (auto *BB : L->blocks()) +    if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); })) +      return false;    return true;  } @@ -799,22 +771,19 @@ bool IndVarSimplify::CanLoopBeDeleted(  //===----------------------------------------------------------------------===//  namespace { -  // Collect information about induction variables that are used by sign/zero -  // extend operations. This information is recorded by CollectExtend and -  // provides the input to WidenIV. -  struct WideIVInfo { -    PHINode *NarrowIV; -    Type *WidestNativeType; // Widest integer type created [sz]ext -    bool IsSigned;          // Was a sext user seen before a zext? - -    WideIVInfo() : NarrowIV(nullptr), WidestNativeType(nullptr), -                   IsSigned(false) {} -  }; +// Collect information about induction variables that are used by sign/zero +// extend operations. This information is recorded by CollectExtend and provides +// the input to WidenIV. +struct WideIVInfo { +  PHINode *NarrowIV = nullptr; +  Type *WidestNativeType = nullptr; // Widest integer type created [sz]ext +  bool IsSigned = false;            // Was a sext user seen before a zext? +};  } -/// visitCast - Update information about the induction variable that is -/// extended by this sign or zero extend operation. This is used to determine -/// the final width of the IV before actually widening it. +/// Update information about the induction variable that is extended by this +/// sign or zero extend operation. This is used to determine the final width of +/// the IV before actually widening it.  static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,                          const TargetTransformInfo *TTI) {    bool IsSigned = Cast->getOpcode() == Instruction::SExt; @@ -855,24 +824,29 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,  namespace { -/// NarrowIVDefUse - Record a link in the Narrow IV def-use chain along with the -/// WideIV that computes the same value as the Narrow IV def.  This avoids -/// caching Use* pointers. +/// Record a link in the Narrow IV def-use chain along with the WideIV that +/// computes the same value as the Narrow IV def.  This avoids caching Use* +/// pointers.  struct NarrowIVDefUse { -  Instruction *NarrowDef; -  Instruction *NarrowUse; -  Instruction *WideDef; - -  NarrowIVDefUse(): NarrowDef(nullptr), NarrowUse(nullptr), WideDef(nullptr) {} - -  NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD): -    NarrowDef(ND), NarrowUse(NU), WideDef(WD) {} +  Instruction *NarrowDef = nullptr; +  Instruction *NarrowUse = nullptr; +  Instruction *WideDef = nullptr; + +  // True if the narrow def is never negative.  Tracking this information lets +  // us use a sign extension instead of a zero extension or vice versa, when +  // profitable and legal. +  bool NeverNegative = false; + +  NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD, +                 bool NeverNegative) +      : NarrowDef(ND), NarrowUse(NU), WideDef(WD), +        NeverNegative(NeverNegative) {}  }; -/// WidenIV - The goal of this transform is to remove sign and zero extends -/// without creating any new induction variables. To do this, it creates a new -/// phi of the wider type and redirects all users, either removing extends or -/// inserting truncs whenever we stop propagating the type. +/// The goal of this transform is to remove sign and zero extends without +/// creating any new induction variables. To do this, it creates a new phi of +/// the wider type and redirects all users, either removing extends or inserting +/// truncs whenever we stop propagating the type.  ///  class WidenIV {    // Parameters @@ -913,32 +887,35 @@ public:      assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");    } -  PHINode *CreateWideIV(SCEVExpander &Rewriter); +  PHINode *createWideIV(SCEVExpander &Rewriter);  protected: -  Value *getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, -                   Instruction *Use); +  Value *createExtendInst(Value *NarrowOper, Type *WideType, bool IsSigned, +                          Instruction *Use); -  Instruction *CloneIVUser(NarrowIVDefUse DU); +  Instruction *cloneIVUser(NarrowIVDefUse DU, const SCEVAddRecExpr *WideAR); +  Instruction *cloneArithmeticIVUser(NarrowIVDefUse DU, +                                     const SCEVAddRecExpr *WideAR); +  Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU); -  const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse); +  const SCEVAddRecExpr *getWideRecurrence(Instruction *NarrowUse); -  const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU); +  const SCEVAddRecExpr* getExtendedOperandRecurrence(NarrowIVDefUse DU); -  const SCEV *GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, +  const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,                                unsigned OpCode) const; -  Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); +  Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); -  bool WidenLoopCompare(NarrowIVDefUse DU); +  bool widenLoopCompare(NarrowIVDefUse DU);    void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);  };  } // anonymous namespace -/// isLoopInvariant - Perform a quick domtree based check for loop invariance -/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems -/// gratuitous for this purpose. +/// Perform a quick domtree based check for loop invariance assuming that V is +/// used within the loop. LoopInfo::isLoopInvariant() seems gratuitous for this +/// purpose.  static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) {    Instruction *Inst = dyn_cast<Instruction>(V);    if (!Inst) @@ -947,8 +924,8 @@ static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) {    return DT->properlyDominates(Inst->getParent(), L->getHeader());  } -Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, -                          Instruction *Use) { +Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType, +                                 bool IsSigned, Instruction *Use) {    // Set the debug location and conservative insertion point.    IRBuilder<> Builder(Use);    // Hoist the insertion point into loop preheaders as far as possible. @@ -961,10 +938,11 @@ Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned,                      Builder.CreateZExt(NarrowOper, WideType);  } -/// CloneIVUser - Instantiate a wide operation to replace a narrow -/// operation. This only needs to handle operations that can evaluation to -/// SCEVAddRec. It can safely return 0 for any operation we decide not to clone. -Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { +/// Instantiate a wide operation to replace a narrow operation. This only needs +/// to handle operations that can evaluation to SCEVAddRec. It can safely return +/// 0 for any operation we decide not to clone. +Instruction *WidenIV::cloneIVUser(NarrowIVDefUse DU, +                                  const SCEVAddRecExpr *WideAR) {    unsigned Opcode = DU.NarrowUse->getOpcode();    switch (Opcode) {    default: @@ -973,40 +951,140 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) {    case Instruction::Mul:    case Instruction::UDiv:    case Instruction::Sub: +    return cloneArithmeticIVUser(DU, WideAR); +    case Instruction::And:    case Instruction::Or:    case Instruction::Xor:    case Instruction::Shl:    case Instruction::LShr:    case Instruction::AShr: -    DEBUG(dbgs() << "Cloning IVUser: " << *DU.NarrowUse << "\n"); - -    // Replace NarrowDef operands with WideDef. Otherwise, we don't know -    // anything about the narrow operand yet so must insert a [sz]ext. It is -    // probably loop invariant and will be folded or hoisted. If it actually -    // comes from a widened IV, it should be removed during a future call to -    // WidenIVUse. -    Value *LHS = (DU.NarrowUse->getOperand(0) == DU.NarrowDef) ? DU.WideDef : -      getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, DU.NarrowUse); -    Value *RHS = (DU.NarrowUse->getOperand(1) == DU.NarrowDef) ? DU.WideDef : -      getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, DU.NarrowUse); - -    BinaryOperator *NarrowBO = cast<BinaryOperator>(DU.NarrowUse); -    BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), -                                                    LHS, RHS, -                                                    NarrowBO->getName()); -    IRBuilder<> Builder(DU.NarrowUse); -    Builder.Insert(WideBO); -    if (const OverflowingBinaryOperator *OBO = -        dyn_cast<OverflowingBinaryOperator>(NarrowBO)) { -      if (OBO->hasNoUnsignedWrap()) WideBO->setHasNoUnsignedWrap(); -      if (OBO->hasNoSignedWrap()) WideBO->setHasNoSignedWrap(); +    return cloneBitwiseIVUser(DU); +  } +} + +Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) { +  Instruction *NarrowUse = DU.NarrowUse; +  Instruction *NarrowDef = DU.NarrowDef; +  Instruction *WideDef = DU.WideDef; + +  DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n"); + +  // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything +  // about the narrow operand yet so must insert a [sz]ext. It is probably loop +  // invariant and will be folded or hoisted. If it actually comes from a +  // widened IV, it should be removed during a future call to widenIVUse. +  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) +                   ? WideDef +                   : createExtendInst(NarrowUse->getOperand(0), WideType, +                                      IsSigned, NarrowUse); +  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) +                   ? WideDef +                   : createExtendInst(NarrowUse->getOperand(1), WideType, +                                      IsSigned, NarrowUse); + +  auto *NarrowBO = cast<BinaryOperator>(NarrowUse); +  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, +                                        NarrowBO->getName()); +  IRBuilder<> Builder(NarrowUse); +  Builder.Insert(WideBO); +  WideBO->copyIRFlags(NarrowBO); +  return WideBO; +} + +Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU, +                                            const SCEVAddRecExpr *WideAR) { +  Instruction *NarrowUse = DU.NarrowUse; +  Instruction *NarrowDef = DU.NarrowDef; +  Instruction *WideDef = DU.WideDef; + +  DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n"); + +  unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1; + +  // We're trying to find X such that +  // +  //  Widen(NarrowDef `op` NonIVNarrowDef) == WideAR == WideDef `op.wide` X +  // +  // We guess two solutions to X, sext(NonIVNarrowDef) and zext(NonIVNarrowDef), +  // and check using SCEV if any of them are correct. + +  // Returns true if extending NonIVNarrowDef according to `SignExt` is a +  // correct solution to X. +  auto GuessNonIVOperand = [&](bool SignExt) { +    const SCEV *WideLHS; +    const SCEV *WideRHS; + +    auto GetExtend = [this, SignExt](const SCEV *S, Type *Ty) { +      if (SignExt) +        return SE->getSignExtendExpr(S, Ty); +      return SE->getZeroExtendExpr(S, Ty); +    }; + +    if (IVOpIdx == 0) { +      WideLHS = SE->getSCEV(WideDef); +      const SCEV *NarrowRHS = SE->getSCEV(NarrowUse->getOperand(1)); +      WideRHS = GetExtend(NarrowRHS, WideType); +    } else { +      const SCEV *NarrowLHS = SE->getSCEV(NarrowUse->getOperand(0)); +      WideLHS = GetExtend(NarrowLHS, WideType); +      WideRHS = SE->getSCEV(WideDef); +    } + +    // WideUse is "WideDef `op.wide` X" as described in the comment. +    const SCEV *WideUse = nullptr; + +    switch (NarrowUse->getOpcode()) { +    default: +      llvm_unreachable("No other possibility!"); + +    case Instruction::Add: +      WideUse = SE->getAddExpr(WideLHS, WideRHS); +      break; + +    case Instruction::Mul: +      WideUse = SE->getMulExpr(WideLHS, WideRHS); +      break; + +    case Instruction::UDiv: +      WideUse = SE->getUDivExpr(WideLHS, WideRHS); +      break; + +    case Instruction::Sub: +      WideUse = SE->getMinusSCEV(WideLHS, WideRHS); +      break;      } -    return WideBO; + +    return WideUse == WideAR; +  }; + +  bool SignExtend = IsSigned; +  if (!GuessNonIVOperand(SignExtend)) { +    SignExtend = !SignExtend; +    if (!GuessNonIVOperand(SignExtend)) +      return nullptr;    } + +  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) +                   ? WideDef +                   : createExtendInst(NarrowUse->getOperand(0), WideType, +                                      SignExtend, NarrowUse); +  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) +                   ? WideDef +                   : createExtendInst(NarrowUse->getOperand(1), WideType, +                                      SignExtend, NarrowUse); + +  auto *NarrowBO = cast<BinaryOperator>(NarrowUse); +  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, +                                        NarrowBO->getName()); + +  IRBuilder<> Builder(NarrowUse); +  Builder.Insert(WideBO); +  WideBO->copyIRFlags(NarrowBO); +  return WideBO;  } -const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, +const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,                                       unsigned OpCode) const {    if (OpCode == Instruction::Add)      return SE->getAddExpr(LHS, RHS); @@ -1022,7 +1100,7 @@ const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,  /// operands. Generate the SCEV value for the widened operation without  /// actually modifying the IR yet. If the expression after extending the  /// operands is an AddRec for this loop, return it. -const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { +const SCEVAddRecExpr* WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {    // Handle the common case of add<nsw/nuw>    const unsigned OpCode = DU.NarrowUse->getOpcode(); @@ -1062,19 +1140,18 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {    if (ExtendOperIdx == 0)      std::swap(lhs, rhs);    const SCEVAddRecExpr *AddRec = -      dyn_cast<SCEVAddRecExpr>(GetSCEVByOpCode(lhs, rhs, OpCode)); +      dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));    if (!AddRec || AddRec->getLoop() != L)      return nullptr;    return AddRec;  } -/// GetWideRecurrence - Is this instruction potentially interesting for further -/// simplification after widening it's type? In other words, can the -/// extend be safely hoisted out of the loop with SCEV reducing the value to a -/// recurrence on the same loop. If so, return the sign or zero extended -/// recurrence. Otherwise return NULL. -const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { +/// Is this instruction potentially interesting for further simplification after +/// widening it's type? In other words, can the extend be safely hoisted out of +/// the loop with SCEV reducing the value to a recurrence on the same loop. If +/// so, return the sign or zero extended recurrence. Otherwise return NULL. +const SCEVAddRecExpr *WidenIV::getWideRecurrence(Instruction *NarrowUse) {    if (!SE->isSCEVable(NarrowUse->getType()))      return nullptr; @@ -1097,10 +1174,11 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {  /// This IV user cannot be widen. Replace this use of the original narrow IV  /// with a truncation of the new wide IV to isolate and eliminate the narrow IV. -static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) { +static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {    DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef          << " for user " << *DU.NarrowUse << "\n"); -  IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); +  IRBuilder<> Builder( +      getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));    Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());    DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);  } @@ -1108,13 +1186,27 @@ static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) {  /// If the narrow use is a compare instruction, then widen the compare  //  (and possibly the other operand).  The extend operation is hoisted into the  // loop preheader as far as possible. -bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) { +bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {    ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse);    if (!Cmp)      return false; -  // Sign of IV user and compare must match. -  if (IsSigned != CmpInst::isSigned(Cmp->getPredicate())) +  // We can legally widen the comparison in the following two cases: +  // +  //  - The signedness of the IV extension and comparison match +  // +  //  - The narrow IV is always positive (and thus its sign extension is equal +  //    to its zero extension).  For instance, let's say we're zero extending +  //    %narrow for the following use +  // +  //      icmp slt i32 %narrow, %val   ... (A) +  // +  //    and %narrow is always positive.  Then +  // +  //      (A) == icmp slt i32 sext(%narrow), sext(%val) +  //          == icmp slt i32 zext(%narrow), sext(%val) + +  if (!(DU.NeverNegative || IsSigned == Cmp->isSigned()))      return false;    Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0); @@ -1123,20 +1215,21 @@ bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) {    assert (CastWidth <= IVWidth && "Unexpected width while widening compare.");    // Widen the compare instruction. -  IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); +  IRBuilder<> Builder( +      getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));    DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);    // Widen the other operand of the compare, if necessary.    if (CastWidth < IVWidth) { -    Value *ExtOp = getExtend(Op, WideType, IsSigned, Cmp); +    Value *ExtOp = createExtendInst(Op, WideType, Cmp->isSigned(), Cmp);      DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);    }    return true;  } -/// WidenIVUse - Determine whether an individual user of the narrow IV can be -/// widened. If so, return the wide clone of the user. -Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { +/// Determine whether an individual user of the narrow IV can be widened. If so, +/// return the wide clone of the user. +Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {    // Stop traversing the def-use chain at inner-loop phis or post-loop phis.    if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) { @@ -1145,13 +1238,13 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {        // After SimplifyCFG most loop exit targets have a single predecessor.        // Otherwise fall back to a truncate within the loop.        if (UsePhi->getNumOperands() != 1) -        truncateIVUse(DU, DT); +        truncateIVUse(DU, DT, LI);        else {          PHINode *WidePhi =            PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",                            UsePhi);          WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0)); -        IRBuilder<> Builder(WidePhi->getParent()->getFirstInsertionPt()); +        IRBuilder<> Builder(&*WidePhi->getParent()->getFirstInsertionPt());          Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());          UsePhi->replaceAllUsesWith(Trunc);          DeadInsts.emplace_back(UsePhi); @@ -1200,20 +1293,20 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {    }    // Does this user itself evaluate to a recurrence after widening? -  const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse); +  const SCEVAddRecExpr *WideAddRec = getWideRecurrence(DU.NarrowUse);    if (!WideAddRec) -    WideAddRec = GetExtendedOperandRecurrence(DU); +    WideAddRec = getExtendedOperandRecurrence(DU);    if (!WideAddRec) {      // If use is a loop condition, try to promote the condition instead of      // truncating the IV first. -    if (WidenLoopCompare(DU)) +    if (widenLoopCompare(DU))        return nullptr;      // This user does not evaluate to a recurence after widening, so don't      // follow it. Instead insert a Trunc to kill off the original use,      // eventually isolating the original narrow IV so it can be removed. -    truncateIVUse(DU, DT); +    truncateIVUse(DU, DT, LI);      return nullptr;    }    // Assume block terminators cannot evaluate to a recurrence. We can't to @@ -1228,7 +1321,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {        && Rewriter.hoistIVInc(WideInc, DU.NarrowUse))      WideUse = WideInc;    else { -    WideUse = CloneIVUser(DU); +    WideUse = cloneIVUser(DU, WideAddRec);      if (!WideUse)        return nullptr;    } @@ -1248,9 +1341,13 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {    return WideUse;  } -/// pushNarrowIVUsers - Add eligible users of NarrowDef to NarrowIVUsers. +/// Add eligible users of NarrowDef to NarrowIVUsers.  ///  void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { +  const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef); +  bool NeverNegative = +      SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV, +                           SE->getConstant(NarrowSCEV->getType(), 0));    for (User *U : NarrowDef->users()) {      Instruction *NarrowUser = cast<Instruction>(U); @@ -1258,21 +1355,21 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {      if (!Widened.insert(NarrowUser).second)        continue; -    NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUser, WideDef)); +    NarrowIVUsers.push_back( +        NarrowIVDefUse(NarrowDef, NarrowUser, WideDef, NeverNegative));    }  } -/// CreateWideIV - Process a single induction variable. First use the -/// SCEVExpander to create a wide induction variable that evaluates to the same -/// recurrence as the original narrow IV. Then use a worklist to forward -/// traverse the narrow IV's def-use chain. After WidenIVUse has processed all -/// interesting IV users, the narrow IV will be isolated for removal by -/// DeleteDeadPHIs. +/// Process a single induction variable. First use the SCEVExpander to create a +/// wide induction variable that evaluates to the same recurrence as the +/// original narrow IV. Then use a worklist to forward traverse the narrow IV's +/// def-use chain. After widenIVUse has processed all interesting IV users, the +/// narrow IV will be isolated for removal by DeleteDeadPHIs.  ///  /// It would be simpler to delete uses as they are processed, but we must avoid  /// invalidating SCEV expressions.  /// -PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { +PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {    // Is this phi an induction variable?    const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));    if (!AddRec) @@ -1302,11 +1399,11 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {    // either find an existing phi or materialize a new one. Either way, we    // expect a well-formed cyclic phi-with-increments. i.e. any operand not part    // of the phi-SCC dominates the loop entry. -  Instruction *InsertPt = L->getHeader()->begin(); +  Instruction *InsertPt = &L->getHeader()->front();    WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt));    // Remembering the WideIV increment generated by SCEVExpander allows -  // WidenIVUse to reuse it when widening the narrow IV's increment. We don't +  // widenIVUse to reuse it when widening the narrow IV's increment. We don't    // employ a general reuse mechanism because the call above is the only call to    // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses.    if (BasicBlock *LatchBlock = L->getLoopLatch()) { @@ -1329,13 +1426,13 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {      // Process a def-use edge. This may replace the use, so don't hold a      // use_iterator across it. -    Instruction *WideUse = WidenIVUse(DU, Rewriter); +    Instruction *WideUse = widenIVUse(DU, Rewriter);      // Follow all def-use edges from the previous narrow use.      if (WideUse)        pushNarrowIVUsers(DU.NarrowUse, WideUse); -    // WidenIVUse may have removed the def-use edge. +    // widenIVUse may have removed the def-use edge.      if (DU.NarrowDef->use_empty())        DeadInsts.emplace_back(DU.NarrowDef);    } @@ -1352,38 +1449,38 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {  //===----------------------------------------------------------------------===//  namespace { -  class IndVarSimplifyVisitor : public IVVisitor { -    ScalarEvolution *SE; -    const TargetTransformInfo *TTI; -    PHINode *IVPhi; - -  public: -    WideIVInfo WI; - -    IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, -                          const TargetTransformInfo *TTI, -                          const DominatorTree *DTree) -        : SE(SCEV), TTI(TTI), IVPhi(IV) { -      DT = DTree; -      WI.NarrowIV = IVPhi; -      if (ReduceLiveIVs) -        setSplitOverflowIntrinsics(); -    } +class IndVarSimplifyVisitor : public IVVisitor { +  ScalarEvolution *SE; +  const TargetTransformInfo *TTI; +  PHINode *IVPhi; -    // Implement the interface used by simplifyUsersOfIV. -    void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } -  }; +public: +  WideIVInfo WI; + +  IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, +                        const TargetTransformInfo *TTI, +                        const DominatorTree *DTree) +    : SE(SCEV), TTI(TTI), IVPhi(IV) { +    DT = DTree; +    WI.NarrowIV = IVPhi; +    if (ReduceLiveIVs) +      setSplitOverflowIntrinsics(); +  } + +  // Implement the interface used by simplifyUsersOfIV. +  void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } +};  } -/// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV -/// users. Each successive simplification may push more users which may -/// themselves be candidates for simplification. +/// Iteratively perform simplification on a worklist of IV users. Each +/// successive simplification may push more users which may themselves be +/// candidates for simplification.  ///  /// Sign/Zero extend elimination is interleaved with IV simplification.  /// -void IndVarSimplify::SimplifyAndExtend(Loop *L, +void IndVarSimplify::simplifyAndExtend(Loop *L,                                         SCEVExpander &Rewriter, -                                       LPPassManager &LPM) { +                                       LoopInfo *LI) {    SmallVector<WideIVInfo, 8> WideIVs;    SmallVector<PHINode*, 8> LoopPhis; @@ -1400,14 +1497,14 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L,      // extension. The first time SCEV attempts to normalize sign/zero extension,      // the result becomes final. So for the most predictable results, we delay      // evaluation of sign/zero extend evaluation until needed, and avoid running -    // other SCEV based analysis prior to SimplifyAndExtend. +    // other SCEV based analysis prior to simplifyAndExtend.      do {        PHINode *CurrIV = LoopPhis.pop_back_val();        // Information about sign/zero extensions of CurrIV.        IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); -      Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor); +      Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, &Visitor);        if (Visitor.WI.WidestNativeType) {          WideIVs.push_back(Visitor.WI); @@ -1416,7 +1513,7 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L,      for (; !WideIVs.empty(); WideIVs.pop_back()) {        WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts); -      if (PHINode *WidePhi = Widener.CreateWideIV(Rewriter)) { +      if (PHINode *WidePhi = Widener.createWideIV(Rewriter)) {          Changed = true;          LoopPhis.push_back(WidePhi);        } @@ -1425,12 +1522,12 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L,  }  //===----------------------------------------------------------------------===// -//  LinearFunctionTestReplace and its kin. Rewrite the loop exit condition. +//  linearFunctionTestReplace and its kin. Rewrite the loop exit condition.  //===----------------------------------------------------------------------===// -/// canExpandBackedgeTakenCount - Return true if this loop's backedge taken -/// count expression can be safely and cheaply expanded into an instruction -/// sequence that can be used by LinearFunctionTestReplace. +/// Return true if this loop's backedge taken count expression can be safely and +/// cheaply expanded into an instruction sequence that can be used by +/// linearFunctionTestReplace.  ///  /// TODO: This fails for pointer-type loop counters with greater than one byte  /// strides, consequently preventing LFTR from running. For the purpose of LFTR @@ -1461,8 +1558,7 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE,    return true;  } -/// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop -/// invariant value to the phi. +/// Return the loop header phi IFF IncV adds a loop invariant value to the phi.  static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {    Instruction *IncI = dyn_cast<Instruction>(IncV);    if (!IncI) @@ -1513,8 +1609,8 @@ static ICmpInst *getLoopTest(Loop *L) {    return dyn_cast<ICmpInst>(BI->getCondition());  } -/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show -/// that the current exit test is already sufficiently canonical. +/// linearFunctionTestReplace policy. Return true unless we can show that the +/// current exit test is already sufficiently canonical.  static bool needsLFTR(Loop *L, DominatorTree *DT) {    // Do LFTR to simplify the exit condition to an ICMP.    ICmpInst *Cond = getLoopTest(L); @@ -1574,10 +1670,10 @@ static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,      return false;    // Optimistically handle other instructions. -  for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) { -    if (!Visited.insert(*OI).second) +  for (Value *Op : I->operands()) { +    if (!Visited.insert(Op).second)        continue; -    if (!hasConcreteDefImpl(*OI, Visited, Depth+1)) +    if (!hasConcreteDefImpl(Op, Visited, Depth+1))        return false;    }    return true; @@ -1594,8 +1690,8 @@ static bool hasConcreteDef(Value *V) {    return hasConcreteDefImpl(V, Visited, 0);  } -/// AlmostDeadIV - Return true if this IV has any uses other than the (soon to -/// be rewritten) loop exit test. +/// Return true if this IV has any uses other than the (soon to be rewritten) +/// loop exit test.  static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {    int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);    Value *IncV = Phi->getIncomingValue(LatchIdx); @@ -1608,7 +1704,7 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {    return true;  } -/// FindLoopCounter - Find an affine IV in canonical form. +/// Find an affine IV in canonical form.  ///  /// BECount may be an i8* pointer type. The pointer difference is already  /// valid count without scaling the address stride, so it remains a pointer @@ -1702,8 +1798,8 @@ static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount,    return BestPhi;  } -/// genLoopLimit - Help LinearFunctionTestReplace by generating a value that -/// holds the RHS of the new loop test. +/// Help linearFunctionTestReplace by generating a value that holds the RHS of +/// the new loop test.  static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,                             SCEVExpander &Rewriter, ScalarEvolution *SE) {    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); @@ -1785,13 +1881,13 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,    }  } -/// LinearFunctionTestReplace - This method rewrites the exit condition of the -/// loop to be a canonical != comparison against the incremented loop induction -/// variable.  This pass is able to rewrite the exit tests of any loop where the -/// SCEV analysis can determine a loop-invariant trip count of the loop, which -/// is actually a much broader range than just linear tests. +/// This method rewrites the exit condition of the loop to be a canonical != +/// comparison against the incremented loop induction variable.  This pass is +/// able to rewrite the exit tests of any loop where the SCEV analysis can +/// determine a loop-invariant trip count of the loop, which is actually a much +/// broader range than just linear tests.  Value *IndVarSimplify:: -LinearFunctionTestReplace(Loop *L, +linearFunctionTestReplace(Loop *L,                            const SCEV *BackedgeTakenCount,                            PHINode *IndVar,                            SCEVExpander &Rewriter) { @@ -1809,7 +1905,7 @@ LinearFunctionTestReplace(Loop *L,      // This addition may overflow, which is valid as long as the comparison is      // truncated to BackedgeTakenCount->getType().      IVCount = SE->getAddExpr(BackedgeTakenCount, -                             SE->getConstant(BackedgeTakenCount->getType(), 1)); +                             SE->getOne(BackedgeTakenCount->getType()));      // The BackedgeTaken expression contains the number of times that the      // backedge branches to the loop header.  This is one less than the      // number of times the loop executes, so use the incremented indvar. @@ -1847,8 +1943,8 @@ LinearFunctionTestReplace(Loop *L,      const SCEV *ARStep = AR->getStepRecurrence(*SE);      // For constant IVCount, avoid truncation.      if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) { -      const APInt &Start = cast<SCEVConstant>(ARStart)->getValue()->getValue(); -      APInt Count = cast<SCEVConstant>(IVCount)->getValue()->getValue(); +      const APInt &Start = cast<SCEVConstant>(ARStart)->getAPInt(); +      APInt Count = cast<SCEVConstant>(IVCount)->getAPInt();        // Note that the post-inc value of BackedgeTakenCount may have overflowed        // above such that IVCount is now zero.        if (IVCount != BackedgeTakenCount && Count == 0) { @@ -1886,21 +1982,21 @@ LinearFunctionTestReplace(Loop *L,  }  //===----------------------------------------------------------------------===// -//  SinkUnusedInvariants. A late subpass to cleanup loop preheaders. +//  sinkUnusedInvariants. A late subpass to cleanup loop preheaders.  //===----------------------------------------------------------------------===//  /// If there's a single exit block, sink any loop-invariant values that  /// were defined in the preheader but not used inside the loop into the  /// exit block to reduce register pressure in the loop. -void IndVarSimplify::SinkUnusedInvariants(Loop *L) { +void IndVarSimplify::sinkUnusedInvariants(Loop *L) {    BasicBlock *ExitBlock = L->getExitBlock();    if (!ExitBlock) return;    BasicBlock *Preheader = L->getLoopPreheader();    if (!Preheader) return; -  Instruction *InsertPt = ExitBlock->getFirstInsertionPt(); -  BasicBlock::iterator I = Preheader->getTerminator(); +  Instruction *InsertPt = &*ExitBlock->getFirstInsertionPt(); +  BasicBlock::iterator I(Preheader->getTerminator());    while (I != Preheader->begin()) {      --I;      // New instructions were inserted at the end of the preheader. @@ -1920,8 +2016,8 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {      if (isa<DbgInfoIntrinsic>(I))        continue; -    // Skip landingpad instructions. -    if (isa<LandingPadInst>(I)) +    // Skip eh pad instructions. +    if (I->isEHPad())        continue;      // Don't sink alloca: we never want to sink static alloca's out of the @@ -1953,7 +2049,7 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {        continue;      // Otherwise, sink it to the exit block. -    Instruction *ToMove = I; +    Instruction *ToMove = &*I;      bool Done = false;      if (I != Preheader->begin()) { @@ -1994,7 +2090,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {      return false;    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); -  SE = &getAnalysis<ScalarEvolution>(); +  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();    TLI = TLIP ? &TLIP->getTLI() : nullptr; @@ -2007,7 +2103,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {    // If there are any floating-point recurrences, attempt to    // transform them to use integer recurrences. -  RewriteNonIntegerIVs(L); +  rewriteNonIntegerIVs(L);    const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); @@ -2024,7 +2120,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {    // other expressions involving loop IVs have been evaluated. This helps SCEV    // set no-wrap flags before normalizing sign/zero extension.    Rewriter.disableCanonicalMode(); -  SimplifyAndExtend(L, Rewriter, LPM); +  simplifyAndExtend(L, Rewriter, LI);    // Check to see if this loop has a computable loop-invariant execution count.    // If so, this means that we can compute the final value of any expressions @@ -2034,7 +2130,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {    //    if (ReplaceExitValue != NeverRepl &&        !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) -    RewriteLoopExitValues(L, Rewriter); +    rewriteLoopExitValues(L, Rewriter);    // Eliminate redundant IV cycles.    NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); @@ -2054,7 +2150,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {        // explicitly check any assumptions made by SCEV. Brittle.        const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount);        if (!AR || AR->getLoop()->getLoopPreheader()) -        (void)LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, +        (void)linearFunctionTestReplace(L, BackedgeTakenCount, IndVar,                                          Rewriter);      }    } @@ -2074,13 +2170,13 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {    // Loop-invariant instructions in the preheader that aren't used in the    // loop may be sunk below the loop to reduce register pressure. -  SinkUnusedInvariants(L); +  sinkUnusedInvariants(L);    // Clean up dead instructions.    Changed |= DeleteDeadPHIs(L->getHeader(), TLI); +    // Check a post-condition. -  assert(L->isLCSSAForm(*DT) && -         "Indvars did not leave the loop in lcssa form!"); +  assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");    // Verify that LFTR, and any other change have not interfered with SCEV's    // ability to compute trip count. diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index cbdacad8f28b..dea61f6ff3d7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -214,8 +214,8 @@ public:      AU.addRequired<LoopInfoWrapperPass>();      AU.addRequiredID(LoopSimplifyID);      AU.addRequiredID(LCSSAID); -    AU.addRequired<ScalarEvolution>(); -    AU.addRequired<BranchProbabilityInfo>(); +    AU.addRequired<ScalarEvolutionWrapperPass>(); +    AU.addRequired<BranchProbabilityInfoWrapperPass>();    }    bool runOnLoop(Loop *L, LPPassManager &LPM) override; @@ -224,8 +224,15 @@ public:  char InductiveRangeCheckElimination::ID = 0;  } -INITIALIZE_PASS(InductiveRangeCheckElimination, "irce", -                "Inductive range check elimination", false, false) +INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce", +                      "Inductive range check elimination", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce", +                    "Inductive range check elimination", false, false)  const char *InductiveRangeCheck::rangeCheckKindToStr(      InductiveRangeCheck::RangeCheckKind RCK) { @@ -1044,9 +1051,9 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(    auto BBInsertLocation = std::next(Function::iterator(LS.Latch));    RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector", -                                        &F, BBInsertLocation); +                                        &F, &*BBInsertLocation);    RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F, -                                      BBInsertLocation); +                                      &*BBInsertLocation);    BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin());    bool Increasing = LS.IndVarIncreasing; @@ -1399,8 +1406,9 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {    LLVMContext &Context = Preheader->getContext();    InductiveRangeCheck::AllocatorTy IRCAlloc;    SmallVector<InductiveRangeCheck *, 16> RangeChecks; -  ScalarEvolution &SE = getAnalysis<ScalarEvolution>(); -  BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>(); +  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); +  BranchProbabilityInfo &BPI = +      getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();    for (auto BBI : L->getBlocks())      if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator())) diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 1130d228acb8..087ce8ac50d4 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -18,15 +18,22 @@  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/SmallSet.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BranchProbabilityInfo.h"  #include "llvm/Analysis/ConstantFolding.h"  #include "llvm/Analysis/InstructionSimplify.h"  #include "llvm/Analysis/LazyValueInfo.h"  #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/IntrinsicInst.h"  #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h"  #include "llvm/IR/Metadata.h"  #include "llvm/IR/ValueHandle.h"  #include "llvm/Pass.h" @@ -36,6 +43,8 @@  #include "llvm/Transforms/Utils/BasicBlockUtils.h"  #include "llvm/Transforms/Utils/Local.h"  #include "llvm/Transforms/Utils/SSAUpdater.h" +#include <algorithm> +#include <memory>  using namespace llvm;  #define DEBUG_TYPE "jump-threading" @@ -49,6 +58,13 @@ BBDuplicateThreshold("jump-threading-threshold",            cl::desc("Max block size to duplicate for jump threading"),            cl::init(6), cl::Hidden); +static cl::opt<unsigned> +ImplicationSearchThreshold( +  "jump-threading-implication-search-threshold", +  cl::desc("The number of predecessors to search for a stronger " +           "condition to use to thread over a weaker condition"), +  cl::init(3), cl::Hidden); +  namespace {    // These are at global scope so static functions can use them too.    typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo; @@ -80,6 +96,9 @@ namespace {    class JumpThreading : public FunctionPass {      TargetLibraryInfo *TLI;      LazyValueInfo *LVI; +    std::unique_ptr<BlockFrequencyInfo> BFI; +    std::unique_ptr<BranchProbabilityInfo> BPI; +    bool HasProfileData;  #ifdef NDEBUG      SmallPtrSet<BasicBlock*, 16> LoopHeaders;  #else @@ -114,9 +133,15 @@ namespace {      void getAnalysisUsage(AnalysisUsage &AU) const override {        AU.addRequired<LazyValueInfo>();        AU.addPreserved<LazyValueInfo>(); +      AU.addPreserved<GlobalsAAWrapperPass>();        AU.addRequired<TargetLibraryInfoWrapperPass>();      } +    void releaseMemory() override { +      BFI.reset(); +      BPI.reset(); +    } +      void FindLoopHeaders(Function &F);      bool ProcessBlock(BasicBlock *BB);      bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs, @@ -134,9 +159,16 @@ namespace {      bool ProcessBranchOnPHI(PHINode *PN);      bool ProcessBranchOnXOR(BinaryOperator *BO); +    bool ProcessImpliedCondition(BasicBlock *BB);      bool SimplifyPartiallyRedundantLoad(LoadInst *LI);      bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB); + +  private: +    BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds, +                                const char *Suffix); +    void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB, +                                      BasicBlock *NewBB, BasicBlock *SuccBB);    };  } @@ -160,11 +192,21 @@ bool JumpThreading::runOnFunction(Function &F) {    DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();    LVI = &getAnalysis<LazyValueInfo>(); +  BFI.reset(); +  BPI.reset(); +  // When profile data is available, we need to update edge weights after +  // successful jump threading, which requires both BPI and BFI being available. +  HasProfileData = F.getEntryCount().hasValue(); +  if (HasProfileData) { +    LoopInfo LI{DominatorTree(F)}; +    BPI.reset(new BranchProbabilityInfo(F, LI)); +    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); +  }    // Remove unreachable blocks from function as they may result in infinite    // loop. We do threading if we found something profitable. Jump threading a    // branch can create other opportunities. If these opportunities form a cycle -  // i.e. if any jump treading is undoing previous threading in the path, then +  // i.e. if any jump threading is undoing previous threading in the path, then    // we will loop forever. We take care of this issue by not jump threading for    // back edges. This works for normal cases but not for unreachable blocks as    // they may have cycle with no back edge. @@ -176,7 +218,7 @@ bool JumpThreading::runOnFunction(Function &F) {    do {      Changed = false;      for (Function::iterator I = F.begin(), E = F.end(); I != E;) { -      BasicBlock *BB = I; +      BasicBlock *BB = &*I;        // Thread all of the branches we can over this block.        while (ProcessBlock(BB))          Changed = true; @@ -239,11 +281,26 @@ bool JumpThreading::runOnFunction(Function &F) {  static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,                                               unsigned Threshold) {    /// Ignore PHI nodes, these will be flattened when duplication happens. -  BasicBlock::const_iterator I = BB->getFirstNonPHI(); +  BasicBlock::const_iterator I(BB->getFirstNonPHI());    // FIXME: THREADING will delete values that are just used to compute the    // branch, so they shouldn't count against the duplication cost. +  unsigned Bonus = 0; +  const TerminatorInst *BBTerm = BB->getTerminator(); +  // Threading through a switch statement is particularly profitable.  If this +  // block ends in a switch, decrease its cost to make it more likely to happen. +  if (isa<SwitchInst>(BBTerm)) +    Bonus = 6; + +  // The same holds for indirect branches, but slightly more so. +  if (isa<IndirectBrInst>(BBTerm)) +    Bonus = 8; + +  // Bump the threshold up so the early exit from the loop doesn't skip the +  // terminator-based Size adjustment at the end. +  Threshold += Bonus; +    // Sum up the cost of each instruction until we get to the terminator.  Don't    // include the terminator because the copy won't include it.    unsigned Size = 0; @@ -260,6 +317,11 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,      if (isa<BitCastInst>(I) && I->getType()->isPointerTy())        continue; +    // Bail out if this instruction gives back a token type, it is not possible +    // to duplicate it if it is used outside this BB. +    if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB)) +      return ~0U; +      // All other instructions count for at least one unit.      ++Size; @@ -268,7 +330,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,      // as having cost of 2 total, and if they are a vector intrinsic, we model      // them as having cost 1.      if (const CallInst *CI = dyn_cast<CallInst>(I)) { -      if (CI->cannotDuplicate()) +      if (CI->cannotDuplicate() || CI->isConvergent())          // Blocks with NoDuplicate are modelled as having infinite cost, so they          // are never duplicated.          return ~0U; @@ -279,16 +341,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,      }    } -  // Threading through a switch statement is particularly profitable.  If this -  // block ends in a switch, decrease its cost to make it more likely to happen. -  if (isa<SwitchInst>(I)) -    Size = Size > 6 ? Size-6 : 0; - -  // The same holds for indirect branches, but slightly more so. -  if (isa<IndirectBrInst>(I)) -    Size = Size > 8 ? Size-8 : 0; - -  return Size; +  return Size > Bonus ? Size - Bonus : 0;  }  /// FindLoopHeaders - We do not want jump threading to turn proper loop @@ -669,7 +722,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {    // because now the condition in this block can be threaded through    // predecessors of our predecessor block.    if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { -    if (SinglePred->getTerminator()->getNumSuccessors() == 1 && +    const TerminatorInst *TI = SinglePred->getTerminator(); +    if (!TI->isExceptional() && TI->getNumSuccessors() == 1 &&          SinglePred != BB && !hasAddressTakenAndUsed(BB)) {        // If SinglePred was a loop header, BB becomes one.        if (LoopHeaders.erase(SinglePred)) @@ -761,7 +815,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {      // If we're branching on a conditional, LVI might be able to determine      // it's value at the branch instruction.  We only handle comparisons      // against a constant at this time. -    // TODO: This should be extended to handle switches as well.   +    // TODO: This should be extended to handle switches as well.      BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());      Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));      if (CondBr && CondConst && CondBr->isConditional()) { @@ -829,9 +883,40 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {        CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))      return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst)); +  // Search for a stronger dominating condition that can be used to simplify a +  // conditional branch leaving BB. +  if (ProcessImpliedCondition(BB)) +    return true; + +  return false; +} + +bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) { +  auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); +  if (!BI || !BI->isConditional()) +    return false; + +  Value *Cond = BI->getCondition(); +  BasicBlock *CurrentBB = BB; +  BasicBlock *CurrentPred = BB->getSinglePredecessor(); +  unsigned Iter = 0; + +  auto &DL = BB->getModule()->getDataLayout(); + +  while (CurrentPred && Iter++ < ImplicationSearchThreshold) { +    auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator()); +    if (!PBI || !PBI->isConditional() || PBI->getSuccessor(0) != CurrentBB) +      return false; -  // TODO: If we have: "br (X > 0)"  and we have a predecessor where we know -  // "(X == 4)", thread through this block. +    if (isImpliedCondition(PBI->getCondition(), Cond, DL)) { +      BI->getSuccessor(1)->removePredecessor(BB); +      BranchInst::Create(BI->getSuccessor(0), BI); +      BI->eraseFromParent(); +      return true; +    } +    CurrentBB = CurrentPred; +    CurrentPred = CurrentBB->getSinglePredecessor(); +  }    return false;  } @@ -850,10 +935,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {    if (LoadBB->getSinglePredecessor())      return false; -  // If the load is defined in a landing pad, it can't be partially redundant, -  // because the edges between the invoke and the landing pad cannot have other +  // If the load is defined in an EH pad, it can't be partially redundant, +  // because the edges between the invoke and the EH pad cannot have other    // instructions between them. -  if (LoadBB->isLandingPad()) +  if (LoadBB->isEHPad())      return false;    Value *LoadedPtr = LI->getOperand(0); @@ -866,11 +951,11 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {    // Scan a few instructions up from the load, to see if it is obviously live at    // the entry to its block. -  BasicBlock::iterator BBIt = LI; +  BasicBlock::iterator BBIt(LI);    if (Value *AvailableVal = -        FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) { -    // If the value if the load is locally available within the block, just use +        FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, DefMaxInstsToScan)) { +    // If the value of the load is locally available within the block, just use      // it.  This frequently occurs for reg2mem'd allocas.      //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n"; @@ -914,7 +999,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {      // Scan the predecessor to see if the value is available in the pred.      BBIt = PredBB->end();      AAMDNodes ThisAATags; -    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6, +    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, +                                                    DefMaxInstsToScan,                                                      nullptr, &ThisAATags);      if (!PredAvailable) {        OneUnavailablePred = PredBB; @@ -968,8 +1054,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {      }      // Split them out to their own block. -    UnavailablePred = -      SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split"); +    UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");    }    // If the value isn't available in all predecessors, then there will be @@ -995,7 +1080,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {    // Create a PHI node at the start of the block for the PRE'd load value.    pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);    PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "", -                                LoadBB->begin()); +                                &LoadBB->front());    PN->takeName(LI);    PN->setDebugLoc(LI->getDebugLoc()); @@ -1262,7 +1347,7 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) {    // Into:    //  BB':    //    %Y = icmp ne i32 %A, %B -  //    br i1 %Z, ... +  //    br i1 %Y, ...    PredValueInfoTy XorOpValues;    bool isLHS = true; @@ -1387,14 +1472,14 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,      return false;    } -  // And finally, do it!  Start by factoring the predecessors is needed. +  // And finally, do it!  Start by factoring the predecessors if needed.    BasicBlock *PredBB;    if (PredBBs.size() == 1)      PredBB = PredBBs[0];    else {      DEBUG(dbgs() << "  Factoring out " << PredBBs.size()            << " common predecessors.\n"); -    PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); +    PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");    }    // And finally, do it! @@ -1415,6 +1500,13 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,                                           BB->getParent(), BB);    NewBB->moveAfter(PredBB); +  // Set the block frequency of NewBB. +  if (HasProfileData) { +    auto NewBBFreq = +        BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB); +    BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); +  } +    BasicBlock::iterator BI = BB->begin();    for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)      ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); @@ -1425,7 +1517,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,      Instruction *New = BI->clone();      New->setName(BI->getName());      NewBB->getInstList().push_back(New); -    ValueMapping[BI] = New; +    ValueMapping[&*BI] = New;      // Remap operands to patch up intra-block references.      for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) @@ -1438,7 +1530,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,    // We didn't copy the terminator from BB over to NewBB, because there is now    // an unconditional jump to SuccBB.  Insert the unconditional jump. -  BranchInst *NewBI =BranchInst::Create(SuccBB, NewBB); +  BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);    NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());    // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the @@ -1475,8 +1567,8 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,      // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks      // with the two values we know.      SSAUpdate.Initialize(I->getType(), I->getName()); -    SSAUpdate.AddAvailableValue(BB, I); -    SSAUpdate.AddAvailableValue(NewBB, ValueMapping[I]); +    SSAUpdate.AddAvailableValue(BB, &*I); +    SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&*I]);      while (!UsesToRename.empty())        SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); @@ -1499,11 +1591,98 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,    // frequently happens because of phi translation.    SimplifyInstructionsInBlock(NewBB, TLI); +  // Update the edge weight from BB to SuccBB, which should be less than before. +  UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB); +    // Threaded an edge!    ++NumThreads;    return true;  } +/// Create a new basic block that will be the predecessor of BB and successor of +/// all blocks in Preds. When profile data is availble, update the frequency of +/// this new block. +BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB, +                                           ArrayRef<BasicBlock *> Preds, +                                           const char *Suffix) { +  // Collect the frequencies of all predecessors of BB, which will be used to +  // update the edge weight on BB->SuccBB. +  BlockFrequency PredBBFreq(0); +  if (HasProfileData) +    for (auto Pred : Preds) +      PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB); + +  BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix); + +  // Set the block frequency of the newly created PredBB, which is the sum of +  // frequencies of Preds. +  if (HasProfileData) +    BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency()); +  return PredBB; +} + +/// Update the block frequency of BB and branch weight and the metadata on the +/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 - +/// Freq(PredBB->BB) / Freq(BB->SuccBB). +void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, +                                                 BasicBlock *BB, +                                                 BasicBlock *NewBB, +                                                 BasicBlock *SuccBB) { +  if (!HasProfileData) +    return; + +  assert(BFI && BPI && "BFI & BPI should have been created here"); + +  // As the edge from PredBB to BB is deleted, we have to update the block +  // frequency of BB. +  auto BBOrigFreq = BFI->getBlockFreq(BB); +  auto NewBBFreq = BFI->getBlockFreq(NewBB); +  auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB); +  auto BBNewFreq = BBOrigFreq - NewBBFreq; +  BFI->setBlockFreq(BB, BBNewFreq.getFrequency()); + +  // Collect updated outgoing edges' frequencies from BB and use them to update +  // edge probabilities. +  SmallVector<uint64_t, 4> BBSuccFreq; +  for (auto I = succ_begin(BB), E = succ_end(BB); I != E; ++I) { +    auto SuccFreq = (*I == SuccBB) +                        ? BB2SuccBBFreq - NewBBFreq +                        : BBOrigFreq * BPI->getEdgeProbability(BB, *I); +    BBSuccFreq.push_back(SuccFreq.getFrequency()); +  } + +  uint64_t MaxBBSuccFreq = +      *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end()); + +  SmallVector<BranchProbability, 4> BBSuccProbs; +  if (MaxBBSuccFreq == 0) +    BBSuccProbs.assign(BBSuccFreq.size(), +                       {1, static_cast<unsigned>(BBSuccFreq.size())}); +  else { +    for (uint64_t Freq : BBSuccFreq) +      BBSuccProbs.push_back( +          BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq)); +    // Normalize edge probabilities so that they sum up to one. +    BranchProbability::normalizeProbabilities(BBSuccProbs.begin(), +                                              BBSuccProbs.end()); +  } + +  // Update edge probabilities in BPI. +  for (int I = 0, E = BBSuccProbs.size(); I < E; I++) +    BPI->setEdgeProbability(BB, I, BBSuccProbs[I]); + +  if (BBSuccProbs.size() >= 2) { +    SmallVector<uint32_t, 4> Weights; +    for (auto Prob : BBSuccProbs) +      Weights.push_back(Prob.getNumerator()); + +    auto TI = BB->getTerminator(); +    TI->setMetadata( +        LLVMContext::MD_prof, +        MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights)); +  } +} +  /// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch  /// to BB which contains an i1 PHI node and a conditional branch on that PHI.  /// If we can duplicate the contents of BB up into PredBB do so now, this @@ -1530,14 +1709,14 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,      return false;    } -  // And finally, do it!  Start by factoring the predecessors is needed. +  // And finally, do it!  Start by factoring the predecessors if needed.    BasicBlock *PredBB;    if (PredBBs.size() == 1)      PredBB = PredBBs[0];    else {      DEBUG(dbgs() << "  Factoring out " << PredBBs.size()            << " common predecessors.\n"); -    PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); +    PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");    }    // Okay, we decided to do this!  Clone all the instructions in BB onto the end @@ -1581,12 +1760,12 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,      if (Value *IV =              SimplifyInstruction(New, BB->getModule()->getDataLayout())) {        delete New; -      ValueMapping[BI] = IV; +      ValueMapping[&*BI] = IV;      } else {        // Otherwise, insert the new instruction into the block.        New->setName(BI->getName()); -      PredBB->getInstList().insert(OldPredBranch, New); -      ValueMapping[BI] = New; +      PredBB->getInstList().insert(OldPredBranch->getIterator(), New); +      ValueMapping[&*BI] = New;      }    } @@ -1628,8 +1807,8 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,      // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks      // with the two values we know.      SSAUpdate.Initialize(I->getType(), I->getName()); -    SSAUpdate.AddAvailableValue(BB, I); -    SSAUpdate.AddAvailableValue(PredBB, ValueMapping[I]); +    SSAUpdate.AddAvailableValue(BB, &*I); +    SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&*I]);      while (!UsesToRename.empty())        SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index 43fc50e588f8..6d70cdc3ade2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -34,10 +34,13 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/BasicAliasAnalysis.h"  #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/LoopPass.h"  #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"  #include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/CFG.h" @@ -118,9 +121,12 @@ namespace {        AU.addPreservedID(LoopSimplifyID);        AU.addRequiredID(LCSSAID);        AU.addPreservedID(LCSSAID); -      AU.addRequired<AliasAnalysis>(); -      AU.addPreserved<AliasAnalysis>(); -      AU.addPreserved<ScalarEvolution>(); +      AU.addRequired<AAResultsWrapperPass>(); +      AU.addPreserved<AAResultsWrapperPass>(); +      AU.addPreserved<BasicAAWrapperPass>(); +      AU.addPreserved<GlobalsAAWrapperPass>(); +      AU.addPreserved<ScalarEvolutionWrapperPass>(); +      AU.addPreserved<SCEVAAWrapperPass>();        AU.addRequired<TargetLibraryInfoWrapperPass>();      } @@ -164,9 +170,12 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)  INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)  INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false)  Pass *llvm::createLICMPass() { return new LICM(); } @@ -183,7 +192,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {    // Get our Loop and Alias Analysis information...    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); -  AA = &getAnalysis<AliasAnalysis>(); +  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); @@ -264,9 +273,10 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {      // FIXME: This is really heavy handed. It would be a bit better to use an      // SSAUpdater strategy during promotion that was LCSSA aware and reformed      // it as it went. -    if (Changed) -      formLCSSARecursively(*L, *DT, LI, -                           getAnalysisIfAvailable<ScalarEvolution>()); +    if (Changed) { +      auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); +      formLCSSARecursively(*L, *DT, LI, SEWP ? &SEWP->getSE() : nullptr); +    }    }    // Check that neither this loop nor its parent have had LCSSA broken. LICM is @@ -402,7 +412,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,  }  /// Computes loop safety information, checks loop body & header -/// for the possiblity of may throw exception. +/// for the possibility of may throw exception.  ///  void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {    assert(CurLoop != nullptr && "CurLoop cant be null"); @@ -410,7 +420,7 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {    // Setting default safety values.    SafetyInfo->MayThrow = false;    SafetyInfo->HeaderMayThrow = false; -  // Iterate over header and compute dafety info. +  // Iterate over header and compute safety info.    for (BasicBlock::iterator I = Header->begin(), E = Header->end();         (I != E) && !SafetyInfo->HeaderMayThrow; ++I)      SafetyInfo->HeaderMayThrow |= I->mayThrow(); @@ -445,7 +455,7 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,      // Don't hoist loads which have may-aliased stores in loop.      uint64_t Size = 0;      if (LI->getType()->isSized()) -      Size = AA->getTypeStoreSize(LI->getType()); +      Size = I.getModule()->getDataLayout().getTypeStoreSize(LI->getType());      AAMDNodes AAInfo;      LI->getAAMetadata(AAInfo); @@ -457,10 +467,21 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,        return false;      // Handle simple cases by querying alias analysis. -    AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI); -    if (Behavior == AliasAnalysis::DoesNotAccessMemory) +    FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI); +    if (Behavior == FMRB_DoesNotAccessMemory)        return true;      if (AliasAnalysis::onlyReadsMemory(Behavior)) { +      // A readonly argmemonly function only reads from memory pointed to by +      // it's arguments with arbitrary offsets.  If we can prove there are no +      // writes to this memory in the loop, we can hoist or sink. +      if (AliasAnalysis::onlyAccessesArgPointees(Behavior)) { +        for (Value *Op : CI->arg_operands()) +          if (Op->getType()->isPointerTy() && +              pointerInvalidatedByLoop(Op, MemoryLocation::UnknownSize, +                                       AAMDNodes(), CurAST)) +            return false; +        return true; +      }        // If this call only reads from memory and there are no writes to memory        // in the loop, we can hoist or sink the call as appropriate.        bool FoundMod = false; @@ -566,7 +587,7 @@ static Instruction *CloneInstructionInExitBlock(const Instruction &I,          if (!OLoop->contains(&PN)) {            PHINode *OpPN =                PHINode::Create(OInst->getType(), PN.getNumIncomingValues(), -                              OInst->getName() + ".lcssa", ExitBlock.begin()); +                              OInst->getName() + ".lcssa", &ExitBlock.front());            for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)              OpPN->addIncoming(OInst, PN.getIncomingBlock(i));            *OI = OpPN; @@ -651,6 +672,10 @@ static bool hoist(Instruction &I, BasicBlock *Preheader) {    // Move the new node to the Preheader, before its terminator.    I.moveBefore(Preheader->getTerminator()); +  // Metadata can be dependent on the condition we are hoisting above. +  // Conservatively strip all metadata on the instruction. +  I.dropUnknownNonDebugMetadata(); +    if (isa<LoadInst>(I)) ++NumMovedLoads;    else if (isa<CallInst>(I)) ++NumMovedCalls;    ++NumHoisted; @@ -730,9 +755,9 @@ namespace {            if (!L->contains(BB)) {              // We need to create an LCSSA PHI node for the incoming value and              // store that. -            PHINode *PN = PHINode::Create( -                I->getType(), PredCache.size(BB), -                I->getName() + ".lcssa", BB->begin()); +            PHINode *PN = +                PHINode::Create(I->getType(), PredCache.size(BB), +                                I->getName() + ".lcssa", &BB->front());              for (BasicBlock *Pred : PredCache.get(BB))                PN->addIncoming(I, Pred);              return PN; @@ -942,7 +967,7 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,      CurLoop->getUniqueExitBlocks(ExitBlocks);      InsertPts.resize(ExitBlocks.size());      for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) -      InsertPts[i] = ExitBlocks[i]->getFirstInsertionPt(); +      InsertPts[i] = &*ExitBlocks[i]->getFirstInsertionPt();    }    // We use the SSAUpdater interface to insert phi nodes as required. @@ -973,7 +998,7 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,    return Changed;  } -/// Simple Analysis hook. Clone alias set info. +/// Simple analysis hook. Clone alias set info.  ///  void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {    AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp index c19cd19059b2..1648878b0628 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp @@ -16,6 +16,7 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/TargetFolder.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/Function.h" @@ -56,7 +57,7 @@ class LoadCombine : public BasicBlockPass {  public:    LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) { -    initializeSROAPass(*PassRegistry::getPassRegistry()); +    initializeLoadCombinePass(*PassRegistry::getPassRegistry());    }    using llvm::Pass::doInitialization; @@ -223,7 +224,7 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {    if (skipOptnoneFunction(BB))      return false; -  AA = &getAnalysis<AliasAnalysis>(); +  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();    IRBuilder<true, TargetFolder> TheBuilder(        BB.getContext(), TargetFolder(BB.getModule()->getDataLayout())); @@ -262,8 +263,8 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {  void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const {    AU.setPreservesCFG(); -  AU.addRequired<AliasAnalysis>(); -  AU.addPreserved<AliasAnalysis>(); +  AU.addRequired<AAResultsWrapperPass>(); +  AU.addPreserved<GlobalsAAWrapperPass>();  }  char LoadCombine::ID = 0; @@ -274,7 +275,8 @@ BasicBlockPass *llvm::createLoadCombinePass() {  INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", "Combine Adjacent Loads",                        false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)  INITIALIZE_PASS_END(LoadCombine, "load-combine", "Combine Adjacent Loads",                      false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 98b068edf582..bc00ff3f3a42 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -17,6 +17,7 @@  #include "llvm/Transforms/Scalar.h"  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/LoopPass.h"  #include "llvm/Analysis/ScalarEvolution.h"  #include "llvm/IR/Dominators.h" @@ -35,18 +36,19 @@ namespace {      }      // Possibly eliminate loop L if it is dead. -    bool runOnLoop(Loop *L, LPPassManager &LPM) override; +    bool runOnLoop(Loop *L, LPPassManager &) override;      void getAnalysisUsage(AnalysisUsage &AU) const override {        AU.addRequired<DominatorTreeWrapperPass>();        AU.addRequired<LoopInfoWrapperPass>(); -      AU.addRequired<ScalarEvolution>(); +      AU.addRequired<ScalarEvolutionWrapperPass>();        AU.addRequiredID(LoopSimplifyID);        AU.addRequiredID(LCSSAID); -      AU.addPreserved<ScalarEvolution>(); +      AU.addPreserved<ScalarEvolutionWrapperPass>();        AU.addPreserved<DominatorTreeWrapperPass>();        AU.addPreserved<LoopInfoWrapperPass>(); +      AU.addPreserved<GlobalsAAWrapperPass>();        AU.addPreservedID(LoopSimplifyID);        AU.addPreservedID(LCSSAID);      } @@ -64,7 +66,7 @@ INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion",                  "Delete dead loops", false, false)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)  INITIALIZE_PASS_DEPENDENCY(LCSSA)  INITIALIZE_PASS_END(LoopDeletion, "loop-deletion", @@ -130,7 +132,7 @@ bool LoopDeletion::isLoopDead(Loop *L,  /// so could change the halting/non-halting nature of a program.  /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA  /// in order to make various safety checks work. -bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { +bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) {    if (skipOptnoneFunction(L))      return false; @@ -169,7 +171,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {    // Don't remove loops for which we can't solve the trip count.    // They could be infinite, in which case we'd be changing program behavior. -  ScalarEvolution &SE = getAnalysis<ScalarEvolution>(); +  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();    const SCEV *S = SE.getMaxBackedgeTakenCount(L);    if (isa<SCEVCouldNotCompute>(S))      return Changed; @@ -242,9 +244,8 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {    for (BasicBlock *BB : blocks)      loopInfo.removeBlock(BB); -  // The last step is to inform the loop pass manager that we've -  // eliminated this loop. -  LPM.deleteLoopFromQueue(L); +  // The last step is to update LoopInfo now that we've eliminated this loop. +  loopInfo.updateUnloop(L);    Changed = true;    ++NumDeleted; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 1b9859b57790..3d3cf3e2890b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -34,6 +34,7 @@  #include "llvm/Support/Debug.h"  #include "llvm/Transforms/Utils/BasicBlockUtils.h"  #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h"  #include "llvm/Transforms/Utils/LoopVersioning.h"  #include <list> @@ -54,6 +55,11 @@ static cl::opt<bool> DistributeNonIfConvertible(               "if-convertible by the loop vectorizer"),      cl::init(false)); +static cl::opt<unsigned> DistributeSCEVCheckThreshold( +    "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, +    cl::desc("The maximum number of SCEV checks allowed for Loop " +             "Distribution")); +  STATISTIC(NumLoopsDistributed, "Number of loops distributed");  namespace { @@ -164,9 +170,7 @@ public:      // Delete the instructions backwards, as it has a reduced likelihood of      // having to update as many def-use and use-def chains. -    for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) { -      auto *Inst = *I; - +    for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) {        if (!Inst->use_empty())          Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));        Inst->eraseFromParent(); @@ -373,7 +377,7 @@ public:    /// \brief This performs the main chunk of the work of cloning the loops for    /// the partitions. -  void cloneLoops(Pass *P) { +  void cloneLoops() {      BasicBlock *OrigPH = L->getLoopPreheader();      // At this point the predecessor of the preheader is either the memcheck      // block or the top part of the original preheader. @@ -547,11 +551,11 @@ public:    MemoryInstructionDependences(        const SmallVectorImpl<Instruction *> &Instructions, -      const SmallVectorImpl<Dependence> &InterestingDependences) { +      const SmallVectorImpl<Dependence> &Dependences) {      Accesses.append(Instructions.begin(), Instructions.end());      DEBUG(dbgs() << "Backward dependences:\n"); -    for (auto &Dep : InterestingDependences) +    for (auto &Dep : Dependences)        if (Dep.isPossiblyBackward()) {          // Note that the designations source and destination follow the program          // order, i.e. source is always first.  (The direction is given by the @@ -567,25 +571,6 @@ private:    AccessesType Accesses;  }; -/// \brief Returns the instructions that use values defined in the loop. -static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) { -  SmallVector<Instruction *, 8> UsedOutside; - -  for (auto *Block : L->getBlocks()) -    // FIXME: I believe that this could use copy_if if the Inst reference could -    // be adapted into a pointer. -    for (auto &Inst : *Block) { -      auto Users = Inst.users(); -      if (std::any_of(Users.begin(), Users.end(), [&](User *U) { -            auto *Use = cast<Instruction>(U); -            return !L->contains(Use->getParent()); -          })) -        UsedOutside.push_back(&Inst); -    } - -  return UsedOutside; -} -  /// \brief The pass class.  class LoopDistribute : public FunctionPass {  public: @@ -597,6 +582,7 @@ public:      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();      LAA = &getAnalysis<LoopAccessAnalysis>();      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); +    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();      // Build up a worklist of inner-loops to vectorize. This is necessary as the      // act of distributing a loop creates new loops and can invalidate iterators @@ -619,6 +605,7 @@ public:    }    void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<ScalarEvolutionWrapperPass>();      AU.addRequired<LoopInfoWrapperPass>();      AU.addPreserved<LoopInfoWrapperPass>();      AU.addRequired<LoopAccessAnalysis>(); @@ -629,6 +616,45 @@ public:    static char ID;  private: +  /// \brief Filter out checks between pointers from the same partition. +  /// +  /// \p PtrToPartition contains the partition number for pointers.  Partition +  /// number -1 means that the pointer is used in multiple partitions.  In this +  /// case we can't safely omit the check. +  SmallVector<RuntimePointerChecking::PointerCheck, 4> +  includeOnlyCrossPartitionChecks( +      const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, +      const SmallVectorImpl<int> &PtrToPartition, +      const RuntimePointerChecking *RtPtrChecking) { +    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; + +    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), +                 [&](const RuntimePointerChecking::PointerCheck &Check) { +                   for (unsigned PtrIdx1 : Check.first->Members) +                     for (unsigned PtrIdx2 : Check.second->Members) +                       // Only include this check if there is a pair of pointers +                       // that require checking and the pointers fall into +                       // separate partitions. +                       // +                       // (Note that we already know at this point that the two +                       // pointer groups need checking but it doesn't follow +                       // that each pair of pointers within the two groups need +                       // checking as well. +                       // +                       // In other words we don't want to include a check just +                       // because there is a pair of pointers between the two +                       // pointer groups that require checks and a different +                       // pair whose pointers fall into different partitions.) +                       if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && +                           !RuntimePointerChecking::arePointersInSamePartition( +                               PtrToPartition, PtrIdx1, PtrIdx2)) +                         return true; +                   return false; +                 }); + +    return Checks; +  } +    /// \brief Try to distribute an inner-most loop.    bool processLoop(Loop *L) {      assert(L->empty() && "Only process inner loops."); @@ -655,9 +681,8 @@ private:        DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization");        return false;      } -    auto *InterestingDependences = -        LAI.getDepChecker().getInterestingDependences(); -    if (!InterestingDependences || InterestingDependences->empty()) { +    auto *Dependences = LAI.getDepChecker().getDependences(); +    if (!Dependences || Dependences->empty()) {        DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate");        return false;      } @@ -685,7 +710,7 @@ private:      // NumUnsafeDependencesActive reaches 0.      const MemoryDepChecker &DepChecker = LAI.getDepChecker();      MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), -                                     *InterestingDependences); +                                     *Dependences);      int NumUnsafeDependencesActive = 0;      for (auto &InstDep : MID) { @@ -735,6 +760,13 @@ private:          return false;      } +    // Don't distribute the loop if we need too many SCEV run-time checks. +    const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate(); +    if (Pred.getComplexity() > DistributeSCEVCheckThreshold) { +      DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); +      return false; +    } +      DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");      // We're done forming the partitions set up the reverse mapping from      // instructions to partitions. @@ -746,20 +778,25 @@ private:      if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())        SplitBlock(PH, PH->getTerminator(), DT, LI); -    // If we need run-time checks to disambiguate pointers are run-time, version -    // the loop now. +    // If we need run-time checks, version the loop now.      auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI); -    LoopVersioning LVer(LAI, L, LI, DT, &PtrToPartition); -    if (LVer.needsRuntimeChecks()) { +    const auto *RtPtrChecking = LAI.getRuntimePointerChecking(); +    const auto &AllChecks = RtPtrChecking->getChecks(); +    auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, +                                                  RtPtrChecking); + +    if (!Pred.isAlwaysTrue() || !Checks.empty()) {        DEBUG(dbgs() << "\nPointers:\n"); -      DEBUG(LAI.getRuntimePointerChecking()->print(dbgs(), 0, &PtrToPartition)); -      LVer.versionLoop(this); -      LVer.addPHINodes(DefsUsedOutside); +      DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); +      LoopVersioning LVer(LAI, L, LI, DT, SE, false); +      LVer.setAliasChecks(std::move(Checks)); +      LVer.setSCEVChecks(LAI.PSE.getUnionPredicate()); +      LVer.versionLoop(DefsUsedOutside);      }      // Create identical copies of the original loop for each partition and hook      // them up sequentially. -    Partitions.cloneLoops(this); +    Partitions.cloneLoops();      // Now, we remove the instruction from each loop that don't belong to that      // partition. @@ -780,6 +817,7 @@ private:    LoopInfo *LI;    LoopAccessAnalysis *LAA;    DominatorTree *DT; +  ScalarEvolution *SE;  };  } // anonymous namespace @@ -790,6 +828,7 @@ INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)  namespace llvm { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a21ca2417ca1..2d577de7c2b8 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -31,11 +31,6 @@  //   void foo(_Complex float *P)  //     for (i) { __real__(*P) = 0;  __imag__(*P) = 0; }  // -// We should enhance this to handle negative strides through memory. -// Alternatively (and perhaps better) we could rely on an earlier pass to force -// forward iteration through memory, which is generally better for cache -// behavior.  Negative strides *do* happen for memset/memcpy loops. -//  // This could recognize common matrix multiplies and dot product idioms and  // replace them with calls to BLAS (if linked in??).  // @@ -44,7 +39,10 @@  #include "llvm/Transforms/Scalar.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"  #include "llvm/Analysis/ScalarEvolutionExpander.h"  #include "llvm/Analysis/ScalarEvolutionExpressions.h"  #include "llvm/Analysis/TargetLibraryInfo.h" @@ -67,149 +65,85 @@ STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");  namespace { -  class LoopIdiomRecognize; +class LoopIdiomRecognize : public LoopPass { +  Loop *CurLoop; +  AliasAnalysis *AA; +  DominatorTree *DT; +  LoopInfo *LI; +  ScalarEvolution *SE; +  TargetLibraryInfo *TLI; +  const TargetTransformInfo *TTI; +  const DataLayout *DL; + +public: +  static char ID; +  explicit LoopIdiomRecognize() : LoopPass(ID) { +    initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); +  } -  /// This class defines some utility functions for loop idiom recognization. -  class LIRUtil { -  public: -    /// Return true iff the block contains nothing but an uncondition branch -    /// (aka goto instruction). -    static bool isAlmostEmpty(BasicBlock *); +  bool runOnLoop(Loop *L, LPPassManager &LPM) override; + +  /// This transformation requires natural loop information & requires that +  /// loop preheaders be inserted into the CFG. +  /// +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<LoopInfoWrapperPass>(); +    AU.addPreserved<LoopInfoWrapperPass>(); +    AU.addRequiredID(LoopSimplifyID); +    AU.addPreservedID(LoopSimplifyID); +    AU.addRequiredID(LCSSAID); +    AU.addPreservedID(LCSSAID); +    AU.addRequired<AAResultsWrapperPass>(); +    AU.addPreserved<AAResultsWrapperPass>(); +    AU.addRequired<ScalarEvolutionWrapperPass>(); +    AU.addPreserved<ScalarEvolutionWrapperPass>(); +    AU.addPreserved<SCEVAAWrapperPass>(); +    AU.addRequired<DominatorTreeWrapperPass>(); +    AU.addPreserved<DominatorTreeWrapperPass>(); +    AU.addRequired<TargetLibraryInfoWrapperPass>(); +    AU.addRequired<TargetTransformInfoWrapperPass>(); +    AU.addPreserved<BasicAAWrapperPass>(); +    AU.addPreserved<GlobalsAAWrapperPass>(); +  } -    static BranchInst *getBranch(BasicBlock *BB) { -      return dyn_cast<BranchInst>(BB->getTerminator()); -    } +private: +  typedef SmallVector<StoreInst *, 8> StoreList; +  StoreList StoreRefs; -    /// Derive the precondition block (i.e the block that guards the loop -    /// preheader) from the given preheader. -    static BasicBlock *getPrecondBb(BasicBlock *PreHead); -  }; - -  /// This class is to recoginize idioms of population-count conducted in -  /// a noncountable loop. Currently it only recognizes this pattern: -  /// \code -  ///   while(x) {cnt++; ...; x &= x - 1; ...} -  /// \endcode -  class NclPopcountRecognize { -    LoopIdiomRecognize &LIR; -    Loop *CurLoop; -    BasicBlock *PreCondBB; - -    typedef IRBuilder<> IRBuilderTy; - -  public: -    explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR); -    bool recognize(); - -  private: -    /// Take a glimpse of the loop to see if we need to go ahead recoginizing -    /// the idiom. -    bool preliminaryScreen(); - -    /// Check if the given conditional branch is based on the comparison -    /// between a variable and zero, and if the variable is non-zero, the -    /// control yields to the loop entry. If the branch matches the behavior, -    /// the variable involved in the comparion is returned. This function will -    /// be called to see if the precondition and postcondition of the loop -    /// are in desirable form. -    Value *matchCondition(BranchInst *Br, BasicBlock *NonZeroTarget) const; - -    /// Return true iff the idiom is detected in the loop. and 1) \p CntInst -    /// is set to the instruction counting the population bit. 2) \p CntPhi -    /// is set to the corresponding phi node. 3) \p Var is set to the value -    /// whose population bits are being counted. -    bool detectIdiom -      (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const; - -    /// Insert ctpop intrinsic function and some obviously dead instructions. -    void transform(Instruction *CntInst, PHINode *CntPhi, Value *Var); - -    /// Create llvm.ctpop.* intrinsic function. -    CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL); -  }; - -  class LoopIdiomRecognize : public LoopPass { -    Loop *CurLoop; -    DominatorTree *DT; -    ScalarEvolution *SE; -    TargetLibraryInfo *TLI; -    const TargetTransformInfo *TTI; -  public: -    static char ID; -    explicit LoopIdiomRecognize() : LoopPass(ID) { -      initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); -      DT = nullptr; -      SE = nullptr; -      TLI = nullptr; -      TTI = nullptr; -    } +  /// \name Countable Loop Idiom Handling +  /// @{ -    bool runOnLoop(Loop *L, LPPassManager &LPM) override; -    bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, -                        SmallVectorImpl<BasicBlock*> &ExitBlocks); - -    bool processLoopStore(StoreInst *SI, const SCEV *BECount); -    bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); - -    bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, -                                 unsigned StoreAlignment, -                                 Value *SplatValue, Instruction *TheStore, -                                 const SCEVAddRecExpr *Ev, -                                 const SCEV *BECount); -    bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, -                                    const SCEVAddRecExpr *StoreEv, -                                    const SCEVAddRecExpr *LoadEv, -                                    const SCEV *BECount); - -    /// This transformation requires natural loop information & requires that -    /// loop preheaders be inserted into the CFG. -    /// -    void getAnalysisUsage(AnalysisUsage &AU) const override { -      AU.addRequired<LoopInfoWrapperPass>(); -      AU.addPreserved<LoopInfoWrapperPass>(); -      AU.addRequiredID(LoopSimplifyID); -      AU.addPreservedID(LoopSimplifyID); -      AU.addRequiredID(LCSSAID); -      AU.addPreservedID(LCSSAID); -      AU.addRequired<AliasAnalysis>(); -      AU.addPreserved<AliasAnalysis>(); -      AU.addRequired<ScalarEvolution>(); -      AU.addPreserved<ScalarEvolution>(); -      AU.addPreserved<DominatorTreeWrapperPass>(); -      AU.addRequired<DominatorTreeWrapperPass>(); -      AU.addRequired<TargetLibraryInfoWrapperPass>(); -      AU.addRequired<TargetTransformInfoWrapperPass>(); -    } +  bool runOnCountableLoop(); +  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, +                      SmallVectorImpl<BasicBlock *> &ExitBlocks); -    DominatorTree *getDominatorTree() { -      return DT ? DT -                : (DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree()); -    } +  void collectStores(BasicBlock *BB); +  bool isLegalStore(StoreInst *SI); +  bool processLoopStore(StoreInst *SI, const SCEV *BECount); +  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); -    ScalarEvolution *getScalarEvolution() { -      return SE ? SE : (SE = &getAnalysis<ScalarEvolution>()); -    } +  bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, +                               unsigned StoreAlignment, Value *SplatValue, +                               Instruction *TheStore, const SCEVAddRecExpr *Ev, +                               const SCEV *BECount, bool NegStride); +  bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, +                                  const SCEVAddRecExpr *StoreEv, +                                  const SCEV *BECount, bool NegStride); -    TargetLibraryInfo *getTargetLibraryInfo() { -      if (!TLI) -        TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); +  /// @} +  /// \name Noncountable Loop Idiom Handling +  /// @{ -      return TLI; -    } +  bool runOnNoncountableLoop(); -    const TargetTransformInfo *getTargetTransformInfo() { -      return TTI ? TTI -                 : (TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( -                        *CurLoop->getHeader()->getParent())); -    } +  bool recognizePopcount(); +  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, +                               PHINode *CntPhi, Value *Var); -    Loop *getLoop() const { return CurLoop; } +  /// @} +}; -  private: -    bool runOnNoncountableLoop(); -    bool runOnCountableLoop(); -  }; -} +} // End anonymous namespace.  char LoopIdiomRecognize::ID = 0;  INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", @@ -218,9 +152,12 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)  INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)  INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",                      false, false) @@ -242,406 +179,64 @@ static void deleteDeadInstruction(Instruction *I,  //===----------------------------------------------------------------------===//  // -//          Implementation of LIRUtil -// -//===----------------------------------------------------------------------===// - -// This function will return true iff the given block contains nothing but goto. -// A typical usage of this function is to check if the preheader function is -// "almost" empty such that generated intrinsic functions can be moved across -// the preheader and be placed at the end of the precondition block without -// the concern of breaking data dependence. -bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { -  if (BranchInst *Br = getBranch(BB)) { -    return Br->isUnconditional() && Br == BB->begin(); -  } -  return false; -} - -BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { -  if (BasicBlock *BB = PreHead->getSinglePredecessor()) { -    BranchInst *Br = getBranch(BB); -    return Br && Br->isConditional() ? BB : nullptr; -  } -  return nullptr; -} - -//===----------------------------------------------------------------------===// -// -//          Implementation of NclPopcountRecognize +//          Implementation of LoopIdiomRecognize  //  //===----------------------------------------------------------------------===// -NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR): -  LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(nullptr) { -} - -bool NclPopcountRecognize::preliminaryScreen() { -  const TargetTransformInfo *TTI = LIR.getTargetTransformInfo(); -  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) -    return false; - -  // Counting population are usually conducted by few arithmetic instructions. -  // Such instructions can be easilly "absorbed" by vacant slots in a -  // non-compact loop. Therefore, recognizing popcount idiom only makes sense -  // in a compact loop. - -  // Give up if the loop has multiple blocks or multiple backedges. -  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) -    return false; - -  BasicBlock *LoopBody = *(CurLoop->block_begin()); -  if (LoopBody->size() >= 20) { -    // The loop is too big, bail out. -    return false; -  } - -  // It should have a preheader containing nothing but a goto instruction. -  BasicBlock *PreHead = CurLoop->getLoopPreheader(); -  if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead)) +bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { +  if (skipOptnoneFunction(L))      return false; -  // It should have a precondition block where the generated popcount instrinsic -  // function will be inserted. -  PreCondBB = LIRUtil::getPrecondBb(PreHead); -  if (!PreCondBB) +  CurLoop = L; +  // If the loop could not be converted to canonical form, it must have an +  // indirectbr in it, just give up. +  if (!L->getLoopPreheader())      return false; -  return true; -} - -Value *NclPopcountRecognize::matchCondition(BranchInst *Br, -                                            BasicBlock *LoopEntry) const { -  if (!Br || !Br->isConditional()) -    return nullptr; - -  ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition()); -  if (!Cond) -    return nullptr; - -  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); -  if (!CmpZero || !CmpZero->isZero()) -    return nullptr; - -  ICmpInst::Predicate Pred = Cond->getPredicate(); -  if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) || -      (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry)) -    return Cond->getOperand(0); - -  return nullptr; -} - -bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, -                                       PHINode *&CntPhi, -                                       Value *&Var) const { -  // Following code tries to detect this idiom: -  // -  //    if (x0 != 0) -  //      goto loop-exit // the precondition of the loop -  //    cnt0 = init-val; -  //    do { -  //       x1 = phi (x0, x2); -  //       cnt1 = phi(cnt0, cnt2); -  // -  //       cnt2 = cnt1 + 1; -  //        ... -  //       x2 = x1 & (x1 - 1); -  //        ... -  //    } while(x != 0); -  // -  // loop-exit: -  // - -  // step 1: Check to see if the look-back branch match this pattern: -  //    "if (a!=0) goto loop-entry". -  BasicBlock *LoopEntry; -  Instruction *DefX2, *CountInst; -  Value *VarX1, *VarX0; -  PHINode *PhiX, *CountPhi; - -  DefX2 = CountInst = nullptr; -  VarX1 = VarX0 = nullptr; -  PhiX = CountPhi = nullptr; -  LoopEntry = *(CurLoop->block_begin()); - -  // step 1: Check if the loop-back branch is in desirable form. -  { -    if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry)) -      DefX2 = dyn_cast<Instruction>(T); -    else -      return false; -  } - -  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" -  { -    if (!DefX2 || DefX2->getOpcode() != Instruction::And) -      return false; - -    BinaryOperator *SubOneOp; - -    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) -      VarX1 = DefX2->getOperand(1); -    else { -      VarX1 = DefX2->getOperand(0); -      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); -    } -    if (!SubOneOp) -      return false; - -    Instruction *SubInst = cast<Instruction>(SubOneOp); -    ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); -    if (!Dec || -        !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || -          (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) { -      return false; -    } -  } - -  // step 3: Check the recurrence of variable X -  { -    PhiX = dyn_cast<PHINode>(VarX1); -    if (!PhiX || -        (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { -      return false; -    } -  } - -  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 -  { -    CountInst = nullptr; -    for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(), -           IterE = LoopEntry->end(); Iter != IterE; Iter++) { -      Instruction *Inst = Iter; -      if (Inst->getOpcode() != Instruction::Add) -        continue; - -      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); -      if (!Inc || !Inc->isOne()) -        continue; - -      PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); -      if (!Phi || Phi->getParent() != LoopEntry) -        continue; - -      // Check if the result of the instruction is live of the loop. -      bool LiveOutLoop = false; -      for (User *U : Inst->users()) { -        if ((cast<Instruction>(U))->getParent() != LoopEntry) { -          LiveOutLoop = true; break; -        } -      } - -      if (LiveOutLoop) { -        CountInst = Inst; -        CountPhi = Phi; -        break; -      } -    } - -    if (!CountInst) -      return false; -  } - -  // step 5: check if the precondition is in this form: -  //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" -  { -    BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); -    Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader()); -    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) -      return false; - -    CntInst = CountInst; -    CntPhi = CountPhi; -    Var = T; -  } - -  return true; -} - -void NclPopcountRecognize::transform(Instruction *CntInst, -                                     PHINode *CntPhi, Value *Var) { - -  ScalarEvolution *SE = LIR.getScalarEvolution(); -  TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo(); -  BasicBlock *PreHead = CurLoop->getLoopPreheader(); -  BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); -  const DebugLoc DL = CntInst->getDebugLoc(); - -  // Assuming before transformation, the loop is following: -  //  if (x) // the precondition -  //     do { cnt++; x &= x - 1; } while(x); - -  // Step 1: Insert the ctpop instruction at the end of the precondition block -  IRBuilderTy Builder(PreCondBr); -  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; -  { -    PopCnt = createPopcntIntrinsic(Builder, Var, DL); -    NewCount = PopCntZext = -      Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); - -    if (NewCount != PopCnt) -      (cast<Instruction>(NewCount))->setDebugLoc(DL); - -    // TripCnt is exactly the number of iterations the loop has -    TripCnt = NewCount; - -    // If the population counter's initial value is not zero, insert Add Inst. -    Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); -    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); -    if (!InitConst || !InitConst->isZero()) { -      NewCount = Builder.CreateAdd(NewCount, CntInitVal); -      (cast<Instruction>(NewCount))->setDebugLoc(DL); -    } -  } - -  // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to -  //   "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic -  //   function would be partial dead code, and downstream passes will drag -  //   it back from the precondition block to the preheader. -  { -    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); - -    Value *Opnd0 = PopCntZext; -    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); -    if (PreCond->getOperand(0) != Var) -      std::swap(Opnd0, Opnd1); - -    ICmpInst *NewPreCond = -      cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); -    PreCondBr->setCondition(NewPreCond); - -    RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); -  } - -  // Step 3: Note that the population count is exactly the trip count of the -  // loop in question, which enble us to to convert the loop from noncountable -  // loop into a countable one. The benefit is twofold: -  // -  //  - If the loop only counts population, the entire loop become dead after -  //    the transformation. It is lots easier to prove a countable loop dead -  //    than to prove a noncountable one. (In some C dialects, a infite loop -  //    isn't dead even if it computes nothing useful. In general, DCE needs -  //    to prove a noncountable loop finite before safely delete it.) -  // -  //  - If the loop also performs something else, it remains alive. -  //    Since it is transformed to countable form, it can be aggressively -  //    optimized by some optimizations which are in general not applicable -  //    to a noncountable loop. -  // -  // After this step, this loop (conceptually) would look like following: -  //   newcnt = __builtin_ctpop(x); -  //   t = newcnt; -  //   if (x) -  //     do { cnt++; x &= x-1; t--) } while (t > 0); -  BasicBlock *Body = *(CurLoop->block_begin()); -  { -    BranchInst *LbBr = LIRUtil::getBranch(Body); -    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); -    Type *Ty = TripCnt->getType(); - -    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin()); - -    Builder.SetInsertPoint(LbCond); -    Value *Opnd1 = cast<Value>(TcPhi); -    Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1)); -    Instruction *TcDec = -      cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true)); - -    TcPhi->addIncoming(TripCnt, PreHead); -    TcPhi->addIncoming(TcDec, Body); - -    CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ? -      CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; -    LbCond->setPredicate(Pred); -    LbCond->setOperand(0, TcDec); -    LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0))); -  } - -  // Step 4: All the references to the original population counter outside -  //  the loop are replaced with the NewCount -- the value returned from -  //  __builtin_ctpop(). -  CntInst->replaceUsesOutsideBlock(NewCount, Body); - -  // step 5: Forget the "non-computable" trip-count SCEV associated with the -  //   loop. The loop would otherwise not be deleted even if it becomes empty. -  SE->forgetLoop(CurLoop); -} - -CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, -                                                      Value *Val, DebugLoc DL) { -  Value *Ops[] = { Val }; -  Type *Tys[] = { Val->getType() }; - -  Module *M = (*(CurLoop->block_begin()))->getParent()->getParent(); -  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); -  CallInst *CI = IRBuilder.CreateCall(Func, Ops); -  CI->setDebugLoc(DL); - -  return CI; -} - -/// recognize - detect population count idiom in a non-countable loop. If -///   detected, transform the relevant code to popcount intrinsic function -///   call, and return true; otherwise, return false. -bool NclPopcountRecognize::recognize() { - -  if (!LIR.getTargetTransformInfo()) +  // Disable loop idiom recognition if the function's name is a common idiom. +  StringRef Name = L->getHeader()->getParent()->getName(); +  if (Name == "memset" || Name == "memcpy")      return false; -  LIR.getScalarEvolution(); - -  if (!preliminaryScreen()) -    return false; +  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); +  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); +  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); +  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); +  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); +  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( +      *CurLoop->getHeader()->getParent()); +  DL = &CurLoop->getHeader()->getModule()->getDataLayout(); -  Instruction *CntInst; -  PHINode *CntPhi; -  Value *Val; -  if (!detectIdiom(CntInst, CntPhi, Val)) -    return false; +  if (SE->hasLoopInvariantBackedgeTakenCount(L)) +    return runOnCountableLoop(); -  transform(CntInst, CntPhi, Val); -  return true; +  return runOnNoncountableLoop();  } -//===----------------------------------------------------------------------===// -// -//          Implementation of LoopIdiomRecognize -// -//===----------------------------------------------------------------------===// -  bool LoopIdiomRecognize::runOnCountableLoop() {    const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);    assert(!isa<SCEVCouldNotCompute>(BECount) && -    "runOnCountableLoop() called on a loop without a predictable" -    "backedge-taken count"); +         "runOnCountableLoop() called on a loop without a predictable" +         "backedge-taken count");    // If this loop executes exactly one time, then it should be peeled, not    // optimized by this pass.    if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) -    if (BECst->getValue()->getValue() == 0) +    if (BECst->getAPInt() == 0)        return false; -  // set DT -  (void)getDominatorTree(); - -  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); -  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - -  // set TLI -  (void)getTargetLibraryInfo(); - -  SmallVector<BasicBlock*, 8> ExitBlocks; +  SmallVector<BasicBlock *, 8> ExitBlocks;    CurLoop->getUniqueExitBlocks(ExitBlocks);    DEBUG(dbgs() << "loop-idiom Scanning: F[" -               << CurLoop->getHeader()->getParent()->getName() -               << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); +               << CurLoop->getHeader()->getParent()->getName() << "] Loop %" +               << CurLoop->getHeader()->getName() << "\n");    bool MadeChange = false;    // Scan all the blocks in the loop that are not in subloops.    for (auto *BB : CurLoop->getBlocks()) {      // Ignore blocks in subloops. -    if (LI.getLoopFor(BB) != CurLoop) +    if (LI->getLoopFor(BB) != CurLoop)        continue;      MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); @@ -649,41 +244,109 @@ bool LoopIdiomRecognize::runOnCountableLoop() {    return MadeChange;  } -bool LoopIdiomRecognize::runOnNoncountableLoop() { -  NclPopcountRecognize Popcount(*this); -  if (Popcount.recognize()) -    return true; +static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) { +  uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType()); +  assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) && +         "Don't overflow unsigned."); +  return (unsigned)SizeInBits >> 3; +} -  return false; +static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) { +  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1)); +  return ConstStride->getAPInt().getZExtValue();  } -bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { -  if (skipOptnoneFunction(L)) +/// getMemSetPatternValue - If a strided store of the specified value is safe to +/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should +/// be passed in.  Otherwise, return null. +/// +/// Note that we don't ever attempt to use memset_pattern8 or 4, because these +/// just replicate their input array and then pass on to memset_pattern16. +static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { +  // If the value isn't a constant, we can't promote it to being in a constant +  // array.  We could theoretically do a store to an alloca or something, but +  // that doesn't seem worthwhile. +  Constant *C = dyn_cast<Constant>(V); +  if (!C) +    return nullptr; + +  // Only handle simple values that are a power of two bytes in size. +  uint64_t Size = DL->getTypeSizeInBits(V->getType()); +  if (Size == 0 || (Size & 7) || (Size & (Size - 1))) +    return nullptr; + +  // Don't care enough about darwin/ppc to implement this. +  if (DL->isBigEndian()) +    return nullptr; + +  // Convert to size in bytes. +  Size /= 8; + +  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see +  // if the top and bottom are the same (e.g. for vectors and large integers). +  if (Size > 16) +    return nullptr; + +  // If the constant is exactly 16 bytes, just use it. +  if (Size == 16) +    return C; + +  // Otherwise, we'll use an array of the constants. +  unsigned ArraySize = 16 / Size; +  ArrayType *AT = ArrayType::get(V->getType(), ArraySize); +  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); +} + +bool LoopIdiomRecognize::isLegalStore(StoreInst *SI) { +  // Don't touch volatile stores. +  if (!SI->isSimple())      return false; -  CurLoop = L; +  Value *StoredVal = SI->getValueOperand(); +  Value *StorePtr = SI->getPointerOperand(); -  // If the loop could not be converted to canonical form, it must have an -  // indirectbr in it, just give up. -  if (!L->getLoopPreheader()) +  // Reject stores that are so large that they overflow an unsigned. +  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); +  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)      return false; -  // Disable loop idiom recognition if the function's name is a common idiom. -  StringRef Name = L->getHeader()->getParent()->getName(); -  if (Name == "memset" || Name == "memcpy") +  // See if the pointer expression is an AddRec like {base,+,1} on the current +  // loop, which indicates a strided store.  If we have something else, it's a +  // random store we can't handle. +  const SCEVAddRecExpr *StoreEv = +      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); +  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())      return false; -  SE = &getAnalysis<ScalarEvolution>(); -  if (SE->hasLoopInvariantBackedgeTakenCount(L)) -    return runOnCountableLoop(); -  return runOnNoncountableLoop(); +  // Check to see if we have a constant stride. +  if (!isa<SCEVConstant>(StoreEv->getOperand(1))) +    return false; + +  return true; +} + +void LoopIdiomRecognize::collectStores(BasicBlock *BB) { +  StoreRefs.clear(); +  for (Instruction &I : *BB) { +    StoreInst *SI = dyn_cast<StoreInst>(&I); +    if (!SI) +      continue; + +    // Make sure this is a strided store with a constant stride. +    if (!isLegalStore(SI)) +      continue; + +    // Save the store locations. +    StoreRefs.push_back(SI); +  }  }  /// runOnLoopBlock - Process the specified block, which lives in a counted loop  /// with the specified backedge count.  This block is known to be in the current  /// loop and not in any subloops. -bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, -                                     SmallVectorImpl<BasicBlock*> &ExitBlocks) { +bool LoopIdiomRecognize::runOnLoopBlock( +    BasicBlock *BB, const SCEV *BECount, +    SmallVectorImpl<BasicBlock *> &ExitBlocks) {    // We can only promote stores in this block if they are unconditionally    // executed in the loop.  For a block to be unconditionally executed, it has    // to dominate all the exit blocks of the loop.  Verify this now. @@ -692,25 +355,18 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,        return false;    bool MadeChange = false; -  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { -    Instruction *Inst = I++; -    // Look for store instructions, which may be optimized to memset/memcpy. -    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))  { -      WeakVH InstPtr(I); -      if (!processLoopStore(SI, BECount)) continue; -      MadeChange = true; - -      // If processing the store invalidated our iterator, start over from the -      // top of the block. -      if (!InstPtr) -        I = BB->begin(); -      continue; -    } +  // Look for store instructions, which may be optimized to memset/memcpy. +  collectStores(BB); +  for (auto &SI : StoreRefs) +    MadeChange |= processLoopStore(SI, BECount); +  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { +    Instruction *Inst = &*I++;      // Look for memset instructions, which may be optimized to a larger memset. -    if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst))  { -      WeakVH InstPtr(I); -      if (!processLoopMemSet(MSI, BECount)) continue; +    if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { +      WeakVH InstPtr(&*I); +      if (!processLoopMemSet(MSI, BECount)) +        continue;        MadeChange = true;        // If processing the memset invalidated our iterator, start over from the @@ -724,71 +380,38 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,    return MadeChange;  } -  /// processLoopStore - See if this store can be promoted to a memset or memcpy.  bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { -  if (!SI->isSimple()) return false; +  assert(SI->isSimple() && "Expected only non-volatile stores.");    Value *StoredVal = SI->getValueOperand();    Value *StorePtr = SI->getPointerOperand(); -  // Reject stores that are so large that they overflow an unsigned. -  auto &DL = CurLoop->getHeader()->getModule()->getDataLayout(); -  uint64_t SizeInBits = DL.getTypeSizeInBits(StoredVal->getType()); -  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) -    return false; - -  // See if the pointer expression is an AddRec like {base,+,1} on the current -  // loop, which indicates a strided store.  If we have something else, it's a -  // random store we can't handle. -  const SCEVAddRecExpr *StoreEv = -    dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); -  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) -    return false; -    // Check to see if the stride matches the size of the store.  If so, then we    // know that every byte is touched in the loop. -  unsigned StoreSize = (unsigned)SizeInBits >> 3; -  const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); - -  if (!Stride || StoreSize != Stride->getValue()->getValue()) { -    // TODO: Could also handle negative stride here someday, that will require -    // the validity check in mayLoopAccessLocation to be updated though. -    // Enable this to print exact negative strides. -    if (0 && Stride && StoreSize == -Stride->getValue()->getValue()) { -      dbgs() << "NEGATIVE STRIDE: " << *SI << "\n"; -      dbgs() << "BB: " << *SI->getParent(); -    } - +  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); +  unsigned Stride = getStoreStride(StoreEv); +  unsigned StoreSize = getStoreSizeInBytes(SI, DL); +  if (StoreSize != Stride && StoreSize != -Stride)      return false; -  } + +  bool NegStride = StoreSize == -Stride;    // See if we can optimize just this store in isolation.    if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), -                              StoredVal, SI, StoreEv, BECount)) +                              StoredVal, SI, StoreEv, BECount, NegStride))      return true; -  // If the stored value is a strided load in the same loop with the same stride -  // this this may be transformable into a memcpy.  This kicks in for stuff like -  //   for (i) A[i] = B[i]; -  if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { -    const SCEVAddRecExpr *LoadEv = -      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0))); -    if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() && -        StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple()) -      if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount)) -        return true; -  } -  //errs() << "UNHANDLED strided store: " << *StoreEv << " - " << *SI << "\n"; - -  return false; +  // Optimize the store into a memcpy, if it feeds an similarly strided load. +  return processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, BECount, NegStride);  }  /// processLoopMemSet - See if this memset can be promoted to a large memset. -bool LoopIdiomRecognize:: -processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { +bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, +                                           const SCEV *BECount) {    // We can only handle non-volatile memsets with a constant size. -  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) return false; +  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) +    return false;    // If we're not allowed to hack on memset, we fail.    if (!TLI->has(LibFunc::memset)) @@ -818,17 +441,16 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) {      return false;    return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, -                                 MSI->getAlignment(), MSI->getValue(), -                                 MSI, Ev, BECount); +                                 MSI->getAlignment(), MSI->getValue(), MSI, Ev, +                                 BECount, /*NegStride=*/false);  } -  /// mayLoopAccessLocation - Return true if the specified loop might access the  /// specified pointer location, which is a loop-strided access.  The 'Access'  /// argument specifies what the verboten forms of access are (read or write). -static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, -                                  Loop *L, const SCEV *BECount, -                                  unsigned StoreSize, AliasAnalysis &AA, +static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, +                                  const SCEV *BECount, unsigned StoreSize, +                                  AliasAnalysis &AA,                                    Instruction *IgnoredStore) {    // Get the location that may be stored across the loop.  Since the access is    // strided positively through memory, we say that the modified location starts @@ -838,7 +460,7 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access,    // If the loop iterates a fixed number of times, we can refine the access size    // to be exactly the size of the memset, which is (BECount+1)*StoreSize    if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) -    AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize; +    AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;    // TODO: For this to be really effective, we have to dive into the pointer    // operand in the store.  Store to &A[i] of 100 will always return may alias @@ -849,59 +471,31 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access,    for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;         ++BI)      for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) -      if (&*I != IgnoredStore && -          (AA.getModRefInfo(I, StoreLoc) & Access)) +      if (&*I != IgnoredStore && (AA.getModRefInfo(&*I, StoreLoc) & Access))          return true;    return false;  } -/// getMemSetPatternValue - If a strided store of the specified value is safe to -/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should -/// be passed in.  Otherwise, return null. -/// -/// Note that we don't ever attempt to use memset_pattern8 or 4, because these -/// just replicate their input array and then pass on to memset_pattern16. -static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) { -  // If the value isn't a constant, we can't promote it to being in a constant -  // array.  We could theoretically do a store to an alloca or something, but -  // that doesn't seem worthwhile. -  Constant *C = dyn_cast<Constant>(V); -  if (!C) return nullptr; - -  // Only handle simple values that are a power of two bytes in size. -  uint64_t Size = DL.getTypeSizeInBits(V->getType()); -  if (Size == 0 || (Size & 7) || (Size & (Size-1))) -    return nullptr; - -  // Don't care enough about darwin/ppc to implement this. -  if (DL.isBigEndian()) -    return nullptr; - -  // Convert to size in bytes. -  Size /= 8; - -  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see -  // if the top and bottom are the same (e.g. for vectors and large integers). -  if (Size > 16) return nullptr; - -  // If the constant is exactly 16 bytes, just use it. -  if (Size == 16) return C; - -  // Otherwise, we'll use an array of the constants. -  unsigned ArraySize = 16/Size; -  ArrayType *AT = ArrayType::get(V->getType(), ArraySize); -  return ConstantArray::get(AT, std::vector<Constant*>(ArraySize, C)); +// If we have a negative stride, Start refers to the end of the memory location +// we're trying to memset.  Therefore, we need to recompute the base pointer, +// which is just Start - BECount*Size. +static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, +                                        Type *IntPtr, unsigned StoreSize, +                                        ScalarEvolution *SE) { +  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr); +  if (StoreSize != 1) +    Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize), +                           SCEV::FlagNUW); +  return SE->getMinusSCEV(Start, Index);  } -  /// processLoopStridedStore - We see a strided store of some value.  If we can  /// transform this into a memset or memset_pattern in the loop preheader, do so. -bool LoopIdiomRecognize:: -processLoopStridedStore(Value *DestPtr, unsigned StoreSize, -                        unsigned StoreAlignment, Value *StoredVal, -                        Instruction *TheStore, const SCEVAddRecExpr *Ev, -                        const SCEV *BECount) { +bool LoopIdiomRecognize::processLoopStridedStore( +    Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, +    Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev, +    const SCEV *BECount, bool NegStride) {    // If the stored value is a byte-wise value (like i32 -1), then it may be    // turned into a memset of i8 -1, assuming that all the consecutive bytes @@ -909,7 +503,6 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,    // but it can be turned into memset_pattern if the target supports it.    Value *SplatValue = isBytewiseValue(StoredVal);    Constant *PatternValue = nullptr; -  auto &DL = CurLoop->getHeader()->getModule()->getDataLayout();    unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();    // If we're allowed to form a memset, and the stored value would be acceptable @@ -936,9 +529,15 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,    // header.  This allows us to insert code for it in the preheader.    BasicBlock *Preheader = CurLoop->getLoopPreheader();    IRBuilder<> Builder(Preheader->getTerminator()); -  SCEVExpander Expander(*SE, DL, "loop-idiom"); +  SCEVExpander Expander(*SE, *DL, "loop-idiom");    Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); +  Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS); + +  const SCEV *Start = Ev->getStart(); +  // Handle negative strided loops. +  if (NegStride) +    Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);    // Okay, we have a strided store "p[i]" of a splattable value.  We can turn    // this into a memset in the loop preheader now if we want.  However, this @@ -946,12 +545,9 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,    // or write to the aliased location.  Check for any overlap by generating the    // base pointer and checking the region.    Value *BasePtr = -    Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy, -                           Preheader->getTerminator()); - -  if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, -                            CurLoop, BECount, -                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) { +      Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); +  if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize, +                            *AA, TheStore)) {      Expander.clear();      // If we generated new code for the base pointer, clean up.      RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); @@ -962,36 +558,30 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,    // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to    // pointer size if it isn't already. -  Type *IntPtr = Builder.getIntPtrTy(DL, DestAS);    BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); -  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), -                                         SCEV::FlagNUW); +  const SCEV *NumBytesS = +      SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW);    if (StoreSize != 1) {      NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),                                 SCEV::FlagNUW);    }    Value *NumBytes = -    Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); +      Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());    CallInst *NewCall;    if (SplatValue) { -    NewCall = Builder.CreateMemSet(BasePtr, -                                   SplatValue, -                                   NumBytes, -                                   StoreAlignment); +    NewCall = +        Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);    } else {      // Everything is emitted in default address space      Type *Int8PtrTy = DestInt8PtrTy; -    Module *M = TheStore->getParent()->getParent()->getParent(); -    Value *MSP = M->getOrInsertFunction("memset_pattern16", -                                        Builder.getVoidTy(), -                                        Int8PtrTy, -                                        Int8PtrTy, -                                        IntPtr, -                                        (void*)nullptr); +    Module *M = TheStore->getModule(); +    Value *MSP = +        M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), +                               Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr);      // Otherwise we should form a memset_pattern16.  PatternValue is known to be      // an constant array of 16-bytes.  Plop the value into a mergable global. @@ -1015,26 +605,47 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,    return true;  } -/// processLoopStoreOfLoopLoad - We see a strided store whose value is a -/// same-strided load. -bool LoopIdiomRecognize:: -processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, -                           const SCEVAddRecExpr *StoreEv, -                           const SCEVAddRecExpr *LoadEv, -                           const SCEV *BECount) { +/// If the stored value is a strided load in the same loop with the same stride +/// this may be transformable into a memcpy.  This kicks in for stuff like +///   for (i) A[i] = B[i]; +bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( +    StoreInst *SI, unsigned StoreSize, const SCEVAddRecExpr *StoreEv, +    const SCEV *BECount, bool NegStride) {    // If we're not allowed to form memcpy, we fail.    if (!TLI->has(LibFunc::memcpy))      return false; -  LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); +  // The store must be feeding a non-volatile load. +  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); +  if (!LI || !LI->isSimple()) +    return false; + +  // See if the pointer expression is an AddRec like {base,+,1} on the current +  // loop, which indicates a strided load.  If we have something else, it's a +  // random load we can't handle. +  const SCEVAddRecExpr *LoadEv = +      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); +  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) +    return false; + +  // The store and load must share the same stride. +  if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) +    return false;    // The trip count of the loop and the base pointer of the addrec SCEV is    // guaranteed to be loop invariant, which means that it should dominate the    // header.  This allows us to insert code for it in the preheader.    BasicBlock *Preheader = CurLoop->getLoopPreheader();    IRBuilder<> Builder(Preheader->getTerminator()); -  const DataLayout &DL = Preheader->getModule()->getDataLayout(); -  SCEVExpander Expander(*SE, DL, "loop-idiom"); +  SCEVExpander Expander(*SE, *DL, "loop-idiom"); + +  const SCEV *StrStart = StoreEv->getStart(); +  unsigned StrAS = SI->getPointerAddressSpace(); +  Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS); + +  // Handle negative strided loops. +  if (NegStride) +    StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);    // Okay, we have a strided store "p[i]" of a loaded value.  We can turn    // this into a memcpy in the loop preheader now if we want.  However, this @@ -1042,29 +653,31 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,    // or write the memory region we're storing to.  This includes the load that    // feeds the stores.  Check for an alias by generating the base address and    // checking everything. -  Value *StoreBasePtr = -    Expander.expandCodeFor(StoreEv->getStart(), -                           Builder.getInt8PtrTy(SI->getPointerAddressSpace()), -                           Preheader->getTerminator()); - -  if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef, -                            CurLoop, BECount, StoreSize, -                            getAnalysis<AliasAnalysis>(), SI)) { +  Value *StoreBasePtr = Expander.expandCodeFor( +      StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); + +  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, +                            StoreSize, *AA, SI)) {      Expander.clear();      // If we generated new code for the base pointer, clean up.      RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);      return false;    } +  const SCEV *LdStart = LoadEv->getStart(); +  unsigned LdAS = LI->getPointerAddressSpace(); + +  // Handle negative strided loops. +  if (NegStride) +    LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); +    // For a memcpy, we have to make sure that the input array is not being    // mutated by the loop. -  Value *LoadBasePtr = -    Expander.expandCodeFor(LoadEv->getStart(), -                           Builder.getInt8PtrTy(LI->getPointerAddressSpace()), -                           Preheader->getTerminator()); +  Value *LoadBasePtr = Expander.expandCodeFor( +      LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); -  if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount, -                            StoreSize, getAnalysis<AliasAnalysis>(), SI)) { +  if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, +                            *AA, SI)) {      Expander.clear();      // If we generated new code for the base pointer, clean up.      RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); @@ -1074,34 +687,368 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,    // Okay, everything is safe, we can transform this! -    // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to    // pointer size if it isn't already. -  Type *IntPtrTy = Builder.getIntPtrTy(DL, SI->getPointerAddressSpace());    BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); -  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1), -                                         SCEV::FlagNUW); +  const SCEV *NumBytesS = +      SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);    if (StoreSize != 1)      NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),                                 SCEV::FlagNUW);    Value *NumBytes = -    Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); +      Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());    CallInst *NewCall = -    Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, -                         std::min(SI->getAlignment(), LI->getAlignment())); +      Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, +                           std::min(SI->getAlignment(), LI->getAlignment()));    NewCall->setDebugLoc(SI->getDebugLoc());    DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"                 << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"                 << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n"); - -  // Okay, the memset has been formed.  Zap the original store and anything that +  // Okay, the memcpy has been formed.  Zap the original store and anything that    // feeds into it.    deleteDeadInstruction(SI, TLI);    ++NumMemCpy;    return true;  } + +bool LoopIdiomRecognize::runOnNoncountableLoop() { +  return recognizePopcount(); +} + +/// Check if the given conditional branch is based on the comparison between +/// a variable and zero, and if the variable is non-zero, the control yields to +/// the loop entry. If the branch matches the behavior, the variable involved +/// in the comparion is returned. This function will be called to see if the +/// precondition and postcondition of the loop are in desirable form. +static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) { +  if (!BI || !BI->isConditional()) +    return nullptr; + +  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); +  if (!Cond) +    return nullptr; + +  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); +  if (!CmpZero || !CmpZero->isZero()) +    return nullptr; + +  ICmpInst::Predicate Pred = Cond->getPredicate(); +  if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) || +      (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry)) +    return Cond->getOperand(0); + +  return nullptr; +} + +/// Return true iff the idiom is detected in the loop. +/// +/// Additionally: +/// 1) \p CntInst is set to the instruction counting the population bit. +/// 2) \p CntPhi is set to the corresponding phi node. +/// 3) \p Var is set to the value whose population bits are being counted. +/// +/// The core idiom we are trying to detect is: +/// \code +///    if (x0 != 0) +///      goto loop-exit // the precondition of the loop +///    cnt0 = init-val; +///    do { +///       x1 = phi (x0, x2); +///       cnt1 = phi(cnt0, cnt2); +/// +///       cnt2 = cnt1 + 1; +///        ... +///       x2 = x1 & (x1 - 1); +///        ... +///    } while(x != 0); +/// +/// loop-exit: +/// \endcode +static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, +                                Instruction *&CntInst, PHINode *&CntPhi, +                                Value *&Var) { +  // step 1: Check to see if the look-back branch match this pattern: +  //    "if (a!=0) goto loop-entry". +  BasicBlock *LoopEntry; +  Instruction *DefX2, *CountInst; +  Value *VarX1, *VarX0; +  PHINode *PhiX, *CountPhi; + +  DefX2 = CountInst = nullptr; +  VarX1 = VarX0 = nullptr; +  PhiX = CountPhi = nullptr; +  LoopEntry = *(CurLoop->block_begin()); + +  // step 1: Check if the loop-back branch is in desirable form. +  { +    if (Value *T = matchCondition( +            dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) +      DefX2 = dyn_cast<Instruction>(T); +    else +      return false; +  } + +  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" +  { +    if (!DefX2 || DefX2->getOpcode() != Instruction::And) +      return false; + +    BinaryOperator *SubOneOp; + +    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) +      VarX1 = DefX2->getOperand(1); +    else { +      VarX1 = DefX2->getOperand(0); +      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); +    } +    if (!SubOneOp) +      return false; + +    Instruction *SubInst = cast<Instruction>(SubOneOp); +    ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); +    if (!Dec || +        !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || +          (SubInst->getOpcode() == Instruction::Add && +           Dec->isAllOnesValue()))) { +      return false; +    } +  } + +  // step 3: Check the recurrence of variable X +  { +    PhiX = dyn_cast<PHINode>(VarX1); +    if (!PhiX || +        (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { +      return false; +    } +  } + +  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 +  { +    CountInst = nullptr; +    for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), +                              IterE = LoopEntry->end(); +         Iter != IterE; Iter++) { +      Instruction *Inst = &*Iter; +      if (Inst->getOpcode() != Instruction::Add) +        continue; + +      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); +      if (!Inc || !Inc->isOne()) +        continue; + +      PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); +      if (!Phi || Phi->getParent() != LoopEntry) +        continue; + +      // Check if the result of the instruction is live of the loop. +      bool LiveOutLoop = false; +      for (User *U : Inst->users()) { +        if ((cast<Instruction>(U))->getParent() != LoopEntry) { +          LiveOutLoop = true; +          break; +        } +      } + +      if (LiveOutLoop) { +        CountInst = Inst; +        CountPhi = Phi; +        break; +      } +    } + +    if (!CountInst) +      return false; +  } + +  // step 5: check if the precondition is in this form: +  //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" +  { +    auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); +    Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader()); +    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) +      return false; + +    CntInst = CountInst; +    CntPhi = CountPhi; +    Var = T; +  } + +  return true; +} + +/// Recognizes a population count idiom in a non-countable loop. +/// +/// If detected, transforms the relevant code to issue the popcount intrinsic +/// function call, and returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizePopcount() { +  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) +    return false; + +  // Counting population are usually conducted by few arithmetic instructions. +  // Such instructions can be easily "absorbed" by vacant slots in a +  // non-compact loop. Therefore, recognizing popcount idiom only makes sense +  // in a compact loop. + +  // Give up if the loop has multiple blocks or multiple backedges. +  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) +    return false; + +  BasicBlock *LoopBody = *(CurLoop->block_begin()); +  if (LoopBody->size() >= 20) { +    // The loop is too big, bail out. +    return false; +  } + +  // It should have a preheader containing nothing but an unconditional branch. +  BasicBlock *PH = CurLoop->getLoopPreheader(); +  if (!PH) +    return false; +  if (&PH->front() != PH->getTerminator()) +    return false; +  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator()); +  if (!EntryBI || EntryBI->isConditional()) +    return false; + +  // It should have a precondition block where the generated popcount instrinsic +  // function can be inserted. +  auto *PreCondBB = PH->getSinglePredecessor(); +  if (!PreCondBB) +    return false; +  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); +  if (!PreCondBI || PreCondBI->isUnconditional()) +    return false; + +  Instruction *CntInst; +  PHINode *CntPhi; +  Value *Val; +  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val)) +    return false; + +  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val); +  return true; +} + +static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, +                                       DebugLoc DL) { +  Value *Ops[] = {Val}; +  Type *Tys[] = {Val->getType()}; + +  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); +  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); +  CallInst *CI = IRBuilder.CreateCall(Func, Ops); +  CI->setDebugLoc(DL); + +  return CI; +} + +void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, +                                                 Instruction *CntInst, +                                                 PHINode *CntPhi, Value *Var) { +  BasicBlock *PreHead = CurLoop->getLoopPreheader(); +  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); +  const DebugLoc DL = CntInst->getDebugLoc(); + +  // Assuming before transformation, the loop is following: +  //  if (x) // the precondition +  //     do { cnt++; x &= x - 1; } while(x); + +  // Step 1: Insert the ctpop instruction at the end of the precondition block +  IRBuilder<> Builder(PreCondBr); +  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; +  { +    PopCnt = createPopcntIntrinsic(Builder, Var, DL); +    NewCount = PopCntZext = +        Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); + +    if (NewCount != PopCnt) +      (cast<Instruction>(NewCount))->setDebugLoc(DL); + +    // TripCnt is exactly the number of iterations the loop has +    TripCnt = NewCount; + +    // If the population counter's initial value is not zero, insert Add Inst. +    Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); +    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); +    if (!InitConst || !InitConst->isZero()) { +      NewCount = Builder.CreateAdd(NewCount, CntInitVal); +      (cast<Instruction>(NewCount))->setDebugLoc(DL); +    } +  } + +  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to +  //   "if (NewCount == 0) loop-exit". Without this change, the intrinsic +  //   function would be partial dead code, and downstream passes will drag +  //   it back from the precondition block to the preheader. +  { +    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); + +    Value *Opnd0 = PopCntZext; +    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); +    if (PreCond->getOperand(0) != Var) +      std::swap(Opnd0, Opnd1); + +    ICmpInst *NewPreCond = cast<ICmpInst>( +        Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); +    PreCondBr->setCondition(NewPreCond); + +    RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); +  } + +  // Step 3: Note that the population count is exactly the trip count of the +  // loop in question, which enable us to to convert the loop from noncountable +  // loop into a countable one. The benefit is twofold: +  // +  //  - If the loop only counts population, the entire loop becomes dead after +  //    the transformation. It is a lot easier to prove a countable loop dead +  //    than to prove a noncountable one. (In some C dialects, an infinite loop +  //    isn't dead even if it computes nothing useful. In general, DCE needs +  //    to prove a noncountable loop finite before safely delete it.) +  // +  //  - If the loop also performs something else, it remains alive. +  //    Since it is transformed to countable form, it can be aggressively +  //    optimized by some optimizations which are in general not applicable +  //    to a noncountable loop. +  // +  // After this step, this loop (conceptually) would look like following: +  //   newcnt = __builtin_ctpop(x); +  //   t = newcnt; +  //   if (x) +  //     do { cnt++; x &= x-1; t--) } while (t > 0); +  BasicBlock *Body = *(CurLoop->block_begin()); +  { +    auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); +    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); +    Type *Ty = TripCnt->getType(); + +    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); + +    Builder.SetInsertPoint(LbCond); +    Instruction *TcDec = cast<Instruction>( +        Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), +                          "tcdec", false, true)); + +    TcPhi->addIncoming(TripCnt, PreHead); +    TcPhi->addIncoming(TcDec, Body); + +    CmpInst::Predicate Pred = +        (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; +    LbCond->setPredicate(Pred); +    LbCond->setOperand(0, TcDec); +    LbCond->setOperand(1, ConstantInt::get(Ty, 0)); +  } + +  // Step 4: All the references to the original population counter outside +  //  the loop are replaced with the NewCount -- the value returned from +  //  __builtin_ctpop(). +  CntInst->replaceUsesOutsideBlock(NewCount, Body); + +  // step 5: Forget the "non-computable" trip-count SCEV associated with the +  //   loop. The loop would otherwise not be deleted even if it becomes empty. +  SE->forgetLoop(CurLoop); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index e12502654751..b4102fe9ba34 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -48,7 +48,7 @@ namespace {        AU.addRequiredID(LoopSimplifyID);        AU.addPreservedID(LoopSimplifyID);        AU.addPreservedID(LCSSAID); -      AU.addPreserved<ScalarEvolution>(); +      AU.addPreserved<ScalarEvolutionWrapperPass>();        AU.addRequired<TargetLibraryInfoWrapperPass>();      }    }; @@ -112,7 +112,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {        // Simplify instructions in the current basic block.        for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { -        Instruction *I = BI++; +        Instruction *I = &*BI++;          // The first time through the loop ToSimplify is empty and we try to          // simplify all instructions. On later iterations ToSimplify is not diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 9d7e57ffebac..4295235a3f36 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -99,7 +99,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,          return false;        if (St && !St->isSimple())          return false; -      MemInstr.push_back(I); +      MemInstr.push_back(&*I);      }    } @@ -176,7 +176,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,      }    } -  // We don't have a DepMatrix to check legality return false +  // We don't have a DepMatrix to check legality return false.    if (DepMatrix.size() == 0)      return false;    return true; @@ -331,9 +331,9 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {  class LoopInterchangeLegality {  public:    LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, -                          LoopInterchange *Pass) -      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), CurrentPass(Pass), -        InnerLoopHasReduction(false) {} +                          LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA) +      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), +        PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {}    /// Check if the loops can be interchanged.    bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, @@ -357,9 +357,10 @@ private:    Loop *OuterLoop;    Loop *InnerLoop; -  /// Scev analysis.    ScalarEvolution *SE; -  LoopInterchange *CurrentPass; +  LoopInfo *LI; +  DominatorTree *DT; +  bool PreserveLCSSA;    bool InnerLoopHasReduction;  }; @@ -371,7 +372,7 @@ public:    LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)        : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {} -  /// Check if the loop interchange is profitable +  /// Check if the loop interchange is profitable.    bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,                      CharMatrix &DepMatrix); @@ -385,12 +386,12 @@ private:    ScalarEvolution *SE;  }; -/// LoopInterchangeTransform interchanges the loop +/// LoopInterchangeTransform interchanges the loop.  class LoopInterchangeTransform {  public:    LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,                             LoopInfo *LI, DominatorTree *DT, -                           LoopInterchange *Pass, BasicBlock *LoopNestExit, +                           BasicBlock *LoopNestExit,                             bool InnerLoopContainsReductions)        : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),          LoopExit(LoopNestExit), @@ -424,21 +425,22 @@ private:    bool InnerLoopHasReduction;  }; -// Main LoopInterchange Pass +// Main LoopInterchange Pass.  struct LoopInterchange : public FunctionPass {    static char ID;    ScalarEvolution *SE;    LoopInfo *LI;    DependenceAnalysis *DA;    DominatorTree *DT; +  bool PreserveLCSSA;    LoopInterchange()        : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) {      initializeLoopInterchangePass(*PassRegistry::getPassRegistry());    }    void getAnalysisUsage(AnalysisUsage &AU) const override { -    AU.addRequired<ScalarEvolution>(); -    AU.addRequired<AliasAnalysis>(); +    AU.addRequired<ScalarEvolutionWrapperPass>(); +    AU.addRequired<AAResultsWrapperPass>();      AU.addRequired<DominatorTreeWrapperPass>();      AU.addRequired<LoopInfoWrapperPass>();      AU.addRequired<DependenceAnalysis>(); @@ -447,11 +449,13 @@ struct LoopInterchange : public FunctionPass {    }    bool runOnFunction(Function &F) override { -    SE = &getAnalysis<ScalarEvolution>(); +    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();      DA = &getAnalysis<DependenceAnalysis>();      auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();      DT = DTWP ? &DTWP->getDomTree() : nullptr; +    PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); +      // Build up a worklist of loop pairs to analyze.      SmallVector<LoopVector, 8> Worklist; @@ -489,7 +493,7 @@ struct LoopInterchange : public FunctionPass {    unsigned selectLoopForInterchange(LoopVector LoopList) {      // TODO: Add a better heuristic to select the loop to be interchanged based -    // on the dependece matrix. Currently we select the innermost loop. +    // on the dependence matrix. Currently we select the innermost loop.      return LoopList.size() - 1;    } @@ -544,7 +548,7 @@ struct LoopInterchange : public FunctionPass {      }      unsigned SelecLoopId = selectLoopForInterchange(LoopList); -    // Move the selected loop outwards to the best posible position. +    // Move the selected loop outwards to the best possible position.      for (unsigned i = SelecLoopId; i > 0; i--) {        bool Interchanged =            processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); @@ -574,7 +578,8 @@ struct LoopInterchange : public FunctionPass {      Loop *InnerLoop = LoopList[InnerLoopId];      Loop *OuterLoop = LoopList[OuterLoopId]; -    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, this); +    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT, +                                PreserveLCSSA);      if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {        DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");        return false; @@ -586,7 +591,7 @@ struct LoopInterchange : public FunctionPass {        return false;      } -    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, this, +    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,                                   LoopNestExit, LIL.hasInnerLoopReduction());      LIT.transform();      DEBUG(dbgs() << "Loops interchanged\n"); @@ -655,7 +660,7 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {    DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n");    // We do not have any basic block in between now make sure the outer header -  // and outer loop latch doesnt contain any unsafe instructions. +  // and outer loop latch doesn't contain any unsafe instructions.    if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||        containsUnsafeInstructionsInLatch(OuterLoopLatch))      return false; @@ -698,9 +703,9 @@ bool LoopInterchangeLegality::findInductionAndReductions(      return false;    for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {      RecurrenceDescriptor RD; +    InductionDescriptor ID;      PHINode *PHI = cast<PHINode>(I); -    ConstantInt *StepValue = nullptr; -    if (isInductionPHI(PHI, SE, StepValue)) +    if (InductionDescriptor::isInductionPHI(PHI, SE, ID))        Inductions.push_back(PHI);      else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))        Reductions.push_back(PHI); @@ -836,7 +841,7 @@ bool LoopInterchangeLegality::currentLimitations() {      else        FoundInduction = true;    } -  // The loop latch ended and we didnt find the induction variable return as +  // The loop latch ended and we didn't find the induction variable return as    // current limitation.    if (!FoundInduction)      return true; @@ -867,12 +872,14 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,    if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() ||        isa<PHINode>(OuterLoopPreHeader->begin()) ||        !OuterLoopPreHeader->getUniquePredecessor()) { -    OuterLoopPreHeader = InsertPreheaderForLoop(OuterLoop, CurrentPass); +    OuterLoopPreHeader = +        InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA);    }    if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() ||        InnerLoopPreHeader == OuterLoop->getHeader()) { -    InnerLoopPreHeader = InsertPreheaderForLoop(InnerLoop, CurrentPass); +    InnerLoopPreHeader = +        InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA);    }    // TODO: The loops could not be interchanged due to current limitations in the @@ -966,7 +973,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,                                                  unsigned OuterLoopId,                                                  CharMatrix &DepMatrix) { -  // TODO: Add Better Profitibility checks. +  // TODO: Add better profitability checks.    // e.g    // 1) Construct dependency matrix and move the one with no loop carried dep    //    inside to enable vectorization. @@ -980,7 +987,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,    if (Cost < 0)      return true; -  // It is not profitable as per current cache profitibility model. But check if +  // It is not profitable as per current cache profitability model. But check if    // we can move this loop outside to improve parallelism.    bool ImprovesPar =        isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix); @@ -996,7 +1003,7 @@ void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,        return;      }    } -  assert(false && "Couldn't find loop"); +  llvm_unreachable("Couldn't find loop");  }  void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop, @@ -1045,7 +1052,7 @@ bool LoopInterchangeTransform::transform() {      splitInnerLoopLatch(InnerIndexVar);      DEBUG(dbgs() << "splitInnerLoopLatch Done\n"); -    // Splits the inner loops phi nodes out into a seperate basic block. +    // Splits the inner loops phi nodes out into a separate basic block.      splitInnerLoopHeader();      DEBUG(dbgs() << "splitInnerLoopHeader Done\n");    } @@ -1113,8 +1120,8 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {    auto &ToList = InsertBefore->getParent()->getInstList();    auto &FromList = FromBB->getInstList(); -  ToList.splice(InsertBefore, FromList, FromList.begin(), -                FromBB->getTerminator()); +  ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(), +                FromBB->getTerminator()->getIterator());  }  void LoopInterchangeTransform::adjustOuterLoopPreheader() { @@ -1181,8 +1188,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() {    if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)      return false; -  BasicBlock *InnerLoopHeaderSucessor = InnerLoopHeader->getUniqueSuccessor(); -  if (!InnerLoopHeaderSucessor) +  BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); +  if (!InnerLoopHeaderSuccessor)      return false;    // Adjust Loop Preheader and headers @@ -1198,11 +1205,11 @@ bool LoopInterchangeTransform::adjustLoopBranches() {      if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)        OuterLoopHeaderBI->setSuccessor(i, LoopExit);      else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader) -      OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSucessor); +      OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor);    }    // Adjust reduction PHI's now that the incoming block has changed. -  updateIncomingBlock(InnerLoopHeaderSucessor, InnerLoopHeader, +  updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,                        OuterLoopHeader);    BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI); @@ -1286,10 +1293,10 @@ bool LoopInterchangeTransform::adjustLoopLinks() {  char LoopInterchange::ID = 0;  INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",                        "Interchanges loops for cache reuse", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)  INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)  INITIALIZE_PASS_DEPENDENCY(LCSSA)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp new file mode 100644 index 000000000000..1064d088514d --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -0,0 +1,566 @@ +//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implement a loop-aware load elimination pass. +// +// It uses LoopAccessAnalysis to identify loop-carried dependences with a +// distance of one between stores and loads.  These form the candidates for the +// transformation.  The source value of each store then propagated to the user +// of the corresponding load.  This makes the load dead. +// +// The pass can also version the loop and add memchecks in order to prove that +// may-aliasing stores can't change the value in memory before it's read by the +// load. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include <forward_list> + +#define LLE_OPTION "loop-load-elim" +#define DEBUG_TYPE LLE_OPTION + +using namespace llvm; + +static cl::opt<unsigned> CheckPerElim( +    "runtime-check-per-loop-load-elim", cl::Hidden, +    cl::desc("Max number of memchecks allowed per eliminated load on average"), +    cl::init(1)); + +static cl::opt<unsigned> LoadElimSCEVCheckThreshold( +    "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden, +    cl::desc("The maximum number of SCEV checks allowed for Loop " +             "Load Elimination")); + + +STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE"); + +namespace { + +/// \brief Represent a store-to-forwarding candidate. +struct StoreToLoadForwardingCandidate { +  LoadInst *Load; +  StoreInst *Store; + +  StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store) +      : Load(Load), Store(Store) {} + +  /// \brief Return true if the dependence from the store to the load has a +  /// distance of one.  E.g. A[i+1] = A[i] +  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const { +    Value *LoadPtr = Load->getPointerOperand(); +    Value *StorePtr = Store->getPointerOperand(); +    Type *LoadPtrType = LoadPtr->getType(); +    Type *LoadType = LoadPtrType->getPointerElementType(); + +    assert(LoadPtrType->getPointerAddressSpace() == +               StorePtr->getType()->getPointerAddressSpace() && +           LoadType == StorePtr->getType()->getPointerElementType() && +           "Should be a known dependence"); + +    auto &DL = Load->getParent()->getModule()->getDataLayout(); +    unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType)); + +    auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr)); +    auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr)); + +    // We don't need to check non-wrapping here because forward/backward +    // dependence wouldn't be valid if these weren't monotonic accesses. +    auto *Dist = cast<SCEVConstant>( +        PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV)); +    const APInt &Val = Dist->getAPInt(); +    return Val.abs() == TypeByteSize; +  } + +  Value *getLoadPtr() const { return Load->getPointerOperand(); } + +#ifndef NDEBUG +  friend raw_ostream &operator<<(raw_ostream &OS, +                                 const StoreToLoadForwardingCandidate &Cand) { +    OS << *Cand.Store << " -->\n"; +    OS.indent(2) << *Cand.Load << "\n"; +    return OS; +  } +#endif +}; + +/// \brief Check if the store dominates all latches, so as long as there is no +/// intervening store this value will be loaded in the next iteration. +bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, +                                  DominatorTree *DT) { +  SmallVector<BasicBlock *, 8> Latches; +  L->getLoopLatches(Latches); +  return std::all_of(Latches.begin(), Latches.end(), +                     [&](const BasicBlock *Latch) { +                       return DT->dominates(StoreBlock, Latch); +                     }); +} + +/// \brief The per-loop class that does most of the work. +class LoadEliminationForLoop { +public: +  LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, +                         DominatorTree *DT) +      : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.PSE) {} + +  /// \brief Look through the loop-carried and loop-independent dependences in +  /// this loop and find store->load dependences. +  /// +  /// Note that no candidate is returned if LAA has failed to analyze the loop +  /// (e.g. if it's not bottom-tested, contains volatile memops, etc.) +  std::forward_list<StoreToLoadForwardingCandidate> +  findStoreToLoadDependences(const LoopAccessInfo &LAI) { +    std::forward_list<StoreToLoadForwardingCandidate> Candidates; + +    const auto *Deps = LAI.getDepChecker().getDependences(); +    if (!Deps) +      return Candidates; + +    // Find store->load dependences (consequently true dep).  Both lexically +    // forward and backward dependences qualify.  Disqualify loads that have +    // other unknown dependences. + +    SmallSet<Instruction *, 4> LoadsWithUnknownDepedence; + +    for (const auto &Dep : *Deps) { +      Instruction *Source = Dep.getSource(LAI); +      Instruction *Destination = Dep.getDestination(LAI); + +      if (Dep.Type == MemoryDepChecker::Dependence::Unknown) { +        if (isa<LoadInst>(Source)) +          LoadsWithUnknownDepedence.insert(Source); +        if (isa<LoadInst>(Destination)) +          LoadsWithUnknownDepedence.insert(Destination); +        continue; +      } + +      if (Dep.isBackward()) +        // Note that the designations source and destination follow the program +        // order, i.e. source is always first.  (The direction is given by the +        // DepType.) +        std::swap(Source, Destination); +      else +        assert(Dep.isForward() && "Needs to be a forward dependence"); + +      auto *Store = dyn_cast<StoreInst>(Source); +      if (!Store) +        continue; +      auto *Load = dyn_cast<LoadInst>(Destination); +      if (!Load) +        continue; +      Candidates.emplace_front(Load, Store); +    } + +    if (!LoadsWithUnknownDepedence.empty()) +      Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) { +        return LoadsWithUnknownDepedence.count(C.Load); +      }); + +    return Candidates; +  } + +  /// \brief Return the index of the instruction according to program order. +  unsigned getInstrIndex(Instruction *Inst) { +    auto I = InstOrder.find(Inst); +    assert(I != InstOrder.end() && "No index for instruction"); +    return I->second; +  } + +  /// \brief If a load has multiple candidates associated (i.e. different +  /// stores), it means that it could be forwarding from multiple stores +  /// depending on control flow.  Remove these candidates. +  /// +  /// Here, we rely on LAA to include the relevant loop-independent dependences. +  /// LAA is known to omit these in the very simple case when the read and the +  /// write within an alias set always takes place using the *same* pointer. +  /// +  /// However, we know that this is not the case here, i.e. we can rely on LAA +  /// to provide us with loop-independent dependences for the cases we're +  /// interested.  Consider the case for example where a loop-independent +  /// dependece S1->S2 invalidates the forwarding S3->S2. +  /// +  ///         A[i]   = ...   (S1) +  ///         ...    = A[i]  (S2) +  ///         A[i+1] = ...   (S3) +  /// +  /// LAA will perform dependence analysis here because there are two +  /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]). +  void removeDependencesFromMultipleStores( +      std::forward_list<StoreToLoadForwardingCandidate> &Candidates) { +    // If Store is nullptr it means that we have multiple stores forwarding to +    // this store. +    typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *> +        LoadToSingleCandT; +    LoadToSingleCandT LoadToSingleCand; + +    for (const auto &Cand : Candidates) { +      bool NewElt; +      LoadToSingleCandT::iterator Iter; + +      std::tie(Iter, NewElt) = +          LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand)); +      if (!NewElt) { +        const StoreToLoadForwardingCandidate *&OtherCand = Iter->second; +        // Already multiple stores forward to this load. +        if (OtherCand == nullptr) +          continue; + +        // Handle the very basic of case when the two stores are in the same +        // block so deciding which one forwards is easy.  The later one forwards +        // as long as they both have a dependence distance of one to the load. +        if (Cand.Store->getParent() == OtherCand->Store->getParent() && +            Cand.isDependenceDistanceOfOne(PSE) && +            OtherCand->isDependenceDistanceOfOne(PSE)) { +          // They are in the same block, the later one will forward to the load. +          if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) +            OtherCand = &Cand; +        } else +          OtherCand = nullptr; +      } +    } + +    Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) { +      if (LoadToSingleCand[Cand.Load] != &Cand) { +        DEBUG(dbgs() << "Removing from candidates: \n" << Cand +                     << "  The load may have multiple stores forwarding to " +                     << "it\n"); +        return true; +      } +      return false; +    }); +  } + +  /// \brief Given two pointers operations by their RuntimePointerChecking +  /// indices, return true if they require an alias check. +  /// +  /// We need a check if one is a pointer for a candidate load and the other is +  /// a pointer for a possibly intervening store. +  bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2, +                     const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath, +                     const std::set<Value *> &CandLoadPtrs) { +    Value *Ptr1 = +        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue; +    Value *Ptr2 = +        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue; +    return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) || +            (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1))); +  } + +  /// \brief Return pointers that are possibly written to on the path from a +  /// forwarding store to a load. +  /// +  /// These pointers need to be alias-checked against the forwarding candidates. +  SmallSet<Value *, 4> findPointersWrittenOnForwardingPath( +      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { +    // From FirstStore to LastLoad neither of the elimination candidate loads +    // should overlap with any of the stores. +    // +    // E.g.: +    // +    // st1 C[i] +    // ld1 B[i] <-------, +    // ld0 A[i] <----,  |              * LastLoad +    // ...           |  | +    // st2 E[i]      |  | +    // st3 B[i+1] -- | -'              * FirstStore +    // st0 A[i+1] ---' +    // st4 D[i] +    // +    // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with +    // ld0. + +    LoadInst *LastLoad = +        std::max_element(Candidates.begin(), Candidates.end(), +                         [&](const StoreToLoadForwardingCandidate &A, +                             const StoreToLoadForwardingCandidate &B) { +                           return getInstrIndex(A.Load) < getInstrIndex(B.Load); +                         }) +            ->Load; +    StoreInst *FirstStore = +        std::min_element(Candidates.begin(), Candidates.end(), +                         [&](const StoreToLoadForwardingCandidate &A, +                             const StoreToLoadForwardingCandidate &B) { +                           return getInstrIndex(A.Store) < +                                  getInstrIndex(B.Store); +                         }) +            ->Store; + +    // We're looking for stores after the first forwarding store until the end +    // of the loop, then from the beginning of the loop until the last +    // forwarded-to load.  Collect the pointer for the stores. +    SmallSet<Value *, 4> PtrsWrittenOnFwdingPath; + +    auto InsertStorePtr = [&](Instruction *I) { +      if (auto *S = dyn_cast<StoreInst>(I)) +        PtrsWrittenOnFwdingPath.insert(S->getPointerOperand()); +    }; +    const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions(); +    std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1, +                  MemInstrs.end(), InsertStorePtr); +    std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)], +                  InsertStorePtr); + +    return PtrsWrittenOnFwdingPath; +  } + +  /// \brief Determine the pointer alias checks to prove that there are no +  /// intervening stores. +  SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks( +      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { + +    SmallSet<Value *, 4> PtrsWrittenOnFwdingPath = +        findPointersWrittenOnForwardingPath(Candidates); + +    // Collect the pointers of the candidate loads. +    // FIXME: SmallSet does not work with std::inserter. +    std::set<Value *> CandLoadPtrs; +    std::transform(Candidates.begin(), Candidates.end(), +                   std::inserter(CandLoadPtrs, CandLoadPtrs.begin()), +                   std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr)); + +    const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks(); +    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; + +    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), +                 [&](const RuntimePointerChecking::PointerCheck &Check) { +                   for (auto PtrIdx1 : Check.first->Members) +                     for (auto PtrIdx2 : Check.second->Members) +                       if (needsChecking(PtrIdx1, PtrIdx2, +                                         PtrsWrittenOnFwdingPath, CandLoadPtrs)) +                         return true; +                   return false; +                 }); + +    DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n"); +    DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + +    return Checks; +  } + +  /// \brief Perform the transformation for a candidate. +  void +  propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand, +                                  SCEVExpander &SEE) { +    // +    // loop: +    //      %x = load %gep_i +    //         = ... %x +    //      store %y, %gep_i_plus_1 +    // +    // => +    // +    // ph: +    //      %x.initial = load %gep_0 +    // loop: +    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop] +    //      %x = load %gep_i            <---- now dead +    //         = ... %x.storeforward +    //      store %y, %gep_i_plus_1 + +    Value *Ptr = Cand.Load->getPointerOperand(); +    auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr)); +    auto *PH = L->getLoopPreheader(); +    Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(), +                                          PH->getTerminator()); +    Value *Initial = +        new LoadInst(InitialPtr, "load_initial", PH->getTerminator()); +    PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded", +                                   &L->getHeader()->front()); +    PHI->addIncoming(Initial, PH); +    PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch()); + +    Cand.Load->replaceAllUsesWith(PHI); +  } + +  /// \brief Top-level driver for each loop: find store->load forwarding +  /// candidates, add run-time checks and perform transformation. +  bool processLoop() { +    DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName() +                 << "\" checking " << *L << "\n"); +    // Look for store-to-load forwarding cases across the +    // backedge. E.g.: +    // +    // loop: +    //      %x = load %gep_i +    //         = ... %x +    //      store %y, %gep_i_plus_1 +    // +    // => +    // +    // ph: +    //      %x.initial = load %gep_0 +    // loop: +    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop] +    //      %x = load %gep_i            <---- now dead +    //         = ... %x.storeforward +    //      store %y, %gep_i_plus_1 + +    // First start with store->load dependences. +    auto StoreToLoadDependences = findStoreToLoadDependences(LAI); +    if (StoreToLoadDependences.empty()) +      return false; + +    // Generate an index for each load and store according to the original +    // program order.  This will be used later. +    InstOrder = LAI.getDepChecker().generateInstructionOrderMap(); + +    // To keep things simple for now, remove those where the load is potentially +    // fed by multiple stores. +    removeDependencesFromMultipleStores(StoreToLoadDependences); +    if (StoreToLoadDependences.empty()) +      return false; + +    // Filter the candidates further. +    SmallVector<StoreToLoadForwardingCandidate, 4> Candidates; +    unsigned NumForwarding = 0; +    for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) { +      DEBUG(dbgs() << "Candidate " << Cand); +      // Make sure that the stored values is available everywhere in the loop in +      // the next iteration. +      if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT)) +        continue; + +      // Check whether the SCEV difference is the same as the induction step, +      // thus we load the value in the next iteration. +      if (!Cand.isDependenceDistanceOfOne(PSE)) +        continue; + +      ++NumForwarding; +      DEBUG(dbgs() +            << NumForwarding +            << ". Valid store-to-load forwarding across the loop backedge\n"); +      Candidates.push_back(Cand); +    } +    if (Candidates.empty()) +      return false; + +    // Check intervening may-alias stores.  These need runtime checks for alias +    // disambiguation. +    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks = +        collectMemchecks(Candidates); + +    // Too many checks are likely to outweigh the benefits of forwarding. +    if (Checks.size() > Candidates.size() * CheckPerElim) { +      DEBUG(dbgs() << "Too many run-time checks needed.\n"); +      return false; +    } + +    if (LAI.PSE.getUnionPredicate().getComplexity() > +        LoadElimSCEVCheckThreshold) { +      DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); +      return false; +    } + +    // Point of no-return, start the transformation.  First, version the loop if +    // necessary. +    if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) { +      LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false); +      LV.setAliasChecks(std::move(Checks)); +      LV.setSCEVChecks(LAI.PSE.getUnionPredicate()); +      LV.versionLoop(); +    } + +    // Next, propagate the value stored by the store to the users of the load. +    // Also for the first iteration, generate the initial value of the load. +    SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(), +                     "storeforward"); +    for (const auto &Cand : Candidates) +      propagateStoredValueToLoadUsers(Cand, SEE); +    NumLoopLoadEliminted += NumForwarding; + +    return true; +  } + +private: +  Loop *L; + +  /// \brief Maps the load/store instructions to their index according to +  /// program order. +  DenseMap<Instruction *, unsigned> InstOrder; + +  // Analyses used. +  LoopInfo *LI; +  const LoopAccessInfo &LAI; +  DominatorTree *DT; +  PredicatedScalarEvolution PSE; +}; + +/// \brief The pass.  Most of the work is delegated to the per-loop +/// LoadEliminationForLoop class. +class LoopLoadElimination : public FunctionPass { +public: +  LoopLoadElimination() : FunctionPass(ID) { +    initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry()); +  } + +  bool runOnFunction(Function &F) override { +    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); +    auto *LAA = &getAnalysis<LoopAccessAnalysis>(); +    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + +    // Build up a worklist of inner-loops to vectorize. This is necessary as the +    // act of distributing a loop creates new loops and can invalidate iterators +    // across the loops. +    SmallVector<Loop *, 8> Worklist; + +    for (Loop *TopLevelLoop : *LI) +      for (Loop *L : depth_first(TopLevelLoop)) +        // We only handle inner-most loops. +        if (L->empty()) +          Worklist.push_back(L); + +    // Now walk the identified inner loops. +    bool Changed = false; +    for (Loop *L : Worklist) { +      const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); +      // The actual work is performed by LoadEliminationForLoop. +      LoadEliminationForLoop LEL(L, LI, LAI, DT); +      Changed |= LEL.processLoop(); +    } + +    // Process each loop nest in the function. +    return Changed; +  } + +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<LoopInfoWrapperPass>(); +    AU.addPreserved<LoopInfoWrapperPass>(); +    AU.addRequired<LoopAccessAnalysis>(); +    AU.addRequired<ScalarEvolutionWrapperPass>(); +    AU.addRequired<DominatorTreeWrapperPass>(); +    AU.addPreserved<DominatorTreeWrapperPass>(); +  } + +  static char ID; +}; +} + +char LoopLoadElimination::ID; +static const char LLE_name[] = "Loop Load Elimination"; + +INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) + +namespace llvm { +FunctionPass *createLoopLoadEliminationPass() { +  return new LoopLoadElimination(); +} +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index ed103e6b8ed6..27c2d8824df0 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -147,12 +147,12 @@ namespace {      bool runOnLoop(Loop *L, LPPassManager &LPM) override;      void getAnalysisUsage(AnalysisUsage &AU) const override { -      AU.addRequired<AliasAnalysis>(); +      AU.addRequired<AAResultsWrapperPass>();        AU.addRequired<LoopInfoWrapperPass>();        AU.addPreserved<LoopInfoWrapperPass>();        AU.addRequired<DominatorTreeWrapperPass>();        AU.addPreserved<DominatorTreeWrapperPass>(); -      AU.addRequired<ScalarEvolution>(); +      AU.addRequired<ScalarEvolutionWrapperPass>();        AU.addRequired<TargetLibraryInfoWrapperPass>();      } @@ -162,11 +162,15 @@ namespace {      ScalarEvolution *SE;      TargetLibraryInfo *TLI;      DominatorTree *DT; +    bool PreserveLCSSA;      typedef SmallVector<Instruction *, 16> SmallInstructionVector;      typedef SmallSet<Instruction *, 16>   SmallInstructionSet; -    // A chain of isomorphic instructions, indentified by a single-use PHI, +    // Map between induction variable and its increment +    DenseMap<Instruction *, int64_t> IVToIncMap; + +    // A chain of isomorphic instructions, identified by a single-use PHI      // representing a reduction. Only the last value may be used outside the      // loop.      struct SimpleLoopReduction { @@ -300,22 +304,6 @@ namespace {        // The functions below can be called after we've finished processing all        // instructions in the loop, and we know which reductions were selected. -      // Is the provided instruction the PHI of a reduction selected for -      // rerolling? -      bool isSelectedPHI(Instruction *J) { -        if (!isa<PHINode>(J)) -          return false; - -        for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); -             RI != RIE; ++RI) { -          int i = *RI; -          if (cast<Instruction>(J) == PossibleReds[i].getPHI()) -            return true; -        } - -        return false; -      } -        bool validateSelected();        void replaceSelected(); @@ -335,7 +323,7 @@ namespace {      //   x[i*3+1] = y2      //   x[i*3+2] = y3      // -    //   Base instruction -> i*3                +    //   Base instruction -> i*3      //                    +---+----+      //                   /    |     \      //               ST[y1]  +1     +2  <-- Roots @@ -366,8 +354,11 @@ namespace {      struct DAGRootTracker {        DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,                       ScalarEvolution *SE, AliasAnalysis *AA, -                     TargetLibraryInfo *TLI) -          : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), IV(IV) {} +                     TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI, +                     bool PreserveLCSSA, +                     DenseMap<Instruction *, int64_t> &IncrMap) +          : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI), +            PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap) {}        /// Stage 1: Find all the DAG roots for the induction variable.        bool findRoots(); @@ -413,11 +404,14 @@ namespace {        ScalarEvolution *SE;        AliasAnalysis *AA;        TargetLibraryInfo *TLI; +      DominatorTree *DT; +      LoopInfo *LI; +      bool PreserveLCSSA;        // The loop induction variable.        Instruction *IV;        // Loop step amount. -      uint64_t Inc; +      int64_t Inc;        // Loop reroll count; if Inc == 1, this records the scaling applied        // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;        // If Inc is not 1, Scale = Inc. @@ -430,6 +424,8 @@ namespace {        // they are used in (or specially, IL_All for instructions        // used in the loop increment mechanism).        UsesTy Uses; +      // Map between induction variable and its increment +      DenseMap<Instruction *, int64_t> &IVToIncMap;      };      void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); @@ -442,10 +438,10 @@ namespace {  char LoopReroll::ID = 0;  INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)  INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) @@ -477,21 +473,20 @@ void LoopReroll::collectPossibleIVs(Loop *L,        continue;      if (const SCEVAddRecExpr *PHISCEV = -        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) { +            dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) {        if (PHISCEV->getLoop() != L)          continue;        if (!PHISCEV->isAffine())          continue;        if (const SCEVConstant *IncSCEV =            dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) { -        if (!IncSCEV->getValue()->getValue().isStrictlyPositive()) +        const APInt &AInt = IncSCEV->getAPInt().abs(); +        if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc))            continue; -        if (IncSCEV->getValue()->uge(MaxInc)) -          continue; - -        DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << -              *PHISCEV << "\n"); -        PossibleIVs.push_back(I); +        IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue(); +        DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV +                     << "\n"); +        PossibleIVs.push_back(&*I);        }      }    } @@ -552,7 +547,7 @@ void LoopReroll::collectPossibleReductions(Loop *L,      if (!I->getType()->isSingleValueType())        continue; -    SimpleLoopReduction SLR(I, L); +    SimpleLoopReduction SLR(&*I, L);      if (!SLR.valid())        continue; @@ -699,17 +694,11 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {        }      } -    int64_t V = CI->getValue().getSExtValue(); +    int64_t V = std::abs(CI->getValue().getSExtValue());      if (Roots.find(V) != Roots.end())        // No duplicates, please.        return false; -    // FIXME: Add support for negative values. -    if (V < 0) { -      DEBUG(dbgs() << "LRR: Aborting due to negative value: " << V << "\n"); -      return false; -    } -      Roots[V] = cast<Instruction>(I);    } @@ -731,7 +720,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {    unsigned NumBaseUses = BaseUsers.size();    if (NumBaseUses == 0)      NumBaseUses = Roots.begin()->second->getNumUses(); -   +    // Check that every node has the same number of users.    for (auto &KV : Roots) {      if (KV.first == 0) @@ -744,7 +733,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {      }    } -  return true;  +  return true;  }  bool LoopReroll::DAGRootTracker:: @@ -787,7 +776,7 @@ findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {    if (!collectPossibleRoots(IVU, V))      return false; -  // If we didn't get a root for index zero, then IVU must be  +  // If we didn't get a root for index zero, then IVU must be    // subsumed.    if (V.find(0) == V.end())      SubsumedInsts.insert(IVU); @@ -818,13 +807,10 @@ findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {  }  bool LoopReroll::DAGRootTracker::findRoots() { - -  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); -  Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> -    getValue()->getZExtValue(); +  Inc = IVToIncMap[IV];    assert(RootSets.empty() && "Unclean state!"); -  if (Inc == 1) { +  if (std::abs(Inc) == 1) {      for (auto *IVU : IV->users()) {        if (isLoopIncrement(IVU, IV))          LoopIncs.push_back(cast<Instruction>(IVU)); @@ -996,6 +982,25 @@ bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,    return false;  } +static bool isIgnorableInst(const Instruction *I) { +  if (isa<DbgInfoIntrinsic>(I)) +    return true; +  const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); +  if (!II) +    return false; +  switch (II->getIntrinsicID()) { +    default: +      return false; +    case llvm::Intrinsic::annotation: +    case Intrinsic::ptr_annotation: +    case Intrinsic::var_annotation: +    // TODO: the following intrinsics may also be whitelisted: +    //   lifetime_start, lifetime_end, invariant_start, invariant_end +      return true; +  } +  return false; +} +  bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {    // We now need to check for equivalence of the use graph of each root with    // that of the primary induction variable (excluding the roots). Our goal @@ -1029,7 +1034,7 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {    // Make sure all instructions in the loop are in one and only one    // set.    for (auto &KV : Uses) { -    if (KV.second.count() != 1) { +    if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {        DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "              << *KV.first << " (#uses=" << KV.second.count() << ")\n");        return false; @@ -1103,15 +1108,15 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {                  " vs. " << *RootInst << "\n");            return false;          } -         +          RootIt = TryIt;          RootInst = TryIt->first;        }        // All instructions between the last root and this root -      // may belong to some other iteration. If they belong to a  +      // may belong to some other iteration. If they belong to a        // future iteration, then they're dangerous to alias with. -      //  +      //        // Note that because we allow a limited amount of flexibility in the order        // that we visit nodes, LastRootIt might be *before* RootIt, in which        // case we've already checked this set of instructions so we shouldn't @@ -1267,6 +1272,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {      ++J;    } +  bool Negative = IVToIncMap[IV] < 0;    const DataLayout &DL = Header->getModule()->getDataLayout();    // We need to create a new induction variable for each different BaseInst. @@ -1275,13 +1281,12 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {      const SCEVAddRecExpr *RealIVSCEV =        cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));      const SCEV *Start = RealIVSCEV->getStart(); -    const SCEVAddRecExpr *H = cast<SCEVAddRecExpr> -      (SE->getAddRecExpr(Start, -                         SE->getConstant(RealIVSCEV->getType(), 1), -                         L, SCEV::FlagAnyWrap)); +    const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>(SE->getAddRecExpr( +        Start, SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1), L, +        SCEV::FlagAnyWrap));      { // Limit the lifetime of SCEVExpander.        SCEVExpander Expander(*SE, DL, "reroll"); -      Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); +      Value *NewIV = Expander.expandCodeFor(H, IV->getType(), &Header->front());        for (auto &KV : Uses) {          if (KV.second.find_first() == 0) @@ -1294,8 +1299,8 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {            const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);            // Iteration count SCEV minus 1 -          const SCEV *ICMinus1SCEV = -            SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); +          const SCEV *ICMinus1SCEV = SE->getMinusSCEV( +              ICSCEV, SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1));            Value *ICMinus1; // Iteration count minus 1            if (isa<SCEVConstant>(ICMinus1SCEV)) { @@ -1303,7 +1308,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {            } else {              BasicBlock *Preheader = L->getLoopPreheader();              if (!Preheader) -              Preheader = InsertPreheaderForLoop(L, Parent); +              Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);              ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),                                                Preheader->getTerminator()); @@ -1444,13 +1449,14 @@ void LoopReroll::ReductionTracker::replaceSelected() {  bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,                          const SCEV *IterCount,                          ReductionTracker &Reductions) { -  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI); +  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA, +                          IVToIncMap);    if (!DAGRoots.findRoots())      return false;    DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<                    *IV << "\n"); -   +    if (!DAGRoots.validate(Reductions))      return false;    if (!Reductions.validateSelected()) @@ -1469,11 +1475,12 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {    if (skipOptnoneFunction(L))      return false; -  AA = &getAnalysis<AliasAnalysis>(); +  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); -  SE = &getAnalysis<ScalarEvolution>(); +  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); +  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);    BasicBlock *Header = L->getHeader();    DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << @@ -1490,13 +1497,13 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {      return Changed;    const SCEV *LIBETC = SE->getBackedgeTakenCount(L); -  const SCEV *IterCount = -    SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1)); +  const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType()));    DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");    // First, we need to find the induction variable with respect to which we can    // reroll (there may be several possible options).    SmallInstructionVector PossibleIVs; +  IVToIncMap.clear();    collectPossibleIVs(L, PossibleIVs);    if (PossibleIVs.empty()) { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp index a675e1289baf..5e6c2da08cc3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -13,11 +13,15 @@  #include "llvm/Transforms/Scalar.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/CodeMetrics.h"  #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/LoopPass.h"  #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"  #include "llvm/Analysis/TargetTransformInfo.h"  #include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/CFG.h" @@ -41,95 +45,6 @@ DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden,         cl::desc("The default maximum header size for automatic loop rotation"));  STATISTIC(NumRotated, "Number of loops rotated"); -namespace { - -  class LoopRotate : public LoopPass { -  public: -    static char ID; // Pass ID, replacement for typeid -    LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { -      initializeLoopRotatePass(*PassRegistry::getPassRegistry()); -      if (SpecifiedMaxHeaderSize == -1) -        MaxHeaderSize = DefaultRotationThreshold; -      else -        MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); -    } - -    // LCSSA form makes instruction renaming easier. -    void getAnalysisUsage(AnalysisUsage &AU) const override { -      AU.addRequired<AssumptionCacheTracker>(); -      AU.addPreserved<DominatorTreeWrapperPass>(); -      AU.addRequired<LoopInfoWrapperPass>(); -      AU.addPreserved<LoopInfoWrapperPass>(); -      AU.addRequiredID(LoopSimplifyID); -      AU.addPreservedID(LoopSimplifyID); -      AU.addRequiredID(LCSSAID); -      AU.addPreservedID(LCSSAID); -      AU.addPreserved<ScalarEvolution>(); -      AU.addRequired<TargetTransformInfoWrapperPass>(); -    } - -    bool runOnLoop(Loop *L, LPPassManager &LPM) override; -    bool simplifyLoopLatch(Loop *L); -    bool rotateLoop(Loop *L, bool SimplifiedLatch); - -  private: -    unsigned MaxHeaderSize; -    LoopInfo *LI; -    const TargetTransformInfo *TTI; -    AssumptionCache *AC; -    DominatorTree *DT; -  }; -} - -char LoopRotate::ID = 0; -INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) - -Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { -  return new LoopRotate(MaxHeaderSize); -} - -/// Rotate Loop L as many times as possible. Return true if -/// the loop is rotated at least once. -bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { -  if (skipOptnoneFunction(L)) -    return false; - -  // Save the loop metadata. -  MDNode *LoopMD = L->getLoopID(); - -  Function &F = *L->getHeader()->getParent(); - -  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); -  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); -  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); -  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); -  DT = DTWP ? &DTWP->getDomTree() : nullptr; - -  // Simplify the loop latch before attempting to rotate the header -  // upward. Rotation may not be needed if the loop tail can be folded into the -  // loop exit. -  bool SimplifiedLatch = simplifyLoopLatch(L); - -  // One loop can be rotated multiple times. -  bool MadeChange = false; -  while (rotateLoop(L, SimplifiedLatch)) { -    MadeChange = true; -    SimplifiedLatch = false; -  } - -  // Restore the loop metadata. -  // NB! We presume LoopRotation DOESN'T ADD its own metadata. -  if ((MadeChange || SimplifiedLatch) && LoopMD) -    L->setLoopID(LoopMD); - -  return MadeChange; -}  /// RewriteUsesOfClonedInstructions - We just cloned the instructions from the  /// old header into the preheader.  If there were uses of the values produced by @@ -147,7 +62,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,    // as necessary.    SSAUpdater SSA;    for (I = OrigHeader->begin(); I != E; ++I) { -    Value *OrigHeaderVal = I; +    Value *OrigHeaderVal = &*I;      // If there are no uses of the value (e.g. because it returns void), there      // is nothing to rewrite. @@ -196,127 +111,6 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,    }  } -/// Determine whether the instructions in this range may be safely and cheaply -/// speculated. This is not an important enough situation to develop complex -/// heuristics. We handle a single arithmetic instruction along with any type -/// conversions. -static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, -                                  BasicBlock::iterator End, Loop *L) { -  bool seenIncrement = false; -  bool MultiExitLoop = false; - -  if (!L->getExitingBlock()) -    MultiExitLoop = true; - -  for (BasicBlock::iterator I = Begin; I != End; ++I) { - -    if (!isSafeToSpeculativelyExecute(I)) -      return false; - -    if (isa<DbgInfoIntrinsic>(I)) -      continue; - -    switch (I->getOpcode()) { -    default: -      return false; -    case Instruction::GetElementPtr: -      // GEPs are cheap if all indices are constant. -      if (!cast<GEPOperator>(I)->hasAllConstantIndices()) -        return false; -      // fall-thru to increment case -    case Instruction::Add: -    case Instruction::Sub: -    case Instruction::And: -    case Instruction::Or: -    case Instruction::Xor: -    case Instruction::Shl: -    case Instruction::LShr: -    case Instruction::AShr: { -      Value *IVOpnd = !isa<Constant>(I->getOperand(0)) -                          ? I->getOperand(0) -                          : !isa<Constant>(I->getOperand(1)) -                                ? I->getOperand(1) -                                : nullptr; -      if (!IVOpnd) -        return false; - -      // If increment operand is used outside of the loop, this speculation -      // could cause extra live range interference. -      if (MultiExitLoop) { -        for (User *UseI : IVOpnd->users()) { -          auto *UserInst = cast<Instruction>(UseI); -          if (!L->contains(UserInst)) -            return false; -        } -      } - -      if (seenIncrement) -        return false; -      seenIncrement = true; -      break; -    } -    case Instruction::Trunc: -    case Instruction::ZExt: -    case Instruction::SExt: -      // ignore type conversions -      break; -    } -  } -  return true; -} - -/// Fold the loop tail into the loop exit by speculating the loop tail -/// instructions. Typically, this is a single post-increment. In the case of a -/// simple 2-block loop, hoisting the increment can be much better than -/// duplicating the entire loop header. In the case of loops with early exits, -/// rotation will not work anyway, but simplifyLoopLatch will put the loop in -/// canonical form so downstream passes can handle it. -/// -/// I don't believe this invalidates SCEV. -bool LoopRotate::simplifyLoopLatch(Loop *L) { -  BasicBlock *Latch = L->getLoopLatch(); -  if (!Latch || Latch->hasAddressTaken()) -    return false; - -  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); -  if (!Jmp || !Jmp->isUnconditional()) -    return false; - -  BasicBlock *LastExit = Latch->getSinglePredecessor(); -  if (!LastExit || !L->isLoopExiting(LastExit)) -    return false; - -  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); -  if (!BI) -    return false; - -  if (!shouldSpeculateInstrs(Latch->begin(), Jmp, L)) -    return false; - -  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " -        << LastExit->getName() << "\n"); - -  // Hoist the instructions from Latch into LastExit. -  LastExit->getInstList().splice(BI, Latch->getInstList(), Latch->begin(), Jmp); - -  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; -  BasicBlock *Header = Jmp->getSuccessor(0); -  assert(Header == L->getHeader() && "expected a backward branch"); - -  // Remove Latch from the CFG so that LastExit becomes the new Latch. -  BI->setSuccessor(FallThruPath, Header); -  Latch->replaceSuccessorsPhiUsesWith(LastExit); -  Jmp->eraseFromParent(); - -  // Nuke the Latch block. -  assert(Latch->empty() && "unable to evacuate Latch"); -  LI->removeBlock(Latch); -  if (DT) -    DT->eraseNode(Latch); -  Latch->eraseFromParent(); -  return true; -} -  /// Rotate loop LP. Return true if the loop is rotated.  ///  /// \param SimplifiedLatch is true if the latch was just folded into the final @@ -327,7 +121,10 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {  /// rotation. LoopRotate should be repeatable and converge to a canonical  /// form. This property is satisfied because simplifying the loop latch can only  /// happen once across multiple invocations of the LoopRotate pass. -bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { +static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, +                       const TargetTransformInfo *TTI, AssumptionCache *AC, +                       DominatorTree *DT, ScalarEvolution *SE, +                       bool SimplifiedLatch) {    // If the loop has only one block then there is not much to rotate.    if (L->getBlocks().size() == 1)      return false; @@ -382,7 +179,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {    // Anything ScalarEvolution may know about this loop or the PHI nodes    // in its header will soon be invalidated. -  if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) +  if (SE)      SE->forgetLoop(L);    DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); @@ -420,7 +217,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {    // possible or create a clone in the OldPreHeader if not.    TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();    while (I != E) { -    Instruction *Inst = I++; +    Instruction *Inst = &*I++;      // If the instruction's operands are invariant and it doesn't read or write      // memory, then it is safe to hoist.  Doing this doesn't change the order of @@ -465,8 +262,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {    // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's    // successors by duplicating their incoming values for OrigHeader.    TerminatorInst *TI = OrigHeader->getTerminator(); -  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) -    for (BasicBlock::iterator BI = TI->getSuccessor(i)->begin(); +  for (BasicBlock *SuccBB : TI->successors()) +    for (BasicBlock::iterator BI = SuccBB->begin();           PHINode *PN = dyn_cast<PHINode>(BI); ++BI)        PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); @@ -607,3 +404,221 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {    ++NumRotated;    return true;  } + +/// Determine whether the instructions in this range may be safely and cheaply +/// speculated. This is not an important enough situation to develop complex +/// heuristics. We handle a single arithmetic instruction along with any type +/// conversions. +static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, +                                  BasicBlock::iterator End, Loop *L) { +  bool seenIncrement = false; +  bool MultiExitLoop = false; + +  if (!L->getExitingBlock()) +    MultiExitLoop = true; + +  for (BasicBlock::iterator I = Begin; I != End; ++I) { + +    if (!isSafeToSpeculativelyExecute(&*I)) +      return false; + +    if (isa<DbgInfoIntrinsic>(I)) +      continue; + +    switch (I->getOpcode()) { +    default: +      return false; +    case Instruction::GetElementPtr: +      // GEPs are cheap if all indices are constant. +      if (!cast<GEPOperator>(I)->hasAllConstantIndices()) +        return false; +      // fall-thru to increment case +    case Instruction::Add: +    case Instruction::Sub: +    case Instruction::And: +    case Instruction::Or: +    case Instruction::Xor: +    case Instruction::Shl: +    case Instruction::LShr: +    case Instruction::AShr: { +      Value *IVOpnd = !isa<Constant>(I->getOperand(0)) +                          ? I->getOperand(0) +                          : !isa<Constant>(I->getOperand(1)) +                                ? I->getOperand(1) +                                : nullptr; +      if (!IVOpnd) +        return false; + +      // If increment operand is used outside of the loop, this speculation +      // could cause extra live range interference. +      if (MultiExitLoop) { +        for (User *UseI : IVOpnd->users()) { +          auto *UserInst = cast<Instruction>(UseI); +          if (!L->contains(UserInst)) +            return false; +        } +      } + +      if (seenIncrement) +        return false; +      seenIncrement = true; +      break; +    } +    case Instruction::Trunc: +    case Instruction::ZExt: +    case Instruction::SExt: +      // ignore type conversions +      break; +    } +  } +  return true; +} + +/// Fold the loop tail into the loop exit by speculating the loop tail +/// instructions. Typically, this is a single post-increment. In the case of a +/// simple 2-block loop, hoisting the increment can be much better than +/// duplicating the entire loop header. In the case of loops with early exits, +/// rotation will not work anyway, but simplifyLoopLatch will put the loop in +/// canonical form so downstream passes can handle it. +/// +/// I don't believe this invalidates SCEV. +static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) { +  BasicBlock *Latch = L->getLoopLatch(); +  if (!Latch || Latch->hasAddressTaken()) +    return false; + +  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); +  if (!Jmp || !Jmp->isUnconditional()) +    return false; + +  BasicBlock *LastExit = Latch->getSinglePredecessor(); +  if (!LastExit || !L->isLoopExiting(LastExit)) +    return false; + +  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); +  if (!BI) +    return false; + +  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L)) +    return false; + +  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " +        << LastExit->getName() << "\n"); + +  // Hoist the instructions from Latch into LastExit. +  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(), +                                 Latch->begin(), Jmp->getIterator()); + +  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; +  BasicBlock *Header = Jmp->getSuccessor(0); +  assert(Header == L->getHeader() && "expected a backward branch"); + +  // Remove Latch from the CFG so that LastExit becomes the new Latch. +  BI->setSuccessor(FallThruPath, Header); +  Latch->replaceSuccessorsPhiUsesWith(LastExit); +  Jmp->eraseFromParent(); + +  // Nuke the Latch block. +  assert(Latch->empty() && "unable to evacuate Latch"); +  LI->removeBlock(Latch); +  if (DT) +    DT->eraseNode(Latch); +  Latch->eraseFromParent(); +  return true; +} + +/// Rotate \c L as many times as possible. Return true if the loop is rotated +/// at least once. +static bool iterativelyRotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, +                                  const TargetTransformInfo *TTI, +                                  AssumptionCache *AC, DominatorTree *DT, +                                  ScalarEvolution *SE) { +  // Save the loop metadata. +  MDNode *LoopMD = L->getLoopID(); + +  // Simplify the loop latch before attempting to rotate the header +  // upward. Rotation may not be needed if the loop tail can be folded into the +  // loop exit. +  bool SimplifiedLatch = simplifyLoopLatch(L, LI, DT); + +  // One loop can be rotated multiple times. +  bool MadeChange = false; +  while (rotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE, SimplifiedLatch)) { +    MadeChange = true; +    SimplifiedLatch = false; +  } + +  // Restore the loop metadata. +  // NB! We presume LoopRotation DOESN'T ADD its own metadata. +  if ((MadeChange || SimplifiedLatch) && LoopMD) +    L->setLoopID(LoopMD); + +  return MadeChange; +} + +namespace { + +class LoopRotate : public LoopPass { +  unsigned MaxHeaderSize; + +public: +  static char ID; // Pass ID, replacement for typeid +  LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { +    initializeLoopRotatePass(*PassRegistry::getPassRegistry()); +    if (SpecifiedMaxHeaderSize == -1) +      MaxHeaderSize = DefaultRotationThreshold; +    else +      MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); +  } + +  // LCSSA form makes instruction renaming easier. +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addPreserved<AAResultsWrapperPass>(); +    AU.addRequired<AssumptionCacheTracker>(); +    AU.addPreserved<DominatorTreeWrapperPass>(); +    AU.addRequired<LoopInfoWrapperPass>(); +    AU.addPreserved<LoopInfoWrapperPass>(); +    AU.addRequiredID(LoopSimplifyID); +    AU.addPreservedID(LoopSimplifyID); +    AU.addRequiredID(LCSSAID); +    AU.addPreservedID(LCSSAID); +    AU.addPreserved<ScalarEvolutionWrapperPass>(); +    AU.addPreserved<SCEVAAWrapperPass>(); +    AU.addRequired<TargetTransformInfoWrapperPass>(); +    AU.addPreserved<BasicAAWrapperPass>(); +    AU.addPreserved<GlobalsAAWrapperPass>(); +  } + +  bool runOnLoop(Loop *L, LPPassManager &LPM) override { +    if (skipOptnoneFunction(L)) +      return false; +    Function &F = *L->getHeader()->getParent(); + +    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); +    const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); +    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); +    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); +    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; +    auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); +    auto *SE = SEWP ? &SEWP->getSE() : nullptr; + +    return iterativelyRotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE); +  } +}; +} + +char LoopRotate::ID = 0; +INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) + +Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { +  return new LoopRotate(MaxHeaderSize); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 4b59f3d2f6cc..2101225ed9f7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -105,10 +105,33 @@ static bool StressIVChain = false;  namespace { -/// RegSortData - This class holds data which is used to order reuse candidates. +struct MemAccessTy { +  /// Used in situations where the accessed memory type is unknown. +  static const unsigned UnknownAddressSpace = ~0u; + +  Type *MemTy; +  unsigned AddrSpace; + +  MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {} + +  MemAccessTy(Type *Ty, unsigned AS) : +    MemTy(Ty), AddrSpace(AS) {} + +  bool operator==(MemAccessTy Other) const { +    return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace; +  } + +  bool operator!=(MemAccessTy Other) const { return !(*this == Other); } + +  static MemAccessTy getUnknown(LLVMContext &Ctx) { +    return MemAccessTy(Type::getVoidTy(Ctx), UnknownAddressSpace); +  } +}; + +/// This class holds data which is used to order reuse candidates.  class RegSortData {  public: -  /// UsedByIndices - This represents the set of LSRUse indices which reference +  /// This represents the set of LSRUse indices which reference    /// a particular register.    SmallBitVector UsedByIndices; @@ -122,16 +145,14 @@ void RegSortData::print(raw_ostream &OS) const {    OS << "[NumUses=" << UsedByIndices.count() << ']';  } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD  void RegSortData::dump() const {    print(errs()); errs() << '\n';  } -#endif  namespace { -/// RegUseTracker - Map register candidates to information about how they are -/// used. +/// Map register candidates to information about how they are used.  class RegUseTracker {    typedef DenseMap<const SCEV *, RegSortData> RegUsesTy; @@ -139,9 +160,9 @@ class RegUseTracker {    SmallVector<const SCEV *, 16> RegSequence;  public: -  void CountRegister(const SCEV *Reg, size_t LUIdx); -  void DropRegister(const SCEV *Reg, size_t LUIdx); -  void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx); +  void countRegister(const SCEV *Reg, size_t LUIdx); +  void dropRegister(const SCEV *Reg, size_t LUIdx); +  void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);    bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; @@ -160,7 +181,7 @@ public:  }  void -RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) { +RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {    std::pair<RegUsesTy::iterator, bool> Pair =      RegUsesMap.insert(std::make_pair(Reg, RegSortData()));    RegSortData &RSD = Pair.first->second; @@ -171,7 +192,7 @@ RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {  }  void -RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { +RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {    RegUsesTy::iterator It = RegUsesMap.find(Reg);    assert(It != RegUsesMap.end());    RegSortData &RSD = It->second; @@ -180,7 +201,7 @@ RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {  }  void -RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) { +RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {    assert(LUIdx <= LastLUIdx);    // Update RegUses. The data structure is not optimized for this purpose; @@ -219,9 +240,8 @@ void RegUseTracker::clear() {  namespace { -/// Formula - This class holds information that describes a formula for -/// computing satisfying a use. It may include broken-out immediates and scaled -/// registers. +/// This class holds information that describes a formula for computing +/// satisfying a use. It may include broken-out immediates and scaled registers.  struct Formula {    /// Global base address used for complex addressing.    GlobalValue *BaseGV; @@ -235,8 +255,8 @@ struct Formula {    /// The scale of any complex addressing.    int64_t Scale; -  /// BaseRegs - The list of "base" registers for this use. When this is -  /// non-empty. The canonical representation of a formula is +  /// The list of "base" registers for this use. When this is non-empty. The +  /// canonical representation of a formula is    /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and    /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().    /// #1 enforces that the scaled register is always used when at least two @@ -247,31 +267,31 @@ struct Formula {    /// form.    SmallVector<const SCEV *, 4> BaseRegs; -  /// ScaledReg - The 'scaled' register for this use. This should be non-null -  /// when Scale is not zero. +  /// The 'scaled' register for this use. This should be non-null when Scale is +  /// not zero.    const SCEV *ScaledReg; -  /// UnfoldedOffset - An additional constant offset which added near the -  /// use. This requires a temporary register, but the offset itself can -  /// live in an add immediate field rather than a register. +  /// An additional constant offset which added near the use. This requires a +  /// temporary register, but the offset itself can live in an add immediate +  /// field rather than a register.    int64_t UnfoldedOffset;    Formula()        : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0),          ScaledReg(nullptr), UnfoldedOffset(0) {} -  void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); +  void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);    bool isCanonical() const; -  void Canonicalize(); +  void canonicalize(); -  bool Unscale(); +  bool unscale();    size_t getNumRegs() const;    Type *getType() const; -  void DeleteBaseReg(const SCEV *&S); +  void deleteBaseReg(const SCEV *&S);    bool referencesReg(const SCEV *S) const;    bool hasRegsUsedByUsesOtherThan(size_t LUIdx, @@ -283,7 +303,7 @@ struct Formula {  } -/// DoInitialMatch - Recursion helper for InitialMatch. +/// Recursion helper for initialMatch.  static void DoInitialMatch(const SCEV *S, Loop *L,                             SmallVectorImpl<const SCEV *> &Good,                             SmallVectorImpl<const SCEV *> &Bad, @@ -336,10 +356,9 @@ static void DoInitialMatch(const SCEV *S, Loop *L,    Bad.push_back(S);  } -/// InitialMatch - Incorporate loop-variant parts of S into this Formula, -/// attempting to keep all loop-invariant and loop-computable values in a -/// single base register. -void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { +/// Incorporate loop-variant parts of S into this Formula, attempting to keep +/// all loop-invariant and loop-computable values in a single base register. +void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {    SmallVector<const SCEV *, 4> Good;    SmallVector<const SCEV *, 4> Bad;    DoInitialMatch(S, L, Good, Bad, SE); @@ -355,7 +374,7 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {        BaseRegs.push_back(Sum);      HasBaseReg = true;    } -  Canonicalize(); +  canonicalize();  }  /// \brief Check whether or not this formula statisfies the canonical @@ -373,7 +392,7 @@ bool Formula::isCanonical() const {  /// field. Otherwise, we would have to do special cases everywhere in LSR  /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...  /// On the other hand, 1*reg should be canonicalized into reg. -void Formula::Canonicalize() { +void Formula::canonicalize() {    if (isCanonical())      return;    // So far we did not need this case. This is easy to implement but it is @@ -394,7 +413,7 @@ void Formula::Canonicalize() {  /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.  /// \return true if it was possible to get rid of the scale, false otherwise.  /// \note After this operation the formula may not be in the canonical form. -bool Formula::Unscale() { +bool Formula::unscale() {    if (Scale != 1)      return false;    Scale = 0; @@ -403,15 +422,14 @@ bool Formula::Unscale() {    return true;  } -/// getNumRegs - Return the total number of register operands used by this -/// formula. This does not include register uses implied by non-constant -/// addrec strides. +/// Return the total number of register operands used by this formula. This does +/// not include register uses implied by non-constant addrec strides.  size_t Formula::getNumRegs() const {    return !!ScaledReg + BaseRegs.size();  } -/// getType - Return the type of this formula, if it has one, or null -/// otherwise. This type is meaningless except for the bit size. +/// Return the type of this formula, if it has one, or null otherwise. This type +/// is meaningless except for the bit size.  Type *Formula::getType() const {    return !BaseRegs.empty() ? BaseRegs.front()->getType() :           ScaledReg ? ScaledReg->getType() : @@ -419,21 +437,21 @@ Type *Formula::getType() const {           nullptr;  } -/// DeleteBaseReg - Delete the given base reg from the BaseRegs list. -void Formula::DeleteBaseReg(const SCEV *&S) { +/// Delete the given base reg from the BaseRegs list. +void Formula::deleteBaseReg(const SCEV *&S) {    if (&S != &BaseRegs.back())      std::swap(S, BaseRegs.back());    BaseRegs.pop_back();  } -/// referencesReg - Test if this formula references the given register. +/// Test if this formula references the given register.  bool Formula::referencesReg(const SCEV *S) const {    return S == ScaledReg ||           std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();  } -/// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers -/// which are used by uses other than the use with the given index. +/// Test whether this formula uses registers which are used by uses other than +/// the use with the given index.  bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,                                           const RegUseTracker &RegUses) const {    if (ScaledReg) @@ -481,30 +499,29 @@ void Formula::print(raw_ostream &OS) const {    }  } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD  void Formula::dump() const {    print(errs()); errs() << '\n';  } -#endif -/// isAddRecSExtable - Return true if the given addrec can be sign-extended -/// without changing its value. +/// Return true if the given addrec can be sign-extended without changing its +/// value.  static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {    Type *WideTy =      IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);    return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));  } -/// isAddSExtable - Return true if the given add can be sign-extended -/// without changing its value. +/// Return true if the given add can be sign-extended without changing its +/// value.  static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {    Type *WideTy =      IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);    return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));  } -/// isMulSExtable - Return true if the given mul can be sign-extended -/// without changing its value. +/// Return true if the given mul can be sign-extended without changing its +/// value.  static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {    Type *WideTy =      IntegerType::get(SE.getContext(), @@ -512,12 +529,11 @@ static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {    return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));  } -/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined -/// and if the remainder is known to be zero,  or null otherwise. If -/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified -/// to Y, ignoring that the multiplication may overflow, which is useful when -/// the result will be used in a context where the most significant bits are -/// ignored. +/// Return an expression for LHS /s RHS, if it can be determined and if the +/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits +/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that +/// the multiplication may overflow, which is useful when the result will be +/// used in a context where the most significant bits are ignored.  static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,                                  ScalarEvolution &SE,                                  bool IgnoreSignificantBits = false) { @@ -528,7 +544,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,    // Handle a few RHS special cases.    const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);    if (RC) { -    const APInt &RA = RC->getValue()->getValue(); +    const APInt &RA = RC->getAPInt();      // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do      // some folding.      if (RA.isAllOnesValue()) @@ -542,8 +558,8 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,    if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {      if (!RC)        return nullptr; -    const APInt &LA = C->getValue()->getValue(); -    const APInt &RA = RC->getValue()->getValue(); +    const APInt &LA = C->getAPInt(); +    const APInt &RA = RC->getAPInt();      if (LA.srem(RA) != 0)        return nullptr;      return SE.getConstant(LA.sdiv(RA)); @@ -603,12 +619,11 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,    return nullptr;  } -/// ExtractImmediate - If S involves the addition of a constant integer value, -/// return that integer value, and mutate S to point to a new SCEV with that -/// value excluded. +/// If S involves the addition of a constant integer value, return that integer +/// value, and mutate S to point to a new SCEV with that value excluded.  static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {    if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { -    if (C->getValue()->getValue().getMinSignedBits() <= 64) { +    if (C->getAPInt().getMinSignedBits() <= 64) {        S = SE.getConstant(C->getType(), 0);        return C->getValue()->getSExtValue();      } @@ -630,9 +645,8 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {    return 0;  } -/// ExtractSymbol - If S involves the addition of a GlobalValue address, -/// return that symbol, and mutate S to point to a new SCEV with that -/// value excluded. +/// If S involves the addition of a GlobalValue address, return that symbol, and +/// mutate S to point to a new SCEV with that value excluded.  static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {    if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {      if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { @@ -657,8 +671,8 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {    return nullptr;  } -/// isAddressUse - Returns true if the specified instruction is using the -/// specified value as an address. +/// Returns true if the specified instruction is using the specified value as an +/// address.  static bool isAddressUse(Instruction *Inst, Value *OperandVal) {    bool isAddress = isa<LoadInst>(Inst);    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { @@ -682,12 +696,15 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {    return isAddress;  } -/// getAccessType - Return the type of the memory being accessed. -static Type *getAccessType(const Instruction *Inst) { -  Type *AccessTy = Inst->getType(); -  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) -    AccessTy = SI->getOperand(0)->getType(); -  else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { +/// Return the type of the memory being accessed. +static MemAccessTy getAccessType(const Instruction *Inst) { +  MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace); +  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { +    AccessTy.MemTy = SI->getOperand(0)->getType(); +    AccessTy.AddrSpace = SI->getPointerAddressSpace(); +  } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) { +    AccessTy.AddrSpace = LI->getPointerAddressSpace(); +  } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {      // Addressing modes can also be folded into prefetches and a variety      // of intrinsics.      switch (II->getIntrinsicID()) { @@ -696,21 +713,21 @@ static Type *getAccessType(const Instruction *Inst) {      case Intrinsic::x86_sse2_storeu_pd:      case Intrinsic::x86_sse2_storeu_dq:      case Intrinsic::x86_sse2_storel_dq: -      AccessTy = II->getArgOperand(0)->getType(); +      AccessTy.MemTy = II->getArgOperand(0)->getType();        break;      }    }    // All pointers have the same requirements, so canonicalize them to an    // arbitrary pointer type to minimize variation. -  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy)) -    AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), -                                PTy->getAddressSpace()); +  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy)) +    AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), +                                      PTy->getAddressSpace());    return AccessTy;  } -/// isExistingPhi - Return true if this AddRec is already a phi in its loop. +/// Return true if this AddRec is already a phi in its loop.  static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {    for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();         PHINode *PN = dyn_cast<PHINode>(I); ++I) { @@ -793,9 +810,8 @@ static bool isHighCostExpansion(const SCEV *S,    return true;  } -/// DeleteTriviallyDeadInstructions - If any of the instructions is the -/// specified set are trivially dead, delete them and see if this makes any of -/// their operands subsequently dead. +/// If any of the instructions is the specified set are trivially dead, delete +/// them and see if this makes any of their operands subsequently dead.  static bool  DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {    bool Changed = false; @@ -842,7 +858,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,  namespace { -/// Cost - This class is used to measure and compare candidate formulae. +/// This class is used to measure and compare candidate formulae.  class Cost {    /// TODO: Some of these could be merged. Also, a lexical ordering    /// isn't always optimal. @@ -905,7 +921,7 @@ private:  } -/// RateRegister - Tally up interesting quantities from the given register. +/// Tally up interesting quantities from the given register.  void Cost::RateRegister(const SCEV *Reg,                          SmallPtrSetImpl<const SCEV *> &Regs,                          const Loop *L, @@ -951,9 +967,9 @@ void Cost::RateRegister(const SCEV *Reg,                   SE.hasComputableLoopEvolution(Reg, L);  } -/// RatePrimaryRegister - Record this register in the set. If we haven't seen it -/// before, rate it. Optional LoserRegs provides a way to declare any formula -/// that refers to one of those regs an instant loser. +/// Record this register in the set. If we haven't seen it before, rate +/// it. Optional LoserRegs provides a way to declare any formula that refers to +/// one of those regs an instant loser.  void Cost::RatePrimaryRegister(const SCEV *Reg,                                 SmallPtrSetImpl<const SCEV *> &Regs,                                 const Loop *L, @@ -1024,7 +1040,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,    assert(isValid() && "invalid cost");  } -/// Lose - Set this cost to a losing value. +/// Set this cost to a losing value.  void Cost::Lose() {    NumRegs = ~0u;    AddRecCost = ~0u; @@ -1035,7 +1051,7 @@ void Cost::Lose() {    ScaleCost = ~0u;  } -/// operator< - Choose the lower cost. +/// Choose the lower cost.  bool Cost::operator<(const Cost &Other) const {    return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,                    ImmCost, SetupCost) < @@ -1061,37 +1077,35 @@ void Cost::print(raw_ostream &OS) const {      OS << ", plus " << SetupCost << " setup cost";  } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD  void Cost::dump() const {    print(errs()); errs() << '\n';  } -#endif  namespace { -/// LSRFixup - An operand value in an instruction which is to be replaced -/// with some equivalent, possibly strength-reduced, replacement. +/// An operand value in an instruction which is to be replaced with some +/// equivalent, possibly strength-reduced, replacement.  struct LSRFixup { -  /// UserInst - The instruction which will be updated. +  /// The instruction which will be updated.    Instruction *UserInst; -  /// OperandValToReplace - The operand of the instruction which will -  /// be replaced. The operand may be used more than once; every instance -  /// will be replaced. +  /// The operand of the instruction which will be replaced. The operand may be +  /// used more than once; every instance will be replaced.    Value *OperandValToReplace; -  /// PostIncLoops - If this user is to use the post-incremented value of an -  /// induction variable, this variable is non-null and holds the loop -  /// associated with the induction variable. +  /// If this user is to use the post-incremented value of an induction +  /// variable, this variable is non-null and holds the loop associated with the +  /// induction variable.    PostIncLoopSet PostIncLoops; -  /// LUIdx - The index of the LSRUse describing the expression which -  /// this fixup needs, minus an offset (below). +  /// The index of the LSRUse describing the expression which this fixup needs, +  /// minus an offset (below).    size_t LUIdx; -  /// Offset - A constant offset to be added to the LSRUse expression. -  /// This allows multiple fixups to share the same LSRUse with different -  /// offsets, for example in an unrolled loop. +  /// A constant offset to be added to the LSRUse expression.  This allows +  /// multiple fixups to share the same LSRUse with different offsets, for +  /// example in an unrolled loop.    int64_t Offset;    bool isUseFullyOutsideLoop(const Loop *L) const; @@ -1108,8 +1122,7 @@ LSRFixup::LSRFixup()    : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)),      Offset(0) {} -/// isUseFullyOutsideLoop - Test whether this fixup always uses its -/// value outside of the given loop. +/// Test whether this fixup always uses its value outside of the given loop.  bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {    // PHI nodes use their value in their incoming blocks.    if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { @@ -1149,16 +1162,15 @@ void LSRFixup::print(raw_ostream &OS) const {      OS << ", Offset=" << Offset;  } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD  void LSRFixup::dump() const {    print(errs()); errs() << '\n';  } -#endif  namespace { -/// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding -/// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*. +/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted +/// SmallVectors of const SCEV*.  struct UniquifierDenseMapInfo {    static SmallVector<const SCEV *, 4> getEmptyKey() {      SmallVector<const SCEV *, 4>  V; @@ -1182,17 +1194,17 @@ struct UniquifierDenseMapInfo {    }  }; -/// LSRUse - This class holds the state that LSR keeps for each use in -/// IVUsers, as well as uses invented by LSR itself. It includes information -/// about what kinds of things can be folded into the user, information about -/// the user itself, and information about how the use may be satisfied. -/// TODO: Represent multiple users of the same expression in common? +/// This class holds the state that LSR keeps for each use in IVUsers, as well +/// as uses invented by LSR itself. It includes information about what kinds of +/// things can be folded into the user, information about the user itself, and +/// information about how the use may be satisfied.  TODO: Represent multiple +/// users of the same expression in common?  class LSRUse {    DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;  public: -  /// KindType - An enum for a kind of use, indicating what types of -  /// scaled and immediate operands it might support. +  /// An enum for a kind of use, indicating what types of scaled and immediate +  /// operands it might support.    enum KindType {      Basic,   ///< A normal use, with no folding.      Special, ///< A special case of basic, allowing -1 scales. @@ -1204,15 +1216,14 @@ public:    typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;    KindType Kind; -  Type *AccessTy; +  MemAccessTy AccessTy;    SmallVector<int64_t, 8> Offsets;    int64_t MinOffset;    int64_t MaxOffset; -  /// AllFixupsOutsideLoop - This records whether all of the fixups using this -  /// LSRUse are outside of the loop, in which case some special-case heuristics -  /// may be used. +  /// This records whether all of the fixups using this LSRUse are outside of +  /// the loop, in which case some special-case heuristics may be used.    bool AllFixupsOutsideLoop;    /// RigidFormula is set to true to guarantee that this use will be associated @@ -1222,26 +1233,24 @@ public:    /// changing the formula.    bool RigidFormula; -  /// WidestFixupType - This records the widest use type for any fixup using -  /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different -  /// max fixup widths to be equivalent, because the narrower one may be relying -  /// on the implicit truncation to truncate away bogus bits. +  /// This records the widest use type for any fixup using this +  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max +  /// fixup widths to be equivalent, because the narrower one may be relying on +  /// the implicit truncation to truncate away bogus bits.    Type *WidestFixupType; -  /// Formulae - A list of ways to build a value that can satisfy this user. -  /// After the list is populated, one of these is selected heuristically and -  /// used to formulate a replacement for OperandValToReplace in UserInst. +  /// A list of ways to build a value that can satisfy this user.  After the +  /// list is populated, one of these is selected heuristically and used to +  /// formulate a replacement for OperandValToReplace in UserInst.    SmallVector<Formula, 12> Formulae; -  /// Regs - The set of register candidates used by all formulae in this LSRUse. +  /// The set of register candidates used by all formulae in this LSRUse.    SmallPtrSet<const SCEV *, 4> Regs; -  LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T), -                                      MinOffset(INT64_MAX), -                                      MaxOffset(INT64_MIN), -                                      AllFixupsOutsideLoop(true), -                                      RigidFormula(false), -                                      WidestFixupType(nullptr) {} +  LSRUse(KindType K, MemAccessTy AT) +      : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN), +        AllFixupsOutsideLoop(true), RigidFormula(false), +        WidestFixupType(nullptr) {}    bool HasFormulaWithSameRegs(const Formula &F) const;    bool InsertFormula(const Formula &F); @@ -1254,8 +1263,8 @@ public:  } -/// HasFormula - Test whether this use as a formula which has the same -/// registers as the given formula. +/// Test whether this use as a formula which has the same registers as the given +/// formula.  bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {    SmallVector<const SCEV *, 4> Key = F.BaseRegs;    if (F.ScaledReg) Key.push_back(F.ScaledReg); @@ -1264,9 +1273,8 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {    return Uniquifier.count(Key);  } -/// InsertFormula - If the given formula has not yet been inserted, add it to -/// the list, and return true. Return false otherwise. -/// The formula must be in canonical form. +/// If the given formula has not yet been inserted, add it to the list, and +/// return true. Return false otherwise.  The formula must be in canonical form.  bool LSRUse::InsertFormula(const Formula &F) {    assert(F.isCanonical() && "Invalid canonical representation"); @@ -1300,14 +1308,14 @@ bool LSRUse::InsertFormula(const Formula &F) {    return true;  } -/// DeleteFormula - Remove the given formula from this use's list. +/// Remove the given formula from this use's list.  void LSRUse::DeleteFormula(Formula &F) {    if (&F != &Formulae.back())      std::swap(F, Formulae.back());    Formulae.pop_back();  } -/// RecomputeRegs - Recompute the Regs field, and update RegUses. +/// Recompute the Regs field, and update RegUses.  void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {    // Now that we've filtered out some formulae, recompute the Regs set.    SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs); @@ -1320,7 +1328,7 @@ void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {    // Update the RegTracker.    for (const SCEV *S : OldRegs)      if (!Regs.count(S)) -      RegUses.DropRegister(S, LUIdx); +      RegUses.dropRegister(S, LUIdx);  }  void LSRUse::print(raw_ostream &OS) const { @@ -1331,10 +1339,13 @@ void LSRUse::print(raw_ostream &OS) const {    case ICmpZero: OS << "ICmpZero"; break;    case Address:      OS << "Address of "; -    if (AccessTy->isPointerTy()) +    if (AccessTy.MemTy->isPointerTy())        OS << "pointer"; // the full pointer type could be really verbose -    else -      OS << *AccessTy; +    else { +      OS << *AccessTy.MemTy; +    } + +    OS << " in addrspace(" << AccessTy.AddrSpace << ')';    }    OS << ", Offsets={"; @@ -1353,19 +1364,19 @@ void LSRUse::print(raw_ostream &OS) const {      OS << ", widest fixup type: " << *WidestFixupType;  } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD  void LSRUse::dump() const {    print(errs()); errs() << '\n';  } -#endif  static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, -                                 LSRUse::KindType Kind, Type *AccessTy, +                                 LSRUse::KindType Kind, MemAccessTy AccessTy,                                   GlobalValue *BaseGV, int64_t BaseOffset,                                   bool HasBaseReg, int64_t Scale) {    switch (Kind) {    case LSRUse::Address: -    return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); +    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset, +                                     HasBaseReg, Scale, AccessTy.AddrSpace);    case LSRUse::ICmpZero:      // There's not even a target hook for querying whether it would be legal to @@ -1412,7 +1423,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,  static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,                                   int64_t MinOffset, int64_t MaxOffset, -                                 LSRUse::KindType Kind, Type *AccessTy, +                                 LSRUse::KindType Kind, MemAccessTy AccessTy,                                   GlobalValue *BaseGV, int64_t BaseOffset,                                   bool HasBaseReg, int64_t Scale) {    // Check for overflow. @@ -1433,7 +1444,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,  static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,                                   int64_t MinOffset, int64_t MaxOffset, -                                 LSRUse::KindType Kind, Type *AccessTy, +                                 LSRUse::KindType Kind, MemAccessTy AccessTy,                                   const Formula &F) {    // For the purpose of isAMCompletelyFolded either having a canonical formula    // or a scale not equal to zero is correct. @@ -1447,11 +1458,11 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,                                F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);  } -/// isLegalUse - Test whether we know how to expand the current formula. +/// Test whether we know how to expand the current formula.  static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, -                       int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, -                       GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, -                       int64_t Scale) { +                       int64_t MaxOffset, LSRUse::KindType Kind, +                       MemAccessTy AccessTy, GlobalValue *BaseGV, +                       int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {    // We know how to expand completely foldable formulae.    return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,                                BaseOffset, HasBaseReg, Scale) || @@ -1463,8 +1474,8 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,  }  static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, -                       int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, -                       const Formula &F) { +                       int64_t MaxOffset, LSRUse::KindType Kind, +                       MemAccessTy AccessTy, const Formula &F) {    return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,                      F.BaseOffset, F.HasBaseReg, F.Scale);  } @@ -1490,14 +1501,12 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,    switch (LU.Kind) {    case LSRUse::Address: {      // Check the scaling factor cost with both the min and max offsets. -    int ScaleCostMinOffset = -      TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, -                               F.BaseOffset + LU.MinOffset, -                               F.HasBaseReg, F.Scale); -    int ScaleCostMaxOffset = -      TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, -                               F.BaseOffset + LU.MaxOffset, -                               F.HasBaseReg, F.Scale); +    int ScaleCostMinOffset = TTI.getScalingFactorCost( +        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg, +        F.Scale, LU.AccessTy.AddrSpace); +    int ScaleCostMaxOffset = TTI.getScalingFactorCost( +        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg, +        F.Scale, LU.AccessTy.AddrSpace);      assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&             "Legal addressing mode has an illegal cost!"); @@ -1515,7 +1524,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,  }  static bool isAlwaysFoldable(const TargetTransformInfo &TTI, -                             LSRUse::KindType Kind, Type *AccessTy, +                             LSRUse::KindType Kind, MemAccessTy AccessTy,                               GlobalValue *BaseGV, int64_t BaseOffset,                               bool HasBaseReg) {    // Fast-path: zero is always foldable. @@ -1539,7 +1548,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,  static bool isAlwaysFoldable(const TargetTransformInfo &TTI,                               ScalarEvolution &SE, int64_t MinOffset,                               int64_t MaxOffset, LSRUse::KindType Kind, -                             Type *AccessTy, const SCEV *S, bool HasBaseReg) { +                             MemAccessTy AccessTy, const SCEV *S, +                             bool HasBaseReg) {    // Fast-path: zero is always foldable.    if (S->isZero()) return true; @@ -1564,9 +1574,9 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,  namespace { -/// IVInc - An individual increment in a Chain of IV increments. -/// Relate an IV user to an expression that computes the IV it uses from the IV -/// used by the previous link in the Chain. +/// An individual increment in a Chain of IV increments.  Relate an IV user to +/// an expression that computes the IV it uses from the IV used by the previous +/// link in the Chain.  ///  /// For the head of a chain, IncExpr holds the absolute SCEV expression for the  /// original IVOperand. The head of the chain's IVOperand is only valid during @@ -1582,8 +1592,8 @@ struct IVInc {      UserInst(U), IVOperand(O), IncExpr(E) {}  }; -// IVChain - The list of IV increments in program order. -// We typically add the head of a chain without finding subsequent links. +// The list of IV increments in program order.  We typically add the head of a +// chain without finding subsequent links.  struct IVChain {    SmallVector<IVInc,1> Incs;    const SCEV *ExprBase; @@ -1595,7 +1605,7 @@ struct IVChain {    typedef SmallVectorImpl<IVInc>::const_iterator const_iterator; -  // begin - return the first increment in the chain. +  // Return the first increment in the chain.    const_iterator begin() const {      assert(!Incs.empty());      return std::next(Incs.begin()); @@ -1604,32 +1614,30 @@ struct IVChain {      return Incs.end();    } -  // hasIncs - Returns true if this chain contains any increments. +  // Returns true if this chain contains any increments.    bool hasIncs() const { return Incs.size() >= 2; } -  // add - Add an IVInc to the end of this chain. +  // Add an IVInc to the end of this chain.    void add(const IVInc &X) { Incs.push_back(X); } -  // tailUserInst - Returns the last UserInst in the chain. +  // Returns the last UserInst in the chain.    Instruction *tailUserInst() const { return Incs.back().UserInst; } -  // isProfitableIncrement - Returns true if IncExpr can be profitably added to -  // this chain. +  // Returns true if IncExpr can be profitably added to this chain.    bool isProfitableIncrement(const SCEV *OperExpr,                               const SCEV *IncExpr,                               ScalarEvolution&);  }; -/// ChainUsers - Helper for CollectChains to track multiple IV increment uses. -/// Distinguish between FarUsers that definitely cross IV increments and -/// NearUsers that may be used between IV increments. +/// Helper for CollectChains to track multiple IV increment uses.  Distinguish +/// between FarUsers that definitely cross IV increments and NearUsers that may +/// be used between IV increments.  struct ChainUsers {    SmallPtrSet<Instruction*, 4> FarUsers;    SmallPtrSet<Instruction*, 4> NearUsers;  }; -/// LSRInstance - This class holds state for the main loop strength reduction -/// logic. +/// This class holds state for the main loop strength reduction logic.  class LSRInstance {    IVUsers &IU;    ScalarEvolution &SE; @@ -1639,25 +1647,25 @@ class LSRInstance {    Loop *const L;    bool Changed; -  /// IVIncInsertPos - This is the insert position that the current loop's -  /// induction variable increment should be placed. In simple loops, this is -  /// the latch block's terminator. But in more complicated cases, this is a -  /// position which will dominate all the in-loop post-increment users. +  /// This is the insert position that the current loop's induction variable +  /// increment should be placed. In simple loops, this is the latch block's +  /// terminator. But in more complicated cases, this is a position which will +  /// dominate all the in-loop post-increment users.    Instruction *IVIncInsertPos; -  /// Factors - Interesting factors between use strides. +  /// Interesting factors between use strides.    SmallSetVector<int64_t, 8> Factors; -  /// Types - Interesting use types, to facilitate truncation reuse. +  /// Interesting use types, to facilitate truncation reuse.    SmallSetVector<Type *, 4> Types; -  /// Fixups - The list of operands which are to be replaced. +  /// The list of operands which are to be replaced.    SmallVector<LSRFixup, 16> Fixups; -  /// Uses - The list of interesting uses. +  /// The list of interesting uses.    SmallVector<LSRUse, 16> Uses; -  /// RegUses - Track which uses use which register candidates. +  /// Track which uses use which register candidates.    RegUseTracker RegUses;    // Limit the number of chains to avoid quadratic behavior. We don't expect to @@ -1665,10 +1673,10 @@ class LSRInstance {    // back to normal LSR behavior for those uses.    static const unsigned MaxChains = 8; -  /// IVChainVec - IV users can form a chain of IV increments. +  /// IV users can form a chain of IV increments.    SmallVector<IVChain, MaxChains> IVChainVec; -  /// IVIncSet - IV users that belong to profitable IVChains. +  /// IV users that belong to profitable IVChains.    SmallPtrSet<Use*, MaxChains> IVIncSet;    void OptimizeShadowIV(); @@ -1696,11 +1704,10 @@ class LSRInstance {    UseMapTy UseMap;    bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, -                          LSRUse::KindType Kind, Type *AccessTy); +                          LSRUse::KindType Kind, MemAccessTy AccessTy); -  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, -                                    LSRUse::KindType Kind, -                                    Type *AccessTy); +  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind, +                                    MemAccessTy AccessTy);    void DeleteUse(LSRUse &LU, size_t LUIdx); @@ -1769,18 +1776,16 @@ class LSRInstance {    void RewriteForPHI(PHINode *PN, const LSRFixup &LF,                       const Formula &F,                       SCEVExpander &Rewriter, -                     SmallVectorImpl<WeakVH> &DeadInsts, -                     Pass *P) const; +                     SmallVectorImpl<WeakVH> &DeadInsts) const;    void Rewrite(const LSRFixup &LF,                 const Formula &F,                 SCEVExpander &Rewriter, -               SmallVectorImpl<WeakVH> &DeadInsts, -               Pass *P) const; -  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, -                         Pass *P); +               SmallVectorImpl<WeakVH> &DeadInsts) const; +  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);  public: -  LSRInstance(Loop *L, Pass *P); +  LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, +              LoopInfo &LI, const TargetTransformInfo &TTI);    bool getChanged() const { return Changed; } @@ -1793,8 +1798,8 @@ public:  } -/// OptimizeShadowIV - If IV is used in a int-to-float cast -/// inside the loop then try to eliminate the cast operation. +/// If IV is used in a int-to-float cast inside the loop then try to eliminate +/// the cast operation.  void LSRInstance::OptimizeShadowIV() {    const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);    if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) @@ -1902,9 +1907,8 @@ void LSRInstance::OptimizeShadowIV() {    }  } -/// FindIVUserForCond - If Cond has an operand that is an expression of an IV, -/// set the IV user and stride information and return true, otherwise return -/// false. +/// If Cond has an operand that is an expression of an IV, set the IV user and +/// stride information and return true, otherwise return false.  bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {    for (IVStrideUse &U : IU)      if (U.getUser() == Cond) { @@ -1917,8 +1921,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {    return false;  } -/// OptimizeMax - Rewrite the loop's terminating condition if it uses -/// a max computation. +/// Rewrite the loop's terminating condition if it uses a max computation.  ///  /// This is a narrow solution to a specific, but acute, problem. For loops  /// like this: @@ -2076,8 +2079,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {    return NewCond;  } -/// OptimizeLoopTermCond - Change loop terminating condition to use the -/// postinc iv when possible. +/// Change loop terminating condition to use the postinc iv when possible.  void  LSRInstance::OptimizeLoopTermCond() {    SmallPtrSet<Instruction *, 4> PostIncs; @@ -2152,16 +2154,18 @@ LSRInstance::OptimizeLoopTermCond() {                  C->getValue().isMinSignedValue())                goto decline_post_inc;              // Check for possible scaled-address reuse. -            Type *AccessTy = getAccessType(UI->getUser()); +            MemAccessTy AccessTy = getAccessType(UI->getUser());              int64_t Scale = C->getSExtValue(); -            if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, -                                          /*BaseOffset=*/ 0, -                                          /*HasBaseReg=*/ false, Scale)) +            if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, +                                          /*BaseOffset=*/0, +                                          /*HasBaseReg=*/false, Scale, +                                          AccessTy.AddrSpace))                goto decline_post_inc;              Scale = -Scale; -            if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, -                                          /*BaseOffset=*/ 0, -                                          /*HasBaseReg=*/ false, Scale)) +            if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, +                                          /*BaseOffset=*/0, +                                          /*HasBaseReg=*/false, Scale, +                                          AccessTy.AddrSpace))                goto decline_post_inc;            }          } @@ -2180,7 +2184,7 @@ LSRInstance::OptimizeLoopTermCond() {          ICmpInst *OldCond = Cond;          Cond = cast<ICmpInst>(Cond->clone());          Cond->setName(L->getHeader()->getName() + ".termcond"); -        ExitingBlock->getInstList().insert(TermBr, Cond); +        ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);          // Clone the IVUse, as the old use still exists!          CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); @@ -2213,15 +2217,14 @@ LSRInstance::OptimizeLoopTermCond() {    }  } -/// reconcileNewOffset - Determine if the given use can accommodate a fixup -/// at the given offset and other details. If so, update the use and -/// return true. -bool -LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, -                                LSRUse::KindType Kind, Type *AccessTy) { +/// Determine if the given use can accommodate a fixup at the given offset and +/// other details. If so, update the use and return true. +bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, +                                     bool HasBaseReg, LSRUse::KindType Kind, +                                     MemAccessTy AccessTy) {    int64_t NewMinOffset = LU.MinOffset;    int64_t NewMaxOffset = LU.MaxOffset; -  Type *NewAccessTy = AccessTy; +  MemAccessTy NewAccessTy = AccessTy;    // Check for a mismatched kind. It's tempting to collapse mismatched kinds to    // something conservative, however this can pessimize in the case that one of @@ -2232,8 +2235,10 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,    // Check for a mismatched access type, and fall back conservatively as needed.    // TODO: Be less conservative when the type is similar and can use the same    // addressing modes. -  if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) -    NewAccessTy = Type::getVoidTy(AccessTy->getContext()); +  if (Kind == LSRUse::Address) { +    if (AccessTy != LU.AccessTy) +      NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext()); +  }    // Conservatively assume HasBaseReg is true for now.    if (NewOffset < LU.MinOffset) { @@ -2257,12 +2262,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,    return true;  } -/// getUse - Return an LSRUse index and an offset value for a fixup which -/// needs the given expression, with the given kind and optional access type. -/// Either reuse an existing use or create a new one, as needed. -std::pair<size_t, int64_t> -LSRInstance::getUse(const SCEV *&Expr, -                    LSRUse::KindType Kind, Type *AccessTy) { +/// Return an LSRUse index and an offset value for a fixup which needs the given +/// expression, with the given kind and optional access type.  Either reuse an +/// existing use or create a new one, as needed. +std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr, +                                               LSRUse::KindType Kind, +                                               MemAccessTy AccessTy) {    const SCEV *Copy = Expr;    int64_t Offset = ExtractImmediate(Expr, SE); @@ -2300,18 +2305,18 @@ LSRInstance::getUse(const SCEV *&Expr,    return std::make_pair(LUIdx, Offset);  } -/// DeleteUse - Delete the given use from the Uses list. +/// Delete the given use from the Uses list.  void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {    if (&LU != &Uses.back())      std::swap(LU, Uses.back());    Uses.pop_back();    // Update RegUses. -  RegUses.SwapAndDropUse(LUIdx, Uses.size()); +  RegUses.swapAndDropUse(LUIdx, Uses.size());  } -/// FindUseWithFormula - Look for a use distinct from OrigLU which is has -/// a formula that has the same registers as the given formula. +/// Look for a use distinct from OrigLU which is has a formula that has the same +/// registers as the given formula.  LSRUse *  LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,                                         const LSRUse &OrigLU) { @@ -2396,14 +2401,14 @@ void LSRInstance::CollectInterestingTypesAndFactors() {        if (const SCEVConstant *Factor =              dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,                                                          SE, true))) { -        if (Factor->getValue()->getValue().getMinSignedBits() <= 64) -          Factors.insert(Factor->getValue()->getValue().getSExtValue()); +        if (Factor->getAPInt().getMinSignedBits() <= 64) +          Factors.insert(Factor->getAPInt().getSExtValue());        } else if (const SCEVConstant *Factor =                     dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,                                                                 NewStride,                                                                 SE, true))) { -        if (Factor->getValue()->getValue().getMinSignedBits() <= 64) -          Factors.insert(Factor->getValue()->getValue().getSExtValue()); +        if (Factor->getAPInt().getMinSignedBits() <= 64) +          Factors.insert(Factor->getAPInt().getSExtValue());        }      } @@ -2415,9 +2420,9 @@ void LSRInstance::CollectInterestingTypesAndFactors() {    DEBUG(print_factors_and_types(dbgs()));  } -/// findIVOperand - Helper for CollectChains that finds an IV operand (computed -/// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped -/// Instructions to IVStrideUses, we could partially skip this. +/// Helper for CollectChains that finds an IV operand (computed by an AddRec in +/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to +/// IVStrideUses, we could partially skip this.  static User::op_iterator  findIVOperand(User::op_iterator OI, User::op_iterator OE,                Loop *L, ScalarEvolution &SE) { @@ -2436,29 +2441,28 @@ findIVOperand(User::op_iterator OI, User::op_iterator OE,    return OI;  } -/// getWideOperand - IVChain logic must consistenctly peek base TruncInst -/// operands, so wrap it in a convenient helper. +/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in +/// a convenient helper.  static Value *getWideOperand(Value *Oper) {    if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))      return Trunc->getOperand(0);    return Oper;  } -/// isCompatibleIVType - Return true if we allow an IV chain to include both -/// types. +/// Return true if we allow an IV chain to include both types.  static bool isCompatibleIVType(Value *LVal, Value *RVal) {    Type *LType = LVal->getType();    Type *RType = RVal->getType();    return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());  } -/// getExprBase - Return an approximation of this SCEV expression's "base", or -/// NULL for any constant. Returning the expression itself is -/// conservative. Returning a deeper subexpression is more precise and valid as -/// long as it isn't less complex than another subexpression. For expressions -/// involving multiple unscaled values, we need to return the pointer-type -/// SCEVUnknown. This avoids forming chains across objects, such as: -/// PrevOper==a[i], IVOper==b[i], IVInc==b-a. +/// Return an approximation of this SCEV expression's "base", or NULL for any +/// constant. Returning the expression itself is conservative. Returning a +/// deeper subexpression is more precise and valid as long as it isn't less +/// complex than another subexpression. For expressions involving multiple +/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids +/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i], +/// IVInc==b-a.  ///  /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost  /// SCEVUnknown, we simply return the rightmost SCEV operand. @@ -2601,8 +2605,7 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,    return cost < 0;  } -/// ChainInstruction - Add this IV user to an existing chain or make it the head -/// of a new chain. +/// Add this IV user to an existing chain or make it the head of a new chain.  void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,                                     SmallVectorImpl<ChainUsers> &ChainUsersVec) {    // When IVs are used as types of varying widths, they are generally converted @@ -2714,7 +2717,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,    ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);  } -/// CollectChains - Populate the vector of Chains. +/// Populate the vector of Chains.  ///  /// This decreases ILP at the architecture level. Targets with ample registers,  /// multiple memory ports, and no register renaming probably don't want @@ -2755,19 +2758,19 @@ void LSRInstance::CollectChains() {      for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end();           I != E; ++I) {        // Skip instructions that weren't seen by IVUsers analysis. -      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I)) +      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&*I))          continue;        // Ignore users that are part of a SCEV expression. This way we only        // consider leaf IV Users. This effectively rediscovers a portion of        // IVUsers analysis but in program order this time. -      if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I))) +      if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(&*I)))          continue;        // Remove this instruction from any NearUsers set it may be in.        for (unsigned ChainIdx = 0, NChains = IVChainVec.size();             ChainIdx < NChains; ++ChainIdx) { -        ChainUsersVec[ChainIdx].NearUsers.erase(I); +        ChainUsersVec[ChainIdx].NearUsers.erase(&*I);        }        // Search for operands that can be chained.        SmallPtrSet<Instruction*, 4> UniqueOperands; @@ -2776,7 +2779,7 @@ void LSRInstance::CollectChains() {        while (IVOpIter != IVOpEnd) {          Instruction *IVOpInst = cast<Instruction>(*IVOpIter);          if (UniqueOperands.insert(IVOpInst).second) -          ChainInstruction(I, IVOpInst, ChainUsersVec); +          ChainInstruction(&*I, IVOpInst, ChainUsersVec);          IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);        }      } // Continue walking down the instructions. @@ -2828,20 +2831,20 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,    if (!IncConst || !isAddressUse(UserInst, Operand))      return false; -  if (IncConst->getValue()->getValue().getMinSignedBits() > 64) +  if (IncConst->getAPInt().getMinSignedBits() > 64)      return false; +  MemAccessTy AccessTy = getAccessType(UserInst);    int64_t IncOffset = IncConst->getValue()->getSExtValue(); -  if (!isAlwaysFoldable(TTI, LSRUse::Address, -                        getAccessType(UserInst), /*BaseGV=*/ nullptr, -                        IncOffset, /*HaseBaseReg=*/ false)) +  if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, +                        IncOffset, /*HaseBaseReg=*/false))      return false;    return true;  } -/// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to -/// materialize the IV user's operand from the previous IV user's operand. +/// Generate an add or subtract for each IVInc in a chain to materialize the IV +/// user's operand from the previous IV user's operand.  void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,                                    SmallVectorImpl<WeakVH> &DeadInsts) {    // Find the new IVOperand for the head of the chain. It may have been replaced @@ -2961,7 +2964,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {      LF.PostIncLoops = U.getPostIncLoops();      LSRUse::KindType Kind = LSRUse::Basic; -    Type *AccessTy = nullptr; +    MemAccessTy AccessTy;      if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {        Kind = LSRUse::Address;        AccessTy = getAccessType(LF.UserInst); @@ -3027,9 +3030,8 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {    DEBUG(print_fixups(dbgs()));  } -/// InsertInitialFormula - Insert a formula for the given expression into -/// the given use, separating out loop-variant portions from loop-invariant -/// and loop-computable portions. +/// Insert a formula for the given expression into the given use, separating out +/// loop-variant portions from loop-invariant and loop-computable portions.  void  LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {    // Mark uses whose expressions cannot be expanded. @@ -3037,13 +3039,13 @@ LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {      LU.RigidFormula = true;    Formula F; -  F.InitialMatch(S, L, SE); +  F.initialMatch(S, L, SE);    bool Inserted = InsertFormula(LU, LUIdx, F);    assert(Inserted && "Initial formula already exists!"); (void)Inserted;  } -/// InsertSupplementalFormula - Insert a simple single-register formula for -/// the given expression into the given use. +/// Insert a simple single-register formula for the given expression into the +/// given use.  void  LSRInstance::InsertSupplementalFormula(const SCEV *S,                                         LSRUse &LU, size_t LUIdx) { @@ -3054,17 +3056,16 @@ LSRInstance::InsertSupplementalFormula(const SCEV *S,    assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;  } -/// CountRegisters - Note which registers are used by the given formula, -/// updating RegUses. +/// Note which registers are used by the given formula, updating RegUses.  void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {    if (F.ScaledReg) -    RegUses.CountRegister(F.ScaledReg, LUIdx); +    RegUses.countRegister(F.ScaledReg, LUIdx);    for (const SCEV *BaseReg : F.BaseRegs) -    RegUses.CountRegister(BaseReg, LUIdx); +    RegUses.countRegister(BaseReg, LUIdx);  } -/// InsertFormula - If the given formula has not yet been inserted, add it to -/// the list, and return true. Return false otherwise. +/// If the given formula has not yet been inserted, add it to the list, and +/// return true. Return false otherwise.  bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {    // Do not insert formula that we will not be able to expand.    assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) && @@ -3076,9 +3077,9 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {    return true;  } -/// CollectLoopInvariantFixupsAndFormulae - Check for other uses of -/// loop-invariant values which we're tracking. These other uses will pin these -/// values in registers, making them less profitable for elimination. +/// Check for other uses of loop-invariant values which we're tracking. These +/// other uses will pin these values in registers, making them less profitable +/// for elimination.  /// TODO: This currently misses non-constant addrec step registers.  /// TODO: Should this give more weight to users inside the loop?  void @@ -3124,6 +3125,9 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {              PHINode::getIncomingValueNumForOperand(U.getOperandNo()));          if (!DT.dominates(L->getHeader(), UseBB))            continue; +        // Don't bother if the instruction is in a BB which ends in an EHPad. +        if (UseBB->getTerminator()->isEHPad()) +          continue;          // Ignore uses which are part of other SCEV expressions, to avoid          // analyzing them multiple times.          if (SE.isSCEVable(UserInst->getType())) { @@ -3148,7 +3152,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {          LSRFixup &LF = getNewFixup();          LF.UserInst = const_cast<Instruction *>(UserInst);          LF.OperandValToReplace = U; -        std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, nullptr); +        std::pair<size_t, int64_t> P = getUse( +            S, LSRUse::Basic, MemAccessTy());          LF.LUIdx = P.first;          LF.Offset = P.second;          LSRUse &LU = Uses[LF.LUIdx]; @@ -3165,8 +3170,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {    }  } -/// CollectSubexprs - Split S into subexpressions which can be pulled out into -/// separate registers. If C is non-null, multiply each subexpression by C. +/// Split S into subexpressions which can be pulled out into separate +/// registers. If C is non-null, multiply each subexpression by C.  ///  /// Return remainder expression after factoring the subexpressions captured by  /// Ops. If Ops is complete, return NULL. @@ -3300,7 +3305,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,        F.BaseRegs.push_back(*J);      // We may have changed the number of register in base regs, adjust the      // formula accordingly. -    F.Canonicalize(); +    F.canonicalize();      if (InsertFormula(LU, LUIdx, F))        // If that formula hadn't been seen before, recurse to find more like @@ -3309,8 +3314,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,    }  } -/// GenerateReassociations - Split out subexpressions from adds and the bases of -/// addrecs. +/// Split out subexpressions from adds and the bases of addrecs.  void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,                                           Formula Base, unsigned Depth) {    assert(Base.isCanonical() && "Input must be in the canonical form"); @@ -3326,8 +3330,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,                                 /* Idx */ -1, /* IsScaledReg */ true);  } -/// GenerateCombinations - Generate a formula consisting of all of the -/// loop-dominating registers added into a single register. +///  Generate a formula consisting of all of the loop-dominating registers added +/// into a single register.  void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,                                         Formula Base) {    // This method is only interesting on a plurality of registers. @@ -3336,7 +3340,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,    // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before    // processing the formula. -  Base.Unscale(); +  Base.unscale();    Formula F = Base;    F.BaseRegs.clear();    SmallVector<const SCEV *, 4> Ops; @@ -3354,7 +3358,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,      // rather than proceed with zero in a register.      if (!Sum->isZero()) {        F.BaseRegs.push_back(Sum); -      F.Canonicalize(); +      F.canonicalize();        (void)InsertFormula(LU, LUIdx, F);      }    } @@ -3379,7 +3383,7 @@ void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,    (void)InsertFormula(LU, LUIdx, F);  } -/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets. +/// Generate reuse formulae using symbolic offsets.  void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,                                            Formula Base) {    // We can't add a symbolic offset if the address already contains one. @@ -3410,8 +3414,8 @@ void LSRInstance::GenerateConstantOffsetsImpl(            F.Scale = 0;            F.ScaledReg = nullptr;          } else -          F.DeleteBaseReg(F.BaseRegs[Idx]); -        F.Canonicalize(); +          F.deleteBaseReg(F.BaseRegs[Idx]); +        F.canonicalize();        } else if (IsScaledReg)          F.ScaledReg = NewG;        else @@ -3452,8 +3456,8 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,                                  /* IsScaledReg */ true);  } -/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up -/// the comparison. For example, x == y -> x*c == y*c. +/// For ICmpZero, check to see if we can scale up the comparison. For example, x +/// == y -> x*c == y*c.  void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,                                           Formula Base) {    if (LU.Kind != LSRUse::ICmpZero) return; @@ -3538,8 +3542,8 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,    }  } -/// GenerateScales - Generate stride factor reuse formulae by making use of -/// scaled-offset address modes, for example. +/// Generate stride factor reuse formulae by making use of scaled-offset address +/// modes, for example.  void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {    // Determine the integer type for the base formula.    Type *IntTy = Base.getType(); @@ -3547,10 +3551,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {    // If this Formula already has a scaled register, we can't add another one.    // Try to unscale the formula to generate a better scale. -  if (Base.Scale != 0 && !Base.Unscale()) +  if (Base.Scale != 0 && !Base.unscale())      return; -  assert(Base.Scale == 0 && "Unscale did not did its job!"); +  assert(Base.Scale == 0 && "unscale did not did its job!");    // Check each interesting stride.    for (int64_t Factor : Factors) { @@ -3587,7 +3591,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {            // TODO: This could be optimized to avoid all the copying.            Formula F = Base;            F.ScaledReg = Quotient; -          F.DeleteBaseReg(F.BaseRegs[i]); +          F.deleteBaseReg(F.BaseRegs[i]);            // The canonical representation of 1*reg is reg, which is already in            // Base. In that case, do not try to insert the formula, it will be            // rejected anyway. @@ -3599,7 +3603,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {    }  } -/// GenerateTruncates - Generate reuse formulae from different IV types. +/// Generate reuse formulae from different IV types.  void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {    // Don't bother truncating symbolic values.    if (Base.BaseGV) return; @@ -3629,9 +3633,9 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {  namespace { -/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to -/// defer modifications so that the search phase doesn't have to worry about -/// the data structures moving underneath it. +/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer +/// modifications so that the search phase doesn't have to worry about the data +/// structures moving underneath it.  struct WorkItem {    size_t LUIdx;    int64_t Imm; @@ -3651,14 +3655,13 @@ void WorkItem::print(raw_ostream &OS) const {       << " , add offset " << Imm;  } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD  void WorkItem::dump() const {    print(errs()); errs() << '\n';  } -#endif -/// GenerateCrossUseConstantOffsets - Look for registers which are a constant -/// distance apart and try to form reuse opportunities between them. +/// Look for registers which are a constant distance apart and try to form reuse +/// opportunities between them.  void LSRInstance::GenerateCrossUseConstantOffsets() {    // Group the registers by their value without any added constant offset.    typedef std::map<int64_t, const SCEV *> ImmMapTy; @@ -3751,7 +3754,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {        // very similar but slightly different. Investigate if they        // could be merged. That way, we would not have to unscale the        // Formula. -      F.Unscale(); +      F.unscale();        // Use the immediate in the scaled register.        if (F.ScaledReg == OrigReg) {          int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; @@ -3770,14 +3773,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {          // value to the immediate would produce a value closer to zero than the          // immediate itself, then the formula isn't worthwhile.          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) -          if (C->getValue()->isNegative() != -                (NewF.BaseOffset < 0) && -              (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale)) -                .ule(std::abs(NewF.BaseOffset))) +          if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) && +              (C->getAPInt().abs() * APInt(BitWidth, F.Scale)) +                  .ule(std::abs(NewF.BaseOffset)))              continue;          // OK, looks good. -        NewF.Canonicalize(); +        NewF.canonicalize();          (void)InsertFormula(LU, LUIdx, NewF);        } else {          // Use the immediate in a base register. @@ -3801,15 +3803,15 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {            // zero than the immediate itself, then the formula isn't worthwhile.            for (const SCEV *NewReg : NewF.BaseRegs)              if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) -              if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt( -                   std::abs(NewF.BaseOffset)) && -                  (C->getValue()->getValue() + -                   NewF.BaseOffset).countTrailingZeros() >= -                   countTrailingZeros<uint64_t>(NewF.BaseOffset)) +              if ((C->getAPInt() + NewF.BaseOffset) +                      .abs() +                      .slt(std::abs(NewF.BaseOffset)) && +                  (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >= +                      countTrailingZeros<uint64_t>(NewF.BaseOffset))                  goto skip_formula;            // Ok, looks good. -          NewF.Canonicalize(); +          NewF.canonicalize();            (void)InsertFormula(LU, LUIdx, NewF);            break;          skip_formula:; @@ -3819,7 +3821,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {    }  } -/// GenerateAllReuseFormulae - Generate formulae for each use. +/// Generate formulae for each use.  void  LSRInstance::GenerateAllReuseFormulae() {    // This is split into multiple loops so that hasRegsUsedByUsesOtherThan @@ -3959,10 +3961,9 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {  // This is a rough guess that seems to work fairly well.  static const size_t ComplexityLimit = UINT16_MAX; -/// EstimateSearchSpaceComplexity - Estimate the worst-case number of -/// solutions the solver might have to consider. It almost never considers -/// this many solutions because it prune the search space, but the pruning -/// isn't always sufficient. +/// Estimate the worst-case number of solutions the solver might have to +/// consider. It almost never considers this many solutions because it prune the +/// search space, but the pruning isn't always sufficient.  size_t LSRInstance::EstimateSearchSpaceComplexity() const {    size_t Power = 1;    for (const LSRUse &LU : Uses) { @@ -3978,10 +3979,9 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const {    return Power;  } -/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset -/// of the registers of another formula, it won't help reduce register -/// pressure (though it may not necessarily hurt register pressure); remove -/// it to simplify the system. +/// When one formula uses a superset of the registers of another formula, it +/// won't help reduce register pressure (though it may not necessarily hurt +/// register pressure); remove it to simplify the system.  void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {    if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {      DEBUG(dbgs() << "The search space is too complex.\n"); @@ -4042,9 +4042,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {    }  } -/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers -/// for expressions like A, A+1, A+2, etc., allocate a single register for -/// them. +/// When there are many registers for expressions like A, A+1, A+2, etc., +/// allocate a single register for them.  void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {    if (EstimateSearchSpaceComplexity() < ComplexityLimit)      return; @@ -4121,8 +4120,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {    DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));  } -/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call -/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that +/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that  /// we've done more filtering, as it may be able to find more formulae to  /// eliminate.  void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ @@ -4139,9 +4137,9 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){    }  } -/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely -/// to be profitable, and then in any use which has any reference to that -/// register, delete all formulae which do not reference that register. +/// Pick a register which seems likely to be profitable, and then in any use +/// which has any reference to that register, delete all formulae which do not +/// reference that register.  void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {    // With all other options exhausted, loop until the system is simple    // enough to handle. @@ -4202,10 +4200,10 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {    }  } -/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of -/// formulae to choose from, use some rough heuristics to prune down the number -/// of formulae. This keeps the main solver from taking an extraordinary amount -/// of time in some worst-case scenarios. +/// If there are an extraordinary number of formulae to choose from, use some +/// rough heuristics to prune down the number of formulae. This keeps the main +/// solver from taking an extraordinary amount of time in some worst-case +/// scenarios.  void LSRInstance::NarrowSearchSpaceUsingHeuristics() {    NarrowSearchSpaceByDetectingSupersets();    NarrowSearchSpaceByCollapsingUnrolledCode(); @@ -4213,7 +4211,7 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {    NarrowSearchSpaceByPickingWinnerRegs();  } -/// SolveRecurse - This is the recursive solver. +/// This is the recursive solver.  void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,                                 Cost &SolutionCost,                                 SmallVectorImpl<const Formula *> &Workspace, @@ -4291,8 +4289,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,    }  } -/// Solve - Choose one formula from each use. Return the results in the given -/// Solution vector. +/// Choose one formula from each use. Return the results in the given Solution +/// vector.  void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {    SmallVector<const Formula *, 8> Workspace;    Cost SolutionCost; @@ -4326,10 +4324,9 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {    assert(Solution.size() == Uses.size() && "Malformed solution!");  } -/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up -/// the dominator tree far as we can go while still being dominated by the -/// input positions. This helps canonicalize the insert position, which -/// encourages sharing. +/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as +/// we can go while still being dominated by the input positions. This helps +/// canonicalize the insert position, which encourages sharing.  BasicBlock::iterator  LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,                                   const SmallVectorImpl<Instruction *> &Inputs) @@ -4365,21 +4362,21 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,        // instead of at the end, so that it can be used for other expansions.        if (IDom == Inst->getParent() &&            (!BetterPos || !DT.dominates(Inst, BetterPos))) -        BetterPos = std::next(BasicBlock::iterator(Inst)); +        BetterPos = &*std::next(BasicBlock::iterator(Inst));      }      if (!AllDominate)        break;      if (BetterPos) -      IP = BetterPos; +      IP = BetterPos->getIterator();      else -      IP = Tentative; +      IP = Tentative->getIterator();    }    return IP;  } -/// AdjustInsertPositionForExpand - Determine an input position which will be -/// dominated by the operands and which will dominate the result. +/// Determine an input position which will be dominated by the operands and +/// which will dominate the result.  BasicBlock::iterator  LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,                                             const LSRFixup &LF, @@ -4417,7 +4414,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,      }    } -  assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP) +  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()           && !isa<DbgInfoIntrinsic>(LowestIP) &&           "Insertion point must be a normal instruction"); @@ -4429,7 +4426,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,    while (isa<PHINode>(IP)) ++IP;    // Ignore landingpad instructions. -  while (isa<LandingPadInst>(IP)) ++IP; +  while (!isa<TerminatorInst>(IP) && IP->isEHPad()) ++IP;    // Ignore debug intrinsics.    while (isa<DbgInfoIntrinsic>(IP)) ++IP; @@ -4437,13 +4434,14 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,    // Set IP below instructions recently inserted by SCEVExpander. This keeps the    // IP consistent across expansions and allows the previously inserted    // instructions to be reused by subsequent expansion. -  while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP; +  while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP) +    ++IP;    return IP;  } -/// Expand - Emit instructions for the leading candidate expression for this -/// LSRUse (this is called "expanding"). +/// Emit instructions for the leading candidate expression for this LSRUse (this +/// is called "expanding").  Value *LSRInstance::Expand(const LSRFixup &LF,                             const Formula &F,                             BasicBlock::iterator IP, @@ -4487,7 +4485,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,                                   LF.UserInst, LF.OperandValToReplace,                                   Loops, SE, DT); -    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, IP))); +    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP)));    }    // Expand the ScaledReg portion. @@ -4505,14 +4503,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF,        // Expand ScaleReg as if it was part of the base regs.        if (F.Scale == 1)          Ops.push_back( -            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP))); +            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)));        else {          // An interesting way of "folding" with an icmp is to use a negated          // scale, which we'll implement by inserting it into the other operand          // of the icmp.          assert(F.Scale == -1 &&                 "The only scale supported by ICmpZero uses is -1!"); -        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP); +        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP);        }      } else {        // Otherwise just expand the scaled register and an explicit scale, @@ -4522,11 +4520,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF,        // Unless the addressing mode will not be folded.        if (!Ops.empty() && LU.Kind == LSRUse::Address &&            isAMCompletelyFolded(TTI, LU, F)) { -        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); +        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);          Ops.clear();          Ops.push_back(SE.getUnknown(FullV));        } -      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP)); +      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP));        if (F.Scale != 1)          ScaledS =              SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale)); @@ -4538,7 +4536,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,    if (F.BaseGV) {      // Flush the operand list to suppress SCEVExpander hoisting.      if (!Ops.empty()) { -      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); +      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);        Ops.clear();        Ops.push_back(SE.getUnknown(FullV));      } @@ -4548,7 +4546,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,    // Flush the operand list to suppress SCEVExpander hoisting of both folded and    // unfolded offsets. LSR assumes they both live next to their uses.    if (!Ops.empty()) { -    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); +    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);      Ops.clear();      Ops.push_back(SE.getUnknown(FullV));    } @@ -4584,7 +4582,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,    const SCEV *FullS = Ops.empty() ?                        SE.getConstant(IntTy, 0) :                        SE.getAddExpr(Ops); -  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP); +  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP);    // We're done expanding now, so reset the rewriter.    Rewriter.clearPostInc(); @@ -4626,15 +4624,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF,    return FullV;  } -/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use -/// of their operands effectively happens in their predecessor blocks, so the -/// expression may need to be expanded in multiple places. +/// Helper for Rewrite. PHI nodes are special because the use of their operands +/// effectively happens in their predecessor blocks, so the expression may need +/// to be expanded in multiple places.  void LSRInstance::RewriteForPHI(PHINode *PN,                                  const LSRFixup &LF,                                  const Formula &F,                                  SCEVExpander &Rewriter, -                                SmallVectorImpl<WeakVH> &DeadInsts, -                                Pass *P) const { +                                SmallVectorImpl<WeakVH> &DeadInsts) const {    DenseMap<BasicBlock *, Value *> Inserted;    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)      if (PN->getIncomingValue(i) == LF.OperandValToReplace) { @@ -4658,8 +4655,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN,                                            .setDontDeleteUselessPHIs());            } else {              SmallVector<BasicBlock*, 2> NewBBs; -            SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, -                                        /*AliasAnalysis*/ nullptr, &DT, &LI); +            SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);              NewBB = NewBBs[0];            }            // If NewBB==NULL, then SplitCriticalEdge refused to split because all @@ -4685,7 +4681,8 @@ void LSRInstance::RewriteForPHI(PHINode *PN,        if (!Pair.second)          PN->setIncomingValue(i, Pair.first->second);        else { -        Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts); +        Value *FullV = Expand(LF, F, BB->getTerminator()->getIterator(), +                              Rewriter, DeadInsts);          // If this is reuse-by-noop-cast, insert the noop cast.          Type *OpTy = LF.OperandValToReplace->getType(); @@ -4702,20 +4699,20 @@ void LSRInstance::RewriteForPHI(PHINode *PN,      }  } -/// Rewrite - Emit instructions for the leading candidate expression for this -/// LSRUse (this is called "expanding"), and update the UserInst to reference -/// the newly expanded value. +/// Emit instructions for the leading candidate expression for this LSRUse (this +/// is called "expanding"), and update the UserInst to reference the newly +/// expanded value.  void LSRInstance::Rewrite(const LSRFixup &LF,                            const Formula &F,                            SCEVExpander &Rewriter, -                          SmallVectorImpl<WeakVH> &DeadInsts, -                          Pass *P) const { +                          SmallVectorImpl<WeakVH> &DeadInsts) const {    // First, find an insertion point that dominates UserInst. For PHI nodes,    // find the nearest block which dominates all the relevant uses.    if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) { -    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P); +    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts);    } else { -    Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts); +    Value *FullV = +        Expand(LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);      // If this is reuse-by-noop-cast, insert the noop cast.      Type *OpTy = LF.OperandValToReplace->getType(); @@ -4740,11 +4737,10 @@ void LSRInstance::Rewrite(const LSRFixup &LF,    DeadInsts.emplace_back(LF.OperandValToReplace);  } -/// ImplementSolution - Rewrite all the fixup locations with new values, -/// following the chosen solution. -void -LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, -                               Pass *P) { +/// Rewrite all the fixup locations with new values, following the chosen +/// solution. +void LSRInstance::ImplementSolution( +    const SmallVectorImpl<const Formula *> &Solution) {    // Keep track of instructions we may have made dead, so that    // we can remove them after we are done working.    SmallVector<WeakVH, 16> DeadInsts; @@ -4766,7 +4762,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,    // Expand the new value definitions and update the users.    for (const LSRFixup &Fixup : Fixups) { -    Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P); +    Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts);      Changed = true;    } @@ -4782,13 +4778,11 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,    Changed |= DeleteTriviallyDeadInstructions(DeadInsts);  } -LSRInstance::LSRInstance(Loop *L, Pass *P) -    : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), -      DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()), -      LI(P->getAnalysis<LoopInfoWrapperPass>().getLoopInfo()), -      TTI(P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI( -          *L->getHeader()->getParent())), -      L(L), Changed(false), IVIncInsertPos(nullptr) { +LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, +                         DominatorTree &DT, LoopInfo &LI, +                         const TargetTransformInfo &TTI) +    : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false), +      IVIncInsertPos(nullptr) {    // If LoopSimplify form is not available, stay out of trouble.    if (!L->isLoopSimplifyForm())      return; @@ -4879,7 +4873,7 @@ LSRInstance::LSRInstance(Loop *L, Pass *P)  #endif    // Now that we've decided what we want, make it so. -  ImplementSolution(Solution, P); +  ImplementSolution(Solution);  }  void LSRInstance::print_factors_and_types(raw_ostream &OS) const { @@ -4931,11 +4925,10 @@ void LSRInstance::print(raw_ostream &OS) const {    print_uses(OS);  } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD  void LSRInstance::dump() const {    print(errs()); errs() << '\n';  } -#endif  namespace { @@ -4956,7 +4949,7 @@ INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",                  "Loop Strength Reduction", false, false)  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(IVUsers)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopSimplify) @@ -4982,8 +4975,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {    AU.addRequiredID(LoopSimplifyID);    AU.addRequired<DominatorTreeWrapperPass>();    AU.addPreserved<DominatorTreeWrapperPass>(); -  AU.addRequired<ScalarEvolution>(); -  AU.addPreserved<ScalarEvolution>(); +  AU.addRequired<ScalarEvolutionWrapperPass>(); +  AU.addPreserved<ScalarEvolutionWrapperPass>();    // Requiring LoopSimplify a second time here prevents IVUsers from running    // twice, since LoopSimplify was invalidated by running ScalarEvolution.    AU.addRequiredID(LoopSimplifyID); @@ -4996,17 +4989,24 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {    if (skipOptnoneFunction(L))      return false; +  auto &IU = getAnalysis<IVUsers>(); +  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); +  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); +  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); +  const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI( +      *L->getHeader()->getParent());    bool Changed = false;    // Run the main LSR transformation. -  Changed |= LSRInstance(L, this).getChanged(); +  Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged();    // Remove any extra phis created by processing inner loops.    Changed |= DeleteDeadPHIs(L->getHeader());    if (EnablePhiElim && L->isLoopSimplifyForm()) {      SmallVector<WeakVH, 16> DeadInsts;      const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); -    SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), DL, "lsr"); +    SCEVExpander Rewriter(getAnalysis<ScalarEvolutionWrapperPass>().getSE(), DL, +                          "lsr");  #ifndef NDEBUG      Rewriter.setDebugType(DEBUG_TYPE);  #endif diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index d78db6c369b3..56ae5c010411 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -14,6 +14,7 @@  #include "llvm/Transforms/Scalar.h"  #include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/CodeMetrics.h"  #include "llvm/Analysis/InstructionSimplify.h" @@ -130,27 +131,29 @@ namespace {      bool UserAllowPartial;      bool UserRuntime; -    bool runOnLoop(Loop *L, LPPassManager &LPM) override; +    bool runOnLoop(Loop *L, LPPassManager &) override;      /// This transformation requires natural loop information & requires that      /// loop preheaders be inserted into the CFG...      ///      void getAnalysisUsage(AnalysisUsage &AU) const override {        AU.addRequired<AssumptionCacheTracker>(); +      AU.addRequired<DominatorTreeWrapperPass>();        AU.addRequired<LoopInfoWrapperPass>();        AU.addPreserved<LoopInfoWrapperPass>();        AU.addRequiredID(LoopSimplifyID);        AU.addPreservedID(LoopSimplifyID);        AU.addRequiredID(LCSSAID);        AU.addPreservedID(LCSSAID); -      AU.addRequired<ScalarEvolution>(); -      AU.addPreserved<ScalarEvolution>(); +      AU.addRequired<ScalarEvolutionWrapperPass>(); +      AU.addPreserved<ScalarEvolutionWrapperPass>();        AU.addRequired<TargetTransformInfoWrapperPass>();        // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.        // If loop unroll does not preserve dom info then LCSSA pass on next        // loop will receive invalid dom info.        // For now, recreate dom info, if loop is unrolled.        AU.addPreserved<DominatorTreeWrapperPass>(); +      AU.addPreserved<GlobalsAAWrapperPass>();      }      // Fill in the UnrollingPreferences parameter with values from the @@ -186,7 +189,7 @@ namespace {      // total unrolled size.  Parameters Threshold and PartialThreshold      // are set to the maximum unrolled size for fully and partially      // unrolled loops respectively. -    void selectThresholds(const Loop *L, bool HasPragma, +    void selectThresholds(const Loop *L, bool UsePragmaThreshold,                            const TargetTransformInfo::UnrollingPreferences &UP,                            unsigned &Threshold, unsigned &PartialThreshold,                            unsigned &PercentDynamicCostSavedThreshold, @@ -207,12 +210,13 @@ namespace {                                         : UP.DynamicCostSavingsDiscount;        if (!UserThreshold && +          // FIXME: Use Function::optForSize().            L->getHeader()->getParent()->hasFnAttribute(                Attribute::OptimizeForSize)) {          Threshold = UP.OptSizeThreshold;          PartialThreshold = UP.PartialOptSizeThreshold;        } -      if (HasPragma) { +      if (UsePragmaThreshold) {          // If the loop has an unrolling pragma, we want to be more          // aggressive with unrolling limits.  Set thresholds to at          // least the PragmaTheshold value which is larger than the @@ -235,10 +239,11 @@ char LoopUnroll::ID = 0;  INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)  INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)  Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, @@ -278,8 +283,8 @@ class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> {  public:    UnrolledInstAnalyzer(unsigned Iteration,                         DenseMap<Value *, Constant *> &SimplifiedValues, -                       const Loop *L, ScalarEvolution &SE) -      : Iteration(Iteration), SimplifiedValues(SimplifiedValues), L(L), SE(SE) { +                       ScalarEvolution &SE) +      : SimplifiedValues(SimplifiedValues), SE(SE) {        IterationNumber = SE.getConstant(APInt(64, Iteration));    } @@ -295,13 +300,6 @@ private:    /// results saved.    DenseMap<Value *, SimplifiedAddress> SimplifiedAddresses; -  /// \brief Number of currently simulated iteration. -  /// -  /// If an expression is ConstAddress+Constant, then the Constant is -  /// Start + Iteration*Step, where Start and Step could be obtained from -  /// SCEVGEPCache. -  unsigned Iteration; -    /// \brief SCEV expression corresponding to number of currently simulated    /// iteration.    const SCEV *IterationNumber; @@ -316,7 +314,6 @@ private:    /// post-unrolling.    DenseMap<Value *, Constant *> &SimplifiedValues; -  const Loop *L;    ScalarEvolution &SE;    /// \brief Try to simplify instruction \param I using its SCEV expression. @@ -368,11 +365,9 @@ private:      return simplifyInstWithSCEV(&I);    } -  /// TODO: Add visitors for other instruction types, e.g. ZExt, SExt. -    /// Try to simplify binary operator I.    /// -  /// TODO: Probaly it's worth to hoist the code for estimating the +  /// TODO: Probably it's worth to hoist the code for estimating the    /// simplifications effects to a separate class, since we have a very similar    /// code in InlineCost already.    bool visitBinaryOperator(BinaryOperator &I) { @@ -412,7 +407,7 @@ private:      auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base);      // We're only interested in loads that can be completely folded to a      // constant. -    if (!GV || !GV->hasInitializer()) +    if (!GV || !GV->hasDefinitiveInitializer() || !GV->isConstant())        return false;      ConstantDataSequential *CDS = @@ -420,6 +415,12 @@ private:      if (!CDS)        return false; +    // We might have a vector load from an array. FIXME: for now we just bail +    // out in this case, but we should be able to resolve and simplify such +    // loads. +    if(!CDS->isElementTypeCompatible(I.getType())) +      return false; +      int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;      assert(SimplifiedAddrOp->getValue().getActiveBits() < 64 &&             "Unexpectedly large index value."); @@ -436,6 +437,59 @@ private:      return true;    } + +  bool visitCastInst(CastInst &I) { +    // Propagate constants through casts. +    Constant *COp = dyn_cast<Constant>(I.getOperand(0)); +    if (!COp) +      COp = SimplifiedValues.lookup(I.getOperand(0)); +    if (COp) +      if (Constant *C = +              ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { +        SimplifiedValues[&I] = C; +        return true; +      } + +    return Base::visitCastInst(I); +  } + +  bool visitCmpInst(CmpInst &I) { +    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + +    // First try to handle simplified comparisons. +    if (!isa<Constant>(LHS)) +      if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) +        LHS = SimpleLHS; +    if (!isa<Constant>(RHS)) +      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) +        RHS = SimpleRHS; + +    if (!isa<Constant>(LHS) && !isa<Constant>(RHS)) { +      auto SimplifiedLHS = SimplifiedAddresses.find(LHS); +      if (SimplifiedLHS != SimplifiedAddresses.end()) { +        auto SimplifiedRHS = SimplifiedAddresses.find(RHS); +        if (SimplifiedRHS != SimplifiedAddresses.end()) { +          SimplifiedAddress &LHSAddr = SimplifiedLHS->second; +          SimplifiedAddress &RHSAddr = SimplifiedRHS->second; +          if (LHSAddr.Base == RHSAddr.Base) { +            LHS = LHSAddr.Offset; +            RHS = RHSAddr.Offset; +          } +        } +      } +    } + +    if (Constant *CLHS = dyn_cast<Constant>(LHS)) { +      if (Constant *CRHS = dyn_cast<Constant>(RHS)) { +        if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { +          SimplifiedValues[&I] = C; +          return true; +        } +      } +    } + +    return Base::visitCmpInst(I); +  }  };  } // namespace @@ -443,11 +497,11 @@ private:  namespace {  struct EstimatedUnrollCost {    /// \brief The estimated cost after unrolling. -  unsigned UnrolledCost; +  int UnrolledCost;    /// \brief The estimated dynamic cost of executing the instructions in the    /// rolled form. -  unsigned RolledDynamicCost; +  int RolledDynamicCost;  };  } @@ -464,10 +518,10 @@ struct EstimatedUnrollCost {  /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If  /// the analysis failed (no benefits expected from the unrolling, or the loop is  /// too big to analyze), the returned value is None. -Optional<EstimatedUnrollCost> -analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, -                      const TargetTransformInfo &TTI, -                      unsigned MaxUnrolledLoopSize) { +static Optional<EstimatedUnrollCost> +analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, +                      ScalarEvolution &SE, const TargetTransformInfo &TTI, +                      int MaxUnrolledLoopSize) {    // We want to be able to scale offsets by the trip count and add more offsets    // to them without checking for overflows, and we already don't want to    // analyze *massive* trip counts, so we force the max to be reasonably small. @@ -481,24 +535,61 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,    SmallSetVector<BasicBlock *, 16> BBWorklist;    DenseMap<Value *, Constant *> SimplifiedValues; +  SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;    // The estimated cost of the unrolled form of the loop. We try to estimate    // this by simplifying as much as we can while computing the estimate. -  unsigned UnrolledCost = 0; +  int UnrolledCost = 0;    // We also track the estimated dynamic (that is, actually executed) cost in    // the rolled form. This helps identify cases when the savings from unrolling    // aren't just exposing dead control flows, but actual reduced dynamic    // instructions due to the simplifications which we expect to occur after    // unrolling. -  unsigned RolledDynamicCost = 0; +  int RolledDynamicCost = 0; + +  // Ensure that we don't violate the loop structure invariants relied on by +  // this analysis. +  assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); +  assert(L->isLCSSAForm(DT) && +         "Must have loops in LCSSA form to track live-out values."); + +  DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");    // Simulate execution of each iteration of the loop counting instructions,    // which would be simplified.    // Since the same load will take different values on different iterations,    // we literally have to go through all loop's iterations.    for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { +    DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); + +    // Prepare for the iteration by collecting any simplified entry or backedge +    // inputs. +    for (Instruction &I : *L->getHeader()) { +      auto *PHI = dyn_cast<PHINode>(&I); +      if (!PHI) +        break; + +      // The loop header PHI nodes must have exactly two input: one from the +      // loop preheader and one from the loop latch. +      assert( +          PHI->getNumIncomingValues() == 2 && +          "Must have an incoming value only for the preheader and the latch."); + +      Value *V = PHI->getIncomingValueForBlock( +          Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); +      Constant *C = dyn_cast<Constant>(V); +      if (Iteration != 0 && !C) +        C = SimplifiedValues.lookup(V); +      if (C) +        SimplifiedInputValues.push_back({PHI, C}); +    } + +    // Now clear and re-populate the map for the next iteration.      SimplifiedValues.clear(); -    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, L, SE); +    while (!SimplifiedInputValues.empty()) +      SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); + +    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE);      BBWorklist.clear();      BBWorklist.insert(L->getHeader()); @@ -510,21 +601,67 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,        // it.  We don't change the actual IR, just count optimization        // opportunities.        for (Instruction &I : *BB) { -        unsigned InstCost = TTI.getUserCost(&I); +        int InstCost = TTI.getUserCost(&I);          // Visit the instruction to analyze its loop cost after unrolling,          // and if the visitor returns false, include this instruction in the          // unrolled cost.          if (!Analyzer.visit(I))            UnrolledCost += InstCost; +        else { +          DEBUG(dbgs() << "  " << I +                       << " would be simplified if loop is unrolled.\n"); +          (void)0; +        }          // Also track this instructions expected cost when executing the rolled          // loop form.          RolledDynamicCost += InstCost;          // If unrolled body turns out to be too big, bail out. -        if (UnrolledCost > MaxUnrolledLoopSize) +        if (UnrolledCost > MaxUnrolledLoopSize) { +          DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n" +                       << "  UnrolledCost: " << UnrolledCost +                       << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize +                       << "\n");            return None; +        } +      } + +      TerminatorInst *TI = BB->getTerminator(); + +      // Add in the live successors by first checking whether we have terminator +      // that may be simplified based on the values simplified by this call. +      if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { +        if (BI->isConditional()) { +          if (Constant *SimpleCond = +                  SimplifiedValues.lookup(BI->getCondition())) { +            BasicBlock *Succ = nullptr; +            // Just take the first successor if condition is undef +            if (isa<UndefValue>(SimpleCond)) +              Succ = BI->getSuccessor(0); +            else +              Succ = BI->getSuccessor( +                  cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0); +            if (L->contains(Succ)) +              BBWorklist.insert(Succ); +            continue; +          } +        } +      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { +        if (Constant *SimpleCond = +                SimplifiedValues.lookup(SI->getCondition())) { +          BasicBlock *Succ = nullptr; +          // Just take the first successor if condition is undef +          if (isa<UndefValue>(SimpleCond)) +            Succ = SI->getSuccessor(0); +          else +            Succ = SI->findCaseValue(cast<ConstantInt>(SimpleCond)) +                       .getCaseSuccessor(); +          if (L->contains(Succ)) +            BBWorklist.insert(Succ); +          continue; +        }        }        // Add BB's successors to the worklist. @@ -535,9 +672,15 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE,      // If we found no optimization opportunities on the first iteration, we      // won't find them on later ones too. -    if (UnrolledCost == RolledDynamicCost) +    if (UnrolledCost == RolledDynamicCost) { +      DEBUG(dbgs() << "  No opportunities found.. exiting.\n" +                   << "  UnrolledCost: " << UnrolledCost << "\n");        return None; +    }    } +  DEBUG(dbgs() << "Analysis finished:\n" +               << "UnrolledCost: " << UnrolledCost << ", " +               << "RolledDynamicCost: " << RolledDynamicCost << "\n");    return {{UnrolledCost, RolledDynamicCost}};  } @@ -583,6 +726,12 @@ static bool HasUnrollFullPragma(const Loop *L) {    return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");  } +// Returns true if the loop has an unroll(enable) pragma. This metadata is used +// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives. +static bool HasUnrollEnablePragma(const Loop *L) { +  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable"); +} +  // Returns true if the loop has an unroll(disable) pragma.  static bool HasUnrollDisablePragma(const Loop *L) {    return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable"); @@ -708,7 +857,7 @@ unsigned LoopUnroll::selectUnrollCount(    unsigned Count = UserCount ? CurrentCount : 0;    // If there is no user-specified count, unroll pragmas have the next -  // highest precendence. +  // highest precedence.    if (Count == 0) {      if (PragmaCount) {        Count = PragmaCount; @@ -737,17 +886,19 @@ unsigned LoopUnroll::selectUnrollCount(    return Count;  } -bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { +bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &) {    if (skipOptnoneFunction(L))      return false;    Function &F = *L->getHeader()->getParent(); +  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); -  ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); +  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();    const TargetTransformInfo &TTI =        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); +  bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);    BasicBlock *Header = L->getHeader();    DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() @@ -757,8 +908,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {      return false;    }    bool PragmaFullUnroll = HasUnrollFullPragma(L); +  bool PragmaEnableUnroll = HasUnrollEnablePragma(L);    unsigned PragmaCount = UnrollCountPragmaValue(L); -  bool HasPragma = PragmaFullUnroll || PragmaCount > 0; +  bool HasPragma = PragmaFullUnroll || PragmaEnableUnroll || PragmaCount > 0;    TargetTransformInfo::UnrollingPreferences UP;    getUnrollingPreferences(L, TTI, UP); @@ -806,7 +958,15 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {    unsigned Threshold, PartialThreshold;    unsigned PercentDynamicCostSavedThreshold;    unsigned DynamicCostSavingsDiscount; -  selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold, +  // Only use the high pragma threshold when we have a target unroll factor such +  // as with "#pragma unroll N" or a pragma indicating full unrolling and the +  // trip count is known. Otherwise we rely on the standard threshold to +  // heuristically select a reasonable unroll count. +  bool UsePragmaThreshold = +      PragmaCount > 0 || +      ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount != 0); + +  selectThresholds(L, UsePragmaThreshold, UP, Threshold, PartialThreshold,                     PercentDynamicCostSavedThreshold,                     DynamicCostSavingsDiscount); @@ -824,8 +984,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {        // The loop isn't that small, but we still can fully unroll it if that        // helps to remove a significant number of instructions.        // To check that, run additional analysis on the loop. -      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( -              L, TripCount, *SE, TTI, Threshold + DynamicCostSavingsDiscount)) +      if (Optional<EstimatedUnrollCost> Cost = +              analyzeLoopUnrollCost(L, TripCount, DT, *SE, TTI, +                                    Threshold + DynamicCostSavingsDiscount))          if (canUnrollCompletely(L, Threshold, PercentDynamicCostSavedThreshold,                                  DynamicCostSavingsDiscount, Cost->UnrolledCost,                                  Cost->RolledDynamicCost)) { @@ -840,14 +1001,15 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {    // Reduce count based on the type of unrolling and the threshold values.    unsigned OriginalCount = Count; -  bool AllowRuntime = -      (PragmaCount > 0) || (UserRuntime ? CurrentRuntime : UP.Runtime); +  bool AllowRuntime = PragmaEnableUnroll || (PragmaCount > 0) || +                      (UserRuntime ? CurrentRuntime : UP.Runtime);    // Don't unroll a runtime trip count loop with unroll full pragma.    if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) {      AllowRuntime = false;    }    if (Unrolling == Partial) { -    bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; +    bool AllowPartial = PragmaEnableUnroll || +                        (UserAllowPartial ? CurrentAllowPartial : UP.Partial);      if (!AllowPartial && !CountSetExplicitly) {        DEBUG(dbgs() << "  will not try to unroll partially because "                     << "-unroll-allow-partial not given\n"); @@ -887,23 +1049,27 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {      DebugLoc LoopLoc = L->getStartLoc();      Function *F = Header->getParent();      LLVMContext &Ctx = F->getContext(); -    if (PragmaFullUnroll && PragmaCount == 0) { -      if (TripCount && Count != TripCount) { -        emitOptimizationRemarkMissed( -            Ctx, DEBUG_TYPE, *F, LoopLoc, -            "Unable to fully unroll loop as directed by unroll(full) pragma " -            "because unrolled size is too large."); -      } else if (!TripCount) { -        emitOptimizationRemarkMissed( -            Ctx, DEBUG_TYPE, *F, LoopLoc, -            "Unable to fully unroll loop as directed by unroll(full) pragma " -            "because loop has a runtime trip count."); -      } -    } else if (PragmaCount > 0 && Count != OriginalCount) { +    if ((PragmaCount > 0) && Count != OriginalCount) {        emitOptimizationRemarkMissed(            Ctx, DEBUG_TYPE, *F, LoopLoc,            "Unable to unroll loop the number of times directed by "            "unroll_count pragma because unrolled size is too large."); +    } else if (PragmaFullUnroll && !TripCount) { +      emitOptimizationRemarkMissed( +          Ctx, DEBUG_TYPE, *F, LoopLoc, +          "Unable to fully unroll loop as directed by unroll(full) pragma " +          "because loop has a runtime trip count."); +    } else if (PragmaEnableUnroll && Count != TripCount && Count < 2) { +      emitOptimizationRemarkMissed( +          Ctx, DEBUG_TYPE, *F, LoopLoc, +          "Unable to unroll loop as directed by unroll(enable) pragma because " +          "unrolled size is too large."); +    } else if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && +               Count != TripCount) { +      emitOptimizationRemarkMissed( +          Ctx, DEBUG_TYPE, *F, LoopLoc, +          "Unable to fully unroll loop as directed by unroll pragma because " +          "unrolled size is too large.");      }    } @@ -915,7 +1081,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {    // Unroll the loop.    if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount, -                  TripMultiple, LI, this, &LPM, &AC)) +                  TripMultiple, LI, SE, &DT, &AC, PreserveLCSSA))      return false;    return true; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index cbc563bd8998..95d7f8a3beda 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -30,6 +30,7 @@  #include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/CodeMetrics.h"  #include "llvm/Analysis/InstructionSimplify.h" @@ -37,6 +38,10 @@  #include "llvm/Analysis/LoopPass.h"  #include "llvm/Analysis/ScalarEvolution.h"  #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Support/BranchProbability.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DerivedTypes.h"  #include "llvm/IR/Dominators.h" @@ -70,6 +75,19 @@ static cl::opt<unsigned>  Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),            cl::init(100), cl::Hidden); +static cl::opt<bool> +LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency", +    cl::init(false), cl::Hidden, +    cl::desc("Enable the use of the block frequency analysis to access PGO " +             "heuristics to minimize code growth in cold regions.")); + +static cl::opt<unsigned> +ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden, +    cl::desc("Coldness threshold in percentage. The loop header frequency " +             "(relative to the entry frequency) is compared with this " +             "threshold to determine if non-trivial unswitching should be " +             "enabled.")); +  namespace {    class LUAnalysisCache { @@ -148,12 +166,19 @@ namespace {      LPPassManager *LPM;      AssumptionCache *AC; -    // LoopProcessWorklist - Used to check if second loop needs processing -    // after RewriteLoopBodyWithConditionConstant rewrites first loop. +    // Used to check if second loop needs processing after +    // RewriteLoopBodyWithConditionConstant rewrites first loop.      std::vector<Loop*> LoopProcessWorklist;      LUAnalysisCache BranchesInfo; +    bool EnabledPGO; + +    // BFI and ColdEntryFreq are only used when PGO and +    // LoopUnswitchWithBlockFrequency are enabled. +    BlockFrequencyInfo BFI; +    BlockFrequency ColdEntryFreq; +      bool OptimizeForSize;      bool redoLoop; @@ -192,9 +217,11 @@ namespace {        AU.addPreserved<LoopInfoWrapperPass>();        AU.addRequiredID(LCSSAID);        AU.addPreservedID(LCSSAID); +      AU.addRequired<DominatorTreeWrapperPass>();        AU.addPreserved<DominatorTreeWrapperPass>(); -      AU.addPreserved<ScalarEvolution>(); +      AU.addPreserved<ScalarEvolutionWrapperPass>();        AU.addRequired<TargetTransformInfoWrapperPass>(); +      AU.addPreserved<GlobalsAAWrapperPass>();      }    private: @@ -210,7 +237,10 @@ namespace {      /// Split all of the edges from inside the loop to their exit blocks.      /// Update the appropriate Phi nodes as we do so. -    void SplitExitEdges(Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks); +    void SplitExitEdges(Loop *L, +                        const SmallVectorImpl<BasicBlock *> &ExitBlocks); + +    bool TryTrivialLoopUnswitch(bool &Changed);      bool UnswitchIfProfitable(Value *LoopCond, Constant *Val,                                TerminatorInst *TI = nullptr); @@ -229,9 +259,6 @@ namespace {                                          TerminatorInst *TI);      void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); -    bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = nullptr, -                                    BasicBlock **LoopExit = nullptr); -    };  } @@ -367,9 +394,8 @@ Pass *llvm::createLoopUnswitchPass(bool Os) {    return new LoopUnswitch(Os);  } -/// FindLIVLoopCondition - Cond is a condition that occurs in L.  If it is -/// invariant in the loop, or has an invariant piece, return the invariant. -/// Otherwise, return null. +/// Cond is a condition that occurs in L. If it is invariant in the loop, or has +/// an invariant piece, return the invariant. Otherwise, return null.  static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {    // We started analyze new instruction, increment scanned instructions counter. @@ -411,11 +437,23 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {        *L->getHeader()->getParent());    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();    LPM = &LPM_Ref; -  DominatorTreeWrapperPass *DTWP = -      getAnalysisIfAvailable<DominatorTreeWrapperPass>(); -  DT = DTWP ? &DTWP->getDomTree() : nullptr; +  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();    currentLoop = L;    Function *F = currentLoop->getHeader()->getParent(); + +  EnabledPGO = F->getEntryCount().hasValue(); + +  if (LoopUnswitchWithBlockFrequency && EnabledPGO) { +    BranchProbabilityInfo BPI(*F, *LI); +    BFI.calculate(*L->getHeader()->getParent(), BPI, *LI); + +    // Use BranchProbability to compute a minimum frequency based on +    // function entry baseline frequency. Loops with headers below this +    // frequency are considered as cold. +    const BranchProbability ColdProb(ColdnessThreshold, 100); +    ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb; +  } +    bool Changed = false;    do {      assert(currentLoop->isLCSSAForm(*DT)); @@ -423,16 +461,13 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {      Changed |= processCurrentLoop();    } while(redoLoop); -  if (Changed) { -    // FIXME: Reconstruct dom info, because it is not preserved properly. -    if (DT) -      DT->recalculate(*F); -  } +  // FIXME: Reconstruct dom info, because it is not preserved properly. +  if (Changed) +    DT->recalculate(*F);    return Changed;  } -/// processCurrentLoop - Do actual work and unswitch loop if possible -/// and profitable. +/// Do actual work and unswitch loop if possible and profitable.  bool LoopUnswitch::processCurrentLoop() {    bool Changed = false; @@ -452,14 +487,48 @@ bool LoopUnswitch::processCurrentLoop() {    LLVMContext &Context = loopHeader->getContext(); -  // Probably we reach the quota of branches for this loop. If so -  // stop unswitching. +  // Analyze loop cost, and stop unswitching if loop content can not be duplicated.    if (!BranchesInfo.countLoop(            currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(                             *currentLoop->getHeader()->getParent()),            AC))      return false; +  // Try trivial unswitch first before loop over other basic blocks in the loop. +  if (TryTrivialLoopUnswitch(Changed)) { +    return true; +  } + +  // Do not unswitch loops containing convergent operations, as we might be +  // making them control dependent on the unswitch value when they were not +  // before. +  // FIXME: This could be refined to only bail if the convergent operation is +  // not already control-dependent on the unswitch value. +  for (const auto BB : currentLoop->blocks()) { +    for (auto &I : *BB) { +      auto CS = CallSite(&I); +      if (!CS) continue; +      if (CS.hasFnAttr(Attribute::Convergent)) +        return false; +    } +  } + +  // Do not do non-trivial unswitch while optimizing for size. +  // FIXME: Use Function::optForSize(). +  if (OptimizeForSize || +      loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize)) +    return false; + +  if (LoopUnswitchWithBlockFrequency && EnabledPGO) { +    // Compute the weighted frequency of the hottest block in the +    // loop (loopHeader in this case since inner loops should be +    // processed before outer loop). If it is less than ColdFrequency, +    // we should not unswitch. +    BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader); +    if (LoopEntryFreq < ColdEntryFreq) +      return false; +  } +    // Loop over all of the basic blocks in the loop.  If we find an interior    // block that is branching on a loop-invariant condition, we can unswitch this    // loop. @@ -528,8 +597,8 @@ bool LoopUnswitch::processCurrentLoop() {    return Changed;  } -/// isTrivialLoopExitBlock - Check to see if all paths from BB exit the -/// loop with no side effects (including infinite loops). +/// Check to see if all paths from BB exit the loop with no side effects +/// (including infinite loops).  ///  /// If true, we return true and set ExitBB to the block we  /// exit through. @@ -566,9 +635,9 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,    return true;  } -/// isTrivialLoopExitBlock - Return true if the specified block unconditionally -/// leads to an exit from the specified loop, and has no side-effects in the -/// process.  If so, return the block that is exited to, otherwise return null. +/// Return true if the specified block unconditionally leads to an exit from +/// the specified loop, and has no side-effects in the process. If so, return +/// the block that is exited to, otherwise return null.  static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {    std::set<BasicBlock*> Visited;    Visited.insert(L->getHeader());  // Branches to header make infinite loops. @@ -578,105 +647,11 @@ static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {    return nullptr;  } -/// IsTrivialUnswitchCondition - Check to see if this unswitch condition is -/// trivial: that is, that the condition controls whether or not the loop does -/// anything at all.  If this is a trivial condition, unswitching produces no -/// code duplications (equivalently, it produces a simpler loop and a new empty -/// loop, which gets deleted). -/// -/// If this is a trivial condition, return true, otherwise return false.  When -/// returning true, this sets Cond and Val to the condition that controls the -/// trivial condition: when Cond dynamically equals Val, the loop is known to -/// exit.  Finally, this sets LoopExit to the BB that the loop exits to when -/// Cond == Val. -/// -bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, -                                       BasicBlock **LoopExit) { -  BasicBlock *Header = currentLoop->getHeader(); -  TerminatorInst *HeaderTerm = Header->getTerminator(); -  LLVMContext &Context = Header->getContext(); - -  BasicBlock *LoopExitBB = nullptr; -  if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) { -    // If the header block doesn't end with a conditional branch on Cond, we -    // can't handle it. -    if (!BI->isConditional() || BI->getCondition() != Cond) -      return false; - -    // Check to see if a successor of the branch is guaranteed to -    // exit through a unique exit block without having any -    // side-effects.  If so, determine the value of Cond that causes it to do -    // this. -    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, -                                             BI->getSuccessor(0)))) { -      if (Val) *Val = ConstantInt::getTrue(Context); -    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, -                                                    BI->getSuccessor(1)))) { -      if (Val) *Val = ConstantInt::getFalse(Context); -    } -  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) { -    // If this isn't a switch on Cond, we can't handle it. -    if (SI->getCondition() != Cond) return false; - -    // Check to see if a successor of the switch is guaranteed to go to the -    // latch block or exit through a one exit block without having any -    // side-effects.  If so, determine the value of Cond that causes it to do -    // this. -    // Note that we can't trivially unswitch on the default case or -    // on already unswitched cases. -    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); -         i != e; ++i) { -      BasicBlock *LoopExitCandidate; -      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, -                                               i.getCaseSuccessor()))) { -        // Okay, we found a trivial case, remember the value that is trivial. -        ConstantInt *CaseVal = i.getCaseValue(); - -        // Check that it was not unswitched before, since already unswitched -        // trivial vals are looks trivial too. -        if (BranchesInfo.isUnswitched(SI, CaseVal)) -          continue; -        LoopExitBB = LoopExitCandidate; -        if (Val) *Val = CaseVal; -        break; -      } -    } -  } - -  // If we didn't find a single unique LoopExit block, or if the loop exit block -  // contains phi nodes, this isn't trivial. -  if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) -    return false;   // Can't handle this. - -  if (LoopExit) *LoopExit = LoopExitBB; - -  // We already know that nothing uses any scalar values defined inside of this -  // loop.  As such, we just have to check to see if this loop will execute any -  // side-effecting instructions (e.g. stores, calls, volatile loads) in the -  // part of the loop that the code *would* execute.  We already checked the -  // tail, check the header now. -  for (BasicBlock::iterator I = Header->begin(), E = Header->end(); I != E; ++I) -    if (I->mayHaveSideEffects()) -      return false; -  return true; -} - -/// UnswitchIfProfitable - We have found that we can unswitch currentLoop when -/// LoopCond == Val to simplify the loop.  If we decide that this is profitable, +/// We have found that we can unswitch currentLoop when LoopCond == Val to +/// simplify the loop.  If we decide that this is profitable,  /// unswitch the loop, reprocess the pieces, then return true.  bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,                                          TerminatorInst *TI) { -  Function *F = loopHeader->getParent(); -  Constant *CondVal = nullptr; -  BasicBlock *ExitBlock = nullptr; - -  if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) { -    // If the condition is trivial, always unswitch. There is no code growth -    // for this case. -    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, ExitBlock, TI); -    return true; -  } -    // Check to see if it would be profitable to unswitch current loop.    if (!BranchesInfo.CostAllowsUnswitching()) {      DEBUG(dbgs() << "NOT unswitching loop %" @@ -687,32 +662,27 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,      return false;    } -  // Do not do non-trivial unswitch while optimizing for size. -  if (OptimizeForSize || F->hasFnAttribute(Attribute::OptimizeForSize)) -    return false; -    UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);    return true;  } -/// CloneLoop - Recursively clone the specified loop and all of its children, +/// Recursively clone the specified loop and all of its children,  /// mapping the blocks with the specified map.  static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,                         LoopInfo *LI, LPPassManager *LPM) { -  Loop *New = new Loop(); -  LPM->insertLoop(New, PL); +  Loop &New = LPM->addLoop(PL);    // Add all of the blocks in L to the new loop.    for (Loop::block_iterator I = L->block_begin(), E = L->block_end();         I != E; ++I)      if (LI->getLoopFor(*I) == L) -      New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); +      New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);    // Add all of the subloops to the new loop.    for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) -    CloneLoop(*I, New, VM, LI, LPM); +    CloneLoop(*I, &New, VM, LI, LPM); -  return New; +  return &New;  }  static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst, @@ -744,15 +714,15 @@ static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst,          }        }        // fallthrough. +    case LLVMContext::MD_make_implicit:      case LLVMContext::MD_dbg:        DstInst->setMetadata(MD.first, MD.second);      }    }  } -/// EmitPreheaderBranchOnCondition - Emit a conditional branch on two values -/// if LIC == Val, branch to TrueDst, otherwise branch to FalseDest.  Insert the -/// code immediately before InsertPt. +/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst, +/// otherwise branch to FalseDest. Insert the code immediately before InsertPt.  void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,                                                    BasicBlock *TrueDest,                                                    BasicBlock *FalseDest, @@ -782,11 +752,11 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,    SplitCriticalEdge(BI, 1, Options);  } -/// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable -/// condition in it (a cond branch from its header block to its latch block, -/// where the path through the loop that doesn't execute its body has no -/// side-effects), unswitch it.  This doesn't involve any code duplication, just -/// moving the conditional branch outside of the loop and updating loop info. +/// Given a loop that has a trivial unswitchable condition in it (a cond branch +/// from its header block to its latch block, where the path through the loop +/// that doesn't execute its body has no side-effects), unswitch it. This +/// doesn't involve any code duplication, just moving the conditional branch +/// outside of the loop and updating loop info.  void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,                                              BasicBlock *ExitBlock,                                              TerminatorInst *TI) { @@ -810,7 +780,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,    // without actually branching to it (the exit block should be dominated by the    // loop header, not the preheader).    assert(!L->contains(ExitBlock) && "Exit block is in the loop?"); -  BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), DT, LI); +  BasicBlock *NewExit = SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI);    // Okay, now we have a position to branch from and a position to branch to,    // insert the new conditional branch. @@ -829,8 +799,155 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,    ++NumTrivial;  } -/// SplitExitEdges - Split all of the edges from inside the loop to their exit -/// blocks.  Update the appropriate Phi nodes as we do so. +/// Check if the first non-constant condition starting from the loop header is +/// a trivial unswitch condition: that is, a condition controls whether or not +/// the loop does anything at all. If it is a trivial condition, unswitching +/// produces no code duplications (equivalently, it produces a simpler loop and +/// a new empty loop, which gets deleted). Therefore always unswitch trivial +/// condition. +bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { +  BasicBlock *CurrentBB = currentLoop->getHeader(); +  TerminatorInst *CurrentTerm = CurrentBB->getTerminator(); +  LLVMContext &Context = CurrentBB->getContext(); + +  // If loop header has only one reachable successor (currently via an +  // unconditional branch or constant foldable conditional branch, but +  // should also consider adding constant foldable switch instruction in +  // future), we should keep looking for trivial condition candidates in +  // the successor as well. An alternative is to constant fold conditions +  // and merge successors into loop header (then we only need to check header's +  // terminator). The reason for not doing this in LoopUnswitch pass is that +  // it could potentially break LoopPassManager's invariants. Folding dead +  // branches could either eliminate the current loop or make other loops +  // unreachable. LCSSA form might also not be preserved after deleting +  // branches. The following code keeps traversing loop header's successors +  // until it finds the trivial condition candidate (condition that is not a +  // constant). Since unswitching generates branches with constant conditions, +  // this scenario could be very common in practice. +  SmallSet<BasicBlock*, 8> Visited; + +  while (true) { +    // If we exit loop or reach a previous visited block, then +    // we can not reach any trivial condition candidates (unfoldable +    // branch instructions or switch instructions) and no unswitch +    // can happen. Exit and return false. +    if (!currentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second) +      return false; + +    // Check if this loop will execute any side-effecting instructions (e.g. +    // stores, calls, volatile loads) in the part of the loop that the code +    // *would* execute. Check the header first. +    for (Instruction &I : *CurrentBB) +      if (I.mayHaveSideEffects()) +        return false; + +    // FIXME: add check for constant foldable switch instructions. +    if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { +      if (BI->isUnconditional()) { +        CurrentBB = BI->getSuccessor(0); +      } else if (BI->getCondition() == ConstantInt::getTrue(Context)) { +        CurrentBB = BI->getSuccessor(0); +      } else if (BI->getCondition() == ConstantInt::getFalse(Context)) { +        CurrentBB = BI->getSuccessor(1); +      } else { +        // Found a trivial condition candidate: non-foldable conditional branch. +        break; +      } +    } else { +      break; +    } + +    CurrentTerm = CurrentBB->getTerminator(); +  } + +  // CondVal is the condition that controls the trivial condition. +  // LoopExitBB is the BasicBlock that loop exits when meets trivial condition. +  Constant *CondVal = nullptr; +  BasicBlock *LoopExitBB = nullptr; + +  if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { +    // If this isn't branching on an invariant condition, we can't unswitch it. +    if (!BI->isConditional()) +      return false; + +    Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), +                                           currentLoop, Changed); + +    // Unswitch only if the trivial condition itself is an LIV (not +    // partial LIV which could occur in and/or) +    if (!LoopCond || LoopCond != BI->getCondition()) +      return false; + +    // Check to see if a successor of the branch is guaranteed to +    // exit through a unique exit block without having any +    // side-effects.  If so, determine the value of Cond that causes +    // it to do this. +    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, +                                             BI->getSuccessor(0)))) { +      CondVal = ConstantInt::getTrue(Context); +    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, +                                                    BI->getSuccessor(1)))) { +      CondVal = ConstantInt::getFalse(Context); +    } + +    // If we didn't find a single unique LoopExit block, or if the loop exit +    // block contains phi nodes, this isn't trivial. +    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) +      return false;   // Can't handle this. + +    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB, +                             CurrentTerm); +    ++NumBranches; +    return true; +  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) { +    // If this isn't switching on an invariant condition, we can't unswitch it. +    Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), +                                           currentLoop, Changed); + +    // Unswitch only if the trivial condition itself is an LIV (not +    // partial LIV which could occur in and/or) +    if (!LoopCond || LoopCond != SI->getCondition()) +      return false; + +    // Check to see if a successor of the switch is guaranteed to go to the +    // latch block or exit through a one exit block without having any +    // side-effects.  If so, determine the value of Cond that causes it to do +    // this. +    // Note that we can't trivially unswitch on the default case or +    // on already unswitched cases. +    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); +         i != e; ++i) { +      BasicBlock *LoopExitCandidate; +      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, +                                               i.getCaseSuccessor()))) { +        // Okay, we found a trivial case, remember the value that is trivial. +        ConstantInt *CaseVal = i.getCaseValue(); + +        // Check that it was not unswitched before, since already unswitched +        // trivial vals are looks trivial too. +        if (BranchesInfo.isUnswitched(SI, CaseVal)) +          continue; +        LoopExitBB = LoopExitCandidate; +        CondVal = CaseVal; +        break; +      } +    } + +    // If we didn't find a single unique LoopExit block, or if the loop exit +    // block contains phi nodes, this isn't trivial. +    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) +      return false;   // Can't handle this. + +    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB, +                             nullptr); +    ++NumSwitches; +    return true; +  } +  return false; +} + +/// Split all of the edges from inside the loop to their exit blocks. +/// Update the appropriate Phi nodes as we do so.  void LoopUnswitch::SplitExitEdges(Loop *L,                                 const SmallVectorImpl<BasicBlock *> &ExitBlocks){ @@ -841,15 +958,14 @@ void LoopUnswitch::SplitExitEdges(Loop *L,      // Although SplitBlockPredecessors doesn't preserve loop-simplify in      // general, if we call it on all predecessors of all exits then it does. -    SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", -                           /*AliasAnalysis*/ nullptr, DT, LI, +    SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI,                             /*PreserveLCSSA*/ true);    }  } -/// UnswitchNontrivialCondition - We determined that the loop is profitable -/// to unswitch when LIC equal Val.  Split it into loop versions and test the -/// condition outside of either loop.  Return the loops created as Out1/Out2. +/// We determined that the loop is profitable to unswitch when LIC equal Val. +/// Split it into loop versions and test the condition outside of either loop. +/// Return the loops created as Out1/Out2.  void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,                                                 Loop *L, TerminatorInst *TI) {    Function *F = loopHeader->getParent(); @@ -858,8 +974,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,          << " blocks] in Function " << F->getName()          << " when '" << *Val << "' == " << *LIC << "\n"); -  if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) -    SE->forgetLoop(L); +  if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>()) +    SEWP->getSE().forgetLoop(L);    LoopBlocks.clear();    NewBlocks.clear(); @@ -901,8 +1017,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,    // Splice the newly inserted blocks into the function right before the    // original preheader. -  F->getBasicBlockList().splice(NewPreheader, F->getBasicBlockList(), -                                NewBlocks[0], F->end()); +  F->getBasicBlockList().splice(NewPreheader->getIterator(), +                                F->getBasicBlockList(), +                                NewBlocks[0]->getIterator(), F->end());    // FIXME: We could register any cloned assumptions instead of clearing the    // whole function's cache. @@ -944,7 +1061,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,      if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {        PHINode *PN = PHINode::Create(LPad->getType(), 0, "", -                                    ExitSucc->getFirstInsertionPt()); +                                    &*ExitSucc->getFirstInsertionPt());        for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);             I != E; ++I) { @@ -960,7 +1077,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,    for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)      for (BasicBlock::iterator I = NewBlocks[i]->begin(),             E = NewBlocks[i]->end(); I != E; ++I) -      RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); +      RemapInstruction(&*I, VMap, +                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);    // Rewrite the original preheader to select between versions of the loop.    BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator()); @@ -994,8 +1112,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,      RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true);  } -/// RemoveFromWorklist - Remove all instances of I from the worklist vector -/// specified. +/// Remove all instances of I from the worklist vector specified.  static void RemoveFromWorklist(Instruction *I,                                 std::vector<Instruction*> &Worklist) { @@ -1003,7 +1120,7 @@ static void RemoveFromWorklist(Instruction *I,                   Worklist.end());  } -/// ReplaceUsesOfWith - When we find that I really equals V, remove I from the +/// When we find that I really equals V, remove I from the  /// program, replacing all uses with V and update the worklist.  static void ReplaceUsesOfWith(Instruction *I, Value *V,                                std::vector<Instruction*> &Worklist, @@ -1025,9 +1142,9 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V,    ++NumSimplify;  } -// RewriteLoopBodyWithConditionConstant - We know either that the value LIC has -// the value specified by Val in the specified loop, or we know it does NOT have -// that value.  Rewrite any uses of LIC or of properties correlated to it. +/// We know either that the value LIC has the value specified by Val in the +/// specified loop, or we know it does NOT have that value. +/// Rewrite any uses of LIC or of properties correlated to it.  void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,                                                          Constant *Val,                                                          bool IsEqual) { @@ -1138,18 +1255,16 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,      // domtree here -- instead we force it to do a full recomputation      // after the pass is complete -- but we do need to inform it of      // new blocks. -    if (DT) -      DT->addNewBlock(Abort, NewSISucc); +    DT->addNewBlock(Abort, NewSISucc);    }    SimplifyCode(Worklist, L);  } -/// SimplifyCode - Okay, now that we have simplified some instructions in the -/// loop, walk over it and constant prop, dce, and fold control flow where -/// possible.  Note that this is effectively a very simple loop-structure-aware -/// optimizer.  During processing of this loop, L could very well be deleted, so -/// it must not be used. +/// Now that we have simplified some instructions in the loop, walk over it and +/// constant prop, dce, and fold control flow where possible. Note that this is +/// effectively a very simple loop-structure-aware optimizer. During processing +/// of this loop, L could very well be deleted, so it must not be used.  ///  /// FIXME: When the loop optimizer is more mature, separate this out to a new  /// pass. @@ -1207,8 +1322,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {          Succ->replaceAllUsesWith(Pred);          // Move all of the successor contents from Succ to Pred. -        Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(), -                                   Succ->end()); +        Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(), +                                   Succ->begin(), Succ->end());          LPM->deleteSimpleAnalysisValue(BI, L);          BI->eraseFromParent();          RemoveFromWorklist(BI, Worklist); diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp index 3314e1ed41ab..41511bcb7b04 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -22,7 +22,7 @@ using namespace llvm;  #define DEBUG_TYPE "loweratomic"  static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { -  IRBuilder<> Builder(CXI->getParent(), CXI); +  IRBuilder<> Builder(CXI);    Value *Ptr = CXI->getPointerOperand();    Value *Cmp = CXI->getCompareOperand();    Value *Val = CXI->getNewValOperand(); @@ -41,7 +41,7 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {  }  static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) { -  IRBuilder<> Builder(RMWI->getParent(), RMWI); +  IRBuilder<> Builder(RMWI);    Value *Ptr = RMWI->getPointerOperand();    Value *Val = RMWI->getValOperand(); @@ -120,7 +120,7 @@ namespace {          return false;        bool Changed = false;        for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) { -        Instruction *Inst = DI++; +        Instruction *Inst = &*DI++;          if (FenceInst *FI = dyn_cast<FenceInst>(Inst))            Changed |= LowerFenceInst(FI);          else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst)) diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 0c47cbd5bfda..2ace902a7a1b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -139,7 +139,7 @@ static bool lowerExpectIntrinsic(Function &F) {          ExpectIntrinsicsHandled++;      } -    // remove llvm.expect intrinsics. +    // Remove llvm.expect intrinsics.      for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {        CallInst *CI = dyn_cast<CallInst>(BI++);        if (!CI) diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 85012afc80ac..0333bf2284e1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -17,6 +17,7 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/MemoryDependenceAnalysis.h"  #include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/Analysis/ValueTracking.h" @@ -30,7 +31,7 @@  #include "llvm/Support/Debug.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Transforms/Utils/Local.h" -#include <list> +#include <algorithm>  using namespace llvm;  #define DEBUG_TYPE "memcpyopt" @@ -71,9 +72,9 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,    return Offset;  } -/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a -/// constant offset, and return that constant offset.  For example, Ptr1 might -/// be &A[42], and Ptr2 might be &A[40].  In this case offset would be -8. +/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and +/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2 +/// might be &A[40]. In this case offset would be -8.  static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,                              const DataLayout &DL) {    Ptr1 = Ptr1->stripPointerCasts(); @@ -125,7 +126,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,  } -/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value. +/// Represents a range of memset'd bytes with the ByteVal value.  /// This allows us to analyze stores like:  ///   store 0 -> P+1  ///   store 0 -> P+0 @@ -164,8 +165,8 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {    // If any of the stores are a memset, then it is always good to extend the    // memset. -  for (unsigned i = 0, e = TheStores.size(); i != e; ++i) -    if (!isa<StoreInst>(TheStores[i])) +  for (Instruction *SI : TheStores) +    if (!isa<StoreInst>(SI))        return true;    // Assume that the code generator is capable of merging pairs of stores @@ -189,7 +190,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {    unsigned NumPointerStores = Bytes / MaxIntSize;    // Assume the remaining bytes if any are done a byte at a time. -  unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize; +  unsigned NumByteStores = Bytes % MaxIntSize;    // If we will reduce the # stores (according to this heuristic), do the    // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 @@ -200,15 +201,14 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {  namespace {  class MemsetRanges { -  /// Ranges - A sorted list of the memset ranges.  We use std::list here -  /// because each element is relatively large and expensive to copy. -  std::list<MemsetRange> Ranges; -  typedef std::list<MemsetRange>::iterator range_iterator; +  /// A sorted list of the memset ranges. +  SmallVector<MemsetRange, 8> Ranges; +  typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;    const DataLayout &DL;  public:    MemsetRanges(const DataLayout &DL) : DL(DL) {} -  typedef std::list<MemsetRange>::const_iterator const_iterator; +  typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator;    const_iterator begin() const { return Ranges.begin(); }    const_iterator end() const { return Ranges.end(); }    bool empty() const { return Ranges.empty(); } @@ -240,26 +240,20 @@ public:  } // end anon namespace -/// addRange - Add a new store to the MemsetRanges data structure.  This adds a +/// Add a new store to the MemsetRanges data structure.  This adds a  /// new range for the specified store at the specified offset, merging into  /// existing ranges as appropriate. -/// -/// Do a linear search of the ranges to see if this can be joined and/or to -/// find the insertion point in the list.  We keep the ranges sorted for -/// simplicity here.  This is a linear search of a linked list, which is ugly, -/// however the number of ranges is limited, so this won't get crazy slow.  void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,                              unsigned Alignment, Instruction *Inst) {    int64_t End = Start+Size; -  range_iterator I = Ranges.begin(), E = Ranges.end(); -  while (I != E && Start > I->End) -    ++I; +  range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start, +    [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; });    // We now know that I == E, in which case we didn't find anything to merge    // with, or that Start <= I->End.  If End < I->Start or I == E, then we need    // to insert a new range.  Handle this now. -  if (I == E || End < I->Start) { +  if (I == Ranges.end() || End < I->Start) {      MemsetRange &R = *Ranges.insert(I, MemsetRange());      R.Start        = Start;      R.End          = End; @@ -295,7 +289,7 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,    if (End > I->End) {      I->End = End;      range_iterator NextI = I; -    while (++NextI != E && End >= NextI->Start) { +    while (++NextI != Ranges.end() && End >= NextI->Start) {        // Merge the range in.        I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());        if (NextI->End > I->End) @@ -331,9 +325,9 @@ namespace {        AU.addRequired<AssumptionCacheTracker>();        AU.addRequired<DominatorTreeWrapperPass>();        AU.addRequired<MemoryDependenceAnalysis>(); -      AU.addRequired<AliasAnalysis>(); +      AU.addRequired<AAResultsWrapperPass>();        AU.addRequired<TargetLibraryInfoWrapperPass>(); -      AU.addPreserved<AliasAnalysis>(); +      AU.addPreserved<GlobalsAAWrapperPass>();        AU.addPreserved<MemoryDependenceAnalysis>();      } @@ -357,7 +351,7 @@ namespace {    char MemCpyOpt::ID = 0;  } -// createMemCpyOptPass - The public interface to this file... +/// The public interface to this file...  FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }  INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", @@ -366,14 +360,15 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)  INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)  INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",                      false, false) -/// tryMergingIntoMemset - When scanning forward over instructions, we look for -/// some other patterns to fold away.  In particular, this looks for stores to -/// neighboring locations of memory.  If it sees enough consecutive ones, it -/// attempts to merge them together into a memcpy/memset. +/// When scanning forward over instructions, we look for some other patterns to +/// fold away. In particular, this looks for stores to neighboring locations of +/// memory. If it sees enough consecutive ones, it attempts to merge them +/// together into a memcpy/memset.  Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,                                               Value *StartPtr, Value *ByteVal) {    const DataLayout &DL = StartInst->getModule()->getDataLayout(); @@ -384,7 +379,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,    // are stored.    MemsetRanges Ranges(DL); -  BasicBlock::iterator BI = StartInst; +  BasicBlock::iterator BI(StartInst);    for (++BI; !isa<TerminatorInst>(BI); ++BI) {      if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {        // If the instruction is readnone, ignore it, otherwise bail out.  We @@ -439,14 +434,12 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,    // If we create any memsets, we put it right before the first instruction that    // isn't part of the memset block.  This ensure that the memset is dominated    // by any addressing instruction needed by the start of the block. -  IRBuilder<> Builder(BI); +  IRBuilder<> Builder(&*BI);    // Now that we have full information about ranges, loop over the ranges and    // emit memset's for anything big enough to be worthwhile.    Instruction *AMemSet = nullptr; -  for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); -       I != E; ++I) { -    const MemsetRange &Range = *I; +  for (const MemsetRange &Range : Ranges) {      if (Range.TheStores.size() == 1) continue; @@ -470,19 +463,17 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,        Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);      DEBUG(dbgs() << "Replace stores:\n"; -          for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) -            dbgs() << *Range.TheStores[i] << '\n'; +          for (Instruction *SI : Range.TheStores) +            dbgs() << *SI << '\n';            dbgs() << "With: " << *AMemSet << '\n');      if (!Range.TheStores.empty())        AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());      // Zap all the stores. -    for (SmallVectorImpl<Instruction *>::const_iterator -         SI = Range.TheStores.begin(), -         SE = Range.TheStores.end(); SI != SE; ++SI) { -      MD->removeInstruction(*SI); -      (*SI)->eraseFromParent(); +    for (Instruction *SI : Range.TheStores) { +      MD->removeInstruction(SI); +      SI->eraseFromParent();      }      ++NumMemSetInfer;    } @@ -493,6 +484,16 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,  bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {    if (!SI->isSimple()) return false; + +  // Avoid merging nontemporal stores since the resulting +  // memcpy/memset would not be able to preserve the nontemporal hint. +  // In theory we could teach how to propagate the !nontemporal metadata to +  // memset calls. However, that change would force the backend to +  // conservatively expand !nontemporal memset calls back to sequences of +  // store instructions (effectively undoing the merging). +  if (SI->getMetadata(LLVMContext::MD_nontemporal)) +    return false; +    const DataLayout &DL = SI->getModule()->getDataLayout();    // Detect cases where we're performing call slot forwarding, but @@ -509,11 +510,11 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {        if (C) {          // Check that nothing touches the dest of the "copy" between          // the call and the store. -        AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); +        AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();          MemoryLocation StoreLoc = MemoryLocation::get(SI); -        for (BasicBlock::iterator I = --BasicBlock::iterator(SI), -                                  E = C; I != E; --I) { -          if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) { +        for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); +             I != E; --I) { +          if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {              C = nullptr;              break;            } @@ -554,7 +555,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {    if (Value *ByteVal = isBytewiseValue(SI->getOperand(0)))      if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),                                                ByteVal)) { -      BBI = I;  // Don't invalidate iterator. +      BBI = I->getIterator(); // Don't invalidate iterator.        return true;      } @@ -567,14 +568,14 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {    if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())      if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),                                                MSI->getValue())) { -      BBI = I;  // Don't invalidate iterator. +      BBI = I->getIterator(); // Don't invalidate iterator.        return true;      }    return false;  } -/// performCallSlotOptzn - takes a memcpy and a call that it depends on, +/// Takes a memcpy and a call that it depends on,  /// and checks for the possibility of a call slot optimization by having  /// the call write its result directly into the destination of the memcpy.  bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, @@ -710,12 +711,12 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,    // unexpected manner, for example via a global, which we deduce from    // the use analysis, we also need to know that it does not sneakily    // access dest.  We rely on AA to figure this out for us. -  AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); -  AliasAnalysis::ModRefResult MR = AA.getModRefInfo(C, cpyDest, srcSize); +  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); +  ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize);    // If necessary, perform additional analysis. -  if (MR != AliasAnalysis::NoModRef) +  if (MR != MRI_NoModRef)      MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT); -  if (MR != AliasAnalysis::NoModRef) +  if (MR != MRI_NoModRef)      return false;    // All the checks have passed, so do the transformation. @@ -749,11 +750,9 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,    // Update AA metadata    // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be    // handled here, but combineMetadata doesn't support them yet -  unsigned KnownIDs[] = { -    LLVMContext::MD_tbaa, -    LLVMContext::MD_alias_scope, -    LLVMContext::MD_noalias, -  }; +  unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, +                         LLVMContext::MD_noalias, +                         LLVMContext::MD_invariant_group};    combineMetadata(C, cpy, KnownIDs);    // Remove the memcpy. @@ -763,10 +762,8 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,    return true;  } -/// processMemCpyMemCpyDependence - We've found that the (upward scanning) -/// memory dependence of memcpy 'M' is the memcpy 'MDep'.  Try to simplify M to -/// copy from MDep's input if we can. -/// +/// We've found that the (upward scanning) memory dependence of memcpy 'M' is +/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.  bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {    // We can only transforms memcpy's where the dest of one is the source of the    // other. @@ -788,7 +785,7 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {    if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())      return false; -  AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); +  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();    // Verify that the copied-from memory doesn't change in between the two    // transfers.  For example, in: @@ -802,8 +799,9 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {    //    // NOTE: This is conservative, it will stop on any read from the source loc,    // not just the defining memcpy. -  MemDepResult SourceDep = MD->getPointerDependencyFrom( -      MemoryLocation::getForSource(MDep), false, M, M->getParent()); +  MemDepResult SourceDep = +      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, +                                   M->getIterator(), M->getParent());    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)      return false; @@ -860,8 +858,9 @@ bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy,      return false;    // Check that there are no other dependencies on the memset destination. -  MemDepResult DstDepInfo = MD->getPointerDependencyFrom( -      MemoryLocation::getForDest(MemSet), false, MemCpy, MemCpy->getParent()); +  MemDepResult DstDepInfo = +      MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false, +                                   MemCpy->getIterator(), MemCpy->getParent());    if (DstDepInfo.getInst() != MemSet)      return false; @@ -936,7 +935,7 @@ bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,    return true;  } -/// processMemCpy - perform simplification of memcpy's.  If we have memcpy A +/// Perform simplification of memcpy's.  If we have memcpy A  /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite  /// B to be a memcpy from X to Z (or potentially a memmove, depending on  /// circumstances). This allows later passes to remove the first memcpy @@ -998,8 +997,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {    }    MemoryLocation SrcLoc = MemoryLocation::getForSource(M); -  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, -                                                         M, M->getParent()); +  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom( +      SrcLoc, true, M->getIterator(), M->getParent());    if (SrcDepInfo.isClobber()) {      if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) @@ -1037,10 +1036,10 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {    return false;  } -/// processMemMove - Transforms memmove calls to memcpy calls when the src/dst -/// are guaranteed not to alias. +/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed +/// not to alias.  bool MemCpyOpt::processMemMove(MemMoveInst *M) { -  AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); +  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();    if (!TLI->has(LibFunc::memmove))      return false; @@ -1053,12 +1052,11 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {    DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n");    // If not, then we know we can transform this. -  Module *Mod = M->getParent()->getParent()->getParent();    Type *ArgTys[3] = { M->getRawDest()->getType(),                        M->getRawSource()->getType(),                        M->getLength()->getType() }; -  M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, -                                                 ArgTys)); +  M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(), +                                                 Intrinsic::memcpy, ArgTys));    // MemDep may have over conservative information about this instruction, just    // conservatively flush it from the cache. @@ -1068,7 +1066,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {    return true;  } -/// processByValArgument - This is called on every byval argument in call sites. +/// This is called on every byval argument in call sites.  bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {    const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout();    // Find out what feeds this byval argument. @@ -1076,8 +1074,8 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {    Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();    uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);    MemDepResult DepInfo = MD->getPointerDependencyFrom( -      MemoryLocation(ByValArg, ByValSize), true, CS.getInstruction(), -      CS.getInstruction()->getParent()); +      MemoryLocation(ByValArg, ByValSize), true, +      CS.getInstruction()->getIterator(), CS.getInstruction()->getParent());    if (!DepInfo.isClobber())      return false; @@ -1119,9 +1117,9 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {    //    // NOTE: This is conservative, it will stop on any read from the source loc,    // not just the defining memcpy. -  MemDepResult SourceDep = -      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, -                                   CS.getInstruction(), MDep->getParent()); +  MemDepResult SourceDep = MD->getPointerDependencyFrom( +      MemoryLocation::getForSource(MDep), false, +      CS.getInstruction()->getIterator(), MDep->getParent());    if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)      return false; @@ -1140,7 +1138,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {    return true;  } -/// iterateOnFunction - Executes one iteration of MemCpyOpt. +/// Executes one iteration of MemCpyOpt.  bool MemCpyOpt::iterateOnFunction(Function &F) {    bool MadeChange = false; @@ -1148,7 +1146,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {    for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {        // Avoid invalidating the iterator. -      Instruction *I = BI++; +      Instruction *I = &*BI++;        bool RepeatInstruction = false; @@ -1177,9 +1175,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {    return MadeChange;  } -// MemCpyOpt::runOnFunction - This is the main transformation entry point for a -// function. -// +/// This is the main transformation entry point for a function.  bool MemCpyOpt::runOnFunction(Function &F) {    if (skipOptnoneFunction(F))      return false; diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 643f3740eedd..c812d618c16a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -78,6 +78,7 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/Loads.h"  #include "llvm/Analysis/MemoryBuiltins.h"  #include "llvm/Analysis/MemoryDependenceAnalysis.h" @@ -91,6 +92,7 @@  #include "llvm/Transforms/Utils/BasicBlockUtils.h"  #include "llvm/Transforms/Utils/SSAUpdater.h"  #include <vector> +  using namespace llvm;  #define DEBUG_TYPE "mldst-motion" @@ -106,7 +108,7 @@ class MergedLoadStoreMotion : public FunctionPass {  public:    static char ID; // Pass identification, replacement for typeid -  explicit MergedLoadStoreMotion(void) +  MergedLoadStoreMotion()        : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) {      initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry());    } @@ -116,10 +118,11 @@ public:  private:    // This transformation requires dominator postdominator info    void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.setPreservesCFG();      AU.addRequired<TargetLibraryInfoWrapperPass>(); -    AU.addRequired<AliasAnalysis>(); +    AU.addRequired<AAResultsWrapperPass>(); +    AU.addPreserved<GlobalsAAWrapperPass>();      AU.addPreserved<MemoryDependenceAnalysis>(); -    AU.addPreserved<AliasAnalysis>();    }    // Helper routines @@ -156,7 +159,7 @@ private:  };  char MergedLoadStoreMotion::ID = 0; -} +} // anonymous namespace  ///  /// \brief createMergedLoadStoreMotionPass - The public interface to this file. @@ -169,7 +172,8 @@ INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion",                        "MergedLoadStoreMotion", false, false)  INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)  INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion",                      "MergedLoadStoreMotion", false, false) @@ -236,12 +240,11 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {  /// being loaded or protect against the load from happening  /// it is considered a hoist barrier.  /// -  bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start,                                                         const Instruction& End,                                                        LoadInst* LI) {    MemoryLocation Loc = MemoryLocation::get(LI); -  return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod); +  return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod);  }  /// @@ -256,7 +259,7 @@ LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1,    for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE;         ++BBI) { -    Instruction *Inst = BBI; +    Instruction *Inst = &*BBI;      // Only merge and hoist loads when their result in used only in BB      if (!isa<LoadInst>(Inst) || Inst->isUsedOutsideOfBlock(BB1)) @@ -293,7 +296,7 @@ void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,    // Intersect optional metadata.    HoistCand->intersectOptionalDataWith(ElseInst); -  HoistCand->dropUnknownMetadata(); +  HoistCand->dropUnknownNonDebugMetadata();    // Prepend point for instruction insert    Instruction *HoistPt = BB->getTerminator(); @@ -363,8 +366,7 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {    int NLoads = 0;    for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end();         BBI != BBE;) { - -    Instruction *I = BBI; +    Instruction *I = &*BBI;      ++BBI;      // Only move non-simple (atomic, volatile) loads. @@ -394,11 +396,10 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {  /// value being stored or protect against the store from  /// happening it is considered a sink barrier.  /// -  bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,                                                        const Instruction &End,                                                        MemoryLocation Loc) { -  return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::ModRef); +  return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef);  }  /// @@ -438,23 +439,16 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,  PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,                                                StoreInst *S1) {    // Create a phi if the values mismatch. -  PHINode *NewPN = 0; +  PHINode *NewPN = nullptr;    Value *Opd1 = S0->getValueOperand();    Value *Opd2 = S1->getValueOperand();    if (Opd1 != Opd2) {      NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink", -                            BB->begin()); +                            &BB->front());      NewPN->addIncoming(Opd1, S0->getParent());      NewPN->addIncoming(Opd2, S1->getParent()); -    if (NewPN->getType()->getScalarType()->isPointerTy()) { -      // AA needs to be informed when a PHI-use of the pointer value is added -      for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) { -        unsigned J = PHINode::getOperandNumForIncomingValue(I); -        AA->addEscapingUse(NewPN->getOperandUse(J)); -      } -      if (MD) -        MD->invalidateCachedPointerInfo(NewPN); -    } +    if (MD && NewPN->getType()->getScalarType()->isPointerTy()) +      MD->invalidateCachedPointerInfo(NewPN);    }    return NewPN;  } @@ -479,12 +473,12 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,      BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();      // Intersect optional metadata.      S0->intersectOptionalDataWith(S1); -    S0->dropUnknownMetadata(); +    S0->dropUnknownNonDebugMetadata();      // Create the new store to be inserted at the join point.      StoreInst *SNew = (StoreInst *)(S0->clone());      Instruction *ANew = A0->clone(); -    SNew->insertBefore(InsertPt); +    SNew->insertBefore(&*InsertPt);      ANew->insertBefore(SNew);      assert(S0->getParent() == A0->getParent()); @@ -566,12 +560,13 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {    }    return MergedStores;  } +  ///  /// \brief Run the transformation for each function  ///  bool MergedLoadStoreMotion::runOnFunction(Function &F) {    MD = getAnalysisIfAvailable<MemoryDependenceAnalysis>(); -  AA = &getAnalysis<AliasAnalysis>(); +  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();    bool Changed = false;    DEBUG(dbgs() << "Instruction Merger\n"); @@ -579,7 +574,7 @@ bool MergedLoadStoreMotion::runOnFunction(Function &F) {    // Merge unconditional branches, allowing PRE to catch more    // optimization opportunities.    for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) { -    BasicBlock *BB = FI++; +    BasicBlock *BB = &*FI++;      // Hoist equivalent loads and sink stores      // outside diamonds when possible diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index f42f8306fccc..c8f885e7eec5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -71,8 +71,8 @@  //  // Limitations and TODO items:  // -// 1) We only considers n-ary adds for now. This should be extended and -// generalized. +// 1) We only considers n-ary adds and muls for now. This should be extended +// and generalized.  //  //===----------------------------------------------------------------------===// @@ -110,11 +110,11 @@ public:    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.addPreserved<DominatorTreeWrapperPass>(); -    AU.addPreserved<ScalarEvolution>(); +    AU.addPreserved<ScalarEvolutionWrapperPass>();      AU.addPreserved<TargetLibraryInfoWrapperPass>();      AU.addRequired<AssumptionCacheTracker>();      AU.addRequired<DominatorTreeWrapperPass>(); -    AU.addRequired<ScalarEvolution>(); +    AU.addRequired<ScalarEvolutionWrapperPass>();      AU.addRequired<TargetLibraryInfoWrapperPass>();      AU.addRequired<TargetTransformInfoWrapperPass>();      AU.setPreservesCFG(); @@ -145,12 +145,23 @@ private:                                                unsigned I, Value *LHS,                                                Value *RHS, Type *IndexedType); -  // Reassociate Add for better CSE. -  Instruction *tryReassociateAdd(BinaryOperator *I); -  // A helper function for tryReassociateAdd. LHS and RHS are explicitly passed. -  Instruction *tryReassociateAdd(Value *LHS, Value *RHS, Instruction *I); -  // Rewrites I to LHS + RHS if LHS is computed already. -  Instruction *tryReassociatedAdd(const SCEV *LHS, Value *RHS, Instruction *I); +  // Reassociate binary operators for better CSE. +  Instruction *tryReassociateBinaryOp(BinaryOperator *I); + +  // A helper function for tryReassociateBinaryOp. LHS and RHS are explicitly +  // passed. +  Instruction *tryReassociateBinaryOp(Value *LHS, Value *RHS, +                                      BinaryOperator *I); +  // Rewrites I to (LHS op RHS) if LHS is computed already. +  Instruction *tryReassociatedBinaryOp(const SCEV *LHS, Value *RHS, +                                       BinaryOperator *I); + +  // Tries to match Op1 and Op2 by using V. +  bool matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1, Value *&Op2); + +  // Gets SCEV for (LHS op RHS). +  const SCEV *getBinarySCEV(BinaryOperator *I, const SCEV *LHS, +                            const SCEV *RHS);    // Returns the closest dominator of \c Dominatee that computes    // \c CandidateExpr. Returns null if not found. @@ -161,11 +172,6 @@ private:    // GEP's pointer size, i.e., whether Index needs to be sign-extended in order    // to be an index of GEP.    bool requiresSignExtension(Value *Index, GetElementPtrInst *GEP); -  // Returns whether V is known to be non-negative at context \c Ctxt. -  bool isKnownNonNegative(Value *V, Instruction *Ctxt); -  // Returns whether AO may sign overflow at context \c Ctxt. It computes a -  // conservative result -- it answers true when not sure. -  bool maySignOverflow(AddOperator *AO, Instruction *Ctxt);    AssumptionCache *AC;    const DataLayout *DL; @@ -182,7 +188,7 @@ private:    //     foo(a + b);    //   if (p2)    //     bar(a + b); -  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> SeenExprs; +  DenseMap<const SCEV *, SmallVector<WeakVH, 2>> SeenExprs;  };  } // anonymous namespace @@ -191,7 +197,7 @@ INITIALIZE_PASS_BEGIN(NaryReassociate, "nary-reassociate", "Nary reassociation",                        false, false)  INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)  INITIALIZE_PASS_END(NaryReassociate, "nary-reassociate", "Nary reassociation", @@ -207,7 +213,7 @@ bool NaryReassociate::runOnFunction(Function &F) {    AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); -  SE = &getAnalysis<ScalarEvolution>(); +  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); @@ -224,6 +230,7 @@ static bool isPotentiallyNaryReassociable(Instruction *I) {    switch (I->getOpcode()) {    case Instruction::Add:    case Instruction::GetElementPtr: +  case Instruction::Mul:      return true;    default:      return false; @@ -239,19 +246,21 @@ bool NaryReassociate::doOneIteration(Function &F) {         Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) {      BasicBlock *BB = Node->getBlock();      for (auto I = BB->begin(); I != BB->end(); ++I) { -      if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(I)) { -        const SCEV *OldSCEV = SE->getSCEV(I); -        if (Instruction *NewI = tryReassociate(I)) { +      if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(&*I)) { +        const SCEV *OldSCEV = SE->getSCEV(&*I); +        if (Instruction *NewI = tryReassociate(&*I)) {            Changed = true; -          SE->forgetValue(I); +          SE->forgetValue(&*I);            I->replaceAllUsesWith(NewI); -          RecursivelyDeleteTriviallyDeadInstructions(I, TLI); -          I = NewI; +          // If SeenExprs constains I's WeakVH, that entry will be replaced with +          // nullptr. +          RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI); +          I = NewI->getIterator();          }          // Add the rewritten instruction to SeenExprs; the original instruction          // is deleted. -        const SCEV *NewSCEV = SE->getSCEV(I); -        SeenExprs[NewSCEV].push_back(I); +        const SCEV *NewSCEV = SE->getSCEV(&*I); +        SeenExprs[NewSCEV].push_back(WeakVH(&*I));          // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)          // is equivalent to I. However, ScalarEvolution::getSCEV may          // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose @@ -271,7 +280,7 @@ bool NaryReassociate::doOneIteration(Function &F) {          //          // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll.          if (NewSCEV != OldSCEV) -          SeenExprs[OldSCEV].push_back(I); +          SeenExprs[OldSCEV].push_back(WeakVH(&*I));        }      }    } @@ -281,7 +290,8 @@ bool NaryReassociate::doOneIteration(Function &F) {  Instruction *NaryReassociate::tryReassociate(Instruction *I) {    switch (I->getOpcode()) {    case Instruction::Add: -    return tryReassociateAdd(cast<BinaryOperator>(I)); +  case Instruction::Mul: +    return tryReassociateBinaryOp(cast<BinaryOperator>(I));    case Instruction::GetElementPtr:      return tryReassociateGEP(cast<GetElementPtrInst>(I));    default: @@ -352,27 +362,6 @@ bool NaryReassociate::requiresSignExtension(Value *Index,    return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits;  } -bool NaryReassociate::isKnownNonNegative(Value *V, Instruction *Ctxt) { -  bool NonNegative, Negative; -  // TODO: ComputeSignBits is expensive. Consider caching the results. -  ComputeSignBit(V, NonNegative, Negative, *DL, 0, AC, Ctxt, DT); -  return NonNegative; -} - -bool NaryReassociate::maySignOverflow(AddOperator *AO, Instruction *Ctxt) { -  if (AO->hasNoSignedWrap()) -    return false; - -  Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1); -  // If LHS or RHS has the same sign as the sum, AO doesn't sign overflow. -  // TODO: handle the negative case as well. -  if (isKnownNonNegative(AO, Ctxt) && -      (isKnownNonNegative(LHS, Ctxt) || isKnownNonNegative(RHS, Ctxt))) -    return false; - -  return true; -} -  GetElementPtrInst *  NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I,                                            Type *IndexedType) { @@ -381,7 +370,7 @@ NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I,      IndexToSplit = SExt->getOperand(0);    } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) {      // zext can be treated as sext if the source is non-negative. -    if (isKnownNonNegative(ZExt->getOperand(0), GEP)) +    if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT))        IndexToSplit = ZExt->getOperand(0);    } @@ -389,8 +378,11 @@ NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I,      // If the I-th index needs sext and the underlying add is not equipped with      // nsw, we cannot split the add because      //   sext(LHS + RHS) != sext(LHS) + sext(RHS). -    if (requiresSignExtension(IndexToSplit, GEP) && maySignOverflow(AO, GEP)) +    if (requiresSignExtension(IndexToSplit, GEP) && +        computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) != +            OverflowResult::NeverOverflows)        return nullptr; +      Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);      // IndexToSplit = LHS + RHS.      if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType)) @@ -415,7 +407,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(      IndexExprs.push_back(SE->getSCEV(*Index));    // Replace the I-th index with LHS.    IndexExprs[I] = SE->getSCEV(LHS); -  if (isKnownNonNegative(LHS, GEP) && +  if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) &&        DL->getTypeSizeInBits(LHS->getType()) <            DL->getTypeSizeInBits(GEP->getOperand(I)->getType())) {      // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to @@ -429,19 +421,20 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(        GEP->getSourceElementType(), SE->getSCEV(GEP->getPointerOperand()),        IndexExprs, GEP->isInBounds()); -  auto *Candidate = findClosestMatchingDominator(CandidateExpr, GEP); +  Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP);    if (Candidate == nullptr)      return nullptr; -  PointerType *TypeOfCandidate = dyn_cast<PointerType>(Candidate->getType()); -  // Pretty rare but theoretically possible when a numeric value happens to -  // share CandidateExpr. -  if (TypeOfCandidate == nullptr) -    return nullptr; +  IRBuilder<> Builder(GEP); +  // Candidate does not necessarily have the same pointer type as GEP. Use +  // bitcast or pointer cast to make sure they have the same type, so that the +  // later RAUW doesn't complain. +  Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType()); +  assert(Candidate->getType() == GEP->getType());    // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType)    uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType); -  Type *ElementType = TypeOfCandidate->getElementType(); +  Type *ElementType = GEP->getType()->getElementType();    uint64_t ElementSize = DL->getTypeAllocSize(ElementType);    // Another less rare case: because I is not necessarily the last index of the    // GEP, the size of the type at the I-th index (IndexedSize) is not @@ -461,8 +454,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(      return nullptr;    // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0]))); -  IRBuilder<> Builder(GEP); -  Type *IntPtrTy = DL->getIntPtrType(TypeOfCandidate); +  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());    if (RHS->getType() != IntPtrTy)      RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy);    if (IndexedSize != ElementSize) { @@ -476,54 +468,89 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(    return NewGEP;  } -Instruction *NaryReassociate::tryReassociateAdd(BinaryOperator *I) { +Instruction *NaryReassociate::tryReassociateBinaryOp(BinaryOperator *I) {    Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); -  if (auto *NewI = tryReassociateAdd(LHS, RHS, I)) +  if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))      return NewI; -  if (auto *NewI = tryReassociateAdd(RHS, LHS, I)) +  if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I))      return NewI;    return nullptr;  } -Instruction *NaryReassociate::tryReassociateAdd(Value *LHS, Value *RHS, -                                                Instruction *I) { +Instruction *NaryReassociate::tryReassociateBinaryOp(Value *LHS, Value *RHS, +                                                     BinaryOperator *I) {    Value *A = nullptr, *B = nullptr; -  // To be conservative, we reassociate I only when it is the only user of A+B. -  if (LHS->hasOneUse() && match(LHS, m_Add(m_Value(A), m_Value(B)))) { -    // I = (A + B) + RHS -    //   = (A + RHS) + B or (B + RHS) + A +  // To be conservative, we reassociate I only when it is the only user of (A op +  // B). +  if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) { +    // I = (A op B) op RHS +    //   = (A op RHS) op B or (B op RHS) op A      const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);      const SCEV *RHSExpr = SE->getSCEV(RHS);      if (BExpr != RHSExpr) { -      if (auto *NewI = tryReassociatedAdd(SE->getAddExpr(AExpr, RHSExpr), B, I)) +      if (auto *NewI = +              tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))          return NewI;      }      if (AExpr != RHSExpr) { -      if (auto *NewI = tryReassociatedAdd(SE->getAddExpr(BExpr, RHSExpr), A, I)) +      if (auto *NewI = +              tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))          return NewI;      }    }    return nullptr;  } -Instruction *NaryReassociate::tryReassociatedAdd(const SCEV *LHSExpr, -                                                 Value *RHS, Instruction *I) { -  auto Pos = SeenExprs.find(LHSExpr); -  // Bail out if LHSExpr is not previously seen. -  if (Pos == SeenExprs.end()) -    return nullptr; - +Instruction *NaryReassociate::tryReassociatedBinaryOp(const SCEV *LHSExpr, +                                                      Value *RHS, +                                                      BinaryOperator *I) {    // Look for the closest dominator LHS of I that computes LHSExpr, and replace -  // I with LHS + RHS. +  // I with LHS op RHS.    auto *LHS = findClosestMatchingDominator(LHSExpr, I);    if (LHS == nullptr)      return nullptr; -  Instruction *NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); +  Instruction *NewI = nullptr; +  switch (I->getOpcode()) { +  case Instruction::Add: +    NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); +    break; +  case Instruction::Mul: +    NewI = BinaryOperator::CreateMul(LHS, RHS, "", I); +    break; +  default: +    llvm_unreachable("Unexpected instruction."); +  }    NewI->takeName(I);    return NewI;  } +bool NaryReassociate::matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1, +                                     Value *&Op2) { +  switch (I->getOpcode()) { +  case Instruction::Add: +    return match(V, m_Add(m_Value(Op1), m_Value(Op2))); +  case Instruction::Mul: +    return match(V, m_Mul(m_Value(Op1), m_Value(Op2))); +  default: +    llvm_unreachable("Unexpected instruction."); +  } +  return false; +} + +const SCEV *NaryReassociate::getBinarySCEV(BinaryOperator *I, const SCEV *LHS, +                                           const SCEV *RHS) { +  switch (I->getOpcode()) { +  case Instruction::Add: +    return SE->getAddExpr(LHS, RHS); +  case Instruction::Mul: +    return SE->getMulExpr(LHS, RHS); +  default: +    llvm_unreachable("Unexpected instruction."); +  } +  return nullptr; +} +  Instruction *  NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr,                                                Instruction *Dominatee) { @@ -537,9 +564,13 @@ NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr,    // future instruction either. Therefore, we pop it out of the stack. This    // optimization makes the algorithm O(n).    while (!Candidates.empty()) { -    Instruction *Candidate = Candidates.back(); -    if (DT->dominates(Candidate, Dominatee)) -      return Candidate; +    // Candidates stores WeakVHs, so a candidate can be nullptr if it's removed +    // during rewriting. +    if (Value *Candidate = Candidates.back()) { +      Instruction *CandidateInstruction = cast<Instruction>(Candidate); +      if (DT->dominates(CandidateInstruction, Dominatee)) +        return CandidateInstruction; +    }      Candidates.pop_back();    }    return nullptr; diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 31d7df39c781..9f26f78892c6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -154,7 +154,7 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,    Phi->addIncoming(Call, &CurrBB);    Phi->addIncoming(LibCall, LibCallBB); -  BB = JoinBB; +  BB = JoinBB->getIterator();    return true;  } diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index 366301ad731a..28c610c2486a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -27,7 +27,7 @@  // well defined state for inspection by the collector.  In the current  // implementation, this is done via the insertion of poll sites at method entry  // and the backedge of most loops.  We try to avoid inserting more polls than -// are neccessary to ensure a finite period between poll sites.  This is not +// are necessary to ensure a finite period between poll sites.  This is not  // because the poll itself is expensive in the generated code; it's not.  Polls  // do tend to impact the optimizer itself in negative ways; we'd like to avoid  // perturbing the optimization of the method as much as we can. @@ -91,13 +91,15 @@ STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution");  using namespace llvm; -// Ignore oppurtunities to avoid placing safepoints on backedges, useful for +// Ignore opportunities to avoid placing safepoints on backedges, useful for  // validation  static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden,                                    cl::init(false)); -/// If true, do not place backedge safepoints in counted loops. -static cl::opt<bool> SkipCounted("spp-counted", cl::Hidden, cl::init(true)); +/// How narrow does the trip count of a loop have to be to have to be considered +/// "counted"?  Counted loops do not get safepoints at backedges. +static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width", +                                         cl::Hidden, cl::init(32));  // If true, split the backedge of a loop when placing the safepoint, otherwise  // split the latch block itself.  Both are useful to support for @@ -121,7 +123,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {    std::vector<TerminatorInst *> PollLocations;    /// True unless we're running spp-no-calls in which case we need to disable -  /// the call dependend placement opts. +  /// the call-dependent placement opts.    bool CallSafepointsEnabled;    ScalarEvolution *SE = nullptr; @@ -142,7 +144,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {    }    bool runOnFunction(Function &F) override { -    SE = &getAnalysis<ScalarEvolution>(); +    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();      for (auto I = LI->begin(), E = LI->end(); I != E; I++) { @@ -153,7 +155,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.addRequired<DominatorTreeWrapperPass>(); -    AU.addRequired<ScalarEvolution>(); +    AU.addRequired<ScalarEvolutionWrapperPass>();      AU.addRequired<LoopInfoWrapperPass>();      // We no longer modify the IR at all in this pass.  Thus all      // analysis are preserved. @@ -190,10 +192,8 @@ static void  InsertSafepointPoll(Instruction *InsertBefore,                      std::vector<CallSite> &ParsePointsNeeded /*rval*/); -static bool isGCLeafFunction(const CallSite &CS); -  static bool needsStatepoint(const CallSite &CS) { -  if (isGCLeafFunction(CS)) +  if (callsGCLeafFunction(CS))      return false;    if (CS.isCall()) {      CallInst *call = cast<CallInst>(CS.getInstruction()); @@ -206,7 +206,7 @@ static bool needsStatepoint(const CallSite &CS) {    return true;  } -static Value *ReplaceWithStatepoint(const CallSite &CS, Pass *P); +static Value *ReplaceWithStatepoint(const CallSite &CS);  /// Returns true if this loop is known to contain a call safepoint which  /// must unconditionally execute on any iteration of the loop which returns @@ -220,7 +220,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,    // For the moment, we look only for the 'cuts' that consist of a single call    // instruction in a block which is dominated by the Header and dominates the    // loop latch (Pred) block.  Somewhat surprisingly, walking the entire chain -  // of such dominating blocks gets substaintially more occurences than just +  // of such dominating blocks gets substantially more occurrences than just    // checking the Pred and Header blocks themselves.  This may be due to the    // density of loop exit conditions caused by range and null checks.    // TODO: structure this as an analysis pass, cache the result for subloops, @@ -255,18 +255,12 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,  /// conservatism in the analysis.  static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,                                      BasicBlock *Pred) { -  // Only used when SkipCounted is off -  const unsigned upperTripBound = 8192; -    // A conservative bound on the loop as a whole.    const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L); -  if (MaxTrips != SE->getCouldNotCompute()) { -    if (SE->getUnsignedRange(MaxTrips).getUnsignedMax().ult(upperTripBound)) -      return true; -    if (SkipCounted && -        SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(32)) -      return true; -  } +  if (MaxTrips != SE->getCouldNotCompute() && +      SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN( +          CountedLoopTripWidth)) +    return true;    // If this is a conditional branch to the header with the alternate path    // being outside the loop, we can ask questions about the execution frequency @@ -275,13 +269,10 @@ static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,      // This returns an exact expression only.  TODO: We really only need an      // upper bound here, but SE doesn't expose that.      const SCEV *MaxExec = SE->getExitCount(L, Pred); -    if (MaxExec != SE->getCouldNotCompute()) { -      if (SE->getUnsignedRange(MaxExec).getUnsignedMax().ult(upperTripBound)) -        return true; -      if (SkipCounted && -          SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(32)) +    if (MaxExec != SE->getCouldNotCompute() && +        SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN( +            CountedLoopTripWidth))          return true; -    }    }    return /* not finite */ false; @@ -432,14 +423,14 @@ static Instruction *findLocationForEntrySafepoint(Function &F,      assert(hasNextInstruction(I) &&             "first check if there is a next instruction!");      if (I->isTerminator()) { -      return I->getParent()->getUniqueSuccessor()->begin(); +      return &I->getParent()->getUniqueSuccessor()->front();      } else { -      return std::next(BasicBlock::iterator(I)); +      return &*++I->getIterator();      }    };    Instruction *cursor = nullptr; -  for (cursor = F.getEntryBlock().begin(); hasNextInstruction(cursor); +  for (cursor = &F.getEntryBlock().front(); hasNextInstruction(cursor);         cursor = nextInstruction(cursor)) {      // We need to ensure a safepoint poll occurs before any 'real' call.  The @@ -466,7 +457,7 @@ static Instruction *findLocationForEntrySafepoint(Function &F,  static void findCallSafepoints(Function &F,                                 std::vector<CallSite> &Found /*rval*/) {    assert(Found.empty() && "must be empty!"); -  for (Instruction &I : inst_range(F)) { +  for (Instruction &I : instructions(F)) {      Instruction *inst = &I;      if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) {        CallSite CS(inst); @@ -713,7 +704,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {                                    Invoke->getParent());      } -    Value *GCResult = ReplaceWithStatepoint(CS, nullptr); +    Value *GCResult = ReplaceWithStatepoint(CS);      Results.push_back(GCResult);    }    assert(Results.size() == ParsePointNeeded.size()); @@ -747,7 +738,7 @@ FunctionPass *llvm::createPlaceSafepointsPass() {  INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl,                        "place-backedge-safepoints-impl",                        "Place Backedge Safepoints", false, false) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)  INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl, @@ -759,31 +750,6 @@ INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints",  INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",                      false, false) -static bool isGCLeafFunction(const CallSite &CS) { -  Instruction *inst = CS.getInstruction(); -  if (isa<IntrinsicInst>(inst)) { -    // Most LLVM intrinsics are things which can never take a safepoint. -    // As a result, we don't need to have the stack parsable at the -    // callsite.  This is a highly useful optimization since intrinsic -    // calls are fairly prevelent, particularly in debug builds. -    return true; -  } - -  // If this function is marked explicitly as a leaf call, we don't need to -  // place a safepoint of it.  In fact, for correctness we *can't* in many -  // cases.  Note: Indirect calls return Null for the called function, -  // these obviously aren't runtime functions with attributes -  // TODO: Support attributes on the call site as well. -  const Function *F = CS.getCalledFunction(); -  bool isLeaf = -      F && -      F->getFnAttribute("gc-leaf-function").getValueAsString().equals("true"); -  if (isLeaf) { -    return true; -  } -  return false; -} -  static void  InsertSafepointPoll(Instruction *InsertBefore,                      std::vector<CallSite> &ParsePointsNeeded /*rval*/) { @@ -796,6 +762,7 @@ InsertSafepointPoll(Instruction *InsertBefore,    // path call - where we need to insert a safepoint (parsepoint).    auto *F = M->getFunction(GCSafepointPollName); +  assert(F && "gc.safepoint_poll function is missing");    assert(F->getType()->getElementType() ==           FunctionType::get(Type::getVoidTy(M->getContext()), false) &&           "gc.safepoint_poll declared with wrong type"); @@ -864,10 +831,8 @@ InsertSafepointPoll(Instruction *InsertBefore,  /// Replaces the given call site (Call or Invoke) with a gc.statepoint  /// intrinsic with an empty deoptimization arguments list.  This does  /// NOT do explicit relocation for GC support. -static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ -                                    Pass *P) { -  assert(CS.getInstruction()->getParent()->getParent()->getParent() && -         "must be set"); +static Value *ReplaceWithStatepoint(const CallSite &CS /* to replace */) { +  assert(CS.getInstruction()->getModule() && "must be set");    // TODO: technically, a pass is not allowed to get functions from within a    // function pass since it might trigger a new function addition.  Refactor @@ -917,15 +882,10 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */        CS.getInstruction()->getContext(), AttributeSet::FunctionIndex,        AttrsToRemove); -  Value *StatepointTarget = NumPatchBytes == 0 -                                ? CS.getCalledValue() -                                : ConstantPointerNull::get(cast<PointerType>( -                                      CS.getCalledValue()->getType())); -    if (CS.isCall()) {      CallInst *ToReplace = cast<CallInst>(CS.getInstruction());      CallInst *Call = Builder.CreateGCStatepointCall( -        ID, NumPatchBytes, StatepointTarget, +        ID, NumPatchBytes, CS.getCalledValue(),          makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None,          "safepoint_token");      Call->setTailCall(ToReplace->isTailCall()); @@ -938,7 +898,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */      Token = Call; -    // Put the following gc_result and gc_relocate calls immediately after the +    // Put the following gc_result and gc_relocate calls immediately after      // the old call (which we're about to delete).      assert(ToReplace->getNextNode() && "not a terminator, must have next");      Builder.SetInsertPoint(ToReplace->getNextNode()); @@ -951,7 +911,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */      // original block.      Builder.SetInsertPoint(ToReplace->getParent());      InvokeInst *Invoke = Builder.CreateGCStatepointInvoke( -        ID, NumPatchBytes, StatepointTarget, ToReplace->getNormalDest(), +        ID, NumPatchBytes, CS.getCalledValue(), ToReplace->getNormalDest(),          ToReplace->getUnwindDest(), makeArrayRef(CS.arg_begin(), CS.arg_end()),          None, None, "safepoint_token"); @@ -967,7 +927,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */      // We'll insert the gc.result into the normal block      BasicBlock *NormalDest = ToReplace->getNormalDest();      // Can not insert gc.result in case of phi nodes preset. -    // Should have removed this cases prior to runnning this function +    // Should have removed this cases prior to running this function      assert(!isa<PHINode>(NormalDest->begin()));      Instruction *IP = &*(NormalDest->getFirstInsertionPt());      Builder.SetInsertPoint(IP); diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index d1acf785d07e..fb970c747ce1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -26,6 +26,8 @@  #include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SetVector.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/CFG.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DerivedTypes.h" @@ -62,7 +64,7 @@ namespace {  /// Print out the expression identified in the Ops list.  ///  static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { -  Module *M = I->getParent()->getParent()->getParent(); +  Module *M = I->getModule();    dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "         << *Ops[0].Op->getType() << '\t';    for (unsigned i = 0, e = Ops.size(); i != e; ++i) { @@ -82,20 +84,6 @@ namespace {      Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {} -    /// \brief Sort factors by their Base. -    struct BaseSorter { -      bool operator()(const Factor &LHS, const Factor &RHS) { -        return LHS.Base < RHS.Base; -      } -    }; - -    /// \brief Compare factors for equal bases. -    struct BaseEqual { -      bool operator()(const Factor &LHS, const Factor &RHS) { -        return LHS.Base == RHS.Base; -      } -    }; -      /// \brief Sort factors in descending order by their power.      struct PowerDescendingSorter {        bool operator()(const Factor &LHS, const Factor &RHS) { @@ -172,6 +160,7 @@ namespace {      void getAnalysisUsage(AnalysisUsage &AU) const override {        AU.setPreservesCFG(); +      AU.addPreserved<GlobalsAAWrapperPass>();      }    private:      void BuildRankMap(Function &F); @@ -255,27 +244,6 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,    return nullptr;  } -static bool isUnmovableInstruction(Instruction *I) { -  switch (I->getOpcode()) { -  case Instruction::PHI: -  case Instruction::LandingPad: -  case Instruction::Alloca: -  case Instruction::Load: -  case Instruction::Invoke: -  case Instruction::UDiv: -  case Instruction::SDiv: -  case Instruction::FDiv: -  case Instruction::URem: -  case Instruction::SRem: -  case Instruction::FRem: -    return true; -  case Instruction::Call: -    return !isa<DbgInfoIntrinsic>(I); -  default: -    return false; -  } -} -  void Reassociate::BuildRankMap(Function &F) {    unsigned i = 2; @@ -295,7 +263,7 @@ void Reassociate::BuildRankMap(Function &F) {      // we cannot move.  This ensures that the ranks for these instructions are      // all different in the block.      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) -      if (isUnmovableInstruction(I)) +      if (mayBeMemoryDependent(*I))          ValueRankMap[&*I] = ++BBRank;    }  } @@ -913,7 +881,11 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,  /// that computes the negative version of the value specified.  The negative  /// version of the value is returned, and BI is left pointing at the instruction  /// that should be processed next by the reassociation pass. -static Value *NegateValue(Value *V, Instruction *BI) { +/// Also add intermediate instructions to the redo list that are modified while +/// pushing the negates through adds.  These will be revisited to see if +/// additional opportunities have been exposed. +static Value *NegateValue(Value *V, Instruction *BI, +                          SetVector<AssertingVH<Instruction>> &ToRedo) {    if (Constant *C = dyn_cast<Constant>(V)) {      if (C->getType()->isFPOrFPVectorTy()) {        return ConstantExpr::getFNeg(C); @@ -934,8 +906,8 @@ static Value *NegateValue(Value *V, Instruction *BI) {    if (BinaryOperator *I =            isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {      // Push the negates through the add. -    I->setOperand(0, NegateValue(I->getOperand(0), BI)); -    I->setOperand(1, NegateValue(I->getOperand(1), BI)); +    I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo)); +    I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));      if (I->getOpcode() == Instruction::Add) {        I->setHasNoUnsignedWrap(false);        I->setHasNoSignedWrap(false); @@ -948,6 +920,10 @@ static Value *NegateValue(Value *V, Instruction *BI) {      //      I->moveBefore(BI);      I->setName(I->getName()+".neg"); + +    // Add the intermediate negates to the redo list as processing them later +    // could expose more reassociating opportunities. +    ToRedo.insert(I);      return I;    } @@ -972,26 +948,28 @@ static Value *NegateValue(Value *V, Instruction *BI) {        if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {          InsertPt = II->getNormalDest()->begin();        } else { -        InsertPt = InstInput; -        ++InsertPt; +        InsertPt = ++InstInput->getIterator();        }        while (isa<PHINode>(InsertPt)) ++InsertPt;      } else {        InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();      } -    TheNeg->moveBefore(InsertPt); +    TheNeg->moveBefore(&*InsertPt);      if (TheNeg->getOpcode() == Instruction::Sub) {        TheNeg->setHasNoUnsignedWrap(false);        TheNeg->setHasNoSignedWrap(false);      } else {        TheNeg->andIRFlags(BI);      } +    ToRedo.insert(TheNeg);      return TheNeg;    }    // Insert a 'neg' instruction that subtracts the value from zero to get the    // negation. -  return CreateNeg(V, V->getName() + ".neg", BI, BI); +  BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI); +  ToRedo.insert(NewNeg); +  return NewNeg;  }  /// Return true if we should break up this subtract of X-Y into (X + -Y). @@ -1025,14 +1003,15 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {  /// If we have (X-Y), and if either X is an add, or if this is only used by an  /// add, transform this into (X+(0-Y)) to promote better reassociation. -static BinaryOperator *BreakUpSubtract(Instruction *Sub) { +static BinaryOperator * +BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {    // Convert a subtract into an add and a neg instruction. This allows sub    // instructions to be commuted with other add instructions.    //    // Calculate the negative value of Operand 1 of the sub instruction,    // and set it as the RHS of the add instruction we just made.    // -  Value *NegVal = NegateValue(Sub->getOperand(1), Sub); +  Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);    BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);    Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.    Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op. @@ -1166,7 +1145,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {      return nullptr;    } -  BasicBlock::iterator InsertPt = BO; ++InsertPt; +  BasicBlock::iterator InsertPt = ++BO->getIterator();    // If this was just a single multiply, remove the multiply and return the only    // remaining operand. @@ -1179,7 +1158,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {    }    if (NeedsNegate) -    V = CreateNeg(V, "neg", InsertPt, BO); +    V = CreateNeg(V, "neg", &*InsertPt, BO);    return V;  } @@ -1250,7 +1229,7 @@ static Value *OptimizeAndOrXor(unsigned Opcode,    return nullptr;  } -/// Helper funciton of CombineXorOpnd(). It creates a bitwise-and +/// Helper function of CombineXorOpnd(). It creates a bitwise-and  /// instruction with the given two operands, and return the resulting  /// instruction. There are two special cases: 1) if the constant operand is 0,  /// it will return NULL. 2) if the constant is ~0, the symbolic operand will @@ -2083,7 +2062,7 @@ void Reassociate::OptimizeInst(Instruction *I) {      return;    // Don't optimize floating point instructions that don't have unsafe algebra. -  if (I->getType()->isFloatingPointTy() && !I->hasUnsafeAlgebra()) +  if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())      return;    // Do not reassociate boolean (i1) expressions.  We want to preserve the @@ -2099,7 +2078,7 @@ void Reassociate::OptimizeInst(Instruction *I) {    // see if we can convert it to X+-Y.    if (I->getOpcode() == Instruction::Sub) {      if (ShouldBreakUpSubtract(I)) { -      Instruction *NI = BreakUpSubtract(I); +      Instruction *NI = BreakUpSubtract(I, RedoInsts);        RedoInsts.insert(I);        MadeChange = true;        I = NI; @@ -2110,6 +2089,12 @@ void Reassociate::OptimizeInst(Instruction *I) {            (!I->hasOneUse() ||             !isReassociableOp(I->user_back(), Instruction::Mul))) {          Instruction *NI = LowerNegateToMultiply(I); +        // If the negate was simplified, revisit the users to see if we can +        // reassociate further. +        for (User *U : NI->users()) { +          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U)) +            RedoInsts.insert(Tmp); +        }          RedoInsts.insert(I);          MadeChange = true;          I = NI; @@ -2117,7 +2102,7 @@ void Reassociate::OptimizeInst(Instruction *I) {      }    } else if (I->getOpcode() == Instruction::FSub) {      if (ShouldBreakUpSubtract(I)) { -      Instruction *NI = BreakUpSubtract(I); +      Instruction *NI = BreakUpSubtract(I, RedoInsts);        RedoInsts.insert(I);        MadeChange = true;        I = NI; @@ -2127,7 +2112,13 @@ void Reassociate::OptimizeInst(Instruction *I) {        if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&            (!I->hasOneUse() ||             !isReassociableOp(I->user_back(), Instruction::FMul))) { +        // If the negate was simplified, revisit the users to see if we can +        // reassociate further.          Instruction *NI = LowerNegateToMultiply(I); +        for (User *U : NI->users()) { +          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U)) +            RedoInsts.insert(Tmp); +        }          RedoInsts.insert(I);          MadeChange = true;          I = NI; @@ -2142,8 +2133,14 @@ void Reassociate::OptimizeInst(Instruction *I) {    // If this is an interior node of a reassociable tree, ignore it until we    // get to the root of the tree, to avoid N^2 analysis.    unsigned Opcode = BO->getOpcode(); -  if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) +  if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) { +    // During the initial run we will get to the root of the tree. +    // But if we get here while we are redoing instructions, there is no +    // guarantee that the root will be visited. So Redo later +    if (BO->user_back() != BO) +      RedoInsts.insert(BO->user_back());      return; +  }    // If this is an add tree that is used by a sub instruction, ignore it    // until we process the subtract. @@ -2250,10 +2247,10 @@ bool Reassociate::runOnFunction(Function &F) {    for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {      // Optimize every instruction in the basic block.      for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; ) -      if (isInstructionTriviallyDead(II)) { -        EraseInst(II++); +      if (isInstructionTriviallyDead(&*II)) { +        EraseInst(&*II++);        } else { -        OptimizeInst(II); +        OptimizeInst(&*II);          assert(II->getParent() == BI && "Moved to a different block!");          ++II;        } diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 1b46727c17bb..915f89780c08 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -82,10 +82,9 @@ bool RegToMem::runOnFunction(Function &F) {    BasicBlock::iterator I = BBEntry->begin();    while (isa<AllocaInst>(I)) ++I; -  CastInst *AllocaInsertionPoint = -    new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())), -                    Type::getInt32Ty(F.getContext()), -                    "reg2mem alloca point", I); +  CastInst *AllocaInsertionPoint = new BitCastInst( +      Constant::getNullValue(Type::getInt32Ty(F.getContext())), +      Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I);    // Find the escaped instructions. But don't create stack slots for    // allocas in entry block. @@ -95,7 +94,7 @@ bool RegToMem::runOnFunction(Function &F) {      for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();           iib != iie; ++iib) {        if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) && -          valueEscapes(iib)) { +          valueEscapes(&*iib)) {          WorkList.push_front(&*iib);        }      } diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index ae2ae3af0c7a..db127c3f7b4e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -14,12 +14,14 @@  #include "llvm/Pass.h"  #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/InstructionSimplify.h"  #include "llvm/Analysis/TargetTransformInfo.h"  #include "llvm/ADT/SetOperations.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/ADT/DenseSet.h"  #include "llvm/ADT/SetVector.h"  #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/MapVector.h"  #include "llvm/IR/BasicBlock.h"  #include "llvm/IR/CallSite.h"  #include "llvm/IR/Dominators.h" @@ -46,10 +48,6 @@  using namespace llvm; -// Print tracing output -static cl::opt<bool> TraceLSP("trace-rewrite-statepoints", cl::Hidden, -                              cl::init(false)); -  // Print the liveset found at the insert location  static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,                                    cl::init(false)); @@ -74,6 +72,12 @@ static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",                                                    cl::location(ClobberNonLive),                                                    cl::Hidden); +static cl::opt<bool> UseDeoptBundles("rs4gc-use-deopt-bundles", cl::Hidden, +                                     cl::init(false)); +static cl::opt<bool> +    AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info", +                                   cl::Hidden, cl::init(true)); +  namespace {  struct RewriteStatepointsForGC : public ModulePass {    static char ID; // Pass identification, replacement for typeid @@ -88,10 +92,10 @@ struct RewriteStatepointsForGC : public ModulePass {        Changed |= runOnFunction(F);      if (Changed) { -      // stripDereferenceabilityInfo asserts that shouldRewriteStatepointsIn +      // stripNonValidAttributes asserts that shouldRewriteStatepointsIn        // returns true for at least one function in the module.  Since at least        // one function changed, we know that the precondition is satisfied. -      stripDereferenceabilityInfo(M); +      stripNonValidAttributes(M);      }      return Changed; @@ -108,15 +112,16 @@ struct RewriteStatepointsForGC : public ModulePass {    /// dereferenceability that are no longer valid/correct after    /// RewriteStatepointsForGC has run.  This is because semantically, after    /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire -  /// heap.  stripDereferenceabilityInfo (conservatively) restores correctness +  /// heap.  stripNonValidAttributes (conservatively) restores correctness    /// by erasing all attributes in the module that externally imply    /// dereferenceability. -  /// -  void stripDereferenceabilityInfo(Module &M); +  /// Similar reasoning also applies to the noalias attributes. gc.statepoint +  /// can touch the entire heap including noalias objects. +  void stripNonValidAttributes(Module &M); -  // Helpers for stripDereferenceabilityInfo -  void stripDereferenceabilityInfoFromBody(Function &F); -  void stripDereferenceabilityInfoFromPrototype(Function &F); +  // Helpers for stripNonValidAttributes +  void stripNonValidAttributesFromBody(Function &F); +  void stripNonValidAttributesFromPrototype(Function &F);  };  } // namespace @@ -160,15 +165,16 @@ struct GCPtrLivenessData {  // base relation will remain.  Internally, we add a mixture of the two  // types, then update all the second type to the first type  typedef DenseMap<Value *, Value *> DefiningValueMapTy; -typedef DenseSet<llvm::Value *> StatepointLiveSetTy; -typedef DenseMap<Instruction *, Value *> RematerializedValueMapTy; +typedef DenseSet<Value *> StatepointLiveSetTy; +typedef DenseMap<AssertingVH<Instruction>, AssertingVH<Value>> +  RematerializedValueMapTy;  struct PartiallyConstructedSafepointRecord { -  /// The set of values known to be live accross this safepoint -  StatepointLiveSetTy liveset; +  /// The set of values known to be live across this safepoint +  StatepointLiveSetTy LiveSet;    /// Mapping from live pointers to a base-defining-value -  DenseMap<llvm::Value *, llvm::Value *> PointerToBase; +  DenseMap<Value *, Value *> PointerToBase;    /// The *new* gc.statepoint instruction itself.  This produces the token    /// that normal path gc.relocates and the gc.result are tied to. @@ -179,12 +185,26 @@ struct PartiallyConstructedSafepointRecord {    Instruction *UnwindToken;    /// Record live values we are rematerialized instead of relocating. -  /// They are not included into 'liveset' field. +  /// They are not included into 'LiveSet' field.    /// Maps rematerialized copy to it's original value.    RematerializedValueMapTy RematerializedValues;  };  } +static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) { +  assert(UseDeoptBundles && "Should not be called otherwise!"); + +  Optional<OperandBundleUse> DeoptBundle = CS.getOperandBundle("deopt"); + +  if (!DeoptBundle.hasValue()) { +    assert(AllowStatepointWithNoDeoptInfo && +           "Found non-leaf call without deopt info!"); +    return None; +  } + +  return DeoptBundle.getValue().Inputs; +} +  /// Compute the live-in set for every basic block in the function  static void computeLiveInValues(DominatorTree &DT, Function &F,                                  GCPtrLivenessData &Data); @@ -195,10 +215,10 @@ static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data,                                StatepointLiveSetTy &out);  // TODO: Once we can get to the GCStrategy, this becomes -// Optional<bool> isGCManagedPointer(const Value *V) const override { +// Optional<bool> isGCManagedPointer(const Type *Ty) const override { -static bool isGCPointerType(const Type *T) { -  if (const PointerType *PT = dyn_cast<PointerType>(T)) +static bool isGCPointerType(Type *T) { +  if (auto *PT = dyn_cast<PointerType>(T))      // For the sake of this example GC, we arbitrarily pick addrspace(1) as our      // GC managed heap.  We know that a pointer into this heap needs to be      // updated and that no other pointer does. @@ -233,9 +253,8 @@ static bool containsGCPtrType(Type *Ty) {    if (ArrayType *AT = dyn_cast<ArrayType>(Ty))      return containsGCPtrType(AT->getElementType());    if (StructType *ST = dyn_cast<StructType>(Ty)) -    return std::any_of( -        ST->subtypes().begin(), ST->subtypes().end(), -        [](Type *SubType) { return containsGCPtrType(SubType); }); +    return std::any_of(ST->subtypes().begin(), ST->subtypes().end(), +                       containsGCPtrType);    return false;  } @@ -247,7 +266,7 @@ static bool isUnhandledGCPointerType(Type *Ty) {  }  #endif -static bool order_by_name(llvm::Value *a, llvm::Value *b) { +static bool order_by_name(Value *a, Value *b) {    if (a->hasName() && b->hasName()) {      return -1 == a->getName().compare(b->getName());    } else if (a->hasName() && !b->hasName()) { @@ -260,6 +279,13 @@ static bool order_by_name(llvm::Value *a, llvm::Value *b) {    }  } +// Return the name of the value suffixed with the provided value, or if the +// value didn't have a name, the default value specified. +static std::string suffixed_name_or(Value *V, StringRef Suffix, +                                    StringRef DefaultName) { +  return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str(); +} +  // Conservatively identifies any definitions which might be live at the  // given instruction. The  analysis is performed immediately before the  // given instruction. Values defined by that instruction are not considered @@ -269,30 +295,56 @@ static void analyzeParsePointLiveness(      const CallSite &CS, PartiallyConstructedSafepointRecord &result) {    Instruction *inst = CS.getInstruction(); -  StatepointLiveSetTy liveset; -  findLiveSetAtInst(inst, OriginalLivenessData, liveset); +  StatepointLiveSetTy LiveSet; +  findLiveSetAtInst(inst, OriginalLivenessData, LiveSet);    if (PrintLiveSet) {      // Note: This output is used by several of the test cases -    // The order of elemtns in a set is not stable, put them in a vec and sort +    // The order of elements in a set is not stable, put them in a vec and sort      // by name -    SmallVector<Value *, 64> temp; -    temp.insert(temp.end(), liveset.begin(), liveset.end()); -    std::sort(temp.begin(), temp.end(), order_by_name); +    SmallVector<Value *, 64> Temp; +    Temp.insert(Temp.end(), LiveSet.begin(), LiveSet.end()); +    std::sort(Temp.begin(), Temp.end(), order_by_name);      errs() << "Live Variables:\n"; -    for (Value *V : temp) { -      errs() << " " << V->getName(); // no newline -      V->dump(); -    } +    for (Value *V : Temp) +      dbgs() << " " << V->getName() << " " << *V << "\n";    }    if (PrintLiveSetSize) {      errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n"; -    errs() << "Number live values: " << liveset.size() << "\n"; +    errs() << "Number live values: " << LiveSet.size() << "\n"; +  } +  result.LiveSet = LiveSet; +} + +static bool isKnownBaseResult(Value *V); +namespace { +/// A single base defining value - An immediate base defining value for an +/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'. +/// For instructions which have multiple pointer [vector] inputs or that +/// transition between vector and scalar types, there is no immediate base +/// defining value.  The 'base defining value' for 'Def' is the transitive +/// closure of this relation stopping at the first instruction which has no +/// immediate base defining value.  The b.d.v. might itself be a base pointer, +/// but it can also be an arbitrary derived pointer.  +struct BaseDefiningValueResult { +  /// Contains the value which is the base defining value. +  Value * const BDV; +  /// True if the base defining value is also known to be an actual base +  /// pointer. +  const bool IsKnownBase; +  BaseDefiningValueResult(Value *BDV, bool IsKnownBase) +    : BDV(BDV), IsKnownBase(IsKnownBase) { +#ifndef NDEBUG +    // Check consistency between new and old means of checking whether a BDV is +    // a base. +    bool MustBeBase = isKnownBaseResult(BDV); +    assert(!MustBeBase || MustBeBase == IsKnownBase); +#endif    } -  result.liveset = liveset; +};  } -static Value *findBaseDefiningValue(Value *I); +static BaseDefiningValueResult findBaseDefiningValue(Value *I);  /// Return a base defining value for the 'Index' element of the given vector  /// instruction 'I'.  If Index is null, returns a BDV for the entire vector @@ -303,8 +355,8 @@ static Value *findBaseDefiningValue(Value *I);  /// vector returned is a BDV (and possibly a base) of the entire vector 'I'.  /// If the later, the return pointer is a BDV (or possibly a base) for the  /// particular element in 'I'.   -static std::pair<Value *, bool> -findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { +static BaseDefiningValueResult +findBaseDefiningValueOfVector(Value *I) {    assert(I->getType()->isVectorTy() &&           cast<VectorType>(I->getType())->getElementType()->isPointerTy() &&           "Illegal to ask for the base pointer of a non-pointer type"); @@ -314,7 +366,7 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) {    if (isa<Argument>(I))      // An incoming argument to the function is a base pointer -    return std::make_pair(I, true); +    return BaseDefiningValueResult(I, true);    // We shouldn't see the address of a global as a vector value?    assert(!isa<GlobalVariable>(I) && @@ -325,7 +377,7 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) {    if (isa<UndefValue>(I))      // utterly meaningless, but useful for dealing with partially optimized      // code. -    return std::make_pair(I, true); +    return BaseDefiningValueResult(I, true);    // Due to inheritance, this must be _after_ the global variable and undef    // checks @@ -333,31 +385,17 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) {      assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) &&             "order of checks wrong!");      assert(Con->isNullValue() && "null is the only case which makes sense"); -    return std::make_pair(Con, true); +    return BaseDefiningValueResult(Con, true);    }    if (isa<LoadInst>(I)) -    return std::make_pair(I, true); -   -  // For an insert element, we might be able to look through it if we know -  // something about the indexes. -  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(I)) { -    if (Index) { -      Value *InsertIndex = IEI->getOperand(2); -      // This index is inserting the value, look for its BDV -      if (InsertIndex == Index) -        return std::make_pair(findBaseDefiningValue(IEI->getOperand(1)), false); -      // Both constant, and can't be equal per above. This insert is definitely -      // not relevant, look back at the rest of the vector and keep trying. -      if (isa<ConstantInt>(Index) && isa<ConstantInt>(InsertIndex)) -        return findBaseDefiningValueOfVector(IEI->getOperand(0), Index); -    } -     +    return BaseDefiningValueResult(I, true); + +  if (isa<InsertElementInst>(I))      // We don't know whether this vector contains entirely base pointers or      // not.  To be conservatively correct, we treat it as a BDV and will      // duplicate code as needed to construct a parallel vector of bases. -    return std::make_pair(IEI, false); -  } +    return BaseDefiningValueResult(I, false);    if (isa<ShuffleVectorInst>(I))      // We don't know whether this vector contains entirely base pointers or @@ -365,105 +403,62 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) {      // duplicate code as needed to construct a parallel vector of bases.      // TODO: There a number of local optimizations which could be applied here      // for particular sufflevector patterns. -    return std::make_pair(I, false); +    return BaseDefiningValueResult(I, false);    // A PHI or Select is a base defining value.  The outer findBasePointer    // algorithm is responsible for constructing a base value for this BDV.    assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&           "unknown vector instruction - no base found for vector element"); -  return std::make_pair(I, false); +  return BaseDefiningValueResult(I, false);  } -static bool isKnownBaseResult(Value *V); -  /// Helper function for findBasePointer - Will return a value which either a) -/// defines the base pointer for the input or b) blocks the simple search -/// (i.e. a PHI or Select of two derived pointers) -static Value *findBaseDefiningValue(Value *I) { +/// defines the base pointer for the input, b) blocks the simple search +/// (i.e. a PHI or Select of two derived pointers), or c) involves a change +/// from pointer to vector type or back. +static BaseDefiningValueResult findBaseDefiningValue(Value *I) {    if (I->getType()->isVectorTy()) -    return findBaseDefiningValueOfVector(I).first; +    return findBaseDefiningValueOfVector(I);    assert(I->getType()->isPointerTy() &&           "Illegal to ask for the base pointer of a non-pointer type"); -  // This case is a bit of a hack - it only handles extracts from vectors which -  // trivially contain only base pointers or cases where we can directly match -  // the index of the original extract element to an insertion into the vector. -  // See note inside the function for how to improve this. -  if (auto *EEI = dyn_cast<ExtractElementInst>(I)) { -    Value *VectorOperand = EEI->getVectorOperand(); -    Value *Index = EEI->getIndexOperand(); -    std::pair<Value *, bool> pair = -      findBaseDefiningValueOfVector(VectorOperand, Index); -    Value *VectorBase = pair.first; -    if (VectorBase->getType()->isPointerTy()) -      // We found a BDV for this specific element with the vector.  This is an -      // optimization, but in practice it covers most of the useful cases -      // created via scalarization. -      return VectorBase; -    else { -      assert(VectorBase->getType()->isVectorTy()); -      if (pair.second) -        // If the entire vector returned is known to be entirely base pointers, -        // then the extractelement is valid base for this value. -        return EEI; -      else { -        // Otherwise, we have an instruction which potentially produces a -        // derived pointer and we need findBasePointers to clone code for us -        // such that we can create an instruction which produces the -        // accompanying base pointer. -        // Note: This code is currently rather incomplete.  We don't currently -        // support the general form of shufflevector of insertelement. -        // Conceptually, these are just 'base defining values' of the same -        // variety as phi or select instructions.  We need to update the -        // findBasePointers algorithm to insert new 'base-only' versions of the -        // original instructions. This is relative straight forward to do, but -        // the case which would motivate the work hasn't shown up in real -        // workloads yet.   -        assert((isa<PHINode>(VectorBase) || isa<SelectInst>(VectorBase)) && -               "need to extend findBasePointers for generic vector" -               "instruction cases"); -        return VectorBase; -      } -    } -  } -    if (isa<Argument>(I))      // An incoming argument to the function is a base pointer      // We should have never reached here if this argument isn't an gc value -    return I; +    return BaseDefiningValueResult(I, true);    if (isa<GlobalVariable>(I))      // base case -    return I; +    return BaseDefiningValueResult(I, true);    // inlining could possibly introduce phi node that contains    // undef if callee has multiple returns    if (isa<UndefValue>(I))      // utterly meaningless, but useful for dealing with      // partially optimized code. -    return I; +    return BaseDefiningValueResult(I, true);    // Due to inheritance, this must be _after_ the global variable and undef    // checks -  if (Constant *Con = dyn_cast<Constant>(I)) { +  if (isa<Constant>(I)) {      assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) &&             "order of checks wrong!"); -    // Note: Finding a constant base for something marked for relocation -    // doesn't really make sense.  The most likely case is either a) some -    // screwed up the address space usage or b) your validating against -    // compiled C++ code w/o the proper separation.  The only real exception -    // is a null pointer.  You could have generic code written to index of -    // off a potentially null value and have proven it null.  We also use -    // null pointers in dead paths of relocation phis (which we might later -    // want to find a base pointer for). -    assert(isa<ConstantPointerNull>(Con) && -           "null is the only case which makes sense"); -    return Con; +    // Note: Even for frontends which don't have constant references, we can +    // see constants appearing after optimizations.  A simple example is +    // specialization of an address computation on null feeding into a merge +    // point where the actual use of the now-constant input is protected by +    // another null check.  (e.g. test4 in constants.ll) +    return BaseDefiningValueResult(I, true);    }    if (CastInst *CI = dyn_cast<CastInst>(I)) {      Value *Def = CI->stripPointerCasts(); +    // If stripping pointer casts changes the address space there is an +    // addrspacecast in between. +    assert(cast<PointerType>(Def->getType())->getAddressSpace() == +               cast<PointerType>(CI->getType())->getAddressSpace() && +           "unsupported addrspacecast");      // If we find a cast instruction here, it means we've found a cast which is      // not simply a pointer cast (i.e. an inttoptr).  We don't know how to      // handle int->ptr conversion. @@ -472,7 +467,9 @@ static Value *findBaseDefiningValue(Value *I) {    }    if (isa<LoadInst>(I)) -    return I; // The value loaded is an gc base itself +    // The value loaded is an gc base itself +    return BaseDefiningValueResult(I, true); +      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))      // The base of this GEP is the base @@ -480,14 +477,11 @@ static Value *findBaseDefiningValue(Value *I) {    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {      switch (II->getIntrinsicID()) { -    case Intrinsic::experimental_gc_result_ptr:      default:        // fall through to general call handling        break;      case Intrinsic::experimental_gc_statepoint: -    case Intrinsic::experimental_gc_result_float: -    case Intrinsic::experimental_gc_result_int: -      llvm_unreachable("these don't produce pointers"); +      llvm_unreachable("statepoints don't produce pointers");      case Intrinsic::experimental_gc_relocate: {        // Rerunning safepoint insertion after safepoints are already        // inserted is not supported.  It could probably be made to work, @@ -506,17 +500,17 @@ static Value *findBaseDefiningValue(Value *I) {    // pointers.  This should probably be generalized via attributes to support    // both source language and internal functions.    if (isa<CallInst>(I) || isa<InvokeInst>(I)) -    return I; +    return BaseDefiningValueResult(I, true);    // I have absolutely no idea how to implement this part yet.  It's not -  // neccessarily hard, I just haven't really looked at it yet. +  // necessarily hard, I just haven't really looked at it yet.    assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");    if (isa<AtomicCmpXchgInst>(I))      // A CAS is effectively a atomic store and load combined under a      // predicate.  From the perspective of base pointers, we just treat it      // like a load. -    return I; +    return BaseDefiningValueResult(I, true);    assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are "                                     "binary ops which don't apply to pointers"); @@ -525,34 +519,41 @@ static Value *findBaseDefiningValue(Value *I) {    // stack, but in either case, this is simply a field load.  As a result,    // this is a defining definition of the base just like a load is.    if (isa<ExtractValueInst>(I)) -    return I; +    return BaseDefiningValueResult(I, true);    // We should never see an insert vector since that would require we be    // tracing back a struct value not a pointer value.    assert(!isa<InsertValueInst>(I) &&           "Base pointer for a struct is meaningless"); +  // An extractelement produces a base result exactly when it's input does. +  // We may need to insert a parallel instruction to extract the appropriate +  // element out of the base vector corresponding to the input. Given this, +  // it's analogous to the phi and select case even though it's not a merge. +  if (isa<ExtractElementInst>(I)) +    // Note: There a lot of obvious peephole cases here.  This are deliberately +    // handled after the main base pointer inference algorithm to make writing +    // test cases to exercise that code easier. +    return BaseDefiningValueResult(I, false); +    // The last two cases here don't return a base pointer.  Instead, they -  // return a value which dynamically selects from amoung several base +  // return a value which dynamically selects from among several base    // derived pointers (each with it's own base potentially).  It's the job of    // the caller to resolve these.    assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&           "missing instruction case in findBaseDefiningValing"); -  return I; +  return BaseDefiningValueResult(I, false);  }  /// Returns the base defining value for this value.  static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {    Value *&Cached = Cache[I];    if (!Cached) { -    Cached = findBaseDefiningValue(I); +    Cached = findBaseDefiningValue(I).BDV; +    DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> " +                 << Cached->getName() << "\n");    }    assert(Cache[I] != nullptr); - -  if (TraceLSP) { -    dbgs() << "fBDV-cached: " << I->getName() << " -> " << Cached->getName() -           << "\n"; -  }    return Cached;  } @@ -572,7 +573,9 @@ static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) {  /// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,  /// is it known to be a base pointer?  Or do we need to continue searching.  static bool isKnownBaseResult(Value *V) { -  if (!isa<PHINode>(V) && !isa<SelectInst>(V)) { +  if (!isa<PHINode>(V) && !isa<SelectInst>(V) && +      !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) && +      !isa<ShuffleVectorInst>(V)) {      // no recursion possible      return true;    } @@ -587,17 +590,19 @@ static bool isKnownBaseResult(Value *V) {    return false;  } -// TODO: find a better name for this  namespace { -class PhiState { +/// Models the state of a single base defining value in the findBasePointer +/// algorithm for determining where a new instruction is needed to propagate +/// the base of this BDV. +class BDVState {  public:    enum Status { Unknown, Base, Conflict }; -  PhiState(Status s, Value *b = nullptr) : status(s), base(b) { +  BDVState(Status s, Value *b = nullptr) : status(s), base(b) {      assert(status != Base || b);    } -  PhiState(Value *b) : status(Base), base(b) {} -  PhiState() : status(Unknown), base(nullptr) {} +  explicit BDVState(Value *b) : status(Base), base(b) {} +  BDVState() : status(Unknown), base(nullptr) {}    Status getStatus() const { return status; }    Value *getBase() const { return base; } @@ -606,72 +611,80 @@ public:    bool isUnknown() const { return getStatus() == Unknown; }    bool isConflict() const { return getStatus() == Conflict; } -  bool operator==(const PhiState &other) const { +  bool operator==(const BDVState &other) const {      return base == other.base && status == other.status;    } -  bool operator!=(const PhiState &other) const { return !(*this == other); } +  bool operator!=(const BDVState &other) const { return !(*this == other); } -  void dump() { -    errs() << status << " (" << base << " - " -           << (base ? base->getName() : "nullptr") << "): "; +  LLVM_DUMP_METHOD +  void dump() const { print(dbgs()); dbgs() << '\n'; } +   +  void print(raw_ostream &OS) const { +    switch (status) { +    case Unknown: +      OS << "U"; +      break; +    case Base: +      OS << "B"; +      break; +    case Conflict: +      OS << "C"; +      break; +    }; +    OS << " (" << base << " - " +       << (base ? base->getName() : "nullptr") << "): ";    }  private:    Status status; -  Value *base; // non null only if status == base +  AssertingVH<Value> base; // non null only if status == base  }; +} -typedef DenseMap<Value *, PhiState> ConflictStateMapTy; -// Values of type PhiState form a lattice, and this is a helper +#ifndef NDEBUG +static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) { +  State.print(OS); +  return OS; +} +#endif + +namespace { +// Values of type BDVState form a lattice, and this is a helper  // class that implementes the meet operation.  The meat of the meet -// operation is implemented in MeetPhiStates::pureMeet -class MeetPhiStates { +// operation is implemented in MeetBDVStates::pureMeet +class MeetBDVStates {  public: -  // phiStates is a mapping from PHINodes and SelectInst's to PhiStates. -  explicit MeetPhiStates(const ConflictStateMapTy &phiStates) -      : phiStates(phiStates) {} - -  // Destructively meet the current result with the base V.  V can -  // either be a merge instruction (SelectInst / PHINode), in which -  // case its status is looked up in the phiStates map; or a regular -  // SSA value, in which case it is assumed to be a base. -  void meetWith(Value *V) { -    PhiState otherState = getStateForBDV(V); -    assert((MeetPhiStates::pureMeet(otherState, currentResult) == -            MeetPhiStates::pureMeet(currentResult, otherState)) && -           "math is wrong: meet does not commute!"); -    currentResult = MeetPhiStates::pureMeet(otherState, currentResult); +  /// Initializes the currentResult to the TOP state so that if can be met with +  /// any other state to produce that state. +  MeetBDVStates() {} + +  // Destructively meet the current result with the given BDVState +  void meetWith(BDVState otherState) { +    currentResult = meet(otherState, currentResult);    } -  PhiState getResult() const { return currentResult; } +  BDVState getResult() const { return currentResult; }  private: -  const ConflictStateMapTy &phiStates; -  PhiState currentResult; - -  /// Return a phi state for a base defining value.  We'll generate a new -  /// base state for known bases and expect to find a cached state otherwise -  PhiState getStateForBDV(Value *baseValue) { -    if (isKnownBaseResult(baseValue)) { -      return PhiState(baseValue); -    } else { -      return lookupFromMap(baseValue); -    } -  } +  BDVState currentResult; -  PhiState lookupFromMap(Value *V) { -    auto I = phiStates.find(V); -    assert(I != phiStates.end() && "lookup failed!"); -    return I->second; +  /// Perform a meet operation on two elements of the BDVState lattice. +  static BDVState meet(BDVState LHS, BDVState RHS) { +    assert((pureMeet(LHS, RHS) == pureMeet(RHS, LHS)) && +           "math is wrong: meet does not commute!"); +    BDVState Result = pureMeet(LHS, RHS); +    DEBUG(dbgs() << "meet of " << LHS << " with " << RHS +                 << " produced " << Result << "\n"); +    return Result;    } -  static PhiState pureMeet(const PhiState &stateA, const PhiState &stateB) { +  static BDVState pureMeet(const BDVState &stateA, const BDVState &stateB) {      switch (stateA.getStatus()) { -    case PhiState::Unknown: +    case BDVState::Unknown:        return stateB; -    case PhiState::Base: +    case BDVState::Base:        assert(stateA.getBase() && "can't be null");        if (stateB.isUnknown())          return stateA; @@ -681,18 +694,20 @@ private:            assert(stateA == stateB && "equality broken!");            return stateA;          } -        return PhiState(PhiState::Conflict); +        return BDVState(BDVState::Conflict);        }        assert(stateB.isConflict() && "only three states!"); -      return PhiState(PhiState::Conflict); +      return BDVState(BDVState::Conflict); -    case PhiState::Conflict: +    case BDVState::Conflict:        return stateA;      }      llvm_unreachable("only three states!");    }  };  } + +  /// For a given value or instruction, figure out what base ptr it's derived  /// from.  For gc objects, this is simply itself.  On success, returns a value  /// which is the base pointer.  (This is reliable and can be used for @@ -723,171 +738,252 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {    //    // Note: A simpler form of this would be to add the conflict form of all    // PHIs without running the optimistic algorithm.  This would be -  // analougous to pessimistic data flow and would likely lead to an +  // analogous to pessimistic data flow and would likely lead to an    // overall worse solution. -  ConflictStateMapTy states; -  states[def] = PhiState(); -  // Recursively fill in all phis & selects reachable from the initial one -  // for which we don't already know a definite base value for -  // TODO: This should be rewritten with a worklist -  bool done = false; -  while (!done) { -    done = true; -    // Since we're adding elements to 'states' as we run, we can't keep -    // iterators into the set. -    SmallVector<Value *, 16> Keys; -    Keys.reserve(states.size()); -    for (auto Pair : states) { -      Value *V = Pair.first; -      Keys.push_back(V); -    } -    for (Value *v : Keys) { -      assert(!isKnownBaseResult(v) && "why did it get added?"); -      if (PHINode *phi = dyn_cast<PHINode>(v)) { -        assert(phi->getNumIncomingValues() > 0 && -               "zero input phis are illegal"); -        for (Value *InVal : phi->incoming_values()) { -          Value *local = findBaseOrBDV(InVal, cache); -          if (!isKnownBaseResult(local) && states.find(local) == states.end()) { -            states[local] = PhiState(); -            done = false; -          } -        } -      } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) { -        Value *local = findBaseOrBDV(sel->getTrueValue(), cache); -        if (!isKnownBaseResult(local) && states.find(local) == states.end()) { -          states[local] = PhiState(); -          done = false; -        } -        local = findBaseOrBDV(sel->getFalseValue(), cache); -        if (!isKnownBaseResult(local) && states.find(local) == states.end()) { -          states[local] = PhiState(); -          done = false; -        } +#ifndef NDEBUG +  auto isExpectedBDVType = [](Value *BDV) { +    return isa<PHINode>(BDV) || isa<SelectInst>(BDV) || +           isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV); +  }; +#endif + +  // Once populated, will contain a mapping from each potentially non-base BDV +  // to a lattice value (described above) which corresponds to that BDV. +  // We use the order of insertion (DFS over the def/use graph) to provide a +  // stable deterministic ordering for visiting DenseMaps (which are unordered) +  // below.  This is important for deterministic compilation. +  MapVector<Value *, BDVState> States; + +  // Recursively fill in all base defining values reachable from the initial +  // one for which we don't already know a definite base value for +  /* scope */ { +    SmallVector<Value*, 16> Worklist; +    Worklist.push_back(def); +    States.insert(std::make_pair(def, BDVState())); +    while (!Worklist.empty()) { +      Value *Current = Worklist.pop_back_val(); +      assert(!isKnownBaseResult(Current) && "why did it get added?"); + +      auto visitIncomingValue = [&](Value *InVal) { +        Value *Base = findBaseOrBDV(InVal, cache); +        if (isKnownBaseResult(Base)) +          // Known bases won't need new instructions introduced and can be +          // ignored safely +          return; +        assert(isExpectedBDVType(Base) && "the only non-base values " +               "we see should be base defining values"); +        if (States.insert(std::make_pair(Base, BDVState())).second) +          Worklist.push_back(Base); +      }; +      if (PHINode *Phi = dyn_cast<PHINode>(Current)) { +        for (Value *InVal : Phi->incoming_values()) +          visitIncomingValue(InVal); +      } else if (SelectInst *Sel = dyn_cast<SelectInst>(Current)) { +        visitIncomingValue(Sel->getTrueValue()); +        visitIncomingValue(Sel->getFalseValue()); +      } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) { +        visitIncomingValue(EE->getVectorOperand()); +      } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) { +        visitIncomingValue(IE->getOperand(0)); // vector operand +        visitIncomingValue(IE->getOperand(1)); // scalar operand +      } else { +        // There is one known class of instructions we know we don't handle. +        assert(isa<ShuffleVectorInst>(Current)); +        llvm_unreachable("unimplemented instruction case");        }      }    } -  if (TraceLSP) { -    errs() << "States after initialization:\n"; -    for (auto Pair : states) { -      Instruction *v = cast<Instruction>(Pair.first); -      PhiState state = Pair.second; -      state.dump(); -      v->dump(); -    } +#ifndef NDEBUG +  DEBUG(dbgs() << "States after initialization:\n"); +  for (auto Pair : States) { +    DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");    } +#endif -  // TODO: come back and revisit the state transitions around inputs which -  // have reached conflict state.  The current version seems too conservative. +  // Return a phi state for a base defining value.  We'll generate a new +  // base state for known bases and expect to find a cached state otherwise. +  auto getStateForBDV = [&](Value *baseValue) { +    if (isKnownBaseResult(baseValue)) +      return BDVState(baseValue); +    auto I = States.find(baseValue); +    assert(I != States.end() && "lookup failed!"); +    return I->second; +  };    bool progress = true;    while (progress) {  #ifndef NDEBUG -    size_t oldSize = states.size(); +    const size_t oldSize = States.size();  #endif      progress = false; -    // We're only changing keys in this loop, thus safe to keep iterators -    for (auto Pair : states) { -      MeetPhiStates calculateMeet(states); -      Value *v = Pair.first; -      assert(!isKnownBaseResult(v) && "why did it get added?"); -      if (SelectInst *select = dyn_cast<SelectInst>(v)) { -        calculateMeet.meetWith(findBaseOrBDV(select->getTrueValue(), cache)); -        calculateMeet.meetWith(findBaseOrBDV(select->getFalseValue(), cache)); -      } else -        for (Value *Val : cast<PHINode>(v)->incoming_values()) -          calculateMeet.meetWith(findBaseOrBDV(Val, cache)); - -      PhiState oldState = states[v]; -      PhiState newState = calculateMeet.getResult(); +    // We're only changing values in this loop, thus safe to keep iterators. +    // Since this is computing a fixed point, the order of visit does not +    // effect the result.  TODO: We could use a worklist here and make this run +    // much faster. +    for (auto Pair : States) { +      Value *BDV = Pair.first; +      assert(!isKnownBaseResult(BDV) && "why did it get added?"); + +      // Given an input value for the current instruction, return a BDVState +      // instance which represents the BDV of that value. +      auto getStateForInput = [&](Value *V) mutable { +        Value *BDV = findBaseOrBDV(V, cache); +        return getStateForBDV(BDV); +      }; + +      MeetBDVStates calculateMeet; +      if (SelectInst *select = dyn_cast<SelectInst>(BDV)) { +        calculateMeet.meetWith(getStateForInput(select->getTrueValue())); +        calculateMeet.meetWith(getStateForInput(select->getFalseValue())); +      } else if (PHINode *Phi = dyn_cast<PHINode>(BDV)) { +        for (Value *Val : Phi->incoming_values()) +          calculateMeet.meetWith(getStateForInput(Val)); +      } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) { +        // The 'meet' for an extractelement is slightly trivial, but it's still +        // useful in that it drives us to conflict if our input is. +        calculateMeet.meetWith(getStateForInput(EE->getVectorOperand())); +      } else { +        // Given there's a inherent type mismatch between the operands, will +        // *always* produce Conflict. +        auto *IE = cast<InsertElementInst>(BDV); +        calculateMeet.meetWith(getStateForInput(IE->getOperand(0))); +        calculateMeet.meetWith(getStateForInput(IE->getOperand(1))); +      } + +      BDVState oldState = States[BDV]; +      BDVState newState = calculateMeet.getResult();        if (oldState != newState) {          progress = true; -        states[v] = newState; +        States[BDV] = newState;        }      } -    assert(oldSize <= states.size()); -    assert(oldSize == states.size() || progress); +    assert(oldSize == States.size() && +           "fixed point shouldn't be adding any new nodes to state");    } -  if (TraceLSP) { -    errs() << "States after meet iteration:\n"; -    for (auto Pair : states) { -      Instruction *v = cast<Instruction>(Pair.first); -      PhiState state = Pair.second; -      state.dump(); -      v->dump(); -    } +#ifndef NDEBUG +  DEBUG(dbgs() << "States after meet iteration:\n"); +  for (auto Pair : States) { +    DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");    } - +#endif +      // Insert Phis for all conflicts -  // We want to keep naming deterministic in the loop that follows, so -  // sort the keys before iteration.  This is useful in allowing us to -  // write stable tests. Note that there is no invalidation issue here. -  SmallVector<Value *, 16> Keys; -  Keys.reserve(states.size()); -  for (auto Pair : states) { -    Value *V = Pair.first; -    Keys.push_back(V); -  } -  std::sort(Keys.begin(), Keys.end(), order_by_name);    // TODO: adjust naming patterns to avoid this order of iteration dependency -  for (Value *V : Keys) { -    Instruction *v = cast<Instruction>(V); -    PhiState state = states[V]; -    assert(!isKnownBaseResult(v) && "why did it get added?"); -    assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); -    if (!state.isConflict()) +  for (auto Pair : States) { +    Instruction *I = cast<Instruction>(Pair.first); +    BDVState State = Pair.second; +    assert(!isKnownBaseResult(I) && "why did it get added?"); +    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); + +    // extractelement instructions are a bit special in that we may need to +    // insert an extract even when we know an exact base for the instruction. +    // The problem is that we need to convert from a vector base to a scalar +    // base for the particular indice we're interested in. +    if (State.isBase() && isa<ExtractElementInst>(I) && +        isa<VectorType>(State.getBase()->getType())) { +      auto *EE = cast<ExtractElementInst>(I); +      // TODO: In many cases, the new instruction is just EE itself.  We should +      // exploit this, but can't do it here since it would break the invariant +      // about the BDV not being known to be a base. +      auto *BaseInst = ExtractElementInst::Create(State.getBase(), +                                                  EE->getIndexOperand(), +                                                  "base_ee", EE); +      BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); +      States[I] = BDVState(BDVState::Base, BaseInst); +    } + +    // Since we're joining a vector and scalar base, they can never be the +    // same.  As a result, we should always see insert element having reached +    // the conflict state. +    if (isa<InsertElementInst>(I)) { +      assert(State.isConflict()); +    } +     +    if (!State.isConflict())        continue; -    if (isa<PHINode>(v)) { -      int num_preds = -          std::distance(pred_begin(v->getParent()), pred_end(v->getParent())); -      assert(num_preds > 0 && "how did we reach here"); -      PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v); -      // Add metadata marking this as a base value -      auto *const_1 = ConstantInt::get( -          Type::getInt32Ty( -              v->getParent()->getParent()->getParent()->getContext()), -          1); -      auto MDConst = ConstantAsMetadata::get(const_1); -      MDNode *md = MDNode::get( -          v->getParent()->getParent()->getParent()->getContext(), MDConst); -      phi->setMetadata("is_base_value", md); -      states[v] = PhiState(PhiState::Conflict, phi); +    /// Create and insert a new instruction which will represent the base of +    /// the given instruction 'I'. +    auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* { +      if (isa<PHINode>(I)) { +        BasicBlock *BB = I->getParent(); +        int NumPreds = std::distance(pred_begin(BB), pred_end(BB)); +        assert(NumPreds > 0 && "how did we reach here"); +        std::string Name = suffixed_name_or(I, ".base", "base_phi"); +        return PHINode::Create(I->getType(), NumPreds, Name, I); +      } else if (SelectInst *Sel = dyn_cast<SelectInst>(I)) { +        // The undef will be replaced later +        UndefValue *Undef = UndefValue::get(Sel->getType()); +        std::string Name = suffixed_name_or(I, ".base", "base_select"); +        return SelectInst::Create(Sel->getCondition(), Undef, +                                  Undef, Name, Sel); +      } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { +        UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType()); +        std::string Name = suffixed_name_or(I, ".base", "base_ee"); +        return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name, +                                          EE); +      } else { +        auto *IE = cast<InsertElementInst>(I); +        UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType()); +        UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType()); +        std::string Name = suffixed_name_or(I, ".base", "base_ie"); +        return InsertElementInst::Create(VecUndef, ScalarUndef, +                                         IE->getOperand(2), Name, IE); +      } + +    }; +    Instruction *BaseInst = MakeBaseInstPlaceholder(I); +    // Add metadata marking this as a base value +    BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); +    States[I] = BDVState(BDVState::Conflict, BaseInst); +  } + +  // Returns a instruction which produces the base pointer for a given +  // instruction.  The instruction is assumed to be an input to one of the BDVs +  // seen in the inference algorithm above.  As such, we must either already +  // know it's base defining value is a base, or have inserted a new +  // instruction to propagate the base of it's BDV and have entered that newly +  // introduced instruction into the state table.  In either case, we are +  // assured to be able to determine an instruction which produces it's base +  // pointer.  +  auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) { +    Value *BDV = findBaseOrBDV(Input, cache); +    Value *Base = nullptr; +    if (isKnownBaseResult(BDV)) { +      Base = BDV;      } else { -      SelectInst *sel = cast<SelectInst>(v); -      // The undef will be replaced later -      UndefValue *undef = UndefValue::get(sel->getType()); -      SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef, -                                               undef, "base_select", sel); -      // Add metadata marking this as a base value -      auto *const_1 = ConstantInt::get( -          Type::getInt32Ty( -              v->getParent()->getParent()->getParent()->getContext()), -          1); -      auto MDConst = ConstantAsMetadata::get(const_1); -      MDNode *md = MDNode::get( -          v->getParent()->getParent()->getParent()->getContext(), MDConst); -      basesel->setMetadata("is_base_value", md); -      states[v] = PhiState(PhiState::Conflict, basesel); +      // Either conflict or base. +      assert(States.count(BDV)); +      Base = States[BDV].getBase();      } -  } +    assert(Base && "can't be null"); +    // The cast is needed since base traversal may strip away bitcasts +    if (Base->getType() != Input->getType() && +        InsertPt) { +      Base = new BitCastInst(Base, Input->getType(), "cast", +                             InsertPt); +    } +    return Base; +  }; -  // Fixup all the inputs of the new PHIs -  for (auto Pair : states) { -    Instruction *v = cast<Instruction>(Pair.first); -    PhiState state = Pair.second; +  // Fixup all the inputs of the new PHIs.  Visit order needs to be +  // deterministic and predictable because we're naming newly created +  // instructions. +  for (auto Pair : States) { +    Instruction *BDV = cast<Instruction>(Pair.first); +    BDVState State = Pair.second; -    assert(!isKnownBaseResult(v) && "why did it get added?"); -    assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); -    if (!state.isConflict()) +    assert(!isKnownBaseResult(BDV) && "why did it get added?"); +    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); +    if (!State.isConflict())        continue; -    if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) { -      PHINode *phi = cast<PHINode>(v); +    if (PHINode *basephi = dyn_cast<PHINode>(State.getBase())) { +      PHINode *phi = cast<PHINode>(BDV);        unsigned NumPHIValues = phi->getNumIncomingValues();        for (unsigned i = 0; i < NumPHIValues; i++) {          Value *InVal = phi->getIncomingValue(i); @@ -906,104 +1002,145 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {          if (blockIndex != -1) {            Value *oldBase = basephi->getIncomingValue(blockIndex);            basephi->addIncoming(oldBase, InBB); +            #ifndef NDEBUG -          Value *base = findBaseOrBDV(InVal, cache); -          if (!isKnownBaseResult(base)) { -            // Either conflict or base. -            assert(states.count(base)); -            base = states[base].getBase(); -            assert(base != nullptr && "unknown PhiState!"); -          } - -          // In essense this assert states: the only way two +          Value *Base = getBaseForInput(InVal, nullptr); +          // In essence this assert states: the only way two            // values incoming from the same basic block may be            // different is by being different bitcasts of the same            // value.  A cleanup that remains TODO is changing            // findBaseOrBDV to return an llvm::Value of the correct            // type (and still remain pure).  This will remove the            // need to add bitcasts. -          assert(base->stripPointerCasts() == oldBase->stripPointerCasts() && +          assert(Base->stripPointerCasts() == oldBase->stripPointerCasts() &&                   "sanity -- findBaseOrBDV should be pure!");  #endif            continue;          } -        // Find either the defining value for the PHI or the normal base for -        // a non-phi node -        Value *base = findBaseOrBDV(InVal, cache); -        if (!isKnownBaseResult(base)) { -          // Either conflict or base. -          assert(states.count(base)); -          base = states[base].getBase(); -          assert(base != nullptr && "unknown PhiState!"); -        } -        assert(base && "can't be null"); -        // Must use original input BB since base may not be Instruction -        // The cast is needed since base traversal may strip away bitcasts -        if (base->getType() != basephi->getType()) { -          base = new BitCastInst(base, basephi->getType(), "cast", -                                 InBB->getTerminator()); -        } -        basephi->addIncoming(base, InBB); +        // Find the instruction which produces the base for each input.  We may +        // need to insert a bitcast in the incoming block. +        // TODO: Need to split critical edges if insertion is needed +        Value *Base = getBaseForInput(InVal, InBB->getTerminator()); +        basephi->addIncoming(Base, InBB);        }        assert(basephi->getNumIncomingValues() == NumPHIValues); -    } else { -      SelectInst *basesel = cast<SelectInst>(state.getBase()); -      SelectInst *sel = cast<SelectInst>(v); +    } else if (SelectInst *BaseSel = dyn_cast<SelectInst>(State.getBase())) { +      SelectInst *Sel = cast<SelectInst>(BDV);        // Operand 1 & 2 are true, false path respectively. TODO: refactor to        // something more safe and less hacky.        for (int i = 1; i <= 2; i++) { -        Value *InVal = sel->getOperand(i); -        // Find either the defining value for the PHI or the normal base for -        // a non-phi node -        Value *base = findBaseOrBDV(InVal, cache); -        if (!isKnownBaseResult(base)) { -          // Either conflict or base. -          assert(states.count(base)); -          base = states[base].getBase(); -          assert(base != nullptr && "unknown PhiState!"); -        } -        assert(base && "can't be null"); -        // Must use original input BB since base may not be Instruction -        // The cast is needed since base traversal may strip away bitcasts -        if (base->getType() != basesel->getType()) { -          base = new BitCastInst(base, basesel->getType(), "cast", basesel); -        } -        basesel->setOperand(i, base); +        Value *InVal = Sel->getOperand(i); +        // Find the instruction which produces the base for each input.  We may +        // need to insert a bitcast. +        Value *Base = getBaseForInput(InVal, BaseSel); +        BaseSel->setOperand(i, Base);        } +    } else if (auto *BaseEE = dyn_cast<ExtractElementInst>(State.getBase())) { +      Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand(); +      // Find the instruction which produces the base for each input.  We may +      // need to insert a bitcast. +      Value *Base = getBaseForInput(InVal, BaseEE); +      BaseEE->setOperand(0, Base); +    } else { +      auto *BaseIE = cast<InsertElementInst>(State.getBase()); +      auto *BdvIE = cast<InsertElementInst>(BDV); +      auto UpdateOperand = [&](int OperandIdx) { +        Value *InVal = BdvIE->getOperand(OperandIdx); +        Value *Base = getBaseForInput(InVal, BaseIE); +        BaseIE->setOperand(OperandIdx, Base); +      }; +      UpdateOperand(0); // vector operand +      UpdateOperand(1); // scalar operand +    } + +  } + +  // Now that we're done with the algorithm, see if we can optimize the  +  // results slightly by reducing the number of new instructions needed.  +  // Arguably, this should be integrated into the algorithm above, but  +  // doing as a post process step is easier to reason about for the moment. +  DenseMap<Value *, Value *> ReverseMap; +  SmallPtrSet<Instruction *, 16> NewInsts; +  SmallSetVector<AssertingVH<Instruction>, 16> Worklist; +  // Note: We need to visit the states in a deterministic order.  We uses the +  // Keys we sorted above for this purpose.  Note that we are papering over a +  // bigger problem with the algorithm above - it's visit order is not +  // deterministic.  A larger change is needed to fix this. +  for (auto Pair : States) { +    auto *BDV = Pair.first; +    auto State = Pair.second; +    Value *Base = State.getBase(); +    assert(BDV && Base); +    assert(!isKnownBaseResult(BDV) && "why did it get added?"); +    assert(isKnownBaseResult(Base) && +           "must be something we 'know' is a base pointer"); +    if (!State.isConflict()) +      continue; + +    ReverseMap[Base] = BDV; +    if (auto *BaseI = dyn_cast<Instruction>(Base)) { +      NewInsts.insert(BaseI); +      Worklist.insert(BaseI); +    } +  } +  auto ReplaceBaseInstWith = [&](Value *BDV, Instruction *BaseI, +                                 Value *Replacement) { +    // Add users which are new instructions (excluding self references) +    for (User *U : BaseI->users()) +      if (auto *UI = dyn_cast<Instruction>(U)) +        if (NewInsts.count(UI) && UI != BaseI) +          Worklist.insert(UI); +    // Then do the actual replacement +    NewInsts.erase(BaseI); +    ReverseMap.erase(BaseI); +    BaseI->replaceAllUsesWith(Replacement); +    assert(States.count(BDV)); +    assert(States[BDV].isConflict() && States[BDV].getBase() == BaseI); +    States[BDV] = BDVState(BDVState::Conflict, Replacement); +    BaseI->eraseFromParent(); +  }; +  const DataLayout &DL = cast<Instruction>(def)->getModule()->getDataLayout(); +  while (!Worklist.empty()) { +    Instruction *BaseI = Worklist.pop_back_val(); +    assert(NewInsts.count(BaseI)); +    Value *Bdv = ReverseMap[BaseI]; +    if (auto *BdvI = dyn_cast<Instruction>(Bdv)) +      if (BaseI->isIdenticalTo(BdvI)) { +        DEBUG(dbgs() << "Identical Base: " << *BaseI << "\n"); +        ReplaceBaseInstWith(Bdv, BaseI, Bdv); +        continue; +      } +    if (Value *V = SimplifyInstruction(BaseI, DL)) { +      DEBUG(dbgs() << "Base " << *BaseI << " simplified to " << *V << "\n"); +      ReplaceBaseInstWith(Bdv, BaseI, V); +      continue;      }    }    // Cache all of our results so we can cheaply reuse them    // NOTE: This is actually two caches: one of the base defining value    // relation and one of the base pointer relation!  FIXME -  for (auto item : states) { -    Value *v = item.first; -    Value *base = item.second.getBase(); -    assert(v && base); -    assert(!isKnownBaseResult(v) && "why did it get added?"); - -    if (TraceLSP) { -      std::string fromstr = -          cache.count(v) ? (cache[v]->hasName() ? cache[v]->getName() : "") -                         : "none"; -      errs() << "Updating base value cache" -             << " for: " << (v->hasName() ? v->getName() : "") -             << " from: " << fromstr -             << " to: " << (base->hasName() ? base->getName() : "") << "\n"; -    } - -    assert(isKnownBaseResult(base) && -           "must be something we 'know' is a base pointer"); -    if (cache.count(v)) { +  for (auto Pair : States) { +    auto *BDV = Pair.first; +    Value *base = Pair.second.getBase(); +    assert(BDV && base); + +    std::string fromstr = cache.count(BDV) ? cache[BDV]->getName() : "none"; +    DEBUG(dbgs() << "Updating base value cache" +          << " for: " << BDV->getName() +          << " from: " << fromstr +          << " to: " << base->getName() << "\n"); + +    if (cache.count(BDV)) {        // Once we transition from the BDV relation being store in the cache to        // the base relation being stored, it must be stable -      assert((!isKnownBaseResult(cache[v]) || cache[v] == base) && +      assert((!isKnownBaseResult(cache[BDV]) || cache[BDV] == base) &&               "base relation should be stable");      } -    cache[v] = base; +    cache[BDV] = base;    } -  assert(cache.find(def) != cache.end()); +  assert(cache.count(def));    return cache[def];  } @@ -1024,7 +1161,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {  // pointer was a base pointer.  static void  findBasePointers(const StatepointLiveSetTy &live, -                 DenseMap<llvm::Value *, llvm::Value *> &PointerToBase, +                 DenseMap<Value *, Value *> &PointerToBase,                   DominatorTree *DT, DefiningValueMapTy &DVCache) {    // For the naming of values inserted to be deterministic - which makes for    // much cleaner and more stable tests - we need to assign an order to the @@ -1043,7 +1180,7 @@ findBasePointers(const StatepointLiveSetTy &live,      // If you see this trip and like to live really dangerously, the code should      // be correct, just with idioms the verifier can't handle.  You can try -    // disabling the verifier at your own substaintial risk. +    // disabling the verifier at your own substantial risk.      assert(!isa<ConstantPointerNull>(base) &&             "the relocation code needs adjustment to handle the relocation of "             "a null pointer constant without causing false positives in the " @@ -1056,8 +1193,8 @@ findBasePointers(const StatepointLiveSetTy &live,  static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,                               const CallSite &CS,                               PartiallyConstructedSafepointRecord &result) { -  DenseMap<llvm::Value *, llvm::Value *> PointerToBase; -  findBasePointers(result.liveset, PointerToBase, &DT, DVCache); +  DenseMap<Value *, Value *> PointerToBase; +  findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache);    if (PrintBasePointers) {      // Note: Need to print these in a stable order since this is checked in @@ -1071,8 +1208,11 @@ static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,      std::sort(Temp.begin(), Temp.end(), order_by_name);      for (Value *Ptr : Temp) {        Value *Base = PointerToBase[Ptr]; -      errs() << " derived %" << Ptr->getName() << " base %" << Base->getName() -             << "\n"; +      errs() << " derived "; +      Ptr->printAsOperand(errs(), false); +      errs() << " base "; +      Base->printAsOperand(errs(), false); +      errs() << "\n";;      }    } @@ -1086,10 +1226,10 @@ static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,                                    PartiallyConstructedSafepointRecord &result);  static void recomputeLiveInValues( -    Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate, +    Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate,      MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {    // TODO-PERF: reuse the original liveness, then simply run the dataflow -  // again.  The old values are still live and will help it stablize quickly. +  // again.  The old values are still live and will help it stabilize quickly.    GCPtrLivenessData RevisedLivenessData;    computeLiveInValues(DT, F, RevisedLivenessData);    for (size_t i = 0; i < records.size(); i++) { @@ -1099,69 +1239,66 @@ static void recomputeLiveInValues(    }  } -// When inserting gc.relocate calls, we need to ensure there are no uses -// of the original value between the gc.statepoint and the gc.relocate call. -// One case which can arise is a phi node starting one of the successor blocks. -// We also need to be able to insert the gc.relocates only on the path which -// goes through the statepoint.  We might need to split an edge to make this -// possible. +// When inserting gc.relocate and gc.result calls, we need to ensure there are +// no uses of the original value / return value between the gc.statepoint and +// the gc.relocate / gc.result call.  One case which can arise is a phi node +// starting one of the successor blocks.  We also need to be able to insert the +// gc.relocates only on the path which goes through the statepoint.  We might +// need to split an edge to make this possible.  static BasicBlock *  normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,                              DominatorTree &DT) {    BasicBlock *Ret = BB; -  if (!BB->getUniquePredecessor()) { -    Ret = SplitBlockPredecessors(BB, InvokeParent, "", nullptr, &DT); -  } +  if (!BB->getUniquePredecessor()) +    Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT); -  // Now that 'ret' has unique predecessor we can safely remove all phi nodes +  // Now that 'Ret' has unique predecessor we can safely remove all phi nodes    // from it    FoldSingleEntryPHINodes(Ret); -  assert(!isa<PHINode>(Ret->begin())); +  assert(!isa<PHINode>(Ret->begin()) && +         "All PHI nodes should have been removed!"); -  // At this point, we can safely insert a gc.relocate as the first instruction -  // in Ret if needed. +  // At this point, we can safely insert a gc.relocate or gc.result as the first +  // instruction in Ret if needed.    return Ret;  } -static int find_index(ArrayRef<Value *> livevec, Value *val) { -  auto itr = std::find(livevec.begin(), livevec.end(), val); -  assert(livevec.end() != itr); -  size_t index = std::distance(livevec.begin(), itr); -  assert(index < livevec.size()); -  return index; -} - -// Create new attribute set containing only attributes which can be transfered +// Create new attribute set containing only attributes which can be transferred  // from original call to the safepoint.  static AttributeSet legalizeCallAttributes(AttributeSet AS) { -  AttributeSet ret; +  AttributeSet Ret;    for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) { -    unsigned index = AS.getSlotIndex(Slot); +    unsigned Index = AS.getSlotIndex(Slot); -    if (index == AttributeSet::ReturnIndex || -        index == AttributeSet::FunctionIndex) { +    if (Index == AttributeSet::ReturnIndex || +        Index == AttributeSet::FunctionIndex) { -      for (auto it = AS.begin(Slot), it_end = AS.end(Slot); it != it_end; -           ++it) { -        Attribute attr = *it; +      for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) {          // Do not allow certain attributes - just skip them          // Safepoint can not be read only or read none. -        if (attr.hasAttribute(Attribute::ReadNone) || -            attr.hasAttribute(Attribute::ReadOnly)) +        if (Attr.hasAttribute(Attribute::ReadNone) || +            Attr.hasAttribute(Attribute::ReadOnly)) +          continue; + +        // These attributes control the generation of the gc.statepoint call / +        // invoke itself; and once the gc.statepoint is in place, they're of no +        // use. +        if (Attr.hasAttribute("statepoint-num-patch-bytes") || +            Attr.hasAttribute("statepoint-id"))            continue; -        ret = ret.addAttributes( -            AS.getContext(), index, -            AttributeSet::get(AS.getContext(), index, AttrBuilder(attr))); +        Ret = Ret.addAttributes( +            AS.getContext(), Index, +            AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr)));        }      }      // Just skip parameter attributes for now    } -  return ret; +  return Ret;  }  /// Helper function to place all gc relocates necessary for the given @@ -1173,225 +1310,290 @@ static AttributeSet legalizeCallAttributes(AttributeSet AS) {  ///   statepointToken - statepoint instruction to which relocates should be  ///   bound.  ///   Builder - Llvm IR builder to be used to construct new calls. -static void CreateGCRelocates(ArrayRef<llvm::Value *> LiveVariables, +static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,                                const int LiveStart, -                              ArrayRef<llvm::Value *> BasePtrs, +                              ArrayRef<Value *> BasePtrs,                                Instruction *StatepointToken,                                IRBuilder<> Builder) { -  SmallVector<Instruction *, 64> NewDefs; -  NewDefs.reserve(LiveVariables.size()); +  if (LiveVariables.empty()) +    return; -  Module *M = StatepointToken->getParent()->getParent()->getParent(); +  auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) { +    auto ValIt = std::find(LiveVec.begin(), LiveVec.end(), Val); +    assert(ValIt != LiveVec.end() && "Val not found in LiveVec!"); +    size_t Index = std::distance(LiveVec.begin(), ValIt); +    assert(Index < LiveVec.size() && "Bug in std::find?"); +    return Index; +  }; -  for (unsigned i = 0; i < LiveVariables.size(); i++) { -    // We generate a (potentially) unique declaration for every pointer type -    // combination.  This results is some blow up the function declarations in -    // the IR, but removes the need for argument bitcasts which shrinks the IR -    // greatly and makes it much more readable. -    SmallVector<Type *, 1> Types;                 // one per 'any' type -    // All gc_relocate are set to i8 addrspace(1)* type. This could help avoid -    // cases where the actual value's type mangling is not supported by llvm. A -    // bitcast is added later to convert gc_relocate to the actual value's type. -    Types.push_back(Type::getInt8PtrTy(M->getContext(), 1)); -    Value *GCRelocateDecl = Intrinsic::getDeclaration( -        M, Intrinsic::experimental_gc_relocate, Types); +  // All gc_relocate are set to i8 addrspace(1)* type. We originally generated +  // unique declarations for each pointer type, but this proved problematic +  // because the intrinsic mangling code is incomplete and fragile.  Since +  // we're moving towards a single unified pointer type anyways, we can just +  // cast everything to an i8* of the right address space.  A bitcast is added +  // later to convert gc_relocate to the actual value's type.  +  Module *M = StatepointToken->getModule(); +  auto AS = cast<PointerType>(LiveVariables[0]->getType())->getAddressSpace(); +  Type *Types[] = {Type::getInt8PtrTy(M->getContext(), AS)}; +  Value *GCRelocateDecl = +    Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, Types); +  for (unsigned i = 0; i < LiveVariables.size(); i++) {      // Generate the gc.relocate call and save the result      Value *BaseIdx = -        ConstantInt::get(Type::getInt32Ty(M->getContext()), -                         LiveStart + find_index(LiveVariables, BasePtrs[i])); -    Value *LiveIdx = ConstantInt::get( -        Type::getInt32Ty(M->getContext()), -        LiveStart + find_index(LiveVariables, LiveVariables[i])); +      Builder.getInt32(LiveStart + FindIndex(LiveVariables, BasePtrs[i])); +    Value *LiveIdx = Builder.getInt32(LiveStart + i);      // only specify a debug name if we can give a useful one -    Value *Reloc = Builder.CreateCall( +    CallInst *Reloc = Builder.CreateCall(          GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx}, -        LiveVariables[i]->hasName() ? LiveVariables[i]->getName() + ".relocated" -                                    : ""); +        suffixed_name_or(LiveVariables[i], ".relocated", ""));      // Trick CodeGen into thinking there are lots of free registers at this      // fake call. -    cast<CallInst>(Reloc)->setCallingConv(CallingConv::Cold); +    Reloc->setCallingConv(CallingConv::Cold); +  } +} -    NewDefs.push_back(cast<Instruction>(Reloc)); +namespace { + +/// This struct is used to defer RAUWs and `eraseFromParent` s.  Using this +/// avoids having to worry about keeping around dangling pointers to Values. +class DeferredReplacement { +  AssertingVH<Instruction> Old; +  AssertingVH<Instruction> New; + +public: +  explicit DeferredReplacement(Instruction *Old, Instruction *New) : +    Old(Old), New(New) { +    assert(Old != New && "Not allowed!");    } -  assert(NewDefs.size() == LiveVariables.size() && -         "missing or extra redefinition at safepoint"); + +  /// Does the task represented by this instance. +  void doReplacement() { +    Instruction *OldI = Old; +    Instruction *NewI = New; + +    assert(OldI != NewI && "Disallowed at construction?!"); + +    Old = nullptr; +    New = nullptr; + +    if (NewI) +      OldI->replaceAllUsesWith(NewI); +    OldI->eraseFromParent(); +  } +};  }  static void -makeStatepointExplicitImpl(const CallSite &CS, /* to replace */ -                           const SmallVectorImpl<llvm::Value *> &basePtrs, -                           const SmallVectorImpl<llvm::Value *> &liveVariables, -                           Pass *P, -                           PartiallyConstructedSafepointRecord &result) { -  assert(basePtrs.size() == liveVariables.size()); -  assert(isStatepoint(CS) && +makeStatepointExplicitImpl(const CallSite CS, /* to replace */ +                           const SmallVectorImpl<Value *> &BasePtrs, +                           const SmallVectorImpl<Value *> &LiveVariables, +                           PartiallyConstructedSafepointRecord &Result, +                           std::vector<DeferredReplacement> &Replacements) { +  assert(BasePtrs.size() == LiveVariables.size()); +  assert((UseDeoptBundles || isStatepoint(CS)) &&           "This method expects to be rewriting a statepoint"); -  BasicBlock *BB = CS.getInstruction()->getParent(); -  assert(BB); -  Function *F = BB->getParent(); -  assert(F && "must be set"); -  Module *M = F->getParent(); -  (void)M; -  assert(M && "must be set"); - -  // We're not changing the function signature of the statepoint since the gc -  // arguments go into the var args section. -  Function *gc_statepoint_decl = CS.getCalledFunction(); -    // Then go ahead and use the builder do actually do the inserts.  We insert    // immediately before the previous instruction under the assumption that all    // arguments will be available here.  We can't insert afterwards since we may    // be replacing a terminator. -  Instruction *insertBefore = CS.getInstruction(); -  IRBuilder<> Builder(insertBefore); -  // Copy all of the arguments from the original statepoint - this includes the -  // target, call args, and deopt args -  SmallVector<llvm::Value *, 64> args; -  args.insert(args.end(), CS.arg_begin(), CS.arg_end()); -  // TODO: Clear the 'needs rewrite' flag - -  // add all the pointers to be relocated (gc arguments) -  // Capture the start of the live variable list for use in the gc_relocates -  const int live_start = args.size(); -  args.insert(args.end(), liveVariables.begin(), liveVariables.end()); +  Instruction *InsertBefore = CS.getInstruction(); +  IRBuilder<> Builder(InsertBefore); + +  ArrayRef<Value *> GCArgs(LiveVariables); +  uint64_t StatepointID = 0xABCDEF00; +  uint32_t NumPatchBytes = 0; +  uint32_t Flags = uint32_t(StatepointFlags::None); + +  ArrayRef<Use> CallArgs; +  ArrayRef<Use> DeoptArgs; +  ArrayRef<Use> TransitionArgs; + +  Value *CallTarget = nullptr; + +  if (UseDeoptBundles) { +    CallArgs = {CS.arg_begin(), CS.arg_end()}; +    DeoptArgs = GetDeoptBundleOperands(CS); +    // TODO: we don't fill in TransitionArgs or Flags in this branch, but we +    // could have an operand bundle for that too. +    AttributeSet OriginalAttrs = CS.getAttributes(); + +    Attribute AttrID = OriginalAttrs.getAttribute(AttributeSet::FunctionIndex, +                                                  "statepoint-id"); +    if (AttrID.isStringAttribute()) +      AttrID.getValueAsString().getAsInteger(10, StatepointID); + +    Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute( +        AttributeSet::FunctionIndex, "statepoint-num-patch-bytes"); +    if (AttrNumPatchBytes.isStringAttribute()) +      AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes); + +    CallTarget = CS.getCalledValue(); +  } else { +    // This branch will be gone soon, and we will soon only support the +    // UseDeoptBundles == true configuration. +    Statepoint OldSP(CS); +    StatepointID = OldSP.getID(); +    NumPatchBytes = OldSP.getNumPatchBytes(); +    Flags = OldSP.getFlags(); + +    CallArgs = {OldSP.arg_begin(), OldSP.arg_end()}; +    DeoptArgs = {OldSP.vm_state_begin(), OldSP.vm_state_end()}; +    TransitionArgs = {OldSP.gc_transition_args_begin(), +                      OldSP.gc_transition_args_end()}; +    CallTarget = OldSP.getCalledValue(); +  }    // Create the statepoint given all the arguments -  Instruction *token = nullptr; -  AttributeSet return_attributes; +  Instruction *Token = nullptr; +  AttributeSet ReturnAttrs;    if (CS.isCall()) { -    CallInst *toReplace = cast<CallInst>(CS.getInstruction()); -    CallInst *call = -        Builder.CreateCall(gc_statepoint_decl, args, "safepoint_token"); -    call->setTailCall(toReplace->isTailCall()); -    call->setCallingConv(toReplace->getCallingConv()); +    CallInst *ToReplace = cast<CallInst>(CS.getInstruction()); +    CallInst *Call = Builder.CreateGCStatepointCall( +        StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs, +        TransitionArgs, DeoptArgs, GCArgs, "safepoint_token"); + +    Call->setTailCall(ToReplace->isTailCall()); +    Call->setCallingConv(ToReplace->getCallingConv());      // Currently we will fail on parameter attributes and on certain      // function attributes. -    AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); -    // In case if we can handle this set of sttributes - set up function attrs +    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes()); +    // In case if we can handle this set of attributes - set up function attrs      // directly on statepoint and return attrs later for gc_result intrinsic. -    call->setAttributes(new_attrs.getFnAttributes()); -    return_attributes = new_attrs.getRetAttributes(); +    Call->setAttributes(NewAttrs.getFnAttributes()); +    ReturnAttrs = NewAttrs.getRetAttributes(); -    token = call; +    Token = Call;      // Put the following gc_result and gc_relocate calls immediately after the      // the old call (which we're about to delete) -    BasicBlock::iterator next(toReplace); -    assert(BB->end() != next && "not a terminator, must have next"); -    next++; -    Instruction *IP = &*(next); -    Builder.SetInsertPoint(IP); -    Builder.SetCurrentDebugLocation(IP->getDebugLoc()); - +    assert(ToReplace->getNextNode() && "Not a terminator, must have next!"); +    Builder.SetInsertPoint(ToReplace->getNextNode()); +    Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc());    } else { -    InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction()); +    InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction());      // Insert the new invoke into the old block.  We'll remove the old one in a      // moment at which point this will become the new terminator for the      // original block. -    InvokeInst *invoke = InvokeInst::Create( -        gc_statepoint_decl, toReplace->getNormalDest(), -        toReplace->getUnwindDest(), args, "", toReplace->getParent()); -    invoke->setCallingConv(toReplace->getCallingConv()); +    InvokeInst *Invoke = Builder.CreateGCStatepointInvoke( +        StatepointID, NumPatchBytes, CallTarget, ToReplace->getNormalDest(), +        ToReplace->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs, +        GCArgs, "statepoint_token"); + +    Invoke->setCallingConv(ToReplace->getCallingConv());      // Currently we will fail on parameter attributes and on certain      // function attributes. -    AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); -    // In case if we can handle this set of sttributes - set up function attrs +    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes()); +    // In case if we can handle this set of attributes - set up function attrs      // directly on statepoint and return attrs later for gc_result intrinsic. -    invoke->setAttributes(new_attrs.getFnAttributes()); -    return_attributes = new_attrs.getRetAttributes(); +    Invoke->setAttributes(NewAttrs.getFnAttributes()); +    ReturnAttrs = NewAttrs.getRetAttributes(); -    token = invoke; +    Token = Invoke;      // Generate gc relocates in exceptional path -    BasicBlock *unwindBlock = toReplace->getUnwindDest(); -    assert(!isa<PHINode>(unwindBlock->begin()) && -           unwindBlock->getUniquePredecessor() && +    BasicBlock *UnwindBlock = ToReplace->getUnwindDest(); +    assert(!isa<PHINode>(UnwindBlock->begin()) && +           UnwindBlock->getUniquePredecessor() &&             "can't safely insert in this block!"); -    Instruction *IP = &*(unwindBlock->getFirstInsertionPt()); -    Builder.SetInsertPoint(IP); -    Builder.SetCurrentDebugLocation(toReplace->getDebugLoc()); +    Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt()); +    Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc()); -    // Extract second element from landingpad return value. We will attach -    // exceptional gc relocates to it. -    const unsigned idx = 1; -    Instruction *exceptional_token = -        cast<Instruction>(Builder.CreateExtractValue( -            unwindBlock->getLandingPadInst(), idx, "relocate_token")); -    result.UnwindToken = exceptional_token; +    // Attach exceptional gc relocates to the landingpad. +    Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst(); +    Result.UnwindToken = ExceptionalToken; -    // Just throw away return value. We will use the one we got for normal -    // block. -    (void)CreateGCRelocates(liveVariables, live_start, basePtrs, -                            exceptional_token, Builder); +    const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx(); +    CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, ExceptionalToken, +                      Builder);      // Generate gc relocates and returns for normal block -    BasicBlock *normalDest = toReplace->getNormalDest(); -    assert(!isa<PHINode>(normalDest->begin()) && -           normalDest->getUniquePredecessor() && +    BasicBlock *NormalDest = ToReplace->getNormalDest(); +    assert(!isa<PHINode>(NormalDest->begin()) && +           NormalDest->getUniquePredecessor() &&             "can't safely insert in this block!"); -    IP = &*(normalDest->getFirstInsertionPt()); -    Builder.SetInsertPoint(IP); +    Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt());      // gc relocates will be generated later as if it were regular call      // statepoint    } -  assert(token); - -  // Take the name of the original value call if it had one. -  token->takeName(CS.getInstruction()); +  assert(Token && "Should be set in one of the above branches!"); + +  if (UseDeoptBundles) { +    Token->setName("statepoint_token"); +    if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) { +      StringRef Name = +          CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : ""; +      CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name); +      GCResult->setAttributes(CS.getAttributes().getRetAttributes()); + +      // We cannot RAUW or delete CS.getInstruction() because it could be in the +      // live set of some other safepoint, in which case that safepoint's +      // PartiallyConstructedSafepointRecord will hold a raw pointer to this +      // llvm::Instruction.  Instead, we defer the replacement and deletion to +      // after the live sets have been made explicit in the IR, and we no longer +      // have raw pointers to worry about. +      Replacements.emplace_back(CS.getInstruction(), GCResult); +    } else { +      Replacements.emplace_back(CS.getInstruction(), nullptr); +    } +  } else { +    assert(!CS.getInstruction()->hasNUsesOrMore(2) && +           "only valid use before rewrite is gc.result"); +    assert(!CS.getInstruction()->hasOneUse() || +           isGCResult(cast<Instruction>(*CS.getInstruction()->user_begin()))); -// The GCResult is already inserted, we just need to find it -#ifndef NDEBUG -  Instruction *toReplace = CS.getInstruction(); -  assert((toReplace->hasNUses(0) || toReplace->hasNUses(1)) && -         "only valid use before rewrite is gc.result"); -  assert(!toReplace->hasOneUse() || -         isGCResult(cast<Instruction>(*toReplace->user_begin()))); -#endif +    // Take the name of the original statepoint token if there was one. +    Token->takeName(CS.getInstruction()); -  // Update the gc.result of the original statepoint (if any) to use the newly -  // inserted statepoint.  This is safe to do here since the token can't be -  // considered a live reference. -  CS.getInstruction()->replaceAllUsesWith(token); +    // Update the gc.result of the original statepoint (if any) to use the newly +    // inserted statepoint.  This is safe to do here since the token can't be +    // considered a live reference. +    CS.getInstruction()->replaceAllUsesWith(Token); +    CS.getInstruction()->eraseFromParent(); +  } -  result.StatepointToken = token; +  Result.StatepointToken = Token;    // Second, create a gc.relocate for every live variable -  CreateGCRelocates(liveVariables, live_start, basePtrs, token, Builder); +  const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx(); +  CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder);  }  namespace { -struct name_ordering { -  Value *base; -  Value *derived; -  bool operator()(name_ordering const &a, name_ordering const &b) { -    return -1 == a.derived->getName().compare(b.derived->getName()); +struct NameOrdering { +  Value *Base; +  Value *Derived; + +  bool operator()(NameOrdering const &a, NameOrdering const &b) { +    return -1 == a.Derived->getName().compare(b.Derived->getName());    }  };  } -static void stablize_order(SmallVectorImpl<Value *> &basevec, -                           SmallVectorImpl<Value *> &livevec) { -  assert(basevec.size() == livevec.size()); - -  SmallVector<name_ordering, 64> temp; -  for (size_t i = 0; i < basevec.size(); i++) { -    name_ordering v; -    v.base = basevec[i]; -    v.derived = livevec[i]; -    temp.push_back(v); -  } -  std::sort(temp.begin(), temp.end(), name_ordering()); -  for (size_t i = 0; i < basevec.size(); i++) { -    basevec[i] = temp[i].base; -    livevec[i] = temp[i].derived; + +static void StabilizeOrder(SmallVectorImpl<Value *> &BaseVec, +                           SmallVectorImpl<Value *> &LiveVec) { +  assert(BaseVec.size() == LiveVec.size()); + +  SmallVector<NameOrdering, 64> Temp; +  for (size_t i = 0; i < BaseVec.size(); i++) { +    NameOrdering v; +    v.Base = BaseVec[i]; +    v.Derived = LiveVec[i]; +    Temp.push_back(v); +  } + +  std::sort(Temp.begin(), Temp.end(), NameOrdering()); +  for (size_t i = 0; i < BaseVec.size(); i++) { +    BaseVec[i] = Temp[i].Base; +    LiveVec[i] = Temp[i].Derived;    }  } @@ -1401,40 +1603,39 @@ static void stablize_order(SmallVectorImpl<Value *> &basevec,  // WARNING: Does not do any fixup to adjust users of the original live  // values.  That's the callers responsibility.  static void -makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, Pass *P, -                       PartiallyConstructedSafepointRecord &result) { -  auto liveset = result.liveset; -  auto PointerToBase = result.PointerToBase; +makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, +                       PartiallyConstructedSafepointRecord &Result, +                       std::vector<DeferredReplacement> &Replacements) { +  const auto &LiveSet = Result.LiveSet; +  const auto &PointerToBase = Result.PointerToBase;    // Convert to vector for efficient cross referencing. -  SmallVector<Value *, 64> basevec, livevec; -  livevec.reserve(liveset.size()); -  basevec.reserve(liveset.size()); -  for (Value *L : liveset) { -    livevec.push_back(L); - -    assert(PointerToBase.find(L) != PointerToBase.end()); -    Value *base = PointerToBase[L]; -    basevec.push_back(base); +  SmallVector<Value *, 64> BaseVec, LiveVec; +  LiveVec.reserve(LiveSet.size()); +  BaseVec.reserve(LiveSet.size()); +  for (Value *L : LiveSet) { +    LiveVec.push_back(L); +    assert(PointerToBase.count(L)); +    Value *Base = PointerToBase.find(L)->second; +    BaseVec.push_back(Base);    } -  assert(livevec.size() == basevec.size()); +  assert(LiveVec.size() == BaseVec.size());    // To make the output IR slightly more stable (for use in diffs), ensure a    // fixed order of the values in the safepoint (by sorting the value name).    // The order is otherwise meaningless. -  stablize_order(basevec, livevec); +  StabilizeOrder(BaseVec, LiveVec);    // Do the actual rewriting and delete the old statepoint -  makeStatepointExplicitImpl(CS, basevec, livevec, P, result); -  CS.getInstruction()->eraseFromParent(); +  makeStatepointExplicitImpl(CS, BaseVec, LiveVec, Result, Replacements);  }  // Helper function for the relocationViaAlloca. -// It receives iterator to the statepoint gc relocates and emits store to the -// assigned -// location (via allocaMap) for the each one of them. -// Add visited values into the visitedLiveValues set we will later use them -// for sanity check. +// +// It receives iterator to the statepoint gc relocates and emits a store to the +// assigned location (via allocaMap) for the each one of them.  It adds the +// visited values into the visitedLiveValues set, which we will later use them +// for sanity checking.  static void  insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,                         DenseMap<Value *, Value *> &AllocaMap, @@ -1459,13 +1660,15 @@ insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,      Value *Alloca = AllocaMap[OriginalValue];      // Emit store into the related alloca -    // All gc_relocate are i8 addrspace(1)* typed, and it must be bitcasted to +    // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to      // the correct type according to alloca. -    assert(RelocatedValue->getNextNode() && "Should always have one since it's not a terminator"); +    assert(RelocatedValue->getNextNode() && +           "Should always have one since it's not a terminator");      IRBuilder<> Builder(RelocatedValue->getNextNode());      Value *CastedRelocatedValue = -        Builder.CreateBitCast(RelocatedValue, cast<AllocaInst>(Alloca)->getAllocatedType(), -        RelocatedValue->hasName() ? RelocatedValue->getName() + ".casted" : ""); +      Builder.CreateBitCast(RelocatedValue, +                            cast<AllocaInst>(Alloca)->getAllocatedType(), +                            suffixed_name_or(RelocatedValue, ".casted", ""));      StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca);      Store->insertAfter(cast<Instruction>(CastedRelocatedValue)); @@ -1501,10 +1704,10 @@ insertRematerializationStores(    }  } -/// do all the relocation update via allocas and mem2reg +/// Do all the relocation update via allocas and mem2reg  static void relocationViaAlloca(      Function &F, DominatorTree &DT, ArrayRef<Value *> Live, -    ArrayRef<struct PartiallyConstructedSafepointRecord> Records) { +    ArrayRef<PartiallyConstructedSafepointRecord> Records) {  #ifndef NDEBUG    // record initial number of (static) allocas; we'll check we have the same    // number when we get done. @@ -1531,15 +1734,12 @@ static void relocationViaAlloca(      PromotableAllocas.push_back(Alloca);    }; -  // emit alloca for each live gc pointer -  for (unsigned i = 0; i < Live.size(); i++) { -    emitAllocaFor(Live[i]); -  } - -  // emit allocas for rematerialized values -  for (size_t i = 0; i < Records.size(); i++) { -    const struct PartiallyConstructedSafepointRecord &Info = Records[i]; +  // Emit alloca for each live gc pointer +  for (Value *V : Live) +    emitAllocaFor(V); +  // Emit allocas for rematerialized values +  for (const auto &Info : Records)      for (auto RematerializedValuePair : Info.RematerializedValues) {        Value *OriginalValue = RematerializedValuePair.second;        if (AllocaMap.count(OriginalValue) != 0) @@ -1548,20 +1748,17 @@ static void relocationViaAlloca(        emitAllocaFor(OriginalValue);        ++NumRematerializedValues;      } -  }    // The next two loops are part of the same conceptual operation.  We need to    // insert a store to the alloca after the original def and at each    // redefinition.  We need to insert a load before each use.  These are split    // into distinct loops for performance reasons. -  // update gc pointer after each statepoint -  // either store a relocated value or null (if no relocated value found for -  // this gc pointer and it is not a gc_result) -  // this must happen before we update the statepoint with load of alloca -  // otherwise we lose the link between statepoint and old def -  for (size_t i = 0; i < Records.size(); i++) { -    const struct PartiallyConstructedSafepointRecord &Info = Records[i]; +  // Update gc pointer after each statepoint: either store a relocated value or +  // null (if no relocated value was found for this gc pointer and it is not a +  // gc_result).  This must happen before we update the statepoint with load of +  // alloca otherwise we lose the link between statepoint and old def. +  for (const auto &Info : Records) {      Value *Statepoint = Info.StatepointToken;      // This will be used for consistency check @@ -1582,7 +1779,7 @@ static void relocationViaAlloca(                                    VisitedLiveValues);      if (ClobberNonLive) { -      // As a debuging aid, pretend that an unrelocated pointer becomes null at +      // As a debugging aid, pretend that an unrelocated pointer becomes null at        // the gc.statepoint.  This will turn some subtle GC problems into        // slightly easier to debug SEGVs.  Note that on large IR files with        // lots of gc.statepoints this is extremely costly both memory and time @@ -1612,23 +1809,22 @@ static void relocationViaAlloca(        // Insert the clobbering stores.  These may get intermixed with the        // gc.results and gc.relocates, but that's fine.        if (auto II = dyn_cast<InvokeInst>(Statepoint)) { -        InsertClobbersAt(II->getNormalDest()->getFirstInsertionPt()); -        InsertClobbersAt(II->getUnwindDest()->getFirstInsertionPt()); +        InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt()); +        InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt());        } else { -        BasicBlock::iterator Next(cast<CallInst>(Statepoint)); -        Next++; -        InsertClobbersAt(Next); +        InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode());        }      }    } -  // update use with load allocas and add store for gc_relocated + +  // Update use with load allocas and add store for gc_relocated.    for (auto Pair : AllocaMap) {      Value *Def = Pair.first;      Value *Alloca = Pair.second; -    // we pre-record the uses of allocas so that we dont have to worry about -    // later update -    // that change the user information. +    // We pre-record the uses of allocas so that we dont have to worry about +    // later update that changes the user information.. +      SmallVector<Instruction *, 20> Uses;      // PERF: trade a linear scan for repeated reallocation      Uses.reserve(std::distance(Def->user_begin(), Def->user_end())); @@ -1663,9 +1859,9 @@ static void relocationViaAlloca(        }      } -    // emit store for the initial gc value -    // store must be inserted after load, otherwise store will be in alloca's -    // use list and an extra load will be inserted before it +    // Emit store for the initial gc value.  Store must be inserted after load, +    // otherwise store will be in alloca's use list and an extra load will be +    // inserted before it.      StoreInst *Store = new StoreInst(Def, Alloca);      if (Instruction *Inst = dyn_cast<Instruction>(Def)) {        if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) { @@ -1688,14 +1884,13 @@ static void relocationViaAlloca(    assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues &&           "we must have the same allocas with lives");    if (!PromotableAllocas.empty()) { -    // apply mem2reg to promote alloca to SSA +    // Apply mem2reg to promote alloca to SSA      PromoteMemToReg(PromotableAllocas, DT);    }  #ifndef NDEBUG -  for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); I != E; -       I++) -    if (isa<AllocaInst>(*I)) +  for (auto &I : F.getEntryBlock()) +    if (isa<AllocaInst>(I))        InitialAllocaNum--;    assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas");  #endif @@ -1719,28 +1914,27 @@ static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values,      // No values to hold live, might as well not insert the empty holder      return; -  Module *M = CS.getInstruction()->getParent()->getParent()->getParent(); +  Module *M = CS.getInstruction()->getModule();    // Use a dummy vararg function to actually hold the values live    Function *Func = cast<Function>(M->getOrInsertFunction(        "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true)));    if (CS.isCall()) {      // For call safepoints insert dummy calls right after safepoint -    BasicBlock::iterator Next(CS.getInstruction()); -    Next++; -    Holders.push_back(CallInst::Create(Func, Values, "", Next)); +    Holders.push_back(CallInst::Create(Func, Values, "", +                                       &*++CS.getInstruction()->getIterator()));      return;    }    // For invoke safepooints insert dummy calls both in normal and    // exceptional destination blocks    auto *II = cast<InvokeInst>(CS.getInstruction());    Holders.push_back(CallInst::Create( -      Func, Values, "", II->getNormalDest()->getFirstInsertionPt())); +      Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt()));    Holders.push_back(CallInst::Create( -      Func, Values, "", II->getUnwindDest()->getFirstInsertionPt())); +      Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt()));  }  static void findLiveReferences( -    Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate, +    Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate,      MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {    GCPtrLivenessData OriginalLivenessData;    computeLiveInValues(DT, F, OriginalLivenessData); @@ -1751,12 +1945,12 @@ static void findLiveReferences(    }  } -/// Remove any vector of pointers from the liveset by scalarizing them over the -/// statepoint instruction.  Adds the scalarized pieces to the liveset.  It -/// would be preferrable to include the vector in the statepoint itself, but +/// Remove any vector of pointers from the live set by scalarizing them over the +/// statepoint instruction.  Adds the scalarized pieces to the live set.  It +/// would be preferable to include the vector in the statepoint itself, but  /// the lowering code currently does not handle that.  Extending it would be  /// slightly non-trivial since it requires a format change.  Given how rare -/// such cases are (for the moment?) scalarizing is an acceptable comprimise. +/// such cases are (for the moment?) scalarizing is an acceptable compromise.  static void splitVectorValues(Instruction *StatepointInst,                                StatepointLiveSetTy &LiveSet,                                DenseMap<Value *, Value *>& PointerToBase, @@ -1887,7 +2081,7 @@ static void splitVectorValues(Instruction *StatepointInst,  // Helper function for the "rematerializeLiveValues". It walks use chain  // starting from the "CurrentValue" until it meets "BaseValue". Only "simple"  // values are visited (currently it is GEP's and casts). Returns true if it -// sucessfully reached "BaseValue" and false otherwise. +// successfully reached "BaseValue" and false otherwise.  // Fills "ChainToBase" array with all visited values. "BaseValue" is not  // recorded.  static bool findRematerializableChainToBasePointer( @@ -1907,16 +2101,12 @@ static bool findRematerializableChainToBasePointer(    }    if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) { -    Value *Def = CI->stripPointerCasts(); - -    // This two checks are basically similar. First one is here for the -    // consistency with findBasePointers logic. -    assert(!isa<CastInst>(Def) && "not a pointer cast found");      if (!CI->isNoopCast(CI->getModule()->getDataLayout()))        return false;      ChainToBase.push_back(CI); -    return findRematerializableChainToBasePointer(ChainToBase, Def, BaseValue); +    return findRematerializableChainToBasePointer(ChainToBase, +                                                  CI->getOperand(0), BaseValue);    }    // Not supported instruction in the chain @@ -1957,8 +2147,8 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,    return Cost;  } -// From the statepoint liveset pick values that are cheaper to recompute then to -// relocate. Remove this values from the liveset, rematerialize them after +// From the statepoint live set pick values that are cheaper to recompute then +// to relocate. Remove this values from the live set, rematerialize them after  // statepoint and record them in "Info" structure. Note that similar to  // relocated values we don't do any user adjustments here.  static void rematerializeLiveValues(CallSite CS, @@ -1970,10 +2160,10 @@ static void rematerializeLiveValues(CallSite CS,    // We can not di this in following loop due to iterator invalidation.    SmallVector<Value *, 32> LiveValuesToBeDeleted; -  for (Value *LiveValue: Info.liveset) { +  for (Value *LiveValue: Info.LiveSet) {      // For each live pointer find it's defining chain      SmallVector<Instruction *, 3> ChainToBase; -    assert(Info.PointerToBase.find(LiveValue) != Info.PointerToBase.end()); +    assert(Info.PointerToBase.count(LiveValue));      bool FoundChain =        findRematerializableChainToBasePointer(ChainToBase,                                               LiveValue, @@ -2059,9 +2249,9 @@ static void rematerializeLiveValues(CallSite CS,        InvokeInst *Invoke = cast<InvokeInst>(CS.getInstruction());        Instruction *NormalInsertBefore = -          Invoke->getNormalDest()->getFirstInsertionPt(); +          &*Invoke->getNormalDest()->getFirstInsertionPt();        Instruction *UnwindInsertBefore = -          Invoke->getUnwindDest()->getFirstInsertionPt(); +          &*Invoke->getUnwindDest()->getFirstInsertionPt();        Instruction *NormalRematerializedValue =            rematerializeChain(NormalInsertBefore); @@ -2075,22 +2265,23 @@ static void rematerializeLiveValues(CallSite CS,    // Remove rematerializaed values from the live set    for (auto LiveValue: LiveValuesToBeDeleted) { -    Info.liveset.erase(LiveValue); +    Info.LiveSet.erase(LiveValue);    }  } -static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, -                              SmallVectorImpl<CallSite> &toUpdate) { +static bool insertParsePoints(Function &F, DominatorTree &DT, +                              TargetTransformInfo &TTI, +                              SmallVectorImpl<CallSite> &ToUpdate) {  #ifndef NDEBUG    // sanity check the input -  std::set<CallSite> uniqued; -  uniqued.insert(toUpdate.begin(), toUpdate.end()); -  assert(uniqued.size() == toUpdate.size() && "no duplicates please!"); +  std::set<CallSite> Uniqued; +  Uniqued.insert(ToUpdate.begin(), ToUpdate.end()); +  assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!"); -  for (size_t i = 0; i < toUpdate.size(); i++) { -    CallSite &CS = toUpdate[i]; +  for (CallSite CS : ToUpdate) {      assert(CS.getInstruction()->getParent()->getParent() == &F); -    assert(isStatepoint(CS) && "expected to already be a deopt statepoint"); +    assert((UseDeoptBundles || isStatepoint(CS)) && +           "expected to already be a deopt statepoint");    }  #endif @@ -2098,50 +2289,45 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,    // the top of the successor blocks.  See the comment on    // normalForInvokeSafepoint on exactly what is needed.  Note that this step    // may restructure the CFG. -  for (CallSite CS : toUpdate) { +  for (CallSite CS : ToUpdate) {      if (!CS.isInvoke())        continue; -    InvokeInst *invoke = cast<InvokeInst>(CS.getInstruction()); -    normalizeForInvokeSafepoint(invoke->getNormalDest(), invoke->getParent(), -                                DT); -    normalizeForInvokeSafepoint(invoke->getUnwindDest(), invoke->getParent(), -                                DT); +    auto *II = cast<InvokeInst>(CS.getInstruction()); +    normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT); +    normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT);    }    // A list of dummy calls added to the IR to keep various values obviously    // live in the IR.  We'll remove all of these when done. -  SmallVector<CallInst *, 64> holders; +  SmallVector<CallInst *, 64> Holders;    // Insert a dummy call with all of the arguments to the vm_state we'll need    // for the actual safepoint insertion.  This ensures reference arguments in    // the deopt argument list are considered live through the safepoint (and    // thus makes sure they get relocated.) -  for (size_t i = 0; i < toUpdate.size(); i++) { -    CallSite &CS = toUpdate[i]; -    Statepoint StatepointCS(CS); - +  for (CallSite CS : ToUpdate) {      SmallVector<Value *, 64> DeoptValues; -    for (Use &U : StatepointCS.vm_state_args()) { -      Value *Arg = cast<Value>(&U); + +    iterator_range<const Use *> DeoptStateRange = +        UseDeoptBundles +            ? iterator_range<const Use *>(GetDeoptBundleOperands(CS)) +            : iterator_range<const Use *>(Statepoint(CS).vm_state_args()); + +    for (Value *Arg : DeoptStateRange) {        assert(!isUnhandledGCPointerType(Arg->getType()) &&               "support for FCA unimplemented");        if (isHandledGCPointerType(Arg->getType()))          DeoptValues.push_back(Arg);      } -    insertUseHolderAfter(CS, DeoptValues, holders); -  } -  SmallVector<struct PartiallyConstructedSafepointRecord, 64> records; -  records.reserve(toUpdate.size()); -  for (size_t i = 0; i < toUpdate.size(); i++) { -    struct PartiallyConstructedSafepointRecord info; -    records.push_back(info); +    insertUseHolderAfter(CS, DeoptValues, Holders);    } -  assert(records.size() == toUpdate.size()); -  // A) Identify all gc pointers which are staticly live at the given call +  SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size()); + +  // A) Identify all gc pointers which are statically live at the given call    // site. -  findLiveReferences(F, DT, P, toUpdate, records); +  findLiveReferences(F, DT, ToUpdate, Records);    // B) Find the base pointers for each live pointer    /* scope for caching */ { @@ -2150,10 +2336,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,      // large numbers of duplicate base_phis.      DefiningValueMapTy DVCache; -    for (size_t i = 0; i < records.size(); i++) { -      struct PartiallyConstructedSafepointRecord &info = records[i]; -      CallSite &CS = toUpdate[i]; -      findBasePointers(DT, DVCache, CS, info); +    for (size_t i = 0; i < Records.size(); i++) { +      PartiallyConstructedSafepointRecord &info = Records[i]; +      findBasePointers(DT, DVCache, ToUpdate[i], info);      }    } // end of cache scope @@ -2170,63 +2355,75 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,    // the base pointers which were identified for that safepoint.  We'll then    // ask liveness for _every_ base inserted to see what is now live.  Then we    // remove the dummy calls. -  holders.reserve(holders.size() + records.size()); -  for (size_t i = 0; i < records.size(); i++) { -    struct PartiallyConstructedSafepointRecord &info = records[i]; -    CallSite &CS = toUpdate[i]; +  Holders.reserve(Holders.size() + Records.size()); +  for (size_t i = 0; i < Records.size(); i++) { +    PartiallyConstructedSafepointRecord &Info = Records[i];      SmallVector<Value *, 128> Bases; -    for (auto Pair : info.PointerToBase) { +    for (auto Pair : Info.PointerToBase)        Bases.push_back(Pair.second); -    } -    insertUseHolderAfter(CS, Bases, holders); + +    insertUseHolderAfter(ToUpdate[i], Bases, Holders);    }    // By selecting base pointers, we've effectively inserted new uses. Thus, we    // need to rerun liveness.  We may *also* have inserted new defs, but that's    // not the key issue. -  recomputeLiveInValues(F, DT, P, toUpdate, records); +  recomputeLiveInValues(F, DT, ToUpdate, Records);    if (PrintBasePointers) { -    for (size_t i = 0; i < records.size(); i++) { -      struct PartiallyConstructedSafepointRecord &info = records[i]; +    for (auto &Info : Records) {        errs() << "Base Pairs: (w/Relocation)\n"; -      for (auto Pair : info.PointerToBase) { -        errs() << " derived %" << Pair.first->getName() << " base %" -               << Pair.second->getName() << "\n"; +      for (auto Pair : Info.PointerToBase) { +        errs() << " derived "; +        Pair.first->printAsOperand(errs(), false); +        errs() << " base "; +        Pair.second->printAsOperand(errs(), false); +        errs() << "\n";        }      }    } -  for (size_t i = 0; i < holders.size(); i++) { -    holders[i]->eraseFromParent(); -    holders[i] = nullptr; -  } -  holders.clear(); + +  // It is possible that non-constant live variables have a constant base.  For +  // example, a GEP with a variable offset from a global.  In this case we can +  // remove it from the liveset.  We already don't add constants to the liveset +  // because we assume they won't move at runtime and the GC doesn't need to be +  // informed about them.  The same reasoning applies if the base is constant. +  // Note that the relocation placement code relies on this filtering for +  // correctness as it expects the base to be in the liveset, which isn't true +  // if the base is constant. +  for (auto &Info : Records) +    for (auto &BasePair : Info.PointerToBase) +      if (isa<Constant>(BasePair.second)) +        Info.LiveSet.erase(BasePair.first); + +  for (CallInst *CI : Holders) +    CI->eraseFromParent(); + +  Holders.clear();    // Do a limited scalarization of any live at safepoint vector values which    // contain pointers.  This enables this pass to run after vectorization at    // the cost of some possible performance loss.  TODO: it would be nice to    // natively support vectors all the way through the backend so we don't need    // to scalarize here. -  for (size_t i = 0; i < records.size(); i++) { -    struct PartiallyConstructedSafepointRecord &info = records[i]; -    Instruction *statepoint = toUpdate[i].getInstruction(); -    splitVectorValues(cast<Instruction>(statepoint), info.liveset, -                      info.PointerToBase, DT); +  for (size_t i = 0; i < Records.size(); i++) { +    PartiallyConstructedSafepointRecord &Info = Records[i]; +    Instruction *Statepoint = ToUpdate[i].getInstruction(); +    splitVectorValues(cast<Instruction>(Statepoint), Info.LiveSet, +                      Info.PointerToBase, DT);    }    // In order to reduce live set of statepoint we might choose to rematerialize -  // some values instead of relocating them. This is purelly an optimization and +  // some values instead of relocating them. This is purely an optimization and    // does not influence correctness. -  TargetTransformInfo &TTI = -    P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); +  for (size_t i = 0; i < Records.size(); i++) +    rematerializeLiveValues(ToUpdate[i], Records[i], TTI); -  for (size_t i = 0; i < records.size(); i++) { -    struct PartiallyConstructedSafepointRecord &info = records[i]; -    CallSite &CS = toUpdate[i]; - -    rematerializeLiveValues(CS, info, TTI); -  } +  // We need this to safely RAUW and delete call or invoke return values that +  // may themselves be live over a statepoint.  For details, please see usage in +  // makeStatepointExplicitImpl. +  std::vector<DeferredReplacement> Replacements;    // Now run through and replace the existing statepoints with new ones with    // the live variables listed.  We do not yet update uses of the values being @@ -2234,61 +2431,77 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P,    // survive to the last iteration of this loop.  (By construction, the    // previous statepoint can not be a live variable, thus we can and remove    // the old statepoint calls as we go.) -  for (size_t i = 0; i < records.size(); i++) { -    struct PartiallyConstructedSafepointRecord &info = records[i]; -    CallSite &CS = toUpdate[i]; -    makeStatepointExplicit(DT, CS, P, info); +  for (size_t i = 0; i < Records.size(); i++) +    makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements); + +  ToUpdate.clear(); // prevent accident use of invalid CallSites + +  for (auto &PR : Replacements) +    PR.doReplacement(); + +  Replacements.clear(); + +  for (auto &Info : Records) { +    // These live sets may contain state Value pointers, since we replaced calls +    // with operand bundles with calls wrapped in gc.statepoint, and some of +    // those calls may have been def'ing live gc pointers.  Clear these out to +    // avoid accidentally using them. +    // +    // TODO: We should create a separate data structure that does not contain +    // these live sets, and migrate to using that data structure from this point +    // onward. +    Info.LiveSet.clear(); +    Info.PointerToBase.clear();    } -  toUpdate.clear(); // prevent accident use of invalid CallSites    // Do all the fixups of the original live variables to their relocated selves -  SmallVector<Value *, 128> live; -  for (size_t i = 0; i < records.size(); i++) { -    struct PartiallyConstructedSafepointRecord &info = records[i]; +  SmallVector<Value *, 128> Live; +  for (size_t i = 0; i < Records.size(); i++) { +    PartiallyConstructedSafepointRecord &Info = Records[i]; +      // We can't simply save the live set from the original insertion.  One of      // the live values might be the result of a call which needs a safepoint.      // That Value* no longer exists and we need to use the new gc_result. -    // Thankfully, the liveset is embedded in the statepoint (and updated), so +    // Thankfully, the live set is embedded in the statepoint (and updated), so      // we just grab that. -    Statepoint statepoint(info.StatepointToken); -    live.insert(live.end(), statepoint.gc_args_begin(), -                statepoint.gc_args_end()); +    Statepoint Statepoint(Info.StatepointToken); +    Live.insert(Live.end(), Statepoint.gc_args_begin(), +                Statepoint.gc_args_end());  #ifndef NDEBUG      // Do some basic sanity checks on our liveness results before performing      // relocation.  Relocation can and will turn mistakes in liveness results      // into non-sensical code which is must harder to debug.      // TODO: It would be nice to test consistency as well -    assert(DT.isReachableFromEntry(info.StatepointToken->getParent()) && +    assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&             "statepoint must be reachable or liveness is meaningless"); -    for (Value *V : statepoint.gc_args()) { +    for (Value *V : Statepoint.gc_args()) {        if (!isa<Instruction>(V))          // Non-instruction values trivial dominate all possible uses          continue; -      auto LiveInst = cast<Instruction>(V); +      auto *LiveInst = cast<Instruction>(V);        assert(DT.isReachableFromEntry(LiveInst->getParent()) &&               "unreachable values should never be live"); -      assert(DT.dominates(LiveInst, info.StatepointToken) && +      assert(DT.dominates(LiveInst, Info.StatepointToken) &&               "basic SSA liveness expectation violated by liveness analysis");      }  #endif    } -  unique_unsorted(live); +  unique_unsorted(Live);  #ifndef NDEBUG    // sanity check -  for (auto ptr : live) { -    assert(isGCPointerType(ptr->getType()) && "must be a gc pointer type"); -  } +  for (auto *Ptr : Live) +    assert(isGCPointerType(Ptr->getType()) && "must be a gc pointer type");  #endif -  relocationViaAlloca(F, DT, live, records); -  return !records.empty(); +  relocationViaAlloca(F, DT, Live, Records); +  return !Records.empty();  }  // Handles both return values and arguments for Functions and CallSites.  template <typename AttrHolder> -static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, -                                   unsigned Index) { +static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, +                                      unsigned Index) {    AttrBuilder R;    if (AH.getDereferenceableBytes(Index))      R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable, @@ -2296,6 +2509,8 @@ static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,    if (AH.getDereferenceableOrNullBytes(Index))      R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,                                    AH.getDereferenceableOrNullBytes(Index))); +  if (AH.doesNotAlias(Index)) +    R.addAttribute(Attribute::NoAlias);    if (!R.empty())      AH.setAttributes(AH.getAttributes().removeAttributes( @@ -2303,25 +2518,25 @@ static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,  }  void -RewriteStatepointsForGC::stripDereferenceabilityInfoFromPrototype(Function &F) { +RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {    LLVMContext &Ctx = F.getContext();    for (Argument &A : F.args())      if (isa<PointerType>(A.getType())) -      RemoveDerefAttrAtIndex(Ctx, F, A.getArgNo() + 1); +      RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1);    if (isa<PointerType>(F.getReturnType())) -    RemoveDerefAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex); +    RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex);  } -void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) { +void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {    if (F.empty())      return;    LLVMContext &Ctx = F.getContext();    MDBuilder Builder(Ctx); -  for (Instruction &I : inst_range(F)) { +  for (Instruction &I : instructions(F)) {      if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {        assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");        bool IsImmutableTBAA = @@ -2344,9 +2559,9 @@ void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) {      if (CallSite CS = CallSite(&I)) {        for (int i = 0, e = CS.arg_size(); i != e; i++)          if (isa<PointerType>(CS.getArgument(i)->getType())) -          RemoveDerefAttrAtIndex(Ctx, CS, i + 1); +          RemoveNonValidAttrAtIndex(Ctx, CS, i + 1);        if (isa<PointerType>(CS.getType())) -        RemoveDerefAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex); +        RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex);      }    }  } @@ -2365,17 +2580,17 @@ static bool shouldRewriteStatepointsIn(Function &F) {      return false;  } -void RewriteStatepointsForGC::stripDereferenceabilityInfo(Module &M) { +void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {  #ifndef NDEBUG    assert(std::any_of(M.begin(), M.end(), shouldRewriteStatepointsIn) &&           "precondition!");  #endif    for (Function &F : M) -    stripDereferenceabilityInfoFromPrototype(F); +    stripNonValidAttributesFromPrototype(F);    for (Function &F : M) -    stripDereferenceabilityInfoFromBody(F); +    stripNonValidAttributesFromBody(F);  }  bool RewriteStatepointsForGC::runOnFunction(Function &F) { @@ -2389,15 +2604,27 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) {      return false;    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); +  TargetTransformInfo &TTI = +      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + +  auto NeedsRewrite = [](Instruction &I) { +    if (UseDeoptBundles) { +      if (ImmutableCallSite CS = ImmutableCallSite(&I)) +        return !callsGCLeafFunction(CS); +      return false; +    } + +    return isStatepoint(I); +  };    // Gather all the statepoints which need rewritten.  Be careful to only    // consider those in reachable code since we need to ask dominance queries    // when rewriting.  We'll delete the unreachable ones in a moment.    SmallVector<CallSite, 64> ParsePointNeeded;    bool HasUnreachableStatepoint = false; -  for (Instruction &I : inst_range(F)) { +  for (Instruction &I : instructions(F)) {      // TODO: only the ones with the flag set! -    if (isStatepoint(I)) { +    if (NeedsRewrite(I)) {        if (DT.isReachableFromEntry(I.getParent()))          ParsePointNeeded.push_back(CallSite(&I));        else @@ -2428,7 +2655,38 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) {        FoldSingleEntryPHINodes(&BB);      } -  MadeChange |= insertParsePoints(F, DT, this, ParsePointNeeded); +  // Before we start introducing relocations, we want to tweak the IR a bit to +  // avoid unfortunate code generation effects.  The main example is that we  +  // want to try to make sure the comparison feeding a branch is after any +  // safepoints.  Otherwise, we end up with a comparison of pre-relocation +  // values feeding a branch after relocation.  This is semantically correct, +  // but results in extra register pressure since both the pre-relocation and +  // post-relocation copies must be available in registers.  For code without +  // relocations this is handled elsewhere, but teaching the scheduler to +  // reverse the transform we're about to do would be slightly complex. +  // Note: This may extend the live range of the inputs to the icmp and thus +  // increase the liveset of any statepoint we move over.  This is profitable +  // as long as all statepoints are in rare blocks.  If we had in-register +  // lowering for live values this would be a much safer transform. +  auto getConditionInst = [](TerminatorInst *TI) -> Instruction* { +    if (auto *BI = dyn_cast<BranchInst>(TI)) +      if (BI->isConditional()) +        return dyn_cast<Instruction>(BI->getCondition()); +    // TODO: Extend this to handle switches +    return nullptr; +  }; +  for (BasicBlock &BB : F) { +    TerminatorInst *TI = BB.getTerminator(); +    if (auto *Cond = getConditionInst(TI)) +      // TODO: Handle more than just ICmps here.  We should be able to move +      // most instructions without side effects or memory access.   +      if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) { +        MadeChange = true; +        Cond->moveBefore(TI); +      } +  } + +  MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded);    return MadeChange;  } @@ -2461,7 +2719,7 @@ static void computeLiveInValues(BasicBlock::reverse_iterator rbegin,               "support for FCA unimplemented");        if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {          // The choice to exclude all things constant here is slightly subtle. -        // There are two idependent reasons: +        // There are two independent reasons:          // - We assume that things which are constant (from LLVM's definition)          // do not move at runtime.  For example, the address of a global          // variable is fixed, even though it's contents may not be. @@ -2599,7 +2857,7 @@ static void computeLiveInValues(DominatorTree &DT, Function &F,    } // while( !worklist.empty() )  #ifndef NDEBUG -  // Sanity check our ouput against SSA properties.  This helps catch any +  // Sanity check our output against SSA properties.  This helps catch any    // missing kills during the above iteration.    for (BasicBlock &BB : F) {      checkBasicSSA(DT, Data, BB); @@ -2620,7 +2878,7 @@ static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,    // call result is not live (normal), nor are it's arguments    // (unless they're used again later).  This adjustment is    // specifically what we need to relocate -  BasicBlock::reverse_iterator rend(Inst); +  BasicBlock::reverse_iterator rend(Inst->getIterator());    computeLiveInValues(BB->rbegin(), rend, LiveOut);    LiveOut.erase(Inst);    Out.insert(LiveOut.begin(), LiveOut.end()); @@ -2669,5 +2927,5 @@ static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,      assert(Updated.count(KVPair.first) && "record for non-live value");  #endif -  Info.liveset = Updated; +  Info.LiveSet = Updated;  } diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index 4d3a708fa20e..2fca803adde8 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -24,6 +24,7 @@  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/ConstantFolding.h"  #include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/IR/CallSite.h" @@ -479,6 +480,13 @@ private:    void visitExtractValueInst(ExtractValueInst &EVI);    void visitInsertValueInst(InsertValueInst &IVI);    void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); } +  void visitFuncletPadInst(FuncletPadInst &FPI) { +    markAnythingOverdefined(&FPI); +  } +  void visitCatchSwitchInst(CatchSwitchInst &CPI) { +    markAnythingOverdefined(&CPI); +    visitTerminatorInst(CPI); +  }    // Instructions that cannot be folded away.    void visitStoreInst     (StoreInst &I); @@ -539,9 +547,9 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,      return;    } -  if (isa<InvokeInst>(TI)) { -    // Invoke instructions successors are always executable. -    Succs[0] = Succs[1] = true; +  // Unwinding instructions successors are always executable. +  if (TI.isExceptional()) { +    Succs.assign(TI.getNumSuccessors(), true);      return;    } @@ -605,8 +613,8 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {      return BI->getSuccessor(CI->isZero()) == To;    } -  // Invoke instructions successors are always executable. -  if (isa<InvokeInst>(TI)) +  // Unwinding instructions successors are always executable. +  if (TI->isExceptional())      return true;    if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { @@ -630,7 +638,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {  #ifndef NDEBUG    dbgs() << "Unknown terminator instruction: " << *TI << '\n';  #endif -  llvm_unreachable(nullptr); +  llvm_unreachable("SCCP: Don't know how to handle this terminator!");  }  // visit Implementations - Something changed in this instruction, either an @@ -1126,7 +1134,7 @@ CallOverdefined:    // entry block executable and merge in the actual arguments to the call into    // the formal arguments of the function.    if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){ -    MarkBlockExecutable(F->begin()); +    MarkBlockExecutable(&F->front());      // Propagate information from this call site into the callee.      CallSite::arg_iterator CAI = CS.arg_begin(); @@ -1135,17 +1143,17 @@ CallOverdefined:        // If this argument is byval, and if the function is not readonly, there        // will be an implicit copy formed of the input aggregate.        if (AI->hasByValAttr() && !F->onlyReadsMemory()) { -        markOverdefined(AI); +        markOverdefined(&*AI);          continue;        }        if (StructType *STy = dyn_cast<StructType>(AI->getType())) {          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {            LatticeVal CallArg = getStructValueState(*CAI, i); -          mergeInValue(getStructValueState(AI, i), AI, CallArg); +          mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg);          }        } else { -        mergeInValue(AI, getValueState(*CAI)); +        mergeInValue(&*AI, getValueState(*CAI));        }      }    } @@ -1246,18 +1254,18 @@ void SCCPSolver::Solve() {  /// even if X isn't defined.  bool SCCPSolver::ResolvedUndefsIn(Function &F) {    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { -    if (!BBExecutable.count(BB)) +    if (!BBExecutable.count(&*BB))        continue; -    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { +    for (Instruction &I : *BB) {        // Look for instructions which produce undef values. -      if (I->getType()->isVoidTy()) continue; +      if (I.getType()->isVoidTy()) continue; -      if (StructType *STy = dyn_cast<StructType>(I->getType())) { +      if (StructType *STy = dyn_cast<StructType>(I.getType())) {          // Only a few things that can be structs matter for undef.          // Tracked calls must never be marked overdefined in ResolvedUndefsIn. -        if (CallSite CS = CallSite(I)) +        if (CallSite CS = CallSite(&I))            if (Function *F = CS.getCalledFunction())              if (MRVFunctionsTracked.count(F))                continue; @@ -1270,14 +1278,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {          // Send the results of everything else to overdefined.  We could be          // more precise than this but it isn't worth bothering.          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { -          LatticeVal &LV = getStructValueState(I, i); +          LatticeVal &LV = getStructValueState(&I, i);            if (LV.isUndefined()) -            markOverdefined(LV, I); +            markOverdefined(LV, &I);          }          continue;        } -      LatticeVal &LV = getValueState(I); +      LatticeVal &LV = getValueState(&I);        if (!LV.isUndefined()) continue;        // extractvalue is safe; check here because the argument is a struct. @@ -1287,24 +1295,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {        // Compute the operand LatticeVals, for convenience below.        // Anything taking a struct is conservatively assumed to require        // overdefined markings. -      if (I->getOperand(0)->getType()->isStructTy()) { -        markOverdefined(I); +      if (I.getOperand(0)->getType()->isStructTy()) { +        markOverdefined(&I);          return true;        } -      LatticeVal Op0LV = getValueState(I->getOperand(0)); +      LatticeVal Op0LV = getValueState(I.getOperand(0));        LatticeVal Op1LV; -      if (I->getNumOperands() == 2) { -        if (I->getOperand(1)->getType()->isStructTy()) { -          markOverdefined(I); +      if (I.getNumOperands() == 2) { +        if (I.getOperand(1)->getType()->isStructTy()) { +          markOverdefined(&I);            return true;          } -        Op1LV = getValueState(I->getOperand(1)); +        Op1LV = getValueState(I.getOperand(1));        }        // If this is an instructions whose result is defined even if the input is        // not fully defined, propagate the information. -      Type *ITy = I->getType(); -      switch (I->getOpcode()) { +      Type *ITy = I.getType(); +      switch (I.getOpcode()) {        case Instruction::Add:        case Instruction::Sub:        case Instruction::Trunc: @@ -1318,9 +1326,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {        case Instruction::FRem:          // Floating-point binary operation: be conservative.          if (Op0LV.isUndefined() && Op1LV.isUndefined()) -          markForcedConstant(I, Constant::getNullValue(ITy)); +          markForcedConstant(&I, Constant::getNullValue(ITy));          else -          markOverdefined(I); +          markOverdefined(&I);          return true;        case Instruction::ZExt:        case Instruction::SExt: @@ -1332,7 +1340,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {        case Instruction::SIToFP:        case Instruction::UIToFP:          // undef -> 0; some outputs are impossible -        markForcedConstant(I, Constant::getNullValue(ITy)); +        markForcedConstant(&I, Constant::getNullValue(ITy));          return true;        case Instruction::Mul:        case Instruction::And: @@ -1341,7 +1349,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {            break;          // undef * X -> 0.   X could be zero.          // undef & X -> 0.   X could be zero. -        markForcedConstant(I, Constant::getNullValue(ITy)); +        markForcedConstant(&I, Constant::getNullValue(ITy));          return true;        case Instruction::Or: @@ -1349,7 +1357,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {          if (Op0LV.isUndefined() && Op1LV.isUndefined())            break;          // undef | X -> -1.   X could be -1. -        markForcedConstant(I, Constant::getAllOnesValue(ITy)); +        markForcedConstant(&I, Constant::getAllOnesValue(ITy));          return true;        case Instruction::Xor: @@ -1357,7 +1365,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {          // necessary, but we try to be nice to people who expect this          // behavior in simple cases          if (Op0LV.isUndefined() && Op1LV.isUndefined()) { -          markForcedConstant(I, Constant::getNullValue(ITy)); +          markForcedConstant(&I, Constant::getNullValue(ITy));            return true;          }          // undef ^ X -> undef @@ -1373,7 +1381,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {          // undef / X -> 0.   X could be maxint.          // undef % X -> 0.   X could be 1. -        markForcedConstant(I, Constant::getNullValue(ITy)); +        markForcedConstant(&I, Constant::getNullValue(ITy));          return true;        case Instruction::AShr: @@ -1381,7 +1389,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {          if (Op1LV.isUndefined()) break;          // undef >>a X -> all ones -        markForcedConstant(I, Constant::getAllOnesValue(ITy)); +        markForcedConstant(&I, Constant::getAllOnesValue(ITy));          return true;        case Instruction::LShr:        case Instruction::Shl: @@ -1391,17 +1399,17 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {          // undef << X -> 0          // undef >> X -> 0 -        markForcedConstant(I, Constant::getNullValue(ITy)); +        markForcedConstant(&I, Constant::getNullValue(ITy));          return true;        case Instruction::Select: -        Op1LV = getValueState(I->getOperand(1)); +        Op1LV = getValueState(I.getOperand(1));          // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.          if (Op0LV.isUndefined()) {            if (!Op1LV.isConstant())  // Pick the constant one if there is any. -            Op1LV = getValueState(I->getOperand(2)); +            Op1LV = getValueState(I.getOperand(2));          } else if (Op1LV.isUndefined()) {            // c ? undef : undef -> undef.  No change. -          Op1LV = getValueState(I->getOperand(2)); +          Op1LV = getValueState(I.getOperand(2));            if (Op1LV.isUndefined())              break;            // Otherwise, c ? undef : x -> x. @@ -1410,9 +1418,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {          }          if (Op1LV.isConstant()) -          markForcedConstant(I, Op1LV.getConstant()); +          markForcedConstant(&I, Op1LV.getConstant());          else -          markOverdefined(I); +          markOverdefined(&I);          return true;        case Instruction::Load:          // A load here means one of two things: a load of undef from a global, @@ -1421,9 +1429,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {          break;        case Instruction::ICmp:          // X == undef -> undef.  Other comparisons get more complicated. -        if (cast<ICmpInst>(I)->isEquality()) +        if (cast<ICmpInst>(&I)->isEquality())            break; -        markOverdefined(I); +        markOverdefined(&I);          return true;        case Instruction::Call:        case Instruction::Invoke: { @@ -1432,19 +1440,19 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {          // 2. It could be constant-foldable.          // Because of the way we solve return values, tracked calls must          // never be marked overdefined in ResolvedUndefsIn. -        if (Function *F = CallSite(I).getCalledFunction()) +        if (Function *F = CallSite(&I).getCalledFunction())            if (TrackedRetVals.count(F))              break;          // If the call is constant-foldable, we mark it overdefined because          // we do not know what return values are valid. -        markOverdefined(I); +        markOverdefined(&I);          return true;        }        default:          // If we don't know what should happen here, conservatively mark it          // overdefined. -        markOverdefined(I); +        markOverdefined(&I);          return true;        }      } @@ -1462,7 +1470,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {        // false.        if (isa<UndefValue>(BI->getCondition())) {          BI->setCondition(ConstantInt::getFalse(BI->getContext())); -        markEdgeExecutable(BB, TI->getSuccessor(1)); +        markEdgeExecutable(&*BB, TI->getSuccessor(1));          return true;        } @@ -1484,7 +1492,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {        // the first constant.        if (isa<UndefValue>(SI->getCondition())) {          SI->setCondition(SI->case_begin().getCaseValue()); -        markEdgeExecutable(BB, SI->case_begin().getCaseSuccessor()); +        markEdgeExecutable(&*BB, SI->case_begin().getCaseSuccessor());          return true;        } @@ -1506,6 +1514,7 @@ namespace {    struct SCCP : public FunctionPass {      void getAnalysisUsage(AnalysisUsage &AU) const override {        AU.addRequired<TargetLibraryInfoWrapperPass>(); +      AU.addPreserved<GlobalsAAWrapperPass>();      }      static char ID; // Pass identification, replacement for typeid      SCCP() : FunctionPass(ID) { @@ -1541,11 +1550,10 @@ static void DeleteInstructionInBlock(BasicBlock *BB) {    Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.    while (EndInst != BB->begin()) {      // Delete the next to last instruction. -    BasicBlock::iterator I = EndInst; -    Instruction *Inst = --I; +    Instruction *Inst = &*--EndInst->getIterator();      if (!Inst->use_empty())        Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); -    if (isa<LandingPadInst>(Inst)) { +    if (Inst->isEHPad()) {        EndInst = Inst;        continue;      } @@ -1568,11 +1576,11 @@ bool SCCP::runOnFunction(Function &F) {    SCCPSolver Solver(DL, TLI);    // Mark the first block of the function as being executable. -  Solver.MarkBlockExecutable(F.begin()); +  Solver.MarkBlockExecutable(&F.front());    // Mark all arguments to the function as being overdefined. -  for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;++AI) -    Solver.markAnythingOverdefined(AI); +  for (Argument &AI : F.args()) +    Solver.markAnythingOverdefined(&AI);    // Solve for constants.    bool ResolvedUndefs = true; @@ -1589,8 +1597,8 @@ bool SCCP::runOnFunction(Function &F) {    // as we cannot modify the CFG of the function.    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { -    if (!Solver.isBlockExecutable(BB)) { -      DeleteInstructionInBlock(BB); +    if (!Solver.isBlockExecutable(&*BB)) { +      DeleteInstructionInBlock(&*BB);        MadeChanges = true;        continue;      } @@ -1599,7 +1607,7 @@ bool SCCP::runOnFunction(Function &F) {      // constants if we have found them to be of constant values.      //      for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { -      Instruction *Inst = BI++; +      Instruction *Inst = &*BI++;        if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst))          continue; @@ -1713,36 +1721,34 @@ bool IPSCCP::runOnModule(Module &M) {      // If this is a strong or ODR definition of this function, then we can      // propagate information about its result into callsites of it.      if (!F->mayBeOverridden()) -      Solver.AddTrackedFunction(F); +      Solver.AddTrackedFunction(&*F);      // If this function only has direct calls that we can see, we can track its      // arguments and return value aggressively, and can assume it is not called      // unless we see evidence to the contrary.      if (F->hasLocalLinkage()) { -      if (AddressIsTaken(F)) -        AddressTakenFunctions.insert(F); +      if (AddressIsTaken(&*F)) +        AddressTakenFunctions.insert(&*F);        else { -        Solver.AddArgumentTrackedFunction(F); +        Solver.AddArgumentTrackedFunction(&*F);          continue;        }      }      // Assume the function is called. -    Solver.MarkBlockExecutable(F->begin()); +    Solver.MarkBlockExecutable(&F->front());      // Assume nothing about the incoming arguments. -    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); -         AI != E; ++AI) -      Solver.markAnythingOverdefined(AI); +    for (Argument &AI : F->args()) +      Solver.markAnythingOverdefined(&AI);    }    // Loop over global variables.  We inform the solver about any internal global    // variables that do not have their 'addresses taken'.  If they don't have    // their addresses taken, we can propagate constants through them. -  for (Module::global_iterator G = M.global_begin(), E = M.global_end(); -       G != E; ++G) -    if (!G->isConstant() && G->hasLocalLinkage() && !AddressIsTaken(G)) -      Solver.TrackValueOfGlobalVariable(G); +  for (GlobalVariable &G : M.globals()) +    if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G)) +      Solver.TrackValueOfGlobalVariable(&G);    // Solve for constants.    bool ResolvedUndefs = true; @@ -1763,7 +1769,10 @@ bool IPSCCP::runOnModule(Module &M) {    SmallVector<BasicBlock*, 512> BlocksToErase;    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { -    if (Solver.isBlockExecutable(F->begin())) { +    if (F->isDeclaration()) +      continue; + +    if (Solver.isBlockExecutable(&F->front())) {        for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();             AI != E; ++AI) {          if (AI->use_empty() || AI->getType()->isStructTy()) continue; @@ -1771,7 +1780,7 @@ bool IPSCCP::runOnModule(Module &M) {          // TODO: Could use getStructLatticeValueFor to find out if the entire          // result is a constant and replace it entirely if so. -        LatticeVal IV = Solver.getLatticeValueFor(AI); +        LatticeVal IV = Solver.getLatticeValueFor(&*AI);          if (IV.isOverdefined()) continue;          Constant *CST = IV.isConstant() ? @@ -1786,28 +1795,27 @@ bool IPSCCP::runOnModule(Module &M) {      }      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { -      if (!Solver.isBlockExecutable(BB)) { -        DeleteInstructionInBlock(BB); +      if (!Solver.isBlockExecutable(&*BB)) { +        DeleteInstructionInBlock(&*BB);          MadeChanges = true;          TerminatorInst *TI = BB->getTerminator(); -        for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { -          BasicBlock *Succ = TI->getSuccessor(i); +        for (BasicBlock *Succ : TI->successors()) {            if (!Succ->empty() && isa<PHINode>(Succ->begin())) -            TI->getSuccessor(i)->removePredecessor(BB); +            Succ->removePredecessor(&*BB);          }          if (!TI->use_empty())            TI->replaceAllUsesWith(UndefValue::get(TI->getType()));          TI->eraseFromParent(); -        new UnreachableInst(M.getContext(), BB); +        new UnreachableInst(M.getContext(), &*BB);          if (&*BB != &F->front()) -          BlocksToErase.push_back(BB); +          BlocksToErase.push_back(&*BB);          continue;        }        for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { -        Instruction *Inst = BI++; +        Instruction *Inst = &*BI++;          if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy())            continue; diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index 947513a36572..a7361b5fe083 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -23,12 +23,12 @@  ///  //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/SROA.h"  #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetVector.h"  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/Loads.h"  #include "llvm/Analysis/PtrUseVisitor.h"  #include "llvm/Analysis/ValueTracking.h" @@ -37,8 +37,6 @@  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/DebugInfo.h"  #include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h"  #include "llvm/IR/IRBuilder.h"  #include "llvm/IR/InstVisitor.h"  #include "llvm/IR/Instructions.h" @@ -53,9 +51,9 @@  #include "llvm/Support/MathExtras.h"  #include "llvm/Support/TimeValue.h"  #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h"  #include "llvm/Transforms/Utils/Local.h"  #include "llvm/Transforms/Utils/PromoteMemToReg.h" -#include "llvm/Transforms/Utils/SSAUpdater.h"  #if __cplusplus >= 201103L && !defined(NDEBUG)  // We only use this for a debug check in C++11 @@ -63,6 +61,7 @@  #endif  using namespace llvm; +using namespace llvm::sroa;  #define DEBUG_TYPE "sroa" @@ -77,11 +76,6 @@ STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");  STATISTIC(NumDeleted, "Number of instructions deleted");  STATISTIC(NumVectorized, "Number of vectorized aggregates"); -/// Hidden option to force the pass to not use DomTree and mem2reg, instead -/// forming SSA values through the SSAUpdater infrastructure. -static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false), -                                     cl::Hidden); -  /// Hidden option to enable randomly shuffling the slices to help uncover  /// instability in their order.  static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", @@ -205,7 +199,6 @@ template <typename T> struct isPodLike;  template <> struct isPodLike<Slice> { static const bool value = true; };  } -namespace {  /// \brief Representation of the alloca slices.  ///  /// This class represents the slices of an alloca which are formed by its @@ -213,7 +206,7 @@ namespace {  /// for the slices used and we reflect that in this structure. The uses are  /// stored, sorted by increasing beginning offset and with unsplittable slices  /// starting at a particular offset before splittable slices. -class AllocaSlices { +class llvm::sroa::AllocaSlices {  public:    /// \brief Construct the slices of a particular alloca.    AllocaSlices(const DataLayout &DL, AllocaInst &AI); @@ -253,281 +246,10 @@ public:      std::inplace_merge(Slices.begin(), SliceI, Slices.end());    } -  // Forward declare an iterator to befriend it. +  // Forward declare the iterator and range accessor for walking the +  // partitions.    class partition_iterator; - -  /// \brief A partition of the slices. -  /// -  /// An ephemeral representation for a range of slices which can be viewed as -  /// a partition of the alloca. This range represents a span of the alloca's -  /// memory which cannot be split, and provides access to all of the slices -  /// overlapping some part of the partition. -  /// -  /// Objects of this type are produced by traversing the alloca's slices, but -  /// are only ephemeral and not persistent. -  class Partition { -  private: -    friend class AllocaSlices; -    friend class AllocaSlices::partition_iterator; - -    /// \brief The begining and ending offsets of the alloca for this partition. -    uint64_t BeginOffset, EndOffset; - -    /// \brief The start end end iterators of this partition. -    iterator SI, SJ; - -    /// \brief A collection of split slice tails overlapping the partition. -    SmallVector<Slice *, 4> SplitTails; - -    /// \brief Raw constructor builds an empty partition starting and ending at -    /// the given iterator. -    Partition(iterator SI) : SI(SI), SJ(SI) {} - -  public: -    /// \brief The start offset of this partition. -    /// -    /// All of the contained slices start at or after this offset. -    uint64_t beginOffset() const { return BeginOffset; } - -    /// \brief The end offset of this partition. -    /// -    /// All of the contained slices end at or before this offset. -    uint64_t endOffset() const { return EndOffset; } - -    /// \brief The size of the partition. -    /// -    /// Note that this can never be zero. -    uint64_t size() const { -      assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); -      return EndOffset - BeginOffset; -    } - -    /// \brief Test whether this partition contains no slices, and merely spans -    /// a region occupied by split slices. -    bool empty() const { return SI == SJ; } - -    /// \name Iterate slices that start within the partition. -    /// These may be splittable or unsplittable. They have a begin offset >= the -    /// partition begin offset. -    /// @{ -    // FIXME: We should probably define a "concat_iterator" helper and use that -    // to stitch together pointee_iterators over the split tails and the -    // contiguous iterators of the partition. That would give a much nicer -    // interface here. We could then additionally expose filtered iterators for -    // split, unsplit, and unsplittable splices based on the usage patterns. -    iterator begin() const { return SI; } -    iterator end() const { return SJ; } -    /// @} - -    /// \brief Get the sequence of split slice tails. -    /// -    /// These tails are of slices which start before this partition but are -    /// split and overlap into the partition. We accumulate these while forming -    /// partitions. -    ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } -  }; - -  /// \brief An iterator over partitions of the alloca's slices. -  /// -  /// This iterator implements the core algorithm for partitioning the alloca's -  /// slices. It is a forward iterator as we don't support backtracking for -  /// efficiency reasons, and re-use a single storage area to maintain the -  /// current set of split slices. -  /// -  /// It is templated on the slice iterator type to use so that it can operate -  /// with either const or non-const slice iterators. -  class partition_iterator -      : public iterator_facade_base<partition_iterator, -                                    std::forward_iterator_tag, Partition> { -    friend class AllocaSlices; - -    /// \brief Most of the state for walking the partitions is held in a class -    /// with a nice interface for examining them. -    Partition P; - -    /// \brief We need to keep the end of the slices to know when to stop. -    AllocaSlices::iterator SE; - -    /// \brief We also need to keep track of the maximum split end offset seen. -    /// FIXME: Do we really? -    uint64_t MaxSplitSliceEndOffset; - -    /// \brief Sets the partition to be empty at given iterator, and sets the -    /// end iterator. -    partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) -        : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { -      // If not already at the end, advance our state to form the initial -      // partition. -      if (SI != SE) -        advance(); -    } - -    /// \brief Advance the iterator to the next partition. -    /// -    /// Requires that the iterator not be at the end of the slices. -    void advance() { -      assert((P.SI != SE || !P.SplitTails.empty()) && -             "Cannot advance past the end of the slices!"); - -      // Clear out any split uses which have ended. -      if (!P.SplitTails.empty()) { -        if (P.EndOffset >= MaxSplitSliceEndOffset) { -          // If we've finished all splits, this is easy. -          P.SplitTails.clear(); -          MaxSplitSliceEndOffset = 0; -        } else { -          // Remove the uses which have ended in the prior partition. This -          // cannot change the max split slice end because we just checked that -          // the prior partition ended prior to that max. -          P.SplitTails.erase( -              std::remove_if( -                  P.SplitTails.begin(), P.SplitTails.end(), -                  [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), -              P.SplitTails.end()); -          assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), -                             [&](Slice *S) { -                               return S->endOffset() == MaxSplitSliceEndOffset; -                             }) && -                 "Could not find the current max split slice offset!"); -          assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), -                             [&](Slice *S) { -                               return S->endOffset() <= MaxSplitSliceEndOffset; -                             }) && -                 "Max split slice end offset is not actually the max!"); -        } -      } - -      // If P.SI is already at the end, then we've cleared the split tail and -      // now have an end iterator. -      if (P.SI == SE) { -        assert(P.SplitTails.empty() && "Failed to clear the split slices!"); -        return; -      } - -      // If we had a non-empty partition previously, set up the state for -      // subsequent partitions. -      if (P.SI != P.SJ) { -        // Accumulate all the splittable slices which started in the old -        // partition into the split list. -        for (Slice &S : P) -          if (S.isSplittable() && S.endOffset() > P.EndOffset) { -            P.SplitTails.push_back(&S); -            MaxSplitSliceEndOffset = -                std::max(S.endOffset(), MaxSplitSliceEndOffset); -          } - -        // Start from the end of the previous partition. -        P.SI = P.SJ; - -        // If P.SI is now at the end, we at most have a tail of split slices. -        if (P.SI == SE) { -          P.BeginOffset = P.EndOffset; -          P.EndOffset = MaxSplitSliceEndOffset; -          return; -        } - -        // If the we have split slices and the next slice is after a gap and is -        // not splittable immediately form an empty partition for the split -        // slices up until the next slice begins. -        if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && -            !P.SI->isSplittable()) { -          P.BeginOffset = P.EndOffset; -          P.EndOffset = P.SI->beginOffset(); -          return; -        } -      } - -      // OK, we need to consume new slices. Set the end offset based on the -      // current slice, and step SJ past it. The beginning offset of the -      // parttion is the beginning offset of the next slice unless we have -      // pre-existing split slices that are continuing, in which case we begin -      // at the prior end offset. -      P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; -      P.EndOffset = P.SI->endOffset(); -      ++P.SJ; - -      // There are two strategies to form a partition based on whether the -      // partition starts with an unsplittable slice or a splittable slice. -      if (!P.SI->isSplittable()) { -        // When we're forming an unsplittable region, it must always start at -        // the first slice and will extend through its end. -        assert(P.BeginOffset == P.SI->beginOffset()); - -        // Form a partition including all of the overlapping slices with this -        // unsplittable slice. -        while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { -          if (!P.SJ->isSplittable()) -            P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); -          ++P.SJ; -        } - -        // We have a partition across a set of overlapping unsplittable -        // partitions. -        return; -      } - -      // If we're starting with a splittable slice, then we need to form -      // a synthetic partition spanning it and any other overlapping splittable -      // splices. -      assert(P.SI->isSplittable() && "Forming a splittable partition!"); - -      // Collect all of the overlapping splittable slices. -      while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && -             P.SJ->isSplittable()) { -        P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); -        ++P.SJ; -      } - -      // Back upiP.EndOffset if we ended the span early when encountering an -      // unsplittable slice. This synthesizes the early end offset of -      // a partition spanning only splittable slices. -      if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { -        assert(!P.SJ->isSplittable()); -        P.EndOffset = P.SJ->beginOffset(); -      } -    } - -  public: -    bool operator==(const partition_iterator &RHS) const { -      assert(SE == RHS.SE && -             "End iterators don't match between compared partition iterators!"); - -      // The observed positions of partitions is marked by the P.SI iterator and -      // the emptyness of the split slices. The latter is only relevant when -      // P.SI == SE, as the end iterator will additionally have an empty split -      // slices list, but the prior may have the same P.SI and a tail of split -      // slices. -      if (P.SI == RHS.P.SI && -          P.SplitTails.empty() == RHS.P.SplitTails.empty()) { -        assert(P.SJ == RHS.P.SJ && -               "Same set of slices formed two different sized partitions!"); -        assert(P.SplitTails.size() == RHS.P.SplitTails.size() && -               "Same slice position with differently sized non-empty split " -               "slice tails!"); -        return true; -      } -      return false; -    } - -    partition_iterator &operator++() { -      advance(); -      return *this; -    } - -    Partition &operator*() { return P; } -  }; - -  /// \brief A forward range over the partitions of the alloca's slices. -  /// -  /// This accesses an iterator range over the partitions of the alloca's -  /// slices. It computes these partitions on the fly based on the overlapping -  /// offsets of the slices and the ability to split them. It will visit "empty" -  /// partitions to cover regions of the alloca only accessed via split -  /// slices. -  iterator_range<partition_iterator> partitions() { -    return make_range(partition_iterator(begin(), end()), -                      partition_iterator(end(), end())); -  } +  iterator_range<partition_iterator> partitions();    /// \brief Access the dead users for this alloca.    ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; } @@ -595,6 +317,280 @@ private:    /// the alloca.    SmallVector<Use *, 8> DeadOperands;  }; + +/// \brief A partition of the slices. +/// +/// An ephemeral representation for a range of slices which can be viewed as +/// a partition of the alloca. This range represents a span of the alloca's +/// memory which cannot be split, and provides access to all of the slices +/// overlapping some part of the partition. +/// +/// Objects of this type are produced by traversing the alloca's slices, but +/// are only ephemeral and not persistent. +class llvm::sroa::Partition { +private: +  friend class AllocaSlices; +  friend class AllocaSlices::partition_iterator; + +  typedef AllocaSlices::iterator iterator; + +  /// \brief The beginning and ending offsets of the alloca for this +  /// partition. +  uint64_t BeginOffset, EndOffset; + +  /// \brief The start end end iterators of this partition. +  iterator SI, SJ; + +  /// \brief A collection of split slice tails overlapping the partition. +  SmallVector<Slice *, 4> SplitTails; + +  /// \brief Raw constructor builds an empty partition starting and ending at +  /// the given iterator. +  Partition(iterator SI) : SI(SI), SJ(SI) {} + +public: +  /// \brief The start offset of this partition. +  /// +  /// All of the contained slices start at or after this offset. +  uint64_t beginOffset() const { return BeginOffset; } + +  /// \brief The end offset of this partition. +  /// +  /// All of the contained slices end at or before this offset. +  uint64_t endOffset() const { return EndOffset; } + +  /// \brief The size of the partition. +  /// +  /// Note that this can never be zero. +  uint64_t size() const { +    assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); +    return EndOffset - BeginOffset; +  } + +  /// \brief Test whether this partition contains no slices, and merely spans +  /// a region occupied by split slices. +  bool empty() const { return SI == SJ; } + +  /// \name Iterate slices that start within the partition. +  /// These may be splittable or unsplittable. They have a begin offset >= the +  /// partition begin offset. +  /// @{ +  // FIXME: We should probably define a "concat_iterator" helper and use that +  // to stitch together pointee_iterators over the split tails and the +  // contiguous iterators of the partition. That would give a much nicer +  // interface here. We could then additionally expose filtered iterators for +  // split, unsplit, and unsplittable splices based on the usage patterns. +  iterator begin() const { return SI; } +  iterator end() const { return SJ; } +  /// @} + +  /// \brief Get the sequence of split slice tails. +  /// +  /// These tails are of slices which start before this partition but are +  /// split and overlap into the partition. We accumulate these while forming +  /// partitions. +  ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } +}; + +/// \brief An iterator over partitions of the alloca's slices. +/// +/// This iterator implements the core algorithm for partitioning the alloca's +/// slices. It is a forward iterator as we don't support backtracking for +/// efficiency reasons, and re-use a single storage area to maintain the +/// current set of split slices. +/// +/// It is templated on the slice iterator type to use so that it can operate +/// with either const or non-const slice iterators. +class AllocaSlices::partition_iterator +    : public iterator_facade_base<partition_iterator, std::forward_iterator_tag, +                                  Partition> { +  friend class AllocaSlices; + +  /// \brief Most of the state for walking the partitions is held in a class +  /// with a nice interface for examining them. +  Partition P; + +  /// \brief We need to keep the end of the slices to know when to stop. +  AllocaSlices::iterator SE; + +  /// \brief We also need to keep track of the maximum split end offset seen. +  /// FIXME: Do we really? +  uint64_t MaxSplitSliceEndOffset; + +  /// \brief Sets the partition to be empty at given iterator, and sets the +  /// end iterator. +  partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) +      : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { +    // If not already at the end, advance our state to form the initial +    // partition. +    if (SI != SE) +      advance(); +  } + +  /// \brief Advance the iterator to the next partition. +  /// +  /// Requires that the iterator not be at the end of the slices. +  void advance() { +    assert((P.SI != SE || !P.SplitTails.empty()) && +           "Cannot advance past the end of the slices!"); + +    // Clear out any split uses which have ended. +    if (!P.SplitTails.empty()) { +      if (P.EndOffset >= MaxSplitSliceEndOffset) { +        // If we've finished all splits, this is easy. +        P.SplitTails.clear(); +        MaxSplitSliceEndOffset = 0; +      } else { +        // Remove the uses which have ended in the prior partition. This +        // cannot change the max split slice end because we just checked that +        // the prior partition ended prior to that max. +        P.SplitTails.erase( +            std::remove_if( +                P.SplitTails.begin(), P.SplitTails.end(), +                [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), +            P.SplitTails.end()); +        assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), +                           [&](Slice *S) { +                             return S->endOffset() == MaxSplitSliceEndOffset; +                           }) && +               "Could not find the current max split slice offset!"); +        assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), +                           [&](Slice *S) { +                             return S->endOffset() <= MaxSplitSliceEndOffset; +                           }) && +               "Max split slice end offset is not actually the max!"); +      } +    } + +    // If P.SI is already at the end, then we've cleared the split tail and +    // now have an end iterator. +    if (P.SI == SE) { +      assert(P.SplitTails.empty() && "Failed to clear the split slices!"); +      return; +    } + +    // If we had a non-empty partition previously, set up the state for +    // subsequent partitions. +    if (P.SI != P.SJ) { +      // Accumulate all the splittable slices which started in the old +      // partition into the split list. +      for (Slice &S : P) +        if (S.isSplittable() && S.endOffset() > P.EndOffset) { +          P.SplitTails.push_back(&S); +          MaxSplitSliceEndOffset = +              std::max(S.endOffset(), MaxSplitSliceEndOffset); +        } + +      // Start from the end of the previous partition. +      P.SI = P.SJ; + +      // If P.SI is now at the end, we at most have a tail of split slices. +      if (P.SI == SE) { +        P.BeginOffset = P.EndOffset; +        P.EndOffset = MaxSplitSliceEndOffset; +        return; +      } + +      // If the we have split slices and the next slice is after a gap and is +      // not splittable immediately form an empty partition for the split +      // slices up until the next slice begins. +      if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && +          !P.SI->isSplittable()) { +        P.BeginOffset = P.EndOffset; +        P.EndOffset = P.SI->beginOffset(); +        return; +      } +    } + +    // OK, we need to consume new slices. Set the end offset based on the +    // current slice, and step SJ past it. The beginning offset of the +    // partition is the beginning offset of the next slice unless we have +    // pre-existing split slices that are continuing, in which case we begin +    // at the prior end offset. +    P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; +    P.EndOffset = P.SI->endOffset(); +    ++P.SJ; + +    // There are two strategies to form a partition based on whether the +    // partition starts with an unsplittable slice or a splittable slice. +    if (!P.SI->isSplittable()) { +      // When we're forming an unsplittable region, it must always start at +      // the first slice and will extend through its end. +      assert(P.BeginOffset == P.SI->beginOffset()); + +      // Form a partition including all of the overlapping slices with this +      // unsplittable slice. +      while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { +        if (!P.SJ->isSplittable()) +          P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); +        ++P.SJ; +      } + +      // We have a partition across a set of overlapping unsplittable +      // partitions. +      return; +    } + +    // If we're starting with a splittable slice, then we need to form +    // a synthetic partition spanning it and any other overlapping splittable +    // splices. +    assert(P.SI->isSplittable() && "Forming a splittable partition!"); + +    // Collect all of the overlapping splittable slices. +    while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && +           P.SJ->isSplittable()) { +      P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); +      ++P.SJ; +    } + +    // Back upiP.EndOffset if we ended the span early when encountering an +    // unsplittable slice. This synthesizes the early end offset of +    // a partition spanning only splittable slices. +    if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { +      assert(!P.SJ->isSplittable()); +      P.EndOffset = P.SJ->beginOffset(); +    } +  } + +public: +  bool operator==(const partition_iterator &RHS) const { +    assert(SE == RHS.SE && +           "End iterators don't match between compared partition iterators!"); + +    // The observed positions of partitions is marked by the P.SI iterator and +    // the emptiness of the split slices. The latter is only relevant when +    // P.SI == SE, as the end iterator will additionally have an empty split +    // slices list, but the prior may have the same P.SI and a tail of split +    // slices. +    if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) { +      assert(P.SJ == RHS.P.SJ && +             "Same set of slices formed two different sized partitions!"); +      assert(P.SplitTails.size() == RHS.P.SplitTails.size() && +             "Same slice position with differently sized non-empty split " +             "slice tails!"); +      return true; +    } +    return false; +  } + +  partition_iterator &operator++() { +    advance(); +    return *this; +  } + +  Partition &operator*() { return P; } +}; + +/// \brief A forward range over the partitions of the alloca's slices. +/// +/// This accesses an iterator range over the partitions of the alloca's +/// slices. It computes these partitions on the fly based on the overlapping +/// offsets of the slices and the ability to split them. It will visit "empty" +/// partitions to cover regions of the alloca only accessed via split +/// slices. +iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() { +  return make_range(partition_iterator(begin(), end()), +                    partition_iterator(end(), end()));  }  static Value *foldSelectInst(SelectInst &SI) { @@ -1072,217 +1068,6 @@ LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }  #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -namespace { -/// \brief Implementation of LoadAndStorePromoter for promoting allocas. -/// -/// This subclass of LoadAndStorePromoter adds overrides to handle promoting -/// the loads and stores of an alloca instruction, as well as updating its -/// debug information. This is used when a domtree is unavailable and thus -/// mem2reg in its full form can't be used to handle promotion of allocas to -/// scalar values. -class AllocaPromoter : public LoadAndStorePromoter { -  AllocaInst &AI; -  DIBuilder &DIB; - -  SmallVector<DbgDeclareInst *, 4> DDIs; -  SmallVector<DbgValueInst *, 4> DVIs; - -public: -  AllocaPromoter(ArrayRef<const Instruction *> Insts, -                 SSAUpdater &S, -                 AllocaInst &AI, DIBuilder &DIB) -      : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} - -  void run(const SmallVectorImpl<Instruction *> &Insts) { -    // Retain the debug information attached to the alloca for use when -    // rewriting loads and stores. -    if (auto *L = LocalAsMetadata::getIfExists(&AI)) { -      if (auto *DINode = MetadataAsValue::getIfExists(AI.getContext(), L)) { -        for (User *U : DINode->users()) -          if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) -            DDIs.push_back(DDI); -          else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) -            DVIs.push_back(DVI); -      } -    } - -    LoadAndStorePromoter::run(Insts); - -    // While we have the debug information, clear it off of the alloca. The -    // caller takes care of deleting the alloca. -    while (!DDIs.empty()) -      DDIs.pop_back_val()->eraseFromParent(); -    while (!DVIs.empty()) -      DVIs.pop_back_val()->eraseFromParent(); -  } - -  bool -  isInstInList(Instruction *I, -               const SmallVectorImpl<Instruction *> &Insts) const override { -    Value *Ptr; -    if (LoadInst *LI = dyn_cast<LoadInst>(I)) -      Ptr = LI->getOperand(0); -    else -      Ptr = cast<StoreInst>(I)->getPointerOperand(); - -    // Only used to detect cycles, which will be rare and quickly found as -    // we're walking up a chain of defs rather than down through uses. -    SmallPtrSet<Value *, 4> Visited; - -    do { -      if (Ptr == &AI) -        return true; - -      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr)) -        Ptr = BCI->getOperand(0); -      else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) -        Ptr = GEPI->getPointerOperand(); -      else -        return false; - -    } while (Visited.insert(Ptr).second); - -    return false; -  } - -  void updateDebugInfo(Instruction *Inst) const override { -    for (DbgDeclareInst *DDI : DDIs) -      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) -        ConvertDebugDeclareToDebugValue(DDI, SI, DIB); -      else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) -        ConvertDebugDeclareToDebugValue(DDI, LI, DIB); -    for (DbgValueInst *DVI : DVIs) { -      Value *Arg = nullptr; -      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { -        // If an argument is zero extended then use argument directly. The ZExt -        // may be zapped by an optimization pass in future. -        if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) -          Arg = dyn_cast<Argument>(ZExt->getOperand(0)); -        else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) -          Arg = dyn_cast<Argument>(SExt->getOperand(0)); -        if (!Arg) -          Arg = SI->getValueOperand(); -      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { -        Arg = LI->getPointerOperand(); -      } else { -        continue; -      } -      DIB.insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(), -                                  DVI->getExpression(), DVI->getDebugLoc(), -                                  Inst); -    } -  } -}; -} // end anon namespace - -namespace { -/// \brief An optimization pass providing Scalar Replacement of Aggregates. -/// -/// This pass takes allocations which can be completely analyzed (that is, they -/// don't escape) and tries to turn them into scalar SSA values. There are -/// a few steps to this process. -/// -/// 1) It takes allocations of aggregates and analyzes the ways in which they -///    are used to try to split them into smaller allocations, ideally of -///    a single scalar data type. It will split up memcpy and memset accesses -///    as necessary and try to isolate individual scalar accesses. -/// 2) It will transform accesses into forms which are suitable for SSA value -///    promotion. This can be replacing a memset with a scalar store of an -///    integer value, or it can involve speculating operations on a PHI or -///    select to be a PHI or select of the results. -/// 3) Finally, this will try to detect a pattern of accesses which map cleanly -///    onto insert and extract operations on a vector value, and convert them to -///    this form. By doing so, it will enable promotion of vector aggregates to -///    SSA vector values. -class SROA : public FunctionPass { -  const bool RequiresDomTree; - -  LLVMContext *C; -  DominatorTree *DT; -  AssumptionCache *AC; - -  /// \brief Worklist of alloca instructions to simplify. -  /// -  /// Each alloca in the function is added to this. Each new alloca formed gets -  /// added to it as well to recursively simplify unless that alloca can be -  /// directly promoted. Finally, each time we rewrite a use of an alloca other -  /// the one being actively rewritten, we add it back onto the list if not -  /// already present to ensure it is re-visited. -  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist; - -  /// \brief A collection of instructions to delete. -  /// We try to batch deletions to simplify code and make things a bit more -  /// efficient. -  SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts; - -  /// \brief Post-promotion worklist. -  /// -  /// Sometimes we discover an alloca which has a high probability of becoming -  /// viable for SROA after a round of promotion takes place. In those cases, -  /// the alloca is enqueued here for re-processing. -  /// -  /// Note that we have to be very careful to clear allocas out of this list in -  /// the event they are deleted. -  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist; - -  /// \brief A collection of alloca instructions we can directly promote. -  std::vector<AllocaInst *> PromotableAllocas; - -  /// \brief A worklist of PHIs to speculate prior to promoting allocas. -  /// -  /// All of these PHIs have been checked for the safety of speculation and by -  /// being speculated will allow promoting allocas currently in the promotable -  /// queue. -  SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs; - -  /// \brief A worklist of select instructions to speculate prior to promoting -  /// allocas. -  /// -  /// All of these select instructions have been checked for the safety of -  /// speculation and by being speculated will allow promoting allocas -  /// currently in the promotable queue. -  SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects; - -public: -  SROA(bool RequiresDomTree = true) -      : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr), -        DT(nullptr) { -    initializeSROAPass(*PassRegistry::getPassRegistry()); -  } -  bool runOnFunction(Function &F) override; -  void getAnalysisUsage(AnalysisUsage &AU) const override; - -  const char *getPassName() const override { return "SROA"; } -  static char ID; - -private: -  friend class PHIOrSelectSpeculator; -  friend class AllocaSliceRewriter; - -  bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); -  AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, -                               AllocaSlices::Partition &P); -  bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); -  bool runOnAlloca(AllocaInst &AI); -  void clobberUse(Use &U); -  void deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas); -  bool promoteAllocas(Function &F); -}; -} - -char SROA::ID = 0; - -FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { -  return new SROA(RequiresDomTree); -} - -INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false, -                      false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, -                    false) -  /// Walk the range of a partitioning looking for a common type to cover this  /// sequence of slices.  static Type *findCommonType(AllocaSlices::const_iterator B, @@ -1373,7 +1158,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {      // Ensure that there are no instructions between the PHI and the load that      // could store. -    for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) +    for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)        if (BBI->mayWriteToMemory())          return false; @@ -1934,10 +1719,10 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,  /// \brief Test whether the given slice use can be promoted to a vector.  /// -/// This function is called to test each entry in a partioning which is slated +/// This function is called to test each entry in a partition which is slated  /// for a single slice. -static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P, -                                            const Slice &S, VectorType *Ty, +static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, +                                            VectorType *Ty,                                              uint64_t ElementSize,                                              const DataLayout &DL) {    // First validate the slice offsets. @@ -2012,8 +1797,7 @@ static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P,  /// SSA value. We only can ensure this for a limited set of operations, and we  /// don't want to do the rewrites unless we are confident that the result will  /// be promotable, so we have an early test here. -static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P, -                                           const DataLayout &DL) { +static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {    // Collect the candidate types for vector-based promotion. Also track whether    // we have different element types.    SmallVector<VectorType *, 4> CandidateTys; @@ -2130,7 +1914,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,    uint64_t RelEnd = S.endOffset() - AllocBeginOffset;    // We can't reasonably handle cases where the load or store extends past -  // the end of the aloca's type and into its padding. +  // the end of the alloca's type and into its padding.    if (RelEnd > Size)      return false; @@ -2199,7 +1983,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,  /// This is a quick test to check whether we can rewrite the integer loads and  /// stores to a particular alloca into wider loads and stores and be able to  /// promote the resulting alloca. -static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy, +static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,                                      const DataLayout &DL) {    uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);    // Don't create integer types larger than the maximum bitwidth. @@ -2368,14 +2152,14 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,    return V;  } -namespace {  /// \brief Visitor to rewrite instructions using p particular slice of an alloca  /// to use a new alloca.  ///  /// Also implements the rewriting to vector-based accesses when the partition  /// passes the isVectorPromotionViable predicate. Most of the rewriting logic  /// lives here. -class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { +class llvm::sroa::AllocaSliceRewriter +    : public InstVisitor<AllocaSliceRewriter, bool> {    // Befriend the base class so it can delegate to private visit methods.    friend class llvm::InstVisitor<AllocaSliceRewriter, bool>;    typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base; @@ -2583,9 +2367,19 @@ private:      V = convertValue(DL, IRB, V, IntTy);      assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; -    if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) -      V = extractInteger(DL, IRB, V, cast<IntegerType>(LI.getType()), Offset, -                         "extract"); +    if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) { +      IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8); +      V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract"); +    } +    // It is possible that the extracted type is not the load type. This +    // happens if there is a load past the end of the alloca, and as +    // a consequence the slice is narrower but still a candidate for integer +    // lowering. To handle this case, we just zero extend the extracted +    // integer. +    assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 && +           "Can only handle an extract for an overly wide load"); +    if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8) +      V = IRB.CreateZExt(V, LI.getType());      return V;    } @@ -2648,7 +2442,7 @@ private:                   DL.getTypeStoreSizeInBits(LI.getType()) &&               "Non-byte-multiple bit width");        // Move the insertion point just past the load so that we can refer to it. -      IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI))); +      IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));        // Create a placeholder value with the same type as LI to use as the        // basis for the new value. This allows us to replace the uses of LI with        // the computed value, and then replace the placeholder with LI, leaving @@ -3126,7 +2920,7 @@ private:      // dominate the PHI.      IRBuilderTy PtrBuilder(IRB);      if (isa<PHINode>(OldPtr)) -      PtrBuilder.SetInsertPoint(OldPtr->getParent()->getFirstInsertionPt()); +      PtrBuilder.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());      else        PtrBuilder.SetInsertPoint(OldPtr);      PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc()); @@ -3169,7 +2963,6 @@ private:      return true;    }  }; -}  namespace {  /// \brief Visitor to rewrite aggregate loads and stores as scalar. @@ -3181,8 +2974,6 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {    // Befriend the base class so it can delegate to private visit methods.    friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; -  const DataLayout &DL; -    /// Queue of pointer uses to analyze and potentially rewrite.    SmallVector<Use *, 8> Queue; @@ -3194,8 +2985,6 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {    Use *U;  public: -  AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {} -    /// Rewrite loads and stores through a pointer and all pointers derived from    /// it.    bool rewrite(Instruction &I) { @@ -3711,7 +3500,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {                         return true;                       }),        Stores.end()); -  // Now we have to go *back* through all te stores, because a later store may +  // Now we have to go *back* through all the stores, because a later store may    // have caused an earlier store's load to become unsplittable and if it is    // unsplittable for the later store, then we can't rely on it being split in    // the earlier store either. @@ -3773,7 +3562,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {             "Cannot represent alloca access size using 64-bit integers!");      Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); -    IRB.SetInsertPoint(BasicBlock::iterator(LI)); +    IRB.SetInsertPoint(LI);      DEBUG(dbgs() << "  Splitting load: " << *LI << "\n"); @@ -3825,7 +3614,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {        }        Value *StoreBasePtr = SI->getPointerOperand(); -      IRB.SetInsertPoint(BasicBlock::iterator(SI)); +      IRB.SetInsertPoint(SI);        DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n"); @@ -3914,7 +3703,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {        if (SplitLoads) {          PLoad = (*SplitLoads)[Idx];        } else { -        IRB.SetInsertPoint(BasicBlock::iterator(LI)); +        IRB.SetInsertPoint(LI);          PLoad = IRB.CreateAlignedLoad(              getAdjustedPtr(IRB, DL, LoadBasePtr,                             APInt(DL.getPointerSizeInBits(), PartOffset), @@ -3924,7 +3713,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {        }        // And store this partition. -      IRB.SetInsertPoint(BasicBlock::iterator(SI)); +      IRB.SetInsertPoint(SI);        StoreInst *PStore = IRB.CreateAlignedStore(            PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,                                  APInt(DL.getPointerSizeInBits(), PartOffset), @@ -3972,7 +3761,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {      // Mark the original store as dead now that we've split it up and kill its      // slice. Note that we leave the original load in place unless this store -    // was its ownly use. It may in turn be split up if it is an alloca load +    // was its only use. It may in turn be split up if it is an alloca load      // for some other alloca, but it may be a normal load. This may introduce      // redundant loads, but where those can be merged the rest of the optimizer      // should handle the merging, and this uncovers SSA splits which is more @@ -4024,7 +3813,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {  /// at enabling promotion and if it was successful queues the alloca to be  /// promoted.  AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, -                                   AllocaSlices::Partition &P) { +                                   Partition &P) {    // Try to compute a friendly type for this partition of the alloca. This    // won't always succeed, in which case we fall back to a legal integer type    // or an i8 array of an appropriate size. @@ -4230,12 +4019,11 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {        std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);    // Migrate debug information from the old alloca to the new alloca(s) -  // and the individial partitions. +  // and the individual partitions.    if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {      auto *Var = DbgDecl->getVariable();      auto *Expr = DbgDecl->getExpression(); -    DIBuilder DIB(*AI.getParent()->getParent()->getParent(), -                  /*AllowUnresolved*/ false); +    DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);      bool IsSplit = Pieces.size() > 1;      for (auto Piece : Pieces) {        // Create a piece expression describing the new partition or reuse AI's @@ -4308,7 +4096,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {    // First, split any FCA loads and stores touching this alloca to promote    // better splitting and promotion opportunities. -  AggLoadStoreRewriter AggRewriter(DL); +  AggLoadStoreRewriter AggRewriter;    Changed |= AggRewriter.rewrite(AI);    // Build the slices using a recursive instruction-visiting builder. @@ -4388,107 +4176,29 @@ void SROA::deleteDeadInstructions(    }  } -static void enqueueUsersInWorklist(Instruction &I, -                                   SmallVectorImpl<Instruction *> &Worklist, -                                   SmallPtrSetImpl<Instruction *> &Visited) { -  for (User *U : I.users()) -    if (Visited.insert(cast<Instruction>(U)).second) -      Worklist.push_back(cast<Instruction>(U)); -} -  /// \brief Promote the allocas, using the best available technique.  ///  /// This attempts to promote whatever allocas have been identified as viable in  /// the PromotableAllocas list. If that list is empty, there is nothing to do. -/// If there is a domtree available, we attempt to promote using the full power -/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is -/// based on the SSAUpdater utilities. This function returns whether any -/// promotion occurred. +/// This function returns whether any promotion occurred.  bool SROA::promoteAllocas(Function &F) {    if (PromotableAllocas.empty())      return false;    NumPromoted += PromotableAllocas.size(); -  if (DT && !ForceSSAUpdater) { -    DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); -    PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); -    PromotableAllocas.clear(); -    return true; -  } - -  DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); -  SSAUpdater SSA; -  DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); -  SmallVector<Instruction *, 64> Insts; - -  // We need a worklist to walk the uses of each alloca. -  SmallVector<Instruction *, 8> Worklist; -  SmallPtrSet<Instruction *, 8> Visited; -  SmallVector<Instruction *, 32> DeadInsts; - -  for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { -    AllocaInst *AI = PromotableAllocas[Idx]; -    Insts.clear(); -    Worklist.clear(); -    Visited.clear(); - -    enqueueUsersInWorklist(*AI, Worklist, Visited); - -    while (!Worklist.empty()) { -      Instruction *I = Worklist.pop_back_val(); - -      // FIXME: Currently the SSAUpdater infrastructure doesn't reason about -      // lifetime intrinsics and so we strip them (and the bitcasts+GEPs -      // leading to them) here. Eventually it should use them to optimize the -      // scalar values produced. -      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { -        assert(II->getIntrinsicID() == Intrinsic::lifetime_start || -               II->getIntrinsicID() == Intrinsic::lifetime_end); -        II->eraseFromParent(); -        continue; -      } - -      // Push the loads and stores we find onto the list. SROA will already -      // have validated that all loads and stores are viable candidates for -      // promotion. -      if (LoadInst *LI = dyn_cast<LoadInst>(I)) { -        assert(LI->getType() == AI->getAllocatedType()); -        Insts.push_back(LI); -        continue; -      } -      if (StoreInst *SI = dyn_cast<StoreInst>(I)) { -        assert(SI->getValueOperand()->getType() == AI->getAllocatedType()); -        Insts.push_back(SI); -        continue; -      } - -      // For everything else, we know that only no-op bitcasts and GEPs will -      // make it this far, just recurse through them and recall them for later -      // removal. -      DeadInsts.push_back(I); -      enqueueUsersInWorklist(*I, Worklist, Visited); -    } -    AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); -    while (!DeadInsts.empty()) -      DeadInsts.pop_back_val()->eraseFromParent(); -    AI->eraseFromParent(); -  } - +  DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); +  PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);    PromotableAllocas.clear();    return true;  } -bool SROA::runOnFunction(Function &F) { -  if (skipOptnoneFunction(F)) -    return false; - +PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, +                                AssumptionCache &RunAC) {    DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");    C = &F.getContext(); -  DominatorTreeWrapperPass *DTWP = -      getAnalysisIfAvailable<DominatorTreeWrapperPass>(); -  DT = DTWP ? &DTWP->getDomTree() : nullptr; -  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); +  DT = &RunDT; +  AC = &RunAC;    BasicBlock &EntryBB = F.getEntryBlock();    for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); @@ -4527,12 +4237,55 @@ bool SROA::runOnFunction(Function &F) {      PostPromotionWorklist.clear();    } while (!Worklist.empty()); -  return Changed; +  // FIXME: Even when promoting allocas we should preserve some abstract set of +  // CFG-specific analyses. +  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();  } -void SROA::getAnalysisUsage(AnalysisUsage &AU) const { -  AU.addRequired<AssumptionCacheTracker>(); -  if (RequiresDomTree) -    AU.addRequired<DominatorTreeWrapperPass>(); -  AU.setPreservesCFG(); +PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> *AM) { +  return runImpl(F, AM->getResult<DominatorTreeAnalysis>(F), +                 AM->getResult<AssumptionAnalysis>(F));  } + +/// A legacy pass for the legacy pass manager that wraps the \c SROA pass. +/// +/// This is in the llvm namespace purely to allow it to be a friend of the \c +/// SROA pass. +class llvm::sroa::SROALegacyPass : public FunctionPass { +  /// The SROA implementation. +  SROA Impl; + +public: +  SROALegacyPass() : FunctionPass(ID) { +    initializeSROALegacyPassPass(*PassRegistry::getPassRegistry()); +  } +  bool runOnFunction(Function &F) override { +    if (skipOptnoneFunction(F)) +      return false; + +    auto PA = Impl.runImpl( +        F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(), +        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F)); +    return !PA.areAllPreserved(); +  } +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<AssumptionCacheTracker>(); +    AU.addRequired<DominatorTreeWrapperPass>(); +    AU.addPreserved<GlobalsAAWrapperPass>(); +    AU.setPreservesCFG(); +  } + +  const char *getPassName() const override { return "SROA"; } +  static char ID; +}; + +char SROALegacyPass::ID = 0; + +FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); } + +INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", +                      "Scalar Replacement Of Aggregates", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", +                    false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index d5d360571f88..52d477cc9573 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -16,7 +16,10 @@  #include "llvm/Transforms/Scalar.h"  #include "llvm-c/Initialization.h"  #include "llvm-c/Transforms/Scalar.h" +#include "llvm/Analysis/BasicAliasAnalysis.h"  #include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" +#include "llvm/Analysis/TypeBasedAliasAnalysis.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/Verifier.h"  #include "llvm/InitializePasses.h" @@ -27,10 +30,9 @@ using namespace llvm;  /// initializeScalarOptsPasses - Initialize all passes linked into the  /// ScalarOpts library.  void llvm::initializeScalarOpts(PassRegistry &Registry) { -  initializeADCEPass(Registry); +  initializeADCELegacyPassPass(Registry);    initializeBDCEPass(Registry);    initializeAlignmentFromAssumptionsPass(Registry); -  initializeSampleProfileLoaderPass(Registry);    initializeConstantHoistingPass(Registry);    initializeConstantPropagationPass(Registry);    initializeCorrelatedValuePropagationPass(Registry); @@ -66,7 +68,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {    initializeRewriteStatepointsForGCPass(Registry);    initializeSCCPPass(Registry);    initializeIPSCCPPass(Registry); -  initializeSROAPass(Registry); +  initializeSROALegacyPassPass(Registry);    initializeSROA_DTPass(Registry);    initializeSROA_SSAUpPass(Registry);    initializeCFGSimplifyPassPass(Registry); @@ -81,6 +83,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {    initializePlaceSafepointsPass(Registry);    initializeFloat2IntPass(Registry);    initializeLoopDistributePass(Registry); +  initializeLoopLoadEliminationPass(Registry);  }  void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { @@ -225,15 +228,15 @@ void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {  }  void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) { -  unwrap(PM)->add(createTypeBasedAliasAnalysisPass()); +  unwrap(PM)->add(createTypeBasedAAWrapperPass());  }  void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) { -  unwrap(PM)->add(createScopedNoAliasAAPass()); +  unwrap(PM)->add(createScopedNoAliasAAWrapperPass());  }  void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) { -  unwrap(PM)->add(createBasicAliasAnalysisPass()); +  unwrap(PM)->add(createBasicAAWrapperPass());  }  void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) { diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index d955da7ce75d..114d22ddf2e4 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -60,6 +60,7 @@ STATISTIC(NumAdjusted,  "Number of scalar allocas adjusted to allow promotion");  STATISTIC(NumConverted, "Number of aggregates converted to scalar");  namespace { +#define SROA SROA_    struct SROA : public FunctionPass {      SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT)        : FunctionPass(ID), HasDomTree(hasDT) { @@ -382,8 +383,8 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {      // Create and insert the integer alloca.      NewTy = IntegerType::get(AI->getContext(), BitWidth);    } -  AllocaInst *NewAI = new AllocaInst(NewTy, nullptr, "", -                                     AI->getParent()->begin()); +  AllocaInst *NewAI = +      new AllocaInst(NewTy, nullptr, "", &AI->getParent()->front());    ConvertUsesToScalar(AI, NewAI, 0, nullptr);    return NewAI;  } @@ -1195,7 +1196,7 @@ static bool isSafePHIToSpeculate(PHINode *PN) {      // Ensure that there are no instructions between the PHI and the load that      // could store. -    for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI) +    for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)        if (BBI->mayWriteToMemory())          return false; diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 049300350857..054bacdc706b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -253,10 +253,10 @@ bool Scalarizer::doInitialization(Module &M) {  }  bool Scalarizer::runOnFunction(Function &F) { -  for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) { -    BasicBlock *BB = BBI; -    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { -      Instruction *I = II; +  assert(Gathered.empty() && Scattered.empty()); +  for (BasicBlock &BB : F) { +    for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { +      Instruction *I = &*II;        bool Done = visit(I);        ++II;        if (Done && I->getType()->isVoidTy()) @@ -285,7 +285,7 @@ Scatterer Scalarizer::scatter(Instruction *Point, Value *V) {    }    // In the fallback case, just put the scattered before Point and    // keep the result local to Point. -  return Scatterer(Point->getParent(), Point, V); +  return Scatterer(Point->getParent(), Point->getIterator(), V);  }  // Replace Op with the gathered form of the components in CV.  Defer the @@ -377,7 +377,7 @@ bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) {      return false;    unsigned NumElems = VT->getNumElements(); -  IRBuilder<> Builder(I.getParent(), &I); +  IRBuilder<> Builder(&I);    Scatterer Op0 = scatter(&I, I.getOperand(0));    Scatterer Op1 = scatter(&I, I.getOperand(1));    assert(Op0.size() == NumElems && "Mismatched binary operation"); @@ -397,7 +397,7 @@ bool Scalarizer::visitSelectInst(SelectInst &SI) {      return false;    unsigned NumElems = VT->getNumElements(); -  IRBuilder<> Builder(SI.getParent(), &SI); +  IRBuilder<> Builder(&SI);    Scatterer Op1 = scatter(&SI, SI.getOperand(1));    Scatterer Op2 = scatter(&SI, SI.getOperand(2));    assert(Op1.size() == NumElems && "Mismatched select"); @@ -438,7 +438,7 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {    if (!VT)      return false; -  IRBuilder<> Builder(GEPI.getParent(), &GEPI); +  IRBuilder<> Builder(&GEPI);    unsigned NumElems = VT->getNumElements();    unsigned NumIndices = GEPI.getNumIndices(); @@ -472,7 +472,7 @@ bool Scalarizer::visitCastInst(CastInst &CI) {      return false;    unsigned NumElems = VT->getNumElements(); -  IRBuilder<> Builder(CI.getParent(), &CI); +  IRBuilder<> Builder(&CI);    Scatterer Op0 = scatter(&CI, CI.getOperand(0));    assert(Op0.size() == NumElems && "Mismatched cast");    ValueVector Res; @@ -492,7 +492,7 @@ bool Scalarizer::visitBitCastInst(BitCastInst &BCI) {    unsigned DstNumElems = DstVT->getNumElements();    unsigned SrcNumElems = SrcVT->getNumElements(); -  IRBuilder<> Builder(BCI.getParent(), &BCI); +  IRBuilder<> Builder(&BCI);    Scatterer Op0 = scatter(&BCI, BCI.getOperand(0));    ValueVector Res;    Res.resize(DstNumElems); @@ -569,7 +569,7 @@ bool Scalarizer::visitPHINode(PHINode &PHI) {      return false;    unsigned NumElems = VT->getNumElements(); -  IRBuilder<> Builder(PHI.getParent(), &PHI); +  IRBuilder<> Builder(&PHI);    ValueVector Res;    Res.resize(NumElems); @@ -600,7 +600,7 @@ bool Scalarizer::visitLoadInst(LoadInst &LI) {      return false;    unsigned NumElems = Layout.VecTy->getNumElements(); -  IRBuilder<> Builder(LI.getParent(), &LI); +  IRBuilder<> Builder(&LI);    Scatterer Ptr = scatter(&LI, LI.getPointerOperand());    ValueVector Res;    Res.resize(NumElems); @@ -625,7 +625,7 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) {      return false;    unsigned NumElems = Layout.VecTy->getNumElements(); -  IRBuilder<> Builder(SI.getParent(), &SI); +  IRBuilder<> Builder(&SI);    Scatterer Ptr = scatter(&SI, SI.getPointerOperand());    Scatterer Val = scatter(&SI, FullValue); @@ -642,7 +642,9 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) {  // Delete the instructions that we scalarized.  If a full vector result  // is still needed, recreate it using InsertElements.  bool Scalarizer::finish() { -  if (Gathered.empty()) +  // The presence of data in Gathered or Scattered indicates changes +  // made to the Function. +  if (Gathered.empty() && Scattered.empty())      return false;    for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end();         GMI != GME; ++GMI) { @@ -655,7 +657,7 @@ bool Scalarizer::finish() {        Value *Res = UndefValue::get(Ty);        BasicBlock *BB = Op->getParent();        unsigned Count = Ty->getVectorNumElements(); -      IRBuilder<> Builder(BB, Op); +      IRBuilder<> Builder(Op);        if (isa<PHINode>(Op))          Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());        for (unsigned I = 0; I < Count; ++I) diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 4a875311881a..86a10d2a1612 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -156,6 +156,10 @@  //  //===----------------------------------------------------------------------===// +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/Analysis/TargetTransformInfo.h"  #include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/Constants.h" @@ -164,6 +168,7 @@  #include "llvm/IR/Instructions.h"  #include "llvm/IR/LLVMContext.h"  #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h"  #include "llvm/IR/Operator.h"  #include "llvm/Support/CommandLine.h"  #include "llvm/Support/raw_ostream.h" @@ -174,6 +179,7 @@  #include "llvm/IR/IRBuilder.h"  using namespace llvm; +using namespace llvm::PatternMatch;  static cl::opt<bool> DisableSeparateConstOffsetFromGEP(      "disable-separate-const-offset-from-gep", cl::init(false), @@ -319,8 +325,11 @@ public:    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.addRequired<DominatorTreeWrapperPass>(); +    AU.addRequired<ScalarEvolutionWrapperPass>();      AU.addRequired<TargetTransformInfoWrapperPass>(); +    AU.addRequired<LoopInfoWrapperPass>();      AU.setPreservesCFG(); +    AU.addRequired<TargetLibraryInfoWrapperPass>();    }    bool doInitialization(Module &M) override { @@ -373,15 +382,42 @@ private:    ///    /// Verified in @i32_add in split-gep.ll    bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); +  /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow. +  /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting +  /// the constant offset. After extraction, it becomes desirable to reunion the +  /// distributed sexts. For example, +  /// +  ///                              &a[sext(i +nsw (j +nsw 5)] +  ///   => distribute              &a[sext(i) +nsw (sext(j) +nsw 5)] +  ///   => constant extraction     &a[sext(i) + sext(j)] + 5 +  ///   => reunion                 &a[sext(i +nsw j)] + 5 +  bool reuniteExts(Function &F); +  /// A helper that reunites sexts in an instruction. +  bool reuniteExts(Instruction *I); +  /// Find the closest dominator of <Dominatee> that is equivalent to <Key>. +  Instruction *findClosestMatchingDominator(const SCEV *Key, +                                            Instruction *Dominatee);    /// Verify F is free of dead code.    void verifyNoDeadCode(Function &F); +  bool hasMoreThanOneUseInLoop(Value *v, Loop *L); +  // Swap the index operand of two GEP. +  void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second); +  // Check if it is safe to swap operand of two GEP. +  bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second, +                            Loop *CurLoop); +    const DataLayout *DL; -  const DominatorTree *DT; +  DominatorTree *DT; +  ScalarEvolution *SE;    const TargetMachine *TM; + +  LoopInfo *LI; +  TargetLibraryInfo *TLI;    /// Whether to lower a GEP with multiple indices into arithmetic operations or    /// multiple GEPs with a single index.    bool LowerGEP; +  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs;  };  }  // anonymous namespace @@ -391,7 +427,10 @@ INITIALIZE_PASS_BEGIN(      "Split GEPs to a variadic base and a constant offset for better CSE", false,      false)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)  INITIALIZE_PASS_END(      SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",      "Split GEPs to a variadic base and a constant offset for better CSE", false, @@ -734,6 +773,13 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(    Type *I8PtrTy =        Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());    Value *ResultPtr = Variadic->getOperand(0); +  Loop *L = LI->getLoopFor(Variadic->getParent()); +  // Check if the base is not loop invariant or used more than once. +  bool isSwapCandidate = +      L && L->isLoopInvariant(ResultPtr) && +      !hasMoreThanOneUseInLoop(ResultPtr, L); +  Value *FirstResult = nullptr; +    if (ResultPtr->getType() != I8PtrTy)      ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); @@ -762,6 +808,8 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(        // Create an ugly GEP with a single index for each index.        ResultPtr =            Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep"); +      if (FirstResult == nullptr) +        FirstResult = ResultPtr;      }    } @@ -770,7 +818,17 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(      Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);      ResultPtr =          Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep"); -  } +  } else +    isSwapCandidate = false; + +  // If we created a GEP with constant index, and the base is loop invariant, +  // then we swap the first one with it, so LICM can move constant GEP out +  // later. +  GetElementPtrInst *FirstGEP = dyn_cast<GetElementPtrInst>(FirstResult); +  GetElementPtrInst *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr); +  if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L)) +    swapGEPOperand(FirstGEP, SecondGEP); +    if (ResultPtr->getType() != Variadic->getType())      ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType()); @@ -891,13 +949,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {    // Clear the inbounds attribute because the new index may be off-bound.    // e.g.,    // -  // b = add i64 a, 5 -  // addr = gep inbounds float* p, i64 b +  //   b     = add i64 a, 5 +  //   addr  = gep inbounds float, float* p, i64 b    //    // is transformed to:    // -  // addr2 = gep float* p, i64 a -  // addr = gep float* addr2, i64 5 +  //   addr2 = gep float, float* p, i64 a ; inbounds removed +  //   addr  = gep inbounds float, float* addr2, i64 5    //    // If a is -4, although the old index b is in bounds, the new index a is    // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the @@ -907,6 +965,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {    //    // TODO(jingyue): do some range analysis to keep as many inbounds as    // possible. GEPs with inbounds are more friendly to alias analysis. +  bool GEPWasInBounds = GEP->isInBounds();    GEP->setIsInBounds(false);    // Lowers a GEP to either GEPs with a single index or arithmetic operations. @@ -968,6 +1027,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {      NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,                                         ConstantInt::get(IntPtrTy, Index, true),                                         GEP->getName(), GEP); +    // Inherit the inbounds attribute of the original GEP. +    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);    } else {      // Unlikely but possible. For example,      // #pragma pack(1) @@ -990,6 +1051,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {          Type::getInt8Ty(GEP->getContext()), NewGEP,          ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",          GEP); +    // Inherit the inbounds attribute of the original GEP. +    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);      if (GEP->getType() != I8PtrTy)        NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);    } @@ -1008,24 +1071,96 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {      return false;    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - +  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); +  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); +  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();    bool Changed = false;    for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { -    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) { -      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) { +    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;) +      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))          Changed |= splitGEP(GEP); -      } -      // No need to split GEP ConstantExprs because all its indices are constant -      // already. -    } +    // No need to split GEP ConstantExprs because all its indices are constant +    // already.    } +  Changed |= reuniteExts(F); +    if (VerifyNoDeadCode)      verifyNoDeadCode(F);    return Changed;  } +Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator( +    const SCEV *Key, Instruction *Dominatee) { +  auto Pos = DominatingExprs.find(Key); +  if (Pos == DominatingExprs.end()) +    return nullptr; + +  auto &Candidates = Pos->second; +  // Because we process the basic blocks in pre-order of the dominator tree, a +  // candidate that doesn't dominate the current instruction won't dominate any +  // future instruction either. Therefore, we pop it out of the stack. This +  // optimization makes the algorithm O(n). +  while (!Candidates.empty()) { +    Instruction *Candidate = Candidates.back(); +    if (DT->dominates(Candidate, Dominatee)) +      return Candidate; +    Candidates.pop_back(); +  } +  return nullptr; +} + +bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { +  if (!SE->isSCEVable(I->getType())) +    return false; + +  //   Dom: LHS+RHS +  //   I: sext(LHS)+sext(RHS) +  // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom). +  // TODO: handle zext +  Value *LHS = nullptr, *RHS = nullptr; +  if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) || +      match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { +    if (LHS->getType() == RHS->getType()) { +      const SCEV *Key = +          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); +      if (auto *Dom = findClosestMatchingDominator(Key, I)) { +        Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); +        NewSExt->takeName(I); +        I->replaceAllUsesWith(NewSExt); +        RecursivelyDeleteTriviallyDeadInstructions(I); +        return true; +      } +    } +  } + +  // Add I to DominatingExprs if it's an add/sub that can't sign overflow. +  if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) || +      match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) { +    if (isKnownNotFullPoison(I)) { +      const SCEV *Key = +          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); +      DominatingExprs[Key].push_back(I); +    } +  } +  return false; +} + +bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { +  bool Changed = false; +  DominatingExprs.clear(); +  for (auto Node = GraphTraits<DominatorTree *>::nodes_begin(DT); +       Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) { +    BasicBlock *BB = Node->getBlock(); +    for (auto I = BB->begin(); I != BB->end(); ) { +      Instruction *Cur = &*I++; +      Changed |= reuniteExts(Cur); +    } +  } +  return Changed; +} +  void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {    for (auto &B : F) {      for (auto &I : B) { @@ -1038,3 +1173,93 @@ void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {      }    }  } + +bool SeparateConstOffsetFromGEP::isLegalToSwapOperand( +    GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) { +  if (!FirstGEP || !FirstGEP->hasOneUse()) +    return false; + +  if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent()) +    return false; + +  if (FirstGEP == SecondGEP) +    return false; + +  unsigned FirstNum = FirstGEP->getNumOperands(); +  unsigned SecondNum = SecondGEP->getNumOperands(); +  // Give up if the number of operands are not 2. +  if (FirstNum != SecondNum || FirstNum != 2) +    return false; + +  Value *FirstBase = FirstGEP->getOperand(0); +  Value *SecondBase = SecondGEP->getOperand(0); +  Value *FirstOffset = FirstGEP->getOperand(1); +  // Give up if the index of the first GEP is loop invariant. +  if (CurLoop->isLoopInvariant(FirstOffset)) +    return false; + +  // Give up if base doesn't have same type. +  if (FirstBase->getType() != SecondBase->getType()) +    return false; + +  Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset); + +  // Check if the second operand of first GEP has constant coefficient. +  // For an example, for the following code,  we won't gain anything by +  // hoisting the second GEP out because the second GEP can be folded away. +  //   %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256 +  //   %67 = shl i64 %scevgep.sum.ur159, 2 +  //   %uglygep160 = getelementptr i8* %65, i64 %67 +  //   %uglygep161 = getelementptr i8* %uglygep160, i64 -1024 + +  // Skip constant shift instruction which may be generated by Splitting GEPs. +  if (FirstOffsetDef && FirstOffsetDef->isShift() && +      isa<ConstantInt>(FirstOffsetDef->getOperand(1))) +    FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0)); + +  // Give up if FirstOffsetDef is an Add or Sub with constant. +  // Because it may not profitable at all due to constant folding. +  if (FirstOffsetDef) +    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) { +      unsigned opc = BO->getOpcode(); +      if ((opc == Instruction::Add || opc == Instruction::Sub) && +          (isa<ConstantInt>(BO->getOperand(0)) || +           isa<ConstantInt>(BO->getOperand(1)))) +        return false; +    } +  return true; +} + +bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) { +  int UsesInLoop = 0; +  for (User *U : V->users()) { +    if (Instruction *User = dyn_cast<Instruction>(U)) +      if (L->contains(User)) +        if (++UsesInLoop > 1) +          return true; +  } +  return false; +} + +void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First, +                                                GetElementPtrInst *Second) { +  Value *Offset1 = First->getOperand(1); +  Value *Offset2 = Second->getOperand(1); +  First->setOperand(1, Offset2); +  Second->setOperand(1, Offset1); + +  // We changed p+o+c to p+c+o, p+c may not be inbound anymore. +  const DataLayout &DAL = First->getModule()->getDataLayout(); +  APInt Offset(DAL.getPointerSizeInBits( +                   cast<PointerType>(First->getType())->getAddressSpace()), +               0); +  Value *NewBase = +      First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset); +  uint64_t ObjectSize; +  if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) || +     Offset.ugt(ObjectSize)) { +    First->setIsInBounds(false); +    Second->setIsInBounds(false); +  } else +    First->setIsInBounds(true); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 231411a16c05..63c8836bf381 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -25,6 +25,7 @@  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/TargetTransformInfo.h"  #include "llvm/IR/Attributes.h" @@ -67,15 +68,14 @@ static bool mergeEmptyReturnBlocks(Function &F) {      // single PHI node that is the operand to the return.      if (Ret != &BB.front()) {        // Check for something else in the block. -      BasicBlock::iterator I = Ret; +      BasicBlock::iterator I(Ret);        --I;        // Skip over debug info.        while (isa<DbgInfoIntrinsic>(I) && I != BB.begin())          --I;        if (!isa<DbgInfoIntrinsic>(I) && -          (!isa<PHINode>(I) || I != BB.begin() || -           Ret->getNumOperands() == 0 || -           Ret->getOperand(0) != I)) +          (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 || +           Ret->getOperand(0) != &*I))          continue;      } @@ -136,7 +136,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,      // Loop over all of the basic blocks and remove them if they are unneeded.      for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { -      if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, AC)) { +      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC)) {          LocalChange = true;          ++NumSimpl;        } @@ -217,6 +217,7 @@ struct CFGSimplifyPass : public FunctionPass {    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.addRequired<AssumptionCacheTracker>();      AU.addRequired<TargetTransformInfoWrapperPass>(); +    AU.addPreserved<GlobalsAAWrapperPass>();    }  };  } diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp index f49f4eaaedcb..64109b2df117 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp @@ -48,7 +48,7 @@ namespace {      void getAnalysisUsage(AnalysisUsage &AU) const override {        AU.setPreservesCFG();        FunctionPass::getAnalysisUsage(AU); -      AU.addRequired<AliasAnalysis>(); +      AU.addRequired<AAResultsWrapperPass>();        AU.addRequired<DominatorTreeWrapperPass>();        AU.addRequired<LoopInfoWrapperPass>();        AU.addPreserved<DominatorTreeWrapperPass>(); @@ -66,7 +66,7 @@ char Sinking::ID = 0;  INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)  INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false)  FunctionPass *llvm::createSinkingPass() { return new Sinking(); } @@ -99,7 +99,7 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,  bool Sinking::runOnFunction(Function &F) {    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); -  AA = &getAnalysis<AliasAnalysis>(); +  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();    bool MadeChange, EverMadeChange = false; @@ -119,7 +119,7 @@ bool Sinking::runOnFunction(Function &F) {  bool Sinking::ProcessBlock(BasicBlock &BB) {    // Can't sink anything out of a block that has less than two successors. -  if (BB.getTerminator()->getNumSuccessors() <= 1 || BB.empty()) return false; +  if (BB.getTerminator()->getNumSuccessors() <= 1) return false;    // Don't bother sinking code out of unreachable blocks. In addition to being    // unprofitable, it can also lead to infinite looping, because in an @@ -134,7 +134,7 @@ bool Sinking::ProcessBlock(BasicBlock &BB) {    bool ProcessedBegin = false;    SmallPtrSet<Instruction *, 8> Stores;    do { -    Instruction *Inst = I;  // The instruction to sink. +    Instruction *Inst = &*I; // The instruction to sink.      // Predecrement I (if it's not begin) so that it isn't invalidated by      // sinking. @@ -165,14 +165,16 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,    if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {      MemoryLocation Loc = MemoryLocation::get(L);      for (Instruction *S : Stores) -      if (AA->getModRefInfo(S, Loc) & AliasAnalysis::Mod) +      if (AA->getModRefInfo(S, Loc) & MRI_Mod)          return false;    } -  if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst)) +  if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst) || Inst->isEHPad() || +      Inst->mayThrow())      return false; -  // Convergent operations can only be moved to control equivalent blocks. +  // Convergent operations cannot be made control-dependent on additional +  // values.    if (auto CS = CallSite(Inst)) {      if (CS.hasFnAttr(Attribute::Convergent))        return false; @@ -193,6 +195,11 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst,    if (Inst->getParent() == SuccToSinkTo)      return false; +  // It's never legal to sink an instruction into a block which terminates in an +  // EH-pad. +  if (SuccToSinkTo->getTerminator()->isExceptional()) +    return false; +    // If the block has multiple predecessors, this would introduce computation    // on different code paths.  We could split the critical edge, but for now we    // just punt. @@ -278,6 +285,6 @@ bool Sinking::SinkInstruction(Instruction *Inst,          dbgs() << ")\n");    // Move the instruction. -  Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt()); +  Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());    return true;  } diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index ff3f00a2e2f8..147d615488ff 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -227,7 +227,7 @@ bool SpeculativeExecution::considerHoistingFromTo(BasicBlock &FromBlock,      // changes the list that I is iterating through.      auto Current = I;      ++I; -    if (!NotHoisted.count(Current)) { +    if (!NotHoisted.count(&*Current)) {        Current->moveBefore(ToBlock.getTerminator());      }    } diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 6d9d417ef943..1faa65eb3417 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -131,7 +131,7 @@ public:    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.addRequired<DominatorTreeWrapperPass>(); -    AU.addRequired<ScalarEvolution>(); +    AU.addRequired<ScalarEvolutionWrapperPass>();      AU.addRequired<TargetTransformInfoWrapperPass>();      // We do not modify the shape of the CFG.      AU.setPreservesCFG(); @@ -212,7 +212,7 @@ char StraightLineStrengthReduce::ID = 0;  INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",                        "Straight line strength reduction", false, false)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)  INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr",                      "Straight line strength reduction", false, false) @@ -234,6 +234,7 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,            Basis.CandidateKind == C.CandidateKind);  } +// TODO: use TTI->getGEPCost.  static bool isGEPFoldable(GetElementPtrInst *GEP,                            const TargetTransformInfo *TTI,                            const DataLayout *DL) { @@ -523,7 +524,7 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(        continue;      const SCEV *OrigIndexExpr = IndexExprs[I - 1]; -    IndexExprs[I - 1] = SE->getConstant(OrigIndexExpr->getType(), 0); +    IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType());      // The base of this candidate is GEP's base plus the offsets of all      // indices except this current one. @@ -689,7 +690,7 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); -  SE = &getAnalysis<ScalarEvolution>(); +  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();    // Traverse the dominator tree in the depth-first order. This order makes sure    // all bases of a candidate are in Candidates when we process it.    for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT); diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 4f23e20d251d..662513c7d8ae 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -358,13 +358,9 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {      BasicBlock *BB = N->getNodeAs<BasicBlock>();      BranchInst *Term = cast<BranchInst>(BB->getTerminator()); -    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { -      BasicBlock *Succ = Term->getSuccessor(i); - -      if (Visited.count(Succ)) { +    for (BasicBlock *Succ : Term->successors()) +      if (Visited.count(Succ))          Loops[Succ] = BB; -      } -    }    }  } @@ -903,14 +899,14 @@ void StructurizeCFG::rebuildSSA() {              continue;          } -        if (DT->dominates(II, User)) +        if (DT->dominates(&*II, User))            continue;          if (!Initialized) {            Value *Undef = UndefValue::get(II->getType());            Updater.Initialize(II->getType(), "");            Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); -          Updater.AddAvailableValue(BB, II); +          Updater.AddAvailableValue(BB, &*II);            Initialized = true;          }          Updater.RewriteUseAfterInsertions(U); diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index c7de2e2965c7..0e0b00df85bb 100644 --- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -54,6 +54,7 @@  #include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/CFG.h"  #include "llvm/Analysis/CaptureTracking.h"  #include "llvm/Analysis/InlineCost.h" @@ -136,6 +137,7 @@ FunctionPass *llvm::createTailCallEliminationPass() {  void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {    AU.addRequired<TargetTransformInfoWrapperPass>(); +  AU.addPreserved<GlobalsAAWrapperPass>();  }  /// \brief Scan the specified function for alloca instructions. @@ -195,8 +197,8 @@ struct AllocaDerivedValueTracker {        case Instruction::Call:        case Instruction::Invoke: {          CallSite CS(I); -        bool IsNocapture = !CS.isCallee(U) && -                           CS.doesNotCapture(CS.getArgumentNo(U)); +        bool IsNocapture = +            CS.isDataOperand(U) && CS.doesNotCapture(CS.getDataOperandNo(U));          callUsesLocalStack(CS, IsNocapture);          if (IsNocapture) {            // If the alloca-derived argument is passed in as nocapture, then it @@ -302,7 +304,9 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {        if (!CI || CI->isTailCall())          continue; -      if (CI->doesNotAccessMemory()) { +      bool IsNoTail = CI->isNoTailCall(); + +      if (!IsNoTail && CI->doesNotAccessMemory()) {          // A call to a readnone function whose arguments are all things computed          // outside this function can be marked tail. Even if you stored the          // alloca address into a global, a readnone function can't load the @@ -330,7 +334,7 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {          }        } -      if (Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { +      if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {          DeferredTails.push_back(CI);        } else {          AllCallsAreTailCalls = false; @@ -404,7 +408,7 @@ bool TailCallElim::runTRE(Function &F) {    // Until this is resolved, disable this transformation if that would ever    // happen.  This bug is PR962.    for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) { -    BasicBlock *BB = BBI++; // FoldReturnAndProcessPred may delete BB. +    BasicBlock *BB = &*BBI++; // FoldReturnAndProcessPred may delete BB.      if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {        bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,                                            ArgumentPHIs, !CanTRETailMarkedCall); @@ -574,7 +578,7 @@ TailCallElim::FindTRECandidate(Instruction *TI,    // Scan backwards from the return, checking to see if there is a tail call in    // this block.  If so, set CI to it.    CallInst *CI = nullptr; -  BasicBlock::iterator BBI = TI; +  BasicBlock::iterator BBI(TI);    while (true) {      CI = dyn_cast<CallInst>(BBI);      if (CI && CI->getCalledFunction() == F) @@ -595,9 +599,8 @@ TailCallElim::FindTRECandidate(Instruction *TI,    // and disable this xform in this case, because the code generator will    // lower the call to fabs into inline code.    if (BB == &F->getEntryBlock() && -      FirstNonDbg(BB->front()) == CI && -      FirstNonDbg(std::next(BB->begin())) == TI && -      CI->getCalledFunction() && +      FirstNonDbg(BB->front().getIterator()) == CI && +      FirstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&        !TTI->isLoweredToCall(CI->getCalledFunction())) {      // A single-block function with just a call and a return. Check that      // the arguments match. @@ -636,19 +639,19 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,    // tail call if all of the instructions between the call and the return are    // movable to above the call itself, leaving the call next to the return.    // Check that this is the case now. -  BasicBlock::iterator BBI = CI; +  BasicBlock::iterator BBI(CI);    for (++BBI; &*BBI != Ret; ++BBI) { -    if (CanMoveAboveCall(BBI, CI)) continue; +    if (CanMoveAboveCall(&*BBI, CI)) continue;      // If we can't move the instruction above the call, it might be because it      // is an associative and commutative operation that could be transformed      // using accumulator recursion elimination.  Check to see if this is the      // case, and if so, remember the initial accumulator value for later.      if ((AccumulatorRecursionEliminationInitVal = -                           CanTransformAccumulatorRecursion(BBI, CI))) { +             CanTransformAccumulatorRecursion(&*BBI, CI))) {        // Yes, this is accumulator recursion.  Remember which instruction        // accumulates. -      AccumulatorRecursionInstr = BBI; +      AccumulatorRecursionInstr = &*BBI;      } else {        return false;   // Otherwise, we cannot eliminate the tail recursion!      } @@ -698,19 +701,19 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,               NEBI = NewEntry->begin(); OEBI != E; )          if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))            if (isa<ConstantInt>(AI->getArraySize())) -            AI->moveBefore(NEBI); +            AI->moveBefore(&*NEBI);      // Now that we have created a new block, which jumps to the entry      // block, insert a PHI node for each argument of the function.      // For now, we initialize each PHI to only have the real arguments      // which are passed in. -    Instruction *InsertPos = OldEntry->begin(); +    Instruction *InsertPos = &OldEntry->front();      for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();           I != E; ++I) {        PHINode *PN = PHINode::Create(I->getType(), 2,                                      I->getName() + ".tr", InsertPos);        I->replaceAllUsesWith(PN); // Everyone use the PHI node now! -      PN->addIncoming(I, NewEntry); +      PN->addIncoming(&*I, NewEntry);        ArgumentPHIs.push_back(PN);      }    } @@ -739,10 +742,9 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,      Instruction *AccRecInstr = AccumulatorRecursionInstr;      // Start by inserting a new PHI node for the accumulator.      pred_iterator PB = pred_begin(OldEntry), PE = pred_end(OldEntry); -    PHINode *AccPN = -      PHINode::Create(AccumulatorRecursionEliminationInitVal->getType(), -                      std::distance(PB, PE) + 1, -                      "accumulator.tr", OldEntry->begin()); +    PHINode *AccPN = PHINode::Create( +        AccumulatorRecursionEliminationInitVal->getType(), +        std::distance(PB, PE) + 1, "accumulator.tr", &OldEntry->front());      // Loop over all of the predecessors of the tail recursion block.  For the      // real entry into the function we seed the PHI with the initial value, diff --git a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp index 03c3a80170a3..409326eba401 100644 --- a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp @@ -12,8 +12,8 @@  //===----------------------------------------------------------------------===//  #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"  #include "llvm/ADT/SmallString.h" -#include "llvm/Support/raw_ostream.h"  #include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h"  #include <algorithm>  namespace llvm { diff --git a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp index e9f62391a44f..0262358fa3d5 100644 --- a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp +++ b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp @@ -52,32 +52,34 @@  // http://wiki.dwarfstd.org/index.php?title=Path_Discriminators  //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/DenseMap.h"  #include "llvm/IR/BasicBlock.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DIBuilder.h"  #include "llvm/IR/DebugInfo.h"  #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h"  #include "llvm/IR/LLVMContext.h"  #include "llvm/IR/Module.h"  #include "llvm/Pass.h"  #include "llvm/Support/CommandLine.h"  #include "llvm/Support/Debug.h"  #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h"  using namespace llvm;  #define DEBUG_TYPE "add-discriminators"  namespace { -  struct AddDiscriminators : public FunctionPass { -    static char ID; // Pass identification, replacement for typeid -    AddDiscriminators() : FunctionPass(ID) { -      initializeAddDiscriminatorsPass(*PassRegistry::getPassRegistry()); -    } +struct AddDiscriminators : public FunctionPass { +  static char ID; // Pass identification, replacement for typeid +  AddDiscriminators() : FunctionPass(ID) { +    initializeAddDiscriminatorsPass(*PassRegistry::getPassRegistry()); +  } -    bool runOnFunction(Function &F) override; -  }; +  bool runOnFunction(Function &F) override; +};  }  char AddDiscriminators::ID = 0; @@ -89,17 +91,17 @@ INITIALIZE_PASS_END(AddDiscriminators, "add-discriminators",  // Command line option to disable discriminator generation even in the  // presence of debug information. This is only needed when debugging  // debug info generation issues. -static cl::opt<bool> -NoDiscriminators("no-discriminators", cl::init(false), -                 cl::desc("Disable generation of discriminator information.")); +static cl::opt<bool> NoDiscriminators( +    "no-discriminators", cl::init(false), +    cl::desc("Disable generation of discriminator information."));  FunctionPass *llvm::createAddDiscriminatorsPass() {    return new AddDiscriminators();  }  static bool hasDebugInfo(const Function &F) { -  NamedMDNode *CUNodes = F.getParent()->getNamedMetadata("llvm.dbg.cu"); -  return CUNodes != nullptr; +  DISubprogram *S = getDISubprogram(&F); +  return S != nullptr;  }  /// \brief Assign DWARF discriminators. @@ -159,8 +161,7 @@ bool AddDiscriminators::runOnFunction(Function &F) {    // Simlarly, if the function has no debug info, do nothing.    // Finally, if this module is built with dwarf versions earlier than 4,    // do nothing (discriminator support is a DWARF 4 feature). -  if (NoDiscriminators || -      !hasDebugInfo(F) || +  if (NoDiscriminators || !hasDebugInfo(F) ||        F.getParent()->getDwarfVersion() < 4)      return false; @@ -169,59 +170,77 @@ bool AddDiscriminators::runOnFunction(Function &F) {    LLVMContext &Ctx = M->getContext();    DIBuilder Builder(*M, /*AllowUnresolved*/ false); -  // Traverse all the blocks looking for instructions in different -  // blocks that are at the same file:line location. -  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { -    BasicBlock *B = I; -    TerminatorInst *Last = B->getTerminator(); -    const DILocation *LastDIL = Last->getDebugLoc(); -    if (!LastDIL) -      continue; - -    for (unsigned I = 0; I < Last->getNumSuccessors(); ++I) { -      BasicBlock *Succ = Last->getSuccessor(I); -      Instruction *First = Succ->getFirstNonPHIOrDbgOrLifetime(); -      const DILocation *FirstDIL = First->getDebugLoc(); -      if (!FirstDIL) +  typedef std::pair<StringRef, unsigned> Location; +  typedef DenseMap<const BasicBlock *, Metadata *> BBScopeMap; +  typedef DenseMap<Location, BBScopeMap> LocationBBMap; + +  LocationBBMap LBM; + +  // Traverse all instructions in the function. If the source line location +  // of the instruction appears in other basic block, assign a new +  // discriminator for this instruction. +  for (BasicBlock &B : F) { +    for (auto &I : B.getInstList()) { +      if (isa<DbgInfoIntrinsic>(&I)) +        continue; +      const DILocation *DIL = I.getDebugLoc(); +      if (!DIL) +        continue; +      Location L = std::make_pair(DIL->getFilename(), DIL->getLine()); +      auto &BBMap = LBM[L]; +      auto R = BBMap.insert(std::make_pair(&B, (Metadata *)nullptr)); +      if (BBMap.size() == 1) +        continue; +      bool InsertSuccess = R.second; +      Metadata *&NewScope = R.first->second; +      // If we could insert a different block in the same location, a +      // discriminator is needed to distinguish both instructions. +      if (InsertSuccess) { +        auto *Scope = DIL->getScope(); +        auto *File = +            Builder.createFile(DIL->getFilename(), Scope->getDirectory()); +        NewScope = Builder.createLexicalBlockFile( +            Scope, File, DIL->computeNewDiscriminator()); +      } +      I.setDebugLoc(DILocation::get(Ctx, DIL->getLine(), DIL->getColumn(), +                                    NewScope, DIL->getInlinedAt())); +      DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":" +                   << DIL->getColumn() << ":" +                   << dyn_cast<DILexicalBlockFile>(NewScope)->getDiscriminator() +                   << I << "\n"); +      Changed = true; +    } +  } + +  // Traverse all instructions and assign new discriminators to call +  // instructions with the same lineno that are in the same basic block. +  // Sample base profile needs to distinguish different function calls within +  // a same source line for correct profile annotation. +  for (BasicBlock &B : F) { +    const DILocation *FirstDIL = NULL; +    for (auto &I : B.getInstList()) { +      CallInst *Current = dyn_cast<CallInst>(&I); +      if (!Current || isa<DbgInfoIntrinsic>(&I))          continue; -      // If the first instruction (First) of Succ is at the same file -      // location as B's last instruction (Last), add a new -      // discriminator for First's location and all the instructions -      // in Succ that share the same location with First. -      if (!FirstDIL->canDiscriminate(*LastDIL)) { -        // Create a new lexical scope and compute a new discriminator -        // number for it. -        StringRef Filename = FirstDIL->getFilename(); -        auto *Scope = FirstDIL->getScope(); -        auto *File = Builder.createFile(Filename, Scope->getDirectory()); - -        // FIXME: Calculate the discriminator here, based on local information, -        // and delete DILocation::computeNewDiscriminator().  The current -        // solution gives different results depending on other modules in the -        // same context.  All we really need is to discriminate between -        // FirstDIL and LastDIL -- a local map would suffice. -        unsigned Discriminator = FirstDIL->computeNewDiscriminator(); -        auto *NewScope = -            Builder.createLexicalBlockFile(Scope, File, Discriminator); -        auto *NewDIL = -            DILocation::get(Ctx, FirstDIL->getLine(), FirstDIL->getColumn(), -                            NewScope, FirstDIL->getInlinedAt()); -        DebugLoc newDebugLoc = NewDIL; - -        // Attach this new debug location to First and every -        // instruction following First that shares the same location. -        for (BasicBlock::iterator I1(*First), E1 = Succ->end(); I1 != E1; -             ++I1) { -          if (I1->getDebugLoc().get() != FirstDIL) -            break; -          I1->setDebugLoc(newDebugLoc); -          DEBUG(dbgs() << NewDIL->getFilename() << ":" << NewDIL->getLine() -                       << ":" << NewDIL->getColumn() << ":" -                       << NewDIL->getDiscriminator() << *I1 << "\n"); +      DILocation *CurrentDIL = Current->getDebugLoc(); +      if (FirstDIL) { +        if (CurrentDIL && CurrentDIL->getLine() == FirstDIL->getLine() && +            CurrentDIL->getFilename() == FirstDIL->getFilename()) { +          auto *Scope = FirstDIL->getScope(); +          auto *File = Builder.createFile(FirstDIL->getFilename(), +                                          Scope->getDirectory()); +          auto *NewScope = Builder.createLexicalBlockFile( +              Scope, File, FirstDIL->computeNewDiscriminator()); +          Current->setDebugLoc(DILocation::get( +              Ctx, CurrentDIL->getLine(), CurrentDIL->getColumn(), NewScope, +              CurrentDIL->getInlinedAt())); +          Changed = true; +        } else { +          FirstDIL = CurrentDIL;          } -        DEBUG(dbgs() << "\n"); -        Changed = true; +      } else { +        FirstDIL = CurrentDIL;        }      }    } diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index ef7dacac79cb..a5137e933e83 100644 --- a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -41,8 +41,8 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {    // Loop through all of our successors and make sure they know that one    // of their predecessors is going away. -  for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) -    BBTerm->getSuccessor(i)->removePredecessor(BB); +  for (BasicBlock *Succ : BBTerm->successors()) +    Succ->removePredecessor(BB);    // Zap all the instructions in the block.    while (!BB->empty()) { @@ -65,7 +65,7 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {  /// any single-entry PHI nodes in it, fold them away.  This handles the case  /// when all entries to the PHI nodes in a block are guaranteed equal, such as  /// when the block has exactly one predecessor. -void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, AliasAnalysis *AA, +void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,                                     MemoryDependenceAnalysis *MemDep) {    if (!isa<PHINode>(BB->begin())) return; @@ -77,8 +77,6 @@ void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, AliasAnalysis *AA,      if (MemDep)        MemDep->removeInstruction(PN);  // Memdep updates AA itself. -    else if (AA && isa<PointerType>(PN->getType())) -      AA->deleteValue(PN);      PN->eraseFromParent();    } @@ -108,7 +106,7 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {  /// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,  /// if possible.  The return value indicates success or failure.  bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT, -                                     LoopInfo *LI, AliasAnalysis *AA, +                                     LoopInfo *LI,                                       MemoryDependenceAnalysis *MemDep) {    // Don't merge away blocks who have their address taken.    if (BB->hasAddressTaken()) return false; @@ -119,8 +117,9 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,    // Don't break self-loops.    if (PredBB == BB) return false; -  // Don't break invokes. -  if (isa<InvokeInst>(PredBB->getTerminator())) return false; +  // Don't break unwinding instructions. +  if (PredBB->getTerminator()->isExceptional()) +    return false;    succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB));    BasicBlock *OnlySucc = BB; @@ -145,7 +144,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,    // Begin by getting rid of unneeded PHIs.    if (isa<PHINode>(BB->front())) -    FoldSingleEntryPHINodes(BB, AA, MemDep); +    FoldSingleEntryPHINodes(BB, MemDep);    // Delete the unconditional branch from the predecessor...    PredBB->getInstList().pop_back(); @@ -253,7 +252,7 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,      // block.      assert(SP == BB && "CFG broken");      SP = nullptr; -    return SplitBlock(Succ, Succ->begin(), DT, LI); +    return SplitBlock(Succ, &Succ->front(), DT, LI);    }    // Otherwise, if BB has a single successor, split it at the bottom of the @@ -284,8 +283,8 @@ llvm::SplitAllCriticalEdges(Function &F,  ///  BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,                               DominatorTree *DT, LoopInfo *LI) { -  BasicBlock::iterator SplitIt = SplitPt; -  while (isa<PHINode>(SplitIt) || isa<LandingPadInst>(SplitIt)) +  BasicBlock::iterator SplitIt = SplitPt->getIterator(); +  while (isa<PHINode>(SplitIt) || SplitIt->isEHPad())      ++SplitIt;    BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split"); @@ -393,7 +392,7 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,  /// from NewBB. This also updates AliasAnalysis, if available.  static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,                             ArrayRef<BasicBlock *> Preds, BranchInst *BI, -                           AliasAnalysis *AA, bool HasLoopExit) { +                           bool HasLoopExit) {    // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB.    SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end());    for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) { @@ -474,17 +473,20 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,  ///  BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,                                           ArrayRef<BasicBlock *> Preds, -                                         const char *Suffix, AliasAnalysis *AA, -                                         DominatorTree *DT, LoopInfo *LI, -                                         bool PreserveLCSSA) { +                                         const char *Suffix, DominatorTree *DT, +                                         LoopInfo *LI, bool PreserveLCSSA) { +  // Do not attempt to split that which cannot be split. +  if (!BB->canSplitPredecessors()) +    return nullptr; +    // For the landingpads we need to act a bit differently.    // Delegate this work to the SplitLandingPadPredecessors.    if (BB->isLandingPad()) {      SmallVector<BasicBlock*, 2> NewBBs;      std::string NewName = std::string(Suffix) + ".split-lp"; -    SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), -                                NewBBs, AA, DT, LI, PreserveLCSSA); +    SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs, DT, +                                LI, PreserveLCSSA);      return NewBBs[0];    } @@ -523,7 +525,7 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,                              HasLoopExit);    // Update the PHI nodes in BB with the values coming from NewBB. -  UpdatePHINodes(BB, NewBB, Preds, BI, AA, HasLoopExit); +  UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);    return NewBB;  } @@ -544,8 +546,8 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,                                         ArrayRef<BasicBlock *> Preds,                                         const char *Suffix1, const char *Suffix2,                                         SmallVectorImpl<BasicBlock *> &NewBBs, -                                       AliasAnalysis *AA, DominatorTree *DT, -                                       LoopInfo *LI, bool PreserveLCSSA) { +                                       DominatorTree *DT, LoopInfo *LI, +                                       bool PreserveLCSSA) {    assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!");    // Create a new basic block for OrigBB's predecessors listed in Preds. Insert @@ -574,7 +576,7 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,                              HasLoopExit);    // Update the PHI nodes in OrigBB with the values coming from NewBB1. -  UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, AA, HasLoopExit); +  UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, HasLoopExit);    // Move the remaining edges from OrigBB to point to NewBB2.    SmallVector<BasicBlock*, 8> NewBB2Preds; @@ -611,7 +613,7 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,                                PreserveLCSSA, HasLoopExit);      // Update the PHI nodes in OrigBB with the values coming from NewBB2. -    UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, AA, HasLoopExit); +    UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, HasLoopExit);    }    LandingPadInst *LPad = OrigBB->getLandingPadInst(); @@ -661,7 +663,7 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,        // return instruction.        V = BCI->getOperand(0);        NewBC = BCI->clone(); -      Pred->getInstList().insert(NewRet, NewBC); +      Pred->getInstList().insert(NewRet->getIterator(), NewBC);        *i = NewBC;      }      if (PHINode *PN = dyn_cast<PHINode>(V)) { @@ -707,7 +709,7 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond,                                                  MDNode *BranchWeights,                                                  DominatorTree *DT) {    BasicBlock *Head = SplitBefore->getParent(); -  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); +  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());    TerminatorInst *HeadOldTerm = Head->getTerminator();    LLVMContext &C = Head->getContext();    BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); @@ -757,7 +759,7 @@ void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,                                           TerminatorInst **ElseTerm,                                           MDNode *BranchWeights) {    BasicBlock *Head = SplitBefore->getParent(); -  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); +  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());    TerminatorInst *HeadOldTerm = Head->getTerminator();    LLVMContext &C = Head->getContext();    BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index 7e83c9eeceb7..95825991cee9 100644 --- a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -101,10 +101,9 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,          continue;      // Otherwise a new PHI is needed. Create one and populate it. -    PHINode *NewPN = -      PHINode::Create(PN->getType(), Preds.size(), "split", -                      SplitBB->isLandingPad() ? -                      SplitBB->begin() : SplitBB->getTerminator()); +    PHINode *NewPN = PHINode::Create( +        PN->getType(), Preds.size(), "split", +        SplitBB->isLandingPad() ? &SplitBB->front() : SplitBB->getTerminator());      for (unsigned i = 0, e = Preds.size(); i != e; ++i)        NewPN->addIncoming(V, Preds[i]); @@ -141,9 +140,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,    BasicBlock *TIBB = TI->getParent();    BasicBlock *DestBB = TI->getSuccessor(SuccNum); -  // Splitting the critical edge to a landing pad block is non-trivial. Don't do +  // Splitting the critical edge to a pad block is non-trivial. Don't do    // it in this generic function. -  if (DestBB->isLandingPad()) return nullptr; +  if (DestBB->isEHPad()) return nullptr;    // Create a new basic block, linking it into the CFG.    BasicBlock *NewBB = BasicBlock::Create(TI->getContext(), @@ -157,7 +156,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,    // Insert the block into the function... right after the block TI lives in.    Function &F = *TIBB->getParent(); -  Function::iterator FBBI = TIBB; +  Function::iterator FBBI = TIBB->getIterator();    F.getBasicBlockList().insert(++FBBI, NewBB);    // If there are any PHI nodes in DestBB, we need to update them so that they @@ -197,7 +196,6 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,    }    // If we have nothing to update, just return. -  auto *AA = Options.AA;    auto *DT = Options.DT;    auto *LI = Options.LI;    if (!DT && !LI) @@ -319,10 +317,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,            LoopPreds.push_back(P);          }          if (!LoopPreds.empty()) { -          assert(!DestBB->isLandingPad() && -                 "We don't split edges to landing pads!"); +          assert(!DestBB->isEHPad() && "We don't split edges to EH pads!");            BasicBlock *NewExitBB = SplitBlockPredecessors( -              DestBB, LoopPreds, "split", AA, DT, LI, Options.PreserveLCSSA); +              DestBB, LoopPreds, "split", DT, LI, Options.PreserveLCSSA);            if (Options.PreserveLCSSA)              createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB);          } diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 8aa7b2a65ba9..64b44a6b7919 100644 --- a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -13,6 +13,7 @@  #include "llvm/Transforms/Utils/BuildLibCalls.h"  #include "llvm/ADT/SmallString.h" +#include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/Function.h" @@ -21,7 +22,6 @@  #include "llvm/IR/LLVMContext.h"  #include "llvm/IR/Module.h"  #include "llvm/IR/Type.h" -#include "llvm/Analysis/TargetLibraryInfo.h"  using namespace llvm; @@ -55,32 +55,6 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,    return CI;  } -/// EmitStrNLen - Emit a call to the strnlen function to the builder, for the -/// specified pointer.  Ptr is required to be some pointer type, MaxLen must -/// be of size_t type, and the return value has 'intptr_t' type. -Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B, -                         const DataLayout &DL, const TargetLibraryInfo *TLI) { -  if (!TLI->has(LibFunc::strnlen)) -    return nullptr; - -  Module *M = B.GetInsertBlock()->getParent()->getParent(); -  AttributeSet AS[2]; -  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture); -  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind }; -  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs); - -  LLVMContext &Context = B.GetInsertBlock()->getContext(); -  Constant *StrNLen = -      M->getOrInsertFunction("strnlen", AttributeSet::get(M->getContext(), AS), -                             DL.getIntPtrType(Context), B.getInt8PtrTy(), -                             DL.getIntPtrType(Context), nullptr); -  CallInst *CI = B.CreateCall(StrNLen, {CastToCStr(Ptr, B), MaxLen}, "strnlen"); -  if (const Function *F = dyn_cast<Function>(StrNLen->stripPointerCasts())) -    CI->setCallingConv(F->getCallingConv()); - -  return CI; -} -  /// EmitStrChr - Emit a call to the strchr function to the builder, for the  /// specified pointer and character.  Ptr is required to be some pointer type,  /// and the return value has 'i8*' type. diff --git a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp index f2d5e0745035..0914699a2e38 100644 --- a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -82,7 +82,7 @@ static bool insertFastDiv(Function &F,                            bool UseSignedOp,                            DivCacheTy &PerBBDivCache) {    // Get instruction operands -  Instruction *Instr = J; +  Instruction *Instr = &*J;    Value *Dividend = Instr->getOperand(0);    Value *Divisor = Instr->getOperand(1); @@ -94,7 +94,7 @@ static bool insertFastDiv(Function &F,    }    // Basic Block is split before divide -  BasicBlock *MainBB = I; +  BasicBlock *MainBB = &*I;    BasicBlock *SuccessorBB = I->splitBasicBlock(J);    ++I; //advance iterator I to successorBB @@ -190,7 +190,7 @@ static bool reuseOrInsertFastDiv(Function &F,                                   bool UseSignedOp,                                   DivCacheTy &PerBBDivCache) {    // Get instruction operands -  Instruction *Instr = J; +  Instruction *Instr = &*J;    DivOpInfo Key(UseSignedOp, Instr->getOperand(0), Instr->getOperand(1));    DivCacheTy::iterator CacheI = PerBBDivCache.find(Key); diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp index cc4d6c6fb192..854a3b855f54 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -52,8 +52,8 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,      if (II->hasName())        NewInst->setName(II->getName()+NameSuffix);      NewBB->getInstList().push_back(NewInst); -    VMap[II] = NewInst;                // Add instruction map to value. -     +    VMap[&*II] = NewInst; // Add instruction map to value. +      hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));      if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {        if (isa<ConstantInt>(AI->getArraySize())) @@ -85,9 +85,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,    assert(NameSuffix && "NameSuffix cannot be null!");  #ifndef NDEBUG -  for (Function::const_arg_iterator I = OldFunc->arg_begin(),  -       E = OldFunc->arg_end(); I != E; ++I) -    assert(VMap.count(I) && "No mapping from source argument specified!"); +  for (const Argument &I : OldFunc->args()) +    assert(VMap.count(&I) && "No mapping from source argument specified!");  #endif    // Copy all attributes other than those stored in the AttributeSet.  We need @@ -96,6 +95,13 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,    NewFunc->copyAttributesFrom(OldFunc);    NewFunc->setAttributes(NewAttrs); +  // Fix up the personality function that got copied over. +  if (OldFunc->hasPersonalityFn()) +    NewFunc->setPersonalityFn( +        MapValue(OldFunc->getPersonalityFn(), VMap, +                 ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, +                 TypeMapper, Materializer)); +    AttributeSet OldAttrs = OldFunc->getAttributes();    // Clone any argument attributes that are present in the VMap.    for (const Argument &OldArg : OldFunc->args()) @@ -136,7 +142,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,      if (BB.hasAddressTaken()) {        Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),                                                const_cast<BasicBlock*>(&BB)); -      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);                                          +      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);      }      // Note return instructions for the caller. @@ -146,11 +152,13 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,    // Loop over all of the instructions in the function, fixing up operand    // references as we go.  This uses VMap to do all the hard work. -  for (Function::iterator BB = cast<BasicBlock>(VMap[OldFunc->begin()]), -         BE = NewFunc->end(); BB != BE; ++BB) +  for (Function::iterator BB = +           cast<BasicBlock>(VMap[&OldFunc->front()])->getIterator(), +                          BE = NewFunc->end(); +       BB != BE; ++BB)      // Loop over all instructions, fixing each one as we find it... -    for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II) -      RemapInstruction(II, VMap, +    for (Instruction &II : *BB) +      RemapInstruction(&II, VMap,                         ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,                         TypeMapper, Materializer);  } @@ -187,11 +195,9 @@ static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc,    const DISubprogram *OldSubprogramMDNode = FindSubprogram(OldFunc, Finder);    if (!OldSubprogramMDNode) return; -  // Ensure that OldFunc appears in the map. -  // (if it's already there it must point to NewFunc anyway) -  VMap[OldFunc] = NewFunc;    auto *NewSubprogram =        cast<DISubprogram>(MapMetadata(OldSubprogramMDNode, VMap)); +  NewFunc->setSubprogram(NewSubprogram);    for (auto *CU : Finder.compile_units()) {      auto Subprograms = CU->getSubprograms(); @@ -222,10 +228,9 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap,    // The user might be deleting arguments to the function by specifying them in    // the VMap.  If so, we need to not add the arguments to the arg ty vector    // -  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); -       I != E; ++I) -    if (VMap.count(I) == 0)  // Haven't mapped the argument to anything yet? -      ArgTypes.push_back(I->getType()); +  for (const Argument &I : F->args()) +    if (VMap.count(&I) == 0) // Haven't mapped the argument to anything yet? +      ArgTypes.push_back(I.getType());    // Create a new function type...    FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(), @@ -236,11 +241,10 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap,    // Loop over the arguments, copying the names of the mapped arguments over...    Function::arg_iterator DestI = NewF->arg_begin(); -  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); -       I != E; ++I) -    if (VMap.count(I) == 0) {   // Is this argument preserved? -      DestI->setName(I->getName()); // Copy the name over... -      VMap[I] = DestI++;        // Add mapping to VMap +  for (const Argument & I : F->args()) +    if (VMap.count(&I) == 0) {     // Is this argument preserved? +      DestI->setName(I.getName()); // Copy the name over... +      VMap[&I] = &*DestI++;        // Add mapping to VMap      }    if (ModuleLevelChanges) @@ -330,8 +334,8 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,         II != IE; ++II) {      // If the "Director" remaps the instruction, don't clone it.      if (Director) { -      CloningDirector::CloningAction Action  -                              = Director->handleInstruction(VMap, II, NewBB); +      CloningDirector::CloningAction Action = +          Director->handleInstruction(VMap, &*II, NewBB);        // If the cloning director says stop, we want to stop everything, not        // just break out of the loop (which would cause the terminator to be        // cloned).  The cloning director is responsible for inserting a proper @@ -365,7 +369,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,          if (Value *MappedV = VMap.lookup(V))            V = MappedV; -        VMap[II] = V; +        VMap[&*II] = V;          delete NewInst;          continue;        } @@ -373,9 +377,15 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,      if (II->hasName())        NewInst->setName(II->getName()+NameSuffix); -    VMap[II] = NewInst;                // Add instruction map to value. +    VMap[&*II] = NewInst; // Add instruction map to value.      NewBB->getInstList().push_back(NewInst);      hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II)); + +    if (CodeInfo) +      if (auto CS = ImmutableCallSite(&*II)) +        if (CS.hasOperandBundles()) +          CodeInfo->OperandBundleCallSites.push_back(NewInst); +      if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {        if (isa<ConstantInt>(AI->getArraySize()))          hasStaticAllocas = true; @@ -400,8 +410,8 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,        // If the director says to skip with a terminate instruction, we still        // need to clone this block's successors.        const TerminatorInst *TI = NewBB->getTerminator(); -      for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) -        ToClone.push_back(TI->getSuccessor(i)); +      for (const BasicBlock *Succ : TI->successors()) +        ToClone.push_back(Succ);        return;      }      assert(Action != CloningDirector::SkipInstruction &&  @@ -447,11 +457,16 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,        NewInst->setName(OldTI->getName()+NameSuffix);      NewBB->getInstList().push_back(NewInst);      VMap[OldTI] = NewInst;             // Add instruction map to value. -     + +    if (CodeInfo) +      if (auto CS = ImmutableCallSite(OldTI)) +        if (CS.hasOperandBundles()) +          CodeInfo->OperandBundleCallSites.push_back(NewInst); +      // Recursively clone any reachable successor blocks.      const TerminatorInst *TI = BB->getTerminator(); -    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) -      ToClone.push_back(TI->getSuccessor(i)); +    for (const BasicBlock *Succ : TI->successors()) +      ToClone.push_back(Succ);    }    if (CodeInfo) { @@ -484,12 +499,11 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,    }  #ifndef NDEBUG -  // If the cloning starts at the begining of the function, verify that +  // If the cloning starts at the beginning of the function, verify that    // the function arguments are mapped.    if (!StartingInst) -    for (Function::const_arg_iterator II = OldFunc->arg_begin(), -         E = OldFunc->arg_end(); II != E; ++II) -      assert(VMap.count(II) && "No mapping from source argument specified!"); +    for (const Argument &II : OldFunc->args()) +      assert(VMap.count(&II) && "No mapping from source argument specified!");  #endif    PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges, @@ -499,12 +513,12 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,      StartingBB = StartingInst->getParent();    else {      StartingBB = &OldFunc->getEntryBlock(); -    StartingInst = StartingBB->begin(); +    StartingInst = &StartingBB->front();    }    // Clone the entry block, and anything recursively reachable from it.    std::vector<const BasicBlock*> CloneWorklist; -  PFC.CloneBlock(StartingBB, StartingInst, CloneWorklist); +  PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist);    while (!CloneWorklist.empty()) {      const BasicBlock *BB = CloneWorklist.back();      CloneWorklist.pop_back(); @@ -517,9 +531,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,    //    // Defer PHI resolution until rest of function is resolved.    SmallVector<const PHINode*, 16> PHIToResolve; -  for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end(); -       BI != BE; ++BI) { -    Value *V = VMap[BI]; +  for (const BasicBlock &BI : *OldFunc) { +    Value *V = VMap[&BI];      BasicBlock *NewBB = cast_or_null<BasicBlock>(V);      if (!NewBB) continue;  // Dead block. @@ -528,7 +541,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,      // Handle PHI nodes specially, as we have to remove references to dead      // blocks. -    for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); I != E; ++I) { +    for (BasicBlock::const_iterator I = BI.begin(), E = BI.end(); I != E; ++I) {        // PHI nodes may have been remapped to non-PHI nodes by the caller or        // during the cloning process.        if (const PHINode *PN = dyn_cast<PHINode>(I)) { @@ -621,8 +634,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,        while ((PN = dyn_cast<PHINode>(I++))) {          Value *NV = UndefValue::get(PN->getType());          PN->replaceAllUsesWith(NV); -        assert(VMap[OldI] == PN && "VMap mismatch"); -        VMap[OldI] = NV; +        assert(VMap[&*OldI] == PN && "VMap mismatch"); +        VMap[&*OldI] = NV;          PN->eraseFromParent();          ++OldI;        } @@ -644,15 +657,15 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,    // and zap unconditional fall-through branches. This happens all the time when    // specializing code: code specialization turns conditional branches into    // uncond branches, and this code folds them. -  Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB]); +  Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator();    Function::iterator I = Begin;    while (I != NewFunc->end()) {      // Check if this block has become dead during inlining or other      // simplifications. Note that the first block will appear dead, as it has      // not yet been wired up properly. -    if (I != Begin && (pred_begin(I) == pred_end(I) || -                       I->getSinglePredecessor() == I)) { -      BasicBlock *DeadBB = I++; +    if (I != Begin && (pred_begin(&*I) == pred_end(&*I) || +                       I->getSinglePredecessor() == &*I)) { +      BasicBlock *DeadBB = &*I++;        DeleteDeadBlock(DeadBB);        continue;      } @@ -662,7 +675,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,      // simplification required looking through PHI nodes, those are only      // available after forming the full basic block. That may leave some here,      // and we still want to prune the dead code as early as possible. -    ConstantFoldTerminator(I); +    ConstantFoldTerminator(&*I);      BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());      if (!BI || BI->isConditional()) { ++I; continue; } @@ -681,7 +694,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,      BI->eraseFromParent();      // Make all PHI nodes that referred to Dest now refer to I as their source. -    Dest->replaceAllUsesWith(I); +    Dest->replaceAllUsesWith(&*I);      // Move all the instructions in the succ to the pred.      I->getInstList().splice(I->end(), Dest->getInstList()); @@ -695,7 +708,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,    // Make a final pass over the basic blocks from the old function to gather    // any return instructions which survived folding. We have to do this here    // because we can iteratively remove and merge returns above. -  for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB]), +  for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB])->getIterator(),                            E = NewFunc->end();         I != E; ++I)      if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) @@ -717,7 +730,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,                                       const char *NameSuffix,                                        ClonedCodeInfo *CodeInfo,                                       Instruction *TheCall) { -  CloneAndPruneIntoFromInst(NewFunc, OldFunc, OldFunc->front().begin(), VMap, +  CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap,                              ModuleLevelChanges, Returns, NameSuffix, CodeInfo,                              nullptr);  } @@ -780,9 +793,10 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,    }    // Move them physically from the end of the block list. -  F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH); -  F->getBasicBlockList().splice(Before, F->getBasicBlockList(), -                                NewLoop->getHeader(), F->end()); +  F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), +                                NewPH); +  F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), +                                NewLoop->getHeader()->getIterator(), F->end());    return NewLoop;  } diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp index 61f1811e7b4a..ab083353ece6 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp @@ -20,21 +20,28 @@  #include "llvm-c/Core.h"  using namespace llvm; -/// CloneModule - Return an exact copy of the specified module.  This is not as -/// easy as it might seem because we have to worry about making copies of global -/// variables and functions, and making their (initializers and references, -/// respectively) refer to the right globals. +/// This is not as easy as it might seem because we have to worry about making +/// copies of global variables and functions, and making their (initializers and +/// references, respectively) refer to the right globals.  /// -Module *llvm::CloneModule(const Module *M) { +std::unique_ptr<Module> llvm::CloneModule(const Module *M) {    // Create the value map that maps things from the old module over to the new    // module.    ValueToValueMapTy VMap;    return CloneModule(M, VMap);  } -Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { +std::unique_ptr<Module> llvm::CloneModule(const Module *M, +                                          ValueToValueMapTy &VMap) { +  return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; }); +} + +std::unique_ptr<Module> llvm::CloneModule( +    const Module *M, ValueToValueMapTy &VMap, +    std::function<bool(const GlobalValue *)> ShouldCloneDefinition) {    // First off, we need to create the new module. -  Module *New = new Module(M->getModuleIdentifier(), M->getContext()); +  std::unique_ptr<Module> New = +      llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext());    New->setDataLayout(M->getDataLayout());    New->setTargetTriple(M->getTargetTriple());    New->setModuleInlineAsm(M->getModuleInlineAsm()); @@ -52,26 +59,48 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {                                              (GlobalVariable*) nullptr,                                              I->getThreadLocalMode(),                                              I->getType()->getAddressSpace()); -    GV->copyAttributesFrom(I); -    VMap[I] = GV; +    GV->copyAttributesFrom(&*I); +    VMap[&*I] = GV;    }    // Loop over the functions in the module, making external functions as before    for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {      Function *NF = -      Function::Create(cast<FunctionType>(I->getType()->getElementType()), -                       I->getLinkage(), I->getName(), New); -    NF->copyAttributesFrom(I); -    VMap[I] = NF; +        Function::Create(cast<FunctionType>(I->getType()->getElementType()), +                         I->getLinkage(), I->getName(), New.get()); +    NF->copyAttributesFrom(&*I); +    VMap[&*I] = NF;    }    // Loop over the aliases in the module    for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();         I != E; ++I) { -    auto *PTy = cast<PointerType>(I->getType()); -    auto *GA = GlobalAlias::create(PTy, I->getLinkage(), I->getName(), New); -    GA->copyAttributesFrom(I); -    VMap[I] = GA; +    if (!ShouldCloneDefinition(&*I)) { +      // An alias cannot act as an external reference, so we need to create +      // either a function or a global variable depending on the value type. +      // FIXME: Once pointee types are gone we can probably pick one or the +      // other. +      GlobalValue *GV; +      if (I->getValueType()->isFunctionTy()) +        GV = Function::Create(cast<FunctionType>(I->getValueType()), +                              GlobalValue::ExternalLinkage, I->getName(), +                              New.get()); +      else +        GV = new GlobalVariable( +            *New, I->getValueType(), false, GlobalValue::ExternalLinkage, +            (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr, +            I->getThreadLocalMode(), I->getType()->getAddressSpace()); +      VMap[&*I] = GV; +      // We do not copy attributes (mainly because copying between different +      // kinds of globals is forbidden), but this is generally not required for +      // correctness. +      continue; +    } +    auto *GA = GlobalAlias::create(I->getValueType(), +                                   I->getType()->getPointerAddressSpace(), +                                   I->getLinkage(), I->getName(), New.get()); +    GA->copyAttributesFrom(&*I); +    VMap[&*I] = GA;    }    // Now that all of the things that global variable initializer can refer to @@ -80,7 +109,12 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {    //    for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();         I != E; ++I) { -    GlobalVariable *GV = cast<GlobalVariable>(VMap[I]); +    GlobalVariable *GV = cast<GlobalVariable>(VMap[&*I]); +    if (!ShouldCloneDefinition(&*I)) { +      // Skip after setting the correct linkage for an external reference. +      GV->setLinkage(GlobalValue::ExternalLinkage); +      continue; +    }      if (I->hasInitializer())        GV->setInitializer(MapValue(I->getInitializer(), VMap));    } @@ -88,18 +122,22 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {    // Similarly, copy over function bodies now...    //    for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) { -    Function *F = cast<Function>(VMap[I]); +    Function *F = cast<Function>(VMap[&*I]); +    if (!ShouldCloneDefinition(&*I)) { +      // Skip after setting the correct linkage for an external reference. +      F->setLinkage(GlobalValue::ExternalLinkage); +      continue; +    }      if (!I->isDeclaration()) {        Function::arg_iterator DestI = F->arg_begin();        for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end();             ++J) {          DestI->setName(J->getName()); -        VMap[J] = DestI++; +        VMap[&*J] = &*DestI++;        }        SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned. -      CloneFunctionInto(F, I, VMap, /*ModuleLevelChanges=*/true, Returns); - +      CloneFunctionInto(F, &*I, VMap, /*ModuleLevelChanges=*/true, Returns);      }      if (I->hasPersonalityFn()) @@ -109,7 +147,10 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {    // And aliases    for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();         I != E; ++I) { -    GlobalAlias *GA = cast<GlobalAlias>(VMap[I]); +    // We already dealt with undefined aliases above. +    if (!ShouldCloneDefinition(&*I)) +      continue; +    GlobalAlias *GA = cast<GlobalAlias>(VMap[&*I]);      if (const Constant *C = I->getAliasee())        GA->setAliasee(MapValue(C, VMap));    } @@ -129,7 +170,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {  extern "C" {  LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) { -  return wrap(CloneModule(unwrap(M))); +  return wrap(CloneModule(unwrap(M)).release());  }  } diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp index ab89b41f6788..823696d88e65 100644 --- a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -51,7 +51,7 @@ AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,  /// \brief Test whether a block is valid for extraction.  static bool isBlockValidForExtraction(const BasicBlock &BB) {    // Landing pads must be in the function where they were inserted for cleanup. -  if (BB.isLandingPad()) +  if (BB.isEHPad())      return false;    // Don't hoist code containing allocas, invokes, or vastarts. @@ -175,7 +175,7 @@ void CodeExtractor::findInputsOutputs(ValueSet &Inputs,        for (User *U : II->users())          if (!definedInRegion(Blocks, U)) { -          Outputs.insert(II); +          Outputs.insert(&*II);            break;          }      } @@ -211,7 +211,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {    // containing PHI nodes merging values from outside of the region, and a    // second that contains all of the code for the block and merges back any    // incoming values from inside of the region. -  BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI(); +  BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI()->getIterator();    BasicBlock *NewBB = Header->splitBasicBlock(AfterPHIs,                                                Header->getName()+".ce"); @@ -246,7 +246,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {        // Create a new PHI node in the new region, which has an incoming value        // from OldPred of PN.        PHINode *NewPN = PHINode::Create(PN->getType(), 1 + NumPredsFromRegion, -                                       PN->getName()+".ce", NewBB->begin()); +                                       PN->getName() + ".ce", &NewBB->front());        NewPN->addIncoming(PN, OldPred);        // Loop over all of the incoming value in PN, moving them to NewPN if they @@ -266,7 +266,8 @@ void CodeExtractor::splitReturnBlocks() {    for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end();         I != E; ++I)      if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator())) { -      BasicBlock *New = (*I)->splitBasicBlock(RI, (*I)->getName()+".ret"); +      BasicBlock *New = +          (*I)->splitBasicBlock(RI->getIterator(), (*I)->getName() + ".ret");        if (DT) {          // Old dominates New. New node dominates all other nodes dominated          // by Old. @@ -365,10 +366,10 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,        Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);        TerminatorInst *TI = newFunction->begin()->getTerminator();        GetElementPtrInst *GEP = GetElementPtrInst::Create( -          StructTy, AI, Idx, "gep_" + inputs[i]->getName(), TI); +          StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI);        RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI);      } else -      RewriteVal = AI++; +      RewriteVal = &*AI++;      std::vector<User*> Users(inputs[i]->user_begin(), inputs[i]->user_end());      for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end(); @@ -440,8 +441,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,        StructValues.push_back(*i);      } else {        AllocaInst *alloca = -        new AllocaInst((*i)->getType(), nullptr, (*i)->getName()+".loc", -                       codeReplacer->getParent()->begin()->begin()); +          new AllocaInst((*i)->getType(), nullptr, (*i)->getName() + ".loc", +                         &codeReplacer->getParent()->front().front());        ReloadOutputs.push_back(alloca);        params.push_back(alloca);      } @@ -457,9 +458,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,      // Allocate a struct at the beginning of this function      StructArgTy = StructType::get(newFunction->getContext(), ArgTypes); -    Struct = -      new AllocaInst(StructArgTy, nullptr, "structArg", -                     codeReplacer->getParent()->begin()->begin()); +    Struct = new AllocaInst(StructArgTy, nullptr, "structArg", +                            &codeReplacer->getParent()->front().front());      params.push_back(Struct);      for (unsigned i = 0, e = inputs.size(); i != e; ++i) { @@ -566,8 +566,12 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,              bool DominatesDef = true; -            if (InvokeInst *Invoke = dyn_cast<InvokeInst>(outputs[out])) { -              DefBlock = Invoke->getNormalDest(); +            BasicBlock *NormalDest = nullptr; +            if (auto *Invoke = dyn_cast<InvokeInst>(outputs[out])) +              NormalDest = Invoke->getNormalDest(); + +            if (NormalDest) { +              DefBlock = NormalDest;                // Make sure we are looking at the original successor block, not                // at a newly inserted exit block, which won't be in the dominator @@ -606,11 +610,11 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,                  Idx[1] = ConstantInt::get(Type::getInt32Ty(Context),                                            FirstOut+out);                  GetElementPtrInst *GEP = GetElementPtrInst::Create( -                    StructArgTy, OAI, Idx, "gep_" + outputs[out]->getName(), +                    StructArgTy, &*OAI, Idx, "gep_" + outputs[out]->getName(),                      NTRet);                  new StoreInst(outputs[out], GEP, NTRet);                } else { -                new StoreInst(outputs[out], OAI, NTRet); +                new StoreInst(outputs[out], &*OAI, NTRet);                }              }              // Advance output iterator even if we don't emit a store diff --git a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp index dc95089cd2ca..b56ff684e8a8 100644 --- a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp @@ -50,7 +50,7 @@ void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) {    GlobalVariable *NGV =        new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(),                           CA, "", GCL->getThreadLocalMode()); -  GCL->getParent()->getGlobalList().insert(GCL, NGV); +  GCL->getParent()->getGlobalList().insert(GCL->getIterator(), NGV);    NGV->takeName(GCL);    // Nuke the old list, replacing any uses with the new one. diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp index 003da58ee798..75a1dde57c4c 100644 --- a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp +++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp @@ -35,8 +35,8 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,                            I.getName()+".reg2mem", AllocaPoint);    } else {      Function *F = I.getParent()->getParent(); -    Slot = new AllocaInst(I.getType(), nullptr, I.getName()+".reg2mem", -                          F->getEntryBlock().begin()); +    Slot = new AllocaInst(I.getType(), nullptr, I.getName() + ".reg2mem", +                          &F->getEntryBlock().front());    }    // We cannot demote invoke instructions to the stack if their normal edge @@ -89,16 +89,15 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,    // AFTER the terminator instruction.    BasicBlock::iterator InsertPt;    if (!isa<TerminatorInst>(I)) { -    InsertPt = &I; -    ++InsertPt; -    for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt) +    InsertPt = ++I.getIterator(); +    for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)        /* empty */;   // Don't insert before PHI nodes or landingpad instrs.    } else {      InvokeInst &II = cast<InvokeInst>(I);      InsertPt = II.getNormalDest()->getFirstInsertionPt();    } -  new StoreInst(&I, Slot, InsertPt); +  new StoreInst(&I, Slot, &*InsertPt);    return Slot;  } @@ -118,8 +117,8 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {                            P->getName()+".reg2mem", AllocaPoint);    } else {      Function *F = P->getParent()->getParent(); -    Slot = new AllocaInst(P->getType(), nullptr, P->getName()+".reg2mem", -                          F->getEntryBlock().begin()); +    Slot = new AllocaInst(P->getType(), nullptr, P->getName() + ".reg2mem", +                          &F->getEntryBlock().front());    }    // Iterate over each operand inserting a store in each predecessor. @@ -133,12 +132,12 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {    }    // Insert a load in place of the PHI and replace all uses. -  BasicBlock::iterator InsertPt = P; +  BasicBlock::iterator InsertPt = P->getIterator(); -  for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt) +  for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)      /* empty */;   // Don't insert before PHI nodes or landingpad instrs. -  Value *V = new LoadInst(Slot, P->getName()+".reload", InsertPt); +  Value *V = new LoadInst(Slot, P->getName() + ".reload", &*InsertPt);    P->replaceAllUsesWith(V);    // Delete PHI. diff --git a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp index 4eb3e3dd17d2..492ae9f69a65 100644 --- a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp +++ b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp @@ -28,12 +28,11 @@ class FlattenCFGOpt {    AliasAnalysis *AA;    /// \brief Use parallel-and or parallel-or to generate conditions for    /// conditional branches. -  bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, -                            Pass *P = nullptr); +  bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder);    /// \brief If \param BB is the merge block of an if-region, attempt to merge    /// the if-region with an adjacent if-region upstream if two if-regions    /// contain identical instructions. -  bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = nullptr); +  bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder);    /// \brief Compare a pair of blocks: \p Block1 and \p Block2, which    /// are from two if-regions whose entry blocks are \p Head1 and \p    /// Head2.  \returns true if \p Block1 and \p Block2 contain identical @@ -122,8 +121,7 @@ public:  ///  its predecessor.  In Case 2, \param BB (BB3) only has conditional branches  ///  as its predecessors.  /// -bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, -                                         Pass *P) { +bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {    PHINode *PHI = dyn_cast<PHINode>(BB->begin());    if (PHI)      return false; // For simplicity, avoid cases containing PHI nodes. @@ -177,8 +175,9 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder,        // Instructions in the internal condition blocks should be safe        // to hoist up. -      for (BasicBlock::iterator BI = Pred->begin(), BE = PBI; BI != BE;) { -        Instruction *CI = BI++; +      for (BasicBlock::iterator BI = Pred->begin(), BE = PBI->getIterator(); +           BI != BE;) { +        Instruction *CI = &*BI++;          if (isa<PHINode>(CI) || !isSafeToSpeculativelyExecute(CI))            return false;        } @@ -315,7 +314,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,                                           BasicBlock *Block1,                                           BasicBlock *Block2) {    TerminatorInst *PTI2 = Head2->getTerminator(); -  Instruction *PBI2 = Head2->begin(); +  Instruction *PBI2 = &Head2->front();    bool eq1 = (Block1 == Head1);    bool eq2 = (Block2 == Head2); @@ -327,9 +326,9 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,    // Check whether instructions in Block1 and Block2 are identical    // and do not alias with instructions in Head2.    BasicBlock::iterator iter1 = Block1->begin(); -  BasicBlock::iterator end1 = Block1->getTerminator(); +  BasicBlock::iterator end1 = Block1->getTerminator()->getIterator();    BasicBlock::iterator iter2 = Block2->begin(); -  BasicBlock::iterator end2 = Block2->getTerminator(); +  BasicBlock::iterator end2 = Block2->getTerminator()->getIterator();    while (1) {      if (iter1 == end1) { @@ -338,7 +337,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,        break;      } -    if (!iter1->isIdenticalTo(iter2)) +    if (!iter1->isIdenticalTo(&*iter2))        return false;      // Illegal to remove instructions with side effects except @@ -356,10 +355,10 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,        return false;      if (iter1->mayWriteToMemory()) { -      for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) { +      for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) {          if (BI->mayReadFromMemory() || BI->mayWriteToMemory()) {            // Check alias with Head2. -          if (!AA || AA->alias(iter1, BI)) +          if (!AA || AA->alias(&*iter1, &*BI))              return false;          }        } @@ -386,8 +385,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,  /// if (a || b)  ///   statement;  /// -bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, -                                  Pass *P) { +bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {    BasicBlock *IfTrue2, *IfFalse2;    Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2);    Instruction *CInst2 = dyn_cast_or_null<Instruction>(IfCond2); @@ -413,7 +411,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder,      return false;    TerminatorInst *PTI2 = SecondEntryBlock->getTerminator(); -  Instruction *PBI2 = SecondEntryBlock->begin(); +  Instruction *PBI2 = &SecondEntryBlock->front();    if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1,                              IfTrue2)) @@ -425,8 +423,8 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder,    // Check whether \param SecondEntryBlock has side-effect and is safe to    // speculate. -  for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) { -    Instruction *CI = BI; +  for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) { +    Instruction *CI = &*BI;      if (isa<PHINode>(CI) || CI->mayHaveSideEffects() ||          !isSafeToSpeculativelyExecute(CI))        return false; diff --git a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp index 44b7d25d519a..3893a752503b 100644 --- a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp +++ b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp @@ -49,6 +49,10 @@ bool llvm::isSafeToDestroyConstant(const Constant *C) {  static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,                               SmallPtrSetImpl<const PHINode *> &PhiUsers) { +  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) +    if (GV->isExternallyInitialized()) +      GS.StoredType = GlobalStatus::StoredOnce; +    for (const Use &U : V->uses()) {      const User *UR = U.getUser();      if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) { diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp index d2d60d7cd9f6..14574119b9a8 100644 --- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -13,14 +13,15 @@  //===----------------------------------------------------------------------===//  #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/ADT/SetVector.h"  #include "llvm/ADT/SmallSet.h"  #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/SetVector.h"  #include "llvm/ADT/StringExtras.h"  #include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/CallGraph.h"  #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/EHPersonalities.h"  #include "llvm/Analysis/InstructionSimplify.h"  #include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/Attributes.h" @@ -41,6 +42,7 @@  #include "llvm/Transforms/Utils/Local.h"  #include "llvm/Support/CommandLine.h"  #include <algorithm> +  using namespace llvm;  static cl::opt<bool> @@ -54,17 +56,17 @@ PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining",    cl::desc("Convert align attributes to assumptions during inlining."));  bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI, -                          bool InsertLifetime) { -  return InlineFunction(CallSite(CI), IFI, InsertLifetime); +                          AAResults *CalleeAAR, bool InsertLifetime) { +  return InlineFunction(CallSite(CI), IFI, CalleeAAR, InsertLifetime);  }  bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI, -                          bool InsertLifetime) { -  return InlineFunction(CallSite(II), IFI, InsertLifetime); +                          AAResults *CalleeAAR, bool InsertLifetime) { +  return InlineFunction(CallSite(II), IFI, CalleeAAR, InsertLifetime);  }  namespace { -  /// A class for recording information about inlining through an invoke. -  class InvokeInliningInfo { +  /// A class for recording information about inlining a landing pad. +  class LandingPadInliningInfo {      BasicBlock *OuterResumeDest; ///< Destination of the invoke's unwind.      BasicBlock *InnerResumeDest; ///< Destination for the callee's resume.      LandingPadInst *CallerLPad;  ///< LandingPadInst associated with the invoke. @@ -72,7 +74,7 @@ namespace {      SmallVector<Value*, 8> UnwindDestPHIValues;    public: -    InvokeInliningInfo(InvokeInst *II) +    LandingPadInliningInfo(InvokeInst *II)        : OuterResumeDest(II->getUnwindDest()), InnerResumeDest(nullptr),          CallerLPad(nullptr), InnerEHValuesPHI(nullptr) {        // If there are PHI nodes in the unwind destination block, we need to keep @@ -121,14 +123,14 @@ namespace {        }      }    }; -} +} // anonymous namespace  /// Get or create a target for the branch from ResumeInsts. -BasicBlock *InvokeInliningInfo::getInnerResumeDest() { +BasicBlock *LandingPadInliningInfo::getInnerResumeDest() {    if (InnerResumeDest) return InnerResumeDest;    // Split the landing pad. -  BasicBlock::iterator SplitPoint = CallerLPad; ++SplitPoint; +  BasicBlock::iterator SplitPoint = ++CallerLPad->getIterator();    InnerResumeDest =      OuterResumeDest->splitBasicBlock(SplitPoint,                                       OuterResumeDest->getName() + ".body"); @@ -137,7 +139,7 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() {    const unsigned PHICapacity = 2;    // Create corresponding new PHIs for all the PHIs in the outer landing pad. -  BasicBlock::iterator InsertPoint = InnerResumeDest->begin(); +  Instruction *InsertPoint = &InnerResumeDest->front();    BasicBlock::iterator I = OuterResumeDest->begin();    for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {      PHINode *OuterPHI = cast<PHINode>(I); @@ -162,8 +164,8 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() {  /// When the landing pad block has only one predecessor, this is a simple  /// branch. When there is more than one predecessor, we need to split the  /// landing pad block after the landingpad instruction and jump to there. -void InvokeInliningInfo::forwardResume(ResumeInst *RI, -                               SmallPtrSetImpl<LandingPadInst*> &InlinedLPads) { +void LandingPadInliningInfo::forwardResume( +    ResumeInst *RI, SmallPtrSetImpl<LandingPadInst *> &InlinedLPads) {    BasicBlock *Dest = getInnerResumeDest();    BasicBlock *Src = RI->getParent(); @@ -182,33 +184,39 @@ void InvokeInliningInfo::forwardResume(ResumeInst *RI,  /// This function analyze BB to see if there are any calls, and if so,  /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI  /// nodes in that block with the values specified in InvokeDestPHIValues. -static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, -                                                   InvokeInliningInfo &Invoke) { +static BasicBlock * +HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) {    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { -    Instruction *I = BBI++; +    Instruction *I = &*BBI++;      // We only need to check for function calls: inlined invoke      // instructions require no special handling.      CallInst *CI = dyn_cast<CallInst>(I); -    // If this call cannot unwind, don't convert it to an invoke. -    // Inline asm calls cannot throw.      if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue()))        continue;      // Convert this function call into an invoke instruction.  First, split the      // basic block. -    BasicBlock *Split = BB->splitBasicBlock(CI, CI->getName()+".noexc"); +    BasicBlock *Split = +        BB->splitBasicBlock(CI->getIterator(), CI->getName() + ".noexc");      // Delete the unconditional branch inserted by splitBasicBlock      BB->getInstList().pop_back();      // Create the new invoke instruction. -    ImmutableCallSite CS(CI); -    SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end()); -    InvokeInst *II = InvokeInst::Create(CI->getCalledValue(), Split, -                                        Invoke.getOuterResumeDest(), -                                        InvokeArgs, CI->getName(), BB); +    SmallVector<Value*, 8> InvokeArgs(CI->arg_begin(), CI->arg_end()); +    SmallVector<OperandBundleDef, 1> OpBundles; + +    CI->getOperandBundlesAsDefs(OpBundles); + +    // Note: we're round tripping operand bundles through memory here, and that +    // can potentially be avoided with a cleverer API design that we do not have +    // as of this time. + +    InvokeInst *II = +        InvokeInst::Create(CI->getCalledValue(), Split, UnwindEdge, InvokeArgs, +                           OpBundles, CI->getName(), BB);      II->setDebugLoc(CI->getDebugLoc());      II->setCallingConv(CI->getCallingConv());      II->setAttributes(CI->getAttributes()); @@ -219,12 +227,9 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,      // Delete the original call      Split->getInstList().pop_front(); - -    // Update any PHI nodes in the exceptional block to indicate that there is -    // now a new entry in them. -    Invoke.addIncomingPHIValuesFor(BB); -    return; +    return BB;    } +  return nullptr;  }  /// If we inlined an invoke site, we need to convert calls @@ -233,8 +238,8 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,  /// II is the invoke instruction being inlined.  FirstNewBlock is the first  /// block of the inlined code (the last block is the end of the function),  /// and InlineCodeInfo is information about the code that got inlined. -static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, -                                ClonedCodeInfo &InlinedCodeInfo) { +static void HandleInlinedLandingPad(InvokeInst *II, BasicBlock *FirstNewBlock, +                                    ClonedCodeInfo &InlinedCodeInfo) {    BasicBlock *InvokeDest = II->getUnwindDest();    Function *Caller = FirstNewBlock->getParent(); @@ -242,11 +247,12 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,    // The inlined code is currently at the end of the function, scan from the    // start of the inlined code to its end, checking for stuff we need to    // rewrite. -  InvokeInliningInfo Invoke(II); +  LandingPadInliningInfo Invoke(II);    // Get all of the inlined landing pad instructions.    SmallPtrSet<LandingPadInst*, 16> InlinedLPads; -  for (Function::iterator I = FirstNewBlock, E = Caller->end(); I != E; ++I) +  for (Function::iterator I = FirstNewBlock->getIterator(), E = Caller->end(); +       I != E; ++I)      if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator()))        InlinedLPads.insert(II->getLandingPadInst()); @@ -262,9 +268,14 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,        InlinedLPad->setCleanup(true);    } -  for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){ +  for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); +       BB != E; ++BB) {      if (InlinedCodeInfo.ContainsCalls) -      HandleCallsInBlockInlinedThroughInvoke(BB, Invoke); +      if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke( +              &*BB, Invoke.getOuterResumeDest())) +        // Update any PHI nodes in the exceptional block to indicate that there +        // is now a new entry in them. +        Invoke.addIncomingPHIValuesFor(NewBB);      // Forward any resumes that are remaining here.      if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) @@ -278,6 +289,99 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,    InvokeDest->removePredecessor(II->getParent());  } +/// If we inlined an invoke site, we need to convert calls +/// in the body of the inlined function into invokes. +/// +/// II is the invoke instruction being inlined.  FirstNewBlock is the first +/// block of the inlined code (the last block is the end of the function), +/// and InlineCodeInfo is information about the code that got inlined. +static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock, +                               ClonedCodeInfo &InlinedCodeInfo) { +  BasicBlock *UnwindDest = II->getUnwindDest(); +  Function *Caller = FirstNewBlock->getParent(); + +  assert(UnwindDest->getFirstNonPHI()->isEHPad() && "unexpected BasicBlock!"); + +  // If there are PHI nodes in the unwind destination block, we need to keep +  // track of which values came into them from the invoke before removing the +  // edge from this block. +  SmallVector<Value *, 8> UnwindDestPHIValues; +  llvm::BasicBlock *InvokeBB = II->getParent(); +  for (Instruction &I : *UnwindDest) { +    // Save the value to use for this edge. +    PHINode *PHI = dyn_cast<PHINode>(&I); +    if (!PHI) +      break; +    UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB)); +  } + +  // Add incoming-PHI values to the unwind destination block for the given basic +  // block, using the values for the original invoke's source block. +  auto UpdatePHINodes = [&](BasicBlock *Src) { +    BasicBlock::iterator I = UnwindDest->begin(); +    for (Value *V : UnwindDestPHIValues) { +      PHINode *PHI = cast<PHINode>(I); +      PHI->addIncoming(V, Src); +      ++I; +    } +  }; + +  // This connects all the instructions which 'unwind to caller' to the invoke +  // destination. +  for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); +       BB != E; ++BB) { +    if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) { +      if (CRI->unwindsToCaller()) { +        CleanupReturnInst::Create(CRI->getCleanupPad(), UnwindDest, CRI); +        CRI->eraseFromParent(); +        UpdatePHINodes(&*BB); +      } +    } + +    Instruction *I = BB->getFirstNonPHI(); +    if (!I->isEHPad()) +      continue; + +    Instruction *Replacement = nullptr; +    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) { +      if (CatchSwitch->unwindsToCaller()) { +        auto *NewCatchSwitch = CatchSwitchInst::Create( +            CatchSwitch->getParentPad(), UnwindDest, +            CatchSwitch->getNumHandlers(), CatchSwitch->getName(), +            CatchSwitch); +        for (BasicBlock *PadBB : CatchSwitch->handlers()) +          NewCatchSwitch->addHandler(PadBB); +        Replacement = NewCatchSwitch; +      } +    } else if (!isa<FuncletPadInst>(I)) { +      llvm_unreachable("unexpected EHPad!"); +    } + +    if (Replacement) { +      Replacement->takeName(I); +      I->replaceAllUsesWith(Replacement); +      I->eraseFromParent(); +      UpdatePHINodes(&*BB); +    } +  } + +  if (InlinedCodeInfo.ContainsCalls) +    for (Function::iterator BB = FirstNewBlock->getIterator(), +                            E = Caller->end(); +         BB != E; ++BB) +      if (BasicBlock *NewBB = +              HandleCallsInBlockInlinedThroughInvoke(&*BB, UnwindDest)) +        // Update any PHI nodes in the exceptional block to indicate that there +        // is now a new entry in them. +        UpdatePHINodes(NewBB); + +  // Now that everything is happy, we have one final detail.  The PHI nodes in +  // the exception destination block still have entries due to the original +  // invoke instruction. Eliminate these entries (which might even delete the +  // PHI node) now. +  UnwindDest->removePredecessor(InvokeBB); +} +  /// When inlining a function that contains noalias scope metadata,  /// this metadata needs to be cloned so that the inlined blocks  /// have different "unqiue scopes" at every call site. Were this not done, then @@ -395,17 +499,16 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {  /// parameters with noalias metadata specifying the new scope, and tag all  /// non-derived loads, stores and memory intrinsics with the new alias scopes.  static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, -                                  const DataLayout &DL, AliasAnalysis *AA) { +                                  const DataLayout &DL, AAResults *CalleeAAR) {    if (!EnableNoAliasConversion)      return;    const Function *CalledFunc = CS.getCalledFunction();    SmallVector<const Argument *, 4> NoAliasArgs; -  for (Function::const_arg_iterator I = CalledFunc->arg_begin(), -       E = CalledFunc->arg_end(); I != E; ++I) { -    if (I->hasNoAliasAttr() && !I->hasNUses(0)) -      NoAliasArgs.push_back(I); +  for (const Argument &I : CalledFunc->args()) { +    if (I.hasNoAliasAttr() && !I.hasNUses(0)) +      NoAliasArgs.push_back(&I);    }    if (NoAliasArgs.empty()) @@ -480,10 +583,10 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,            continue;          IsFuncCall = true; -        if (AA) { -          AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(ICS); -          if (MRB == AliasAnalysis::OnlyAccessesArgumentPointees || -              MRB == AliasAnalysis::OnlyReadsArgumentPointees) +        if (CalleeAAR) { +          FunctionModRefBehavior MRB = CalleeAAR->getModRefBehavior(ICS); +          if (MRB == FMRB_OnlyAccessesArgumentPointees || +              MRB == FMRB_OnlyReadsArgumentPointees)              IsArgMemOnlyCall = true;          } @@ -518,7 +621,7 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,        for (unsigned i = 0, ie = PtrArgs.size(); i != ie; ++i) {          SmallVector<Value *, 4> Objects;          GetUnderlyingObjects(const_cast<Value*>(PtrArgs[i]), -                             Objects, DL, /* MaxLookup = */ 0); +                             Objects, DL, /* LI = */ nullptr);          for (Value *O : Objects)            ObjSet.insert(O); @@ -646,7 +749,7 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {        // caller, then don't bother inserting the assumption.        Value *Arg = CS.getArgument(I->getArgNo());        if (getKnownAlignment(Arg, DL, CS.getInstruction(), -                            &IFI.ACT->getAssumptionCache(*CalledFunc), +                            &IFI.ACT->getAssumptionCache(*CS.getCaller()),                              &DT) >= Align)          continue; @@ -731,7 +834,7 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,                                      BasicBlock *InsertBlock,                                      InlineFunctionInfo &IFI) {    Type *AggTy = cast<PointerType>(Src->getType())->getElementType(); -  IRBuilder<> Builder(InsertBlock->begin()); +  IRBuilder<> Builder(InsertBlock, InsertBlock->begin());    Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy)); @@ -851,9 +954,8 @@ updateInlinedAtInfo(DebugLoc DL, DILocation *InlinedAtNode, LLVMContext &Ctx,    // Starting from the top, rebuild the nodes to point to the new inlined-at    // location (then rebuilding the rest of the chain behind it) and update the    // map of already-constructed inlined-at nodes. -  for (auto I = InlinedAtLocations.rbegin(), E = InlinedAtLocations.rend(); -       I != E; ++I) { -    const DILocation *MD = *I; +  for (const DILocation *MD : make_range(InlinedAtLocations.rbegin(), +                                         InlinedAtLocations.rend())) {      Last = IANodes[MD] = DILocation::getDistinct(          Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);    } @@ -917,7 +1019,7 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,  /// exists in the instruction stream.  Similarly this will inline a recursive  /// function by one level.  bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, -                          bool InsertLifetime) { +                          AAResults *CalleeAAR, bool InsertLifetime) {    Instruction *TheCall = CS.getInstruction();    assert(TheCall->getParent() && TheCall->getParent()->getParent() &&           "Instruction not in function!"); @@ -930,6 +1032,22 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,        CalledFunc->isDeclaration() || // call, or call to a vararg function!        CalledFunc->getFunctionType()->isVarArg()) return false; +  // The inliner does not know how to inline through calls with operand bundles +  // in general ... +  if (CS.hasOperandBundles()) { +    for (int i = 0, e = CS.getNumOperandBundles(); i != e; ++i) { +      uint32_t Tag = CS.getOperandBundleAt(i).getTagID(); +      // ... but it knows how to inline through "deopt" operand bundles ... +      if (Tag == LLVMContext::OB_deopt) +        continue; +      // ... and "funclet" operand bundles. +      if (Tag == LLVMContext::OB_funclet) +        continue; + +      return false; +    } +  } +    // If the call to the callee cannot throw, set the 'nounwind' flag on any    // calls that we inline.    bool MarkNoUnwind = CS.doesNotThrow(); @@ -950,13 +1068,17 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,    // Get the personality function from the callee if it contains a landing pad.    Constant *CalledPersonality = -      CalledFunc->hasPersonalityFn() ? CalledFunc->getPersonalityFn() : nullptr; +      CalledFunc->hasPersonalityFn() +          ? CalledFunc->getPersonalityFn()->stripPointerCasts() +          : nullptr;    // Find the personality function used by the landing pads of the caller. If it    // exists, then check to see that it matches the personality function used in    // the callee.    Constant *CallerPersonality = -      Caller->hasPersonalityFn() ? Caller->getPersonalityFn() : nullptr; +      Caller->hasPersonalityFn() +          ? Caller->getPersonalityFn()->stripPointerCasts() +          : nullptr;    if (CalledPersonality) {      if (!CallerPersonality)        Caller->setPersonalityFn(CalledPersonality); @@ -968,9 +1090,46 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,        return false;    } +  // We need to figure out which funclet the callsite was in so that we may +  // properly nest the callee. +  Instruction *CallSiteEHPad = nullptr; +  if (CallerPersonality) { +    EHPersonality Personality = classifyEHPersonality(CallerPersonality); +    if (isFuncletEHPersonality(Personality)) { +      Optional<OperandBundleUse> ParentFunclet = +          CS.getOperandBundle(LLVMContext::OB_funclet); +      if (ParentFunclet) +        CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front()); + +      // OK, the inlining site is legal.  What about the target function? + +      if (CallSiteEHPad) { +        if (Personality == EHPersonality::MSVC_CXX) { +          // The MSVC personality cannot tolerate catches getting inlined into +          // cleanup funclets. +          if (isa<CleanupPadInst>(CallSiteEHPad)) { +            // Ok, the call site is within a cleanuppad.  Let's check the callee +            // for catchpads. +            for (const BasicBlock &CalledBB : *CalledFunc) { +              if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHI())) +                return false; +            } +          } +        } else if (isAsynchronousEHPersonality(Personality)) { +          // SEH is even less tolerant, there may not be any sort of exceptional +          // funclet in the callee. +          for (const BasicBlock &CalledBB : *CalledFunc) { +            if (CalledBB.isEHPad()) +              return false; +          } +        } +      } +    } +  } +    // Get an iterator to the last basic block in the function, which will have    // the new function inlined after it. -  Function::iterator LastBlock = &Caller->back(); +  Function::iterator LastBlock = --Caller->end();    // Make sure to capture all of the return instructions from the cloned    // function. @@ -1007,7 +1166,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,            ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI));        } -      VMap[I] = ActualArg; +      VMap[&*I] = ActualArg;      }      // Add alignment assumptions if necessary. We do this before the inlined @@ -1029,7 +1188,61 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,      // Inject byval arguments initialization.      for (std::pair<Value*, Value*> &Init : ByValInit)        HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(), -                              FirstNewBlock, IFI); +                              &*FirstNewBlock, IFI); + +    Optional<OperandBundleUse> ParentDeopt = +        CS.getOperandBundle(LLVMContext::OB_deopt); +    if (ParentDeopt) { +      SmallVector<OperandBundleDef, 2> OpDefs; + +      for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) { +        Instruction *I = dyn_cast_or_null<Instruction>(VH); +        if (!I) continue;  // instruction was DCE'd or RAUW'ed to undef + +        OpDefs.clear(); + +        CallSite ICS(I); +        OpDefs.reserve(ICS.getNumOperandBundles()); + +        for (unsigned i = 0, e = ICS.getNumOperandBundles(); i < e; ++i) { +          auto ChildOB = ICS.getOperandBundleAt(i); +          if (ChildOB.getTagID() != LLVMContext::OB_deopt) { +            // If the inlined call has other operand bundles, let them be +            OpDefs.emplace_back(ChildOB); +            continue; +          } + +          // It may be useful to separate this logic (of handling operand +          // bundles) out to a separate "policy" component if this gets crowded. +          // Prepend the parent's deoptimization continuation to the newly +          // inlined call's deoptimization continuation. +          std::vector<Value *> MergedDeoptArgs; +          MergedDeoptArgs.reserve(ParentDeopt->Inputs.size() + +                                  ChildOB.Inputs.size()); + +          MergedDeoptArgs.insert(MergedDeoptArgs.end(), +                                 ParentDeopt->Inputs.begin(), +                                 ParentDeopt->Inputs.end()); +          MergedDeoptArgs.insert(MergedDeoptArgs.end(), ChildOB.Inputs.begin(), +                                 ChildOB.Inputs.end()); + +          OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs)); +        } + +        Instruction *NewI = nullptr; +        if (isa<CallInst>(I)) +          NewI = CallInst::Create(cast<CallInst>(I), OpDefs, I); +        else +          NewI = InvokeInst::Create(cast<InvokeInst>(I), OpDefs, I); + +        // Note: the RAUW does the appropriate fixup in VMap, so we need to do +        // this even if the call returns void. +        I->replaceAllUsesWith(NewI); + +        VH = nullptr; +        I->eraseFromParent(); +      } +    }      // Update the callgraph if requested.      if (IFI.CG) @@ -1042,7 +1255,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,      CloneAliasScopeMetadata(CS, VMap);      // Add noalias metadata if necessary. -    AddAliasScopeMetadata(CS, VMap, DL, IFI.AA); +    AddAliasScopeMetadata(CS, VMap, DL, CalleeAAR);      // FIXME: We could register any cloned assumptions instead of clearing the      // whole function's cache. @@ -1085,9 +1298,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,        // Transfer all of the allocas over in a block.  Using splice means        // that the instructions aren't removed from the symbol table, then        // reinserted. -      Caller->getEntryBlock().getInstList().splice(InsertPoint, -                                                   FirstNewBlock->getInstList(), -                                                   AI, I); +      Caller->getEntryBlock().getInstList().splice( +          InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I);      }      // Move any dbg.declares describing the allocas into the entry basic block.      DIBuilder DIB(*Caller->getParent()); @@ -1137,7 +1349,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,    // Leave lifetime markers for the static alloca's, scoping them to the    // function we just inlined.    if (InsertLifetime && !IFI.StaticAllocas.empty()) { -    IRBuilder<> builder(FirstNewBlock->begin()); +    IRBuilder<> builder(&FirstNewBlock->front());      for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) {        AllocaInst *AI = IFI.StaticAllocas[ai]; @@ -1189,7 +1401,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,      Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore);      // Insert the llvm.stacksave. -    CallInst *SavedPtr = IRBuilder<>(FirstNewBlock, FirstNewBlock->begin()) +    CallInst *SavedPtr = IRBuilder<>(&*FirstNewBlock, FirstNewBlock->begin())                               .CreateCall(StackSave, {}, "savedstack");      // Insert a call to llvm.stackrestore before any return instructions in the @@ -1203,10 +1415,74 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,      }    } +  // Update the lexical scopes of the new funclets and callsites. +  // Anything that had 'none' as its parent is now nested inside the callsite's +  // EHPad. + +  if (CallSiteEHPad) { +    for (Function::iterator BB = FirstNewBlock->getIterator(), +                            E = Caller->end(); +         BB != E; ++BB) { +      // Add bundle operands to any top-level call sites. +      SmallVector<OperandBundleDef, 1> OpBundles; +      for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) { +        Instruction *I = &*BBI++; +        CallSite CS(I); +        if (!CS) +          continue; + +        // Skip call sites which are nounwind intrinsics. +        auto *CalledFn = +            dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); +        if (CalledFn && CalledFn->isIntrinsic() && CS.doesNotThrow()) +          continue; + +        // Skip call sites which already have a "funclet" bundle. +        if (CS.getOperandBundle(LLVMContext::OB_funclet)) +          continue; + +        CS.getOperandBundlesAsDefs(OpBundles); +        OpBundles.emplace_back("funclet", CallSiteEHPad); + +        Instruction *NewInst; +        if (CS.isCall()) +          NewInst = CallInst::Create(cast<CallInst>(I), OpBundles, I); +        else +          NewInst = InvokeInst::Create(cast<InvokeInst>(I), OpBundles, I); +        NewInst->setDebugLoc(I->getDebugLoc()); +        NewInst->takeName(I); +        I->replaceAllUsesWith(NewInst); +        I->eraseFromParent(); + +        OpBundles.clear(); +      } + +      Instruction *I = BB->getFirstNonPHI(); +      if (!I->isEHPad()) +        continue; + +      if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) { +        if (isa<ConstantTokenNone>(CatchSwitch->getParentPad())) +          CatchSwitch->setParentPad(CallSiteEHPad); +      } else { +        auto *FPI = cast<FuncletPadInst>(I); +        if (isa<ConstantTokenNone>(FPI->getParentPad())) +          FPI->setParentPad(CallSiteEHPad); +      } +    } +  } +    // If we are inlining for an invoke instruction, we must make sure to rewrite    // any call instructions into invoke instructions. -  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) -    HandleInlinedInvoke(II, FirstNewBlock, InlinedFunctionInfo); +  if (auto *II = dyn_cast<InvokeInst>(TheCall)) { +    BasicBlock *UnwindDest = II->getUnwindDest(); +    Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI(); +    if (isa<LandingPadInst>(FirstNonPHI)) { +      HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo); +    } else { +      HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo); +    } +  }    // Handle any inlined musttail call sites.  In order for a new call site to be    // musttail, the source of the clone and the inlined call site must have been @@ -1250,7 +1526,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,    // the calling basic block.    if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) {      // Move all of the instructions right before the call. -    OrigBB->getInstList().splice(TheCall, FirstNewBlock->getInstList(), +    OrigBB->getInstList().splice(TheCall->getIterator(), +                                 FirstNewBlock->getInstList(),                                   FirstNewBlock->begin(), FirstNewBlock->end());      // Remove the cloned basic block.      Caller->getBasicBlockList().pop_back(); @@ -1297,15 +1574,16 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,      // Split the basic block.  This guarantees that no PHI nodes will have to be      // updated due to new incoming edges, and make the invoke case more      // symmetric to the call case. -    AfterCallBB = OrigBB->splitBasicBlock(CreatedBranchToNormalDest, -                                          CalledFunc->getName()+".exit"); +    AfterCallBB = +        OrigBB->splitBasicBlock(CreatedBranchToNormalDest->getIterator(), +                                CalledFunc->getName() + ".exit");    } else {  // It's a call      // If this is a call instruction, we need to split the basic block that      // the call lives in.      // -    AfterCallBB = OrigBB->splitBasicBlock(TheCall, -                                          CalledFunc->getName()+".exit"); +    AfterCallBB = OrigBB->splitBasicBlock(TheCall->getIterator(), +                                          CalledFunc->getName() + ".exit");    }    // Change the branch that used to go to AfterCallBB to branch to the first @@ -1314,14 +1592,14 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,    TerminatorInst *Br = OrigBB->getTerminator();    assert(Br && Br->getOpcode() == Instruction::Br &&           "splitBasicBlock broken!"); -  Br->setOperand(0, FirstNewBlock); - +  Br->setOperand(0, &*FirstNewBlock);    // Now that the function is correct, make it a little bit nicer.  In    // particular, move the basic blocks inserted from the end of the function    // into the space made by splitting the source basic block. -  Caller->getBasicBlockList().splice(AfterCallBB, Caller->getBasicBlockList(), -                                     FirstNewBlock, Caller->end()); +  Caller->getBasicBlockList().splice(AfterCallBB->getIterator(), +                                     Caller->getBasicBlockList(), FirstNewBlock, +                                     Caller->end());    // Handle all of the return instructions that we just cloned in, and eliminate    // any users of the original call/invoke instruction. @@ -1333,7 +1611,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,      // possible incoming values.      if (!TheCall->use_empty()) {        PHI = PHINode::Create(RTy, Returns.size(), TheCall->getName(), -                            AfterCallBB->begin()); +                            &AfterCallBB->front());        // Anything that used the result of the function call should now use the        // PHI node as their operand.        TheCall->replaceAllUsesWith(PHI); @@ -1350,7 +1628,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,        }      } -      // Add a branch to the merge points and remove return instructions.      DebugLoc Loc;      for (unsigned i = 0, e = Returns.size(); i != e; ++i) { @@ -1413,7 +1690,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,    // Splice the code entry block into calling block, right before the    // unconditional branch.    CalleeEntry->replaceAllUsesWith(OrigBB);  // Update PHI nodes -  OrigBB->getInstList().splice(Br, CalleeEntry->getInstList()); +  OrigBB->getInstList().splice(Br->getIterator(), CalleeEntry->getInstList());    // Remove the unconditional branch.    OrigBB->getInstList().erase(Br); diff --git a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp index 30edf3b7aae4..5687afa61e2a 100644 --- a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp +++ b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp @@ -380,14 +380,10 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {    IRBuilder<> Builder(Rem); -  Type *RemTy = Rem->getType(); -  if (RemTy->isVectorTy()) -    llvm_unreachable("Div over vectors not supported"); - -  unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); - -  if (RemTyBitWidth != 32 && RemTyBitWidth != 64) -    llvm_unreachable("Div of bitwidth other than 32 or 64 not supported"); +  assert(!Rem->getType()->isVectorTy() && "Div over vectors not supported"); +  assert((Rem->getType()->getIntegerBitWidth() == 32 || +          Rem->getType()->getIntegerBitWidth() == 64) && +         "Div of bitwidth other than 32 or 64 not supported");    // First prepare the sign if it's a signed remainder    if (Rem->getOpcode() == Instruction::SRem) { @@ -401,7 +397,7 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {      // If we didn't actually generate an urem instruction, we're done      // This happens for example if the input were constant. In this case the      // Builder insertion point was unchanged -    if (Rem == Builder.GetInsertPoint()) +    if (Rem == Builder.GetInsertPoint().getNodePtrUnchecked())        return true;      BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint()); @@ -440,14 +436,10 @@ bool llvm::expandDivision(BinaryOperator *Div) {    IRBuilder<> Builder(Div); -  Type *DivTy = Div->getType(); -  if (DivTy->isVectorTy()) -    llvm_unreachable("Div over vectors not supported"); - -  unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); - -  if (DivTyBitWidth != 32 && DivTyBitWidth != 64) -    llvm_unreachable("Div of bitwidth other than 32 or 64 not supported"); +  assert(!Div->getType()->isVectorTy() && "Div over vectors not supported"); +  assert((Div->getType()->getIntegerBitWidth() == 32 || +          Div->getType()->getIntegerBitWidth() == 64) && +         "Div of bitwidth other than 32 or 64 not supported");    // First prepare the sign if it's a signed division    if (Div->getOpcode() == Instruction::SDiv) { @@ -461,7 +453,7 @@ bool llvm::expandDivision(BinaryOperator *Div) {      // If we didn't actually generate an udiv instruction, we're done      // This happens for example if the input were constant. In this case the      // Builder insertion point was unchanged -    if (Div == Builder.GetInsertPoint()) +    if (Div == Builder.GetInsertPoint().getNodePtrUnchecked())        return true;      BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint()); @@ -492,15 +484,14 @@ bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {            "Trying to expand remainder from a non-remainder function");    Type *RemTy = Rem->getType(); -  if (RemTy->isVectorTy()) -    llvm_unreachable("Div over vectors not supported"); +  assert(!RemTy->isVectorTy() && "Div over vectors not supported");    unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); -  if (RemTyBitWidth > 32)  -    llvm_unreachable("Div of bitwidth greater than 32 not supported"); +  assert(RemTyBitWidth <= 32 && +         "Div of bitwidth greater than 32 not supported"); -  if (RemTyBitWidth == 32)  +  if (RemTyBitWidth == 32)      return expandRemainder(Rem);    // If bitwidth smaller than 32 extend inputs, extend output and proceed @@ -542,15 +533,13 @@ bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) {            "Trying to expand remainder from a non-remainder function");    Type *RemTy = Rem->getType(); -  if (RemTy->isVectorTy()) -    llvm_unreachable("Div over vectors not supported"); +  assert(!RemTy->isVectorTy() && "Div over vectors not supported");    unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); -  if (RemTyBitWidth > 64)  -    llvm_unreachable("Div of bitwidth greater than 64 not supported"); +  assert(RemTyBitWidth <= 64 && "Div of bitwidth greater than 64 not supported"); -  if (RemTyBitWidth == 64)  +  if (RemTyBitWidth == 64)      return expandRemainder(Rem);    // If bitwidth smaller than 64 extend inputs, extend output and proceed @@ -593,13 +582,11 @@ bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {            "Trying to expand division from a non-division function");    Type *DivTy = Div->getType(); -  if (DivTy->isVectorTy()) -    llvm_unreachable("Div over vectors not supported"); +  assert(!DivTy->isVectorTy() && "Div over vectors not supported");    unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); -  if (DivTyBitWidth > 32) -    llvm_unreachable("Div of bitwidth greater than 32 not supported"); +  assert(DivTyBitWidth <= 32 && "Div of bitwidth greater than 32 not supported");    if (DivTyBitWidth == 32)      return expandDivision(Div); @@ -643,13 +630,12 @@ bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) {            "Trying to expand division from a non-division function");    Type *DivTy = Div->getType(); -  if (DivTy->isVectorTy()) -    llvm_unreachable("Div over vectors not supported"); +  assert(!DivTy->isVectorTy() && "Div over vectors not supported");    unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); -  if (DivTyBitWidth > 64) -    llvm_unreachable("Div of bitwidth greater than 64 not supported"); +  assert(DivTyBitWidth <= 64 && +         "Div of bitwidth greater than 64 not supported");    if (DivTyBitWidth == 64)      return expandDivision(Div); diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp index 9d40b6989d6e..b4b2e148dfbb 100644 --- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -31,8 +31,10 @@  #include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/LoopPass.h"  #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/Dominators.h"  #include "llvm/IR/Function.h" @@ -64,6 +66,13 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,                                 PredIteratorCache &PredCache, LoopInfo *LI) {    SmallVector<Use *, 16> UsesToRewrite; +  // Tokens cannot be used in PHI nodes, so we skip over them. +  // We can run into tokens which are live out of a loop with catchswitch +  // instructions in Windows EH if the catchswitch has one catchpad which +  // is inside the loop and another which is not. +  if (Inst.getType()->isTokenTy()) +    return false; +    BasicBlock *InstBB = Inst.getParent();    for (Use &U : Inst.uses()) { @@ -84,9 +93,8 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,    // Invoke instructions are special in that their result value is not available    // along their unwind edge. The code below tests to see whether DomBB -  // dominates -  // the value, so adjust DomBB to the normal destination block, which is -  // effectively where the value is first usable. +  // dominates the value, so adjust DomBB to the normal destination block, +  // which is effectively where the value is first usable.    BasicBlock *DomBB = Inst.getParent();    if (InvokeInst *Inv = dyn_cast<InvokeInst>(&Inst))      DomBB = Inv->getNormalDest(); @@ -101,10 +109,7 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,    // Insert the LCSSA phi's into all of the exit blocks dominated by the    // value, and add them to the Phi's map. -  for (SmallVectorImpl<BasicBlock *>::const_iterator BBI = ExitBlocks.begin(), -                                                     BBE = ExitBlocks.end(); -       BBI != BBE; ++BBI) { -    BasicBlock *ExitBB = *BBI; +  for (BasicBlock *ExitBB : ExitBlocks) {      if (!DT.dominates(DomNode, DT.getNode(ExitBB)))        continue; @@ -113,7 +118,7 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,        continue;      PHINode *PN = PHINode::Create(Inst.getType(), PredCache.size(ExitBB), -                                  Inst.getName() + ".lcssa", ExitBB->begin()); +                                  Inst.getName() + ".lcssa", &ExitBB->front());      // Add inputs from inside the loop for this PHI.      for (BasicBlock *Pred : PredCache.get(ExitBB)) { @@ -148,26 +153,26 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,    // Rewrite all uses outside the loop in terms of the new PHIs we just    // inserted. -  for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) { +  for (Use *UseToRewrite : UsesToRewrite) {      // If this use is in an exit block, rewrite to use the newly inserted PHI.      // This is required for correctness because SSAUpdate doesn't handle uses in      // the same block.  It assumes the PHI we inserted is at the end of the      // block. -    Instruction *User = cast<Instruction>(UsesToRewrite[i]->getUser()); +    Instruction *User = cast<Instruction>(UseToRewrite->getUser());      BasicBlock *UserBB = User->getParent();      if (PHINode *PN = dyn_cast<PHINode>(User)) -      UserBB = PN->getIncomingBlock(*UsesToRewrite[i]); +      UserBB = PN->getIncomingBlock(*UseToRewrite);      if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {        // Tell the VHs that the uses changed. This updates SCEV's caches. -      if (UsesToRewrite[i]->get()->hasValueHandle()) -        ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin()); -      UsesToRewrite[i]->set(UserBB->begin()); +      if (UseToRewrite->get()->hasValueHandle()) +        ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front()); +      UseToRewrite->set(&UserBB->front());        continue;      }      // Otherwise, do full PHI insertion. -    SSAUpdate.RewriteUse(*UsesToRewrite[i]); +    SSAUpdate.RewriteUse(*UseToRewrite);    }    // Post process PHI instructions that were inserted into another disjoint loop @@ -190,10 +195,9 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,    }    // Remove PHI nodes that did not have any uses rewritten. -  for (unsigned i = 0, e = AddedPHIs.size(); i != e; ++i) { -    if (AddedPHIs[i]->use_empty()) -      AddedPHIs[i]->eraseFromParent(); -  } +  for (PHINode *PN : AddedPHIs) +    if (PN->use_empty()) +      PN->eraseFromParent();    return true;  } @@ -205,8 +209,8 @@ blockDominatesAnExit(BasicBlock *BB,                       DominatorTree &DT,                       const SmallVectorImpl<BasicBlock *> &ExitBlocks) {    DomTreeNode *DomNode = DT.getNode(BB); -  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) -    if (DT.dominates(DomNode, DT.getNode(ExitBlocks[i]))) +  for (BasicBlock *ExitBB : ExitBlocks) +    if (DT.dominates(DomNode, DT.getNode(ExitBB)))        return true;    return false; @@ -227,25 +231,22 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,    // Look at all the instructions in the loop, checking to see if they have uses    // outside the loop.  If so, rewrite those uses. -  for (Loop::block_iterator BBI = L.block_begin(), BBE = L.block_end(); -       BBI != BBE; ++BBI) { -    BasicBlock *BB = *BBI; - +  for (BasicBlock *BB : L.blocks()) {      // For large loops, avoid use-scanning by using dominance information:  In      // particular, if a block does not dominate any of the loop exits, then none      // of the values defined in the block could be used outside the loop.      if (!blockDominatesAnExit(BB, DT, ExitBlocks))        continue; -    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { +    for (Instruction &I : *BB) {        // Reject two common cases fast: instructions with no uses (like stores)        // and instructions with one use that is in the same block as this. -      if (I->use_empty() || -          (I->hasOneUse() && I->user_back()->getParent() == BB && -           !isa<PHINode>(I->user_back()))) +      if (I.use_empty() || +          (I.hasOneUse() && I.user_back()->getParent() == BB && +           !isa<PHINode>(I.user_back())))          continue; -      Changed |= processInstruction(L, *I, DT, ExitBlocks, PredCache, LI); +      Changed |= processInstruction(L, I, DT, ExitBlocks, PredCache, LI);      }    } @@ -266,8 +267,8 @@ bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,    bool Changed = false;    // Recurse depth-first through inner loops. -  for (Loop::iterator I = L.begin(), E = L.end(); I != E; ++I) -    Changed |= formLCSSARecursively(**I, DT, LI, SE); +  for (Loop *SubLoop : L.getSubLoops()) +    Changed |= formLCSSARecursively(*SubLoop, DT, LI, SE);    Changed |= formLCSSA(L, DT, LI, SE);    return Changed; @@ -296,8 +297,10 @@ struct LCSSA : public FunctionPass {      AU.addRequired<DominatorTreeWrapperPass>();      AU.addRequired<LoopInfoWrapperPass>();      AU.addPreservedID(LoopSimplifyID); -    AU.addPreserved<AliasAnalysis>(); -    AU.addPreserved<ScalarEvolution>(); +    AU.addPreserved<AAResultsWrapperPass>(); +    AU.addPreserved<GlobalsAAWrapperPass>(); +    AU.addPreserved<ScalarEvolutionWrapperPass>(); +    AU.addPreserved<SCEVAAWrapperPass>();    }  };  } @@ -306,6 +309,8 @@ char LCSSA::ID = 0;  INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)  INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false)  Pass *llvm::createLCSSAPass() { return new LCSSA(); } @@ -317,7 +322,8 @@ bool LCSSA::runOnFunction(Function &F) {    bool Changed = false;    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); -  SE = getAnalysisIfAvailable<ScalarEvolution>(); +  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); +  SE = SEWP ? &SEWP->getSE() : nullptr;    // Simplify each loop nest in the function.    for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp index ba8af47b54e1..e75163f323df 100644 --- a/contrib/llvm/lib/Transforms/Utils/Local.cpp +++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp @@ -17,10 +17,11 @@  #include "llvm/ADT/DenseSet.h"  #include "llvm/ADT/Hashing.h"  #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/EHPersonalities.h"  #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LibCallSemantics.h"  #include "llvm/Analysis/MemoryBuiltins.h"  #include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/CFG.h" @@ -188,9 +189,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,        BasicBlock *BB = SI->getParent();        // Remove entries from PHI nodes which we no longer branch to... -      for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) { +      for (BasicBlock *Succ : SI->successors()) {          // Found case matching a constant operand? -        BasicBlock *Succ = SI->getSuccessor(i);          if (Succ == TheOnlyDest)            TheOnlyDest = nullptr; // Don't modify the first branch to TheOnlyDest          else @@ -230,6 +230,11 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,                                              SIDef->getValue().getZExtValue()));        } +      // Update make.implicit metadata to the newly-created conditional branch. +      MDNode *MakeImplicitMD = SI->getMetadata(LLVMContext::MD_make_implicit); +      if (MakeImplicitMD) +        NewBr->setMetadata(LLVMContext::MD_make_implicit, MakeImplicitMD); +        // Delete the old switch.        SI->eraseFromParent();        return true; @@ -283,8 +288,9 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,                                        const TargetLibraryInfo *TLI) {    if (!I->use_empty() || isa<TerminatorInst>(I)) return false; -  // We don't want the landingpad instruction removed by anything this general. -  if (isa<LandingPadInst>(I)) +  // We don't want the landingpad-like instructions removed by anything this +  // general. +  if (I->isEHPad())      return false;    // We don't want debug info removed by anything this general, unless @@ -414,6 +420,49 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,    return false;  } +static bool +simplifyAndDCEInstruction(Instruction *I, +                          SmallSetVector<Instruction *, 16> &WorkList, +                          const DataLayout &DL, +                          const TargetLibraryInfo *TLI) { +  if (isInstructionTriviallyDead(I, TLI)) { +    // Null out all of the instruction's operands to see if any operand becomes +    // dead as we go. +    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { +      Value *OpV = I->getOperand(i); +      I->setOperand(i, nullptr); + +      if (!OpV->use_empty() || I == OpV) +        continue; + +      // If the operand is an instruction that became dead as we nulled out the +      // operand, and if it is 'trivially' dead, delete it in a future loop +      // iteration. +      if (Instruction *OpI = dyn_cast<Instruction>(OpV)) +        if (isInstructionTriviallyDead(OpI, TLI)) +          WorkList.insert(OpI); +    } + +    I->eraseFromParent(); + +    return true; +  } + +  if (Value *SimpleV = SimplifyInstruction(I, DL)) { +    // Add the users to the worklist. CAREFUL: an instruction can use itself, +    // in the case of a phi node. +    for (User *U : I->users()) +      if (U != I) +        WorkList.insert(cast<Instruction>(U)); + +    // Replace the instruction with its simplified value. +    I->replaceAllUsesWith(SimpleV); +    I->eraseFromParent(); +    return true; +  } +  return false; +} +  /// SimplifyInstructionsInBlock - Scan the specified basic block and try to  /// simplify any instructions in it and recursively delete dead instructions.  /// @@ -422,30 +471,34 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,  bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,                                         const TargetLibraryInfo *TLI) {    bool MadeChange = false; +  const DataLayout &DL = BB->getModule()->getDataLayout();  #ifndef NDEBUG    // In debug builds, ensure that the terminator of the block is never replaced    // or deleted by these simplifications. The idea of simplification is that it    // cannot introduce new instructions, and there is no way to replace the    // terminator of a block without introducing a new instruction. -  AssertingVH<Instruction> TerminatorVH(--BB->end()); +  AssertingVH<Instruction> TerminatorVH(&BB->back());  #endif -  for (BasicBlock::iterator BI = BB->begin(), E = --BB->end(); BI != E; ) { +  SmallSetVector<Instruction *, 16> WorkList; +  // Iterate over the original function, only adding insts to the worklist +  // if they actually need to be revisited. This avoids having to pre-init +  // the worklist with the entire function's worth of instructions. +  for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end()); BI != E;) {      assert(!BI->isTerminator()); -    Instruction *Inst = BI++; +    Instruction *I = &*BI; +    ++BI; -    WeakVH BIHandle(BI); -    if (recursivelySimplifyInstruction(Inst, TLI)) { -      MadeChange = true; -      if (BIHandle != BI) -        BI = BB->begin(); -      continue; -    } +    // We're visiting this instruction now, so make sure it's not in the +    // worklist from an earlier visit. +    if (!WorkList.count(I)) +      MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI); +  } -    MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI); -    if (BIHandle != BI) -      BI = BB->begin(); +  while (!WorkList.empty()) { +    Instruction *I = WorkList.pop_back_val(); +    MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI);    }    return MadeChange;  } @@ -808,7 +861,8 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {      // Copy over any phi, debug or lifetime instruction.      BB->getTerminator()->eraseFromParent(); -    Succ->getInstList().splice(Succ->getFirstNonPHI(), BB->getInstList()); +    Succ->getInstList().splice(Succ->getFirstNonPHI()->getIterator(), +                               BB->getInstList());    } else {      while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {        // We explicitly check for such uses in CanPropagatePredecessorsForPHIs. @@ -1017,8 +1071,13 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,    if (LdStHasDebugValue(DIVar, LI))      return true; -  Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0, DIVar, DIExpr, -                                  DDI->getDebugLoc(), LI); +  // We are now tracking the loaded value instead of the address. In the +  // future if multi-location support is added to the IR, it might be +  // preferable to keep tracking both the loaded value and the original +  // address in case the alloca can not be elided. +  Instruction *DbgValue = Builder.insertDbgValueIntrinsic( +      LI, 0, DIVar, DIExpr, DDI->getDebugLoc(), (Instruction *)nullptr); +  DbgValue->insertAfter(LI);    return true;  } @@ -1034,8 +1093,8 @@ bool llvm::LowerDbgDeclare(Function &F) {    DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);    SmallVector<DbgDeclareInst *, 4> Dbgs;    for (auto &FI : F) -    for (BasicBlock::iterator BI : FI) -      if (auto DDI = dyn_cast<DbgDeclareInst>(BI)) +    for (Instruction &BI : FI) +      if (auto DDI = dyn_cast<DbgDeclareInst>(&BI))          Dbgs.push_back(DDI);    if (Dbgs.empty()) @@ -1060,9 +1119,13 @@ bool llvm::LowerDbgDeclare(Function &F) {            // This is a call by-value or some other instruction that            // takes a pointer to the variable. Insert a *value*            // intrinsic that describes the alloca. +          SmallVector<uint64_t, 1> NewDIExpr; +          auto *DIExpr = DDI->getExpression(); +          NewDIExpr.push_back(dwarf::DW_OP_deref); +          NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());            DIB.insertDbgValueIntrinsic(AI, 0, DDI->getVariable(), -                                      DDI->getExpression(), DDI->getDebugLoc(), -                                      CI); +                                      DIB.createExpression(NewDIExpr), +                                      DDI->getDebugLoc(), CI);          }        DDI->eraseFromParent();      } @@ -1082,9 +1145,10 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {    return nullptr;  } -bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, -                                      DIBuilder &Builder, bool Deref) { -  DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI); +bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, +                             Instruction *InsertBefore, DIBuilder &Builder, +                             bool Deref, int Offset) { +  DbgDeclareInst *DDI = FindAllocaDbgDeclare(Address);    if (!DDI)      return false;    DebugLoc Loc = DDI->getDebugLoc(); @@ -1092,29 +1156,40 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,    auto *DIExpr = DDI->getExpression();    assert(DIVar && "Missing variable"); -  if (Deref) { +  if (Deref || Offset) {      // Create a copy of the original DIDescriptor for user variable, prepending      // "deref" operation to a list of address elements, as new llvm.dbg.declare      // will take a value storing address of the memory for variable, not      // alloca itself.      SmallVector<uint64_t, 4> NewDIExpr; -    NewDIExpr.push_back(dwarf::DW_OP_deref); +    if (Deref) +      NewDIExpr.push_back(dwarf::DW_OP_deref); +    if (Offset > 0) { +      NewDIExpr.push_back(dwarf::DW_OP_plus); +      NewDIExpr.push_back(Offset); +    } else if (Offset < 0) { +      NewDIExpr.push_back(dwarf::DW_OP_minus); +      NewDIExpr.push_back(-Offset); +    }      if (DIExpr)        NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());      DIExpr = Builder.createExpression(NewDIExpr);    } -  // Insert llvm.dbg.declare in the same basic block as the original alloca, -  // and remove old llvm.dbg.declare. -  BasicBlock *BB = AI->getParent(); -  Builder.insertDeclare(NewAllocaAddress, DIVar, DIExpr, Loc, BB); +  // Insert llvm.dbg.declare immediately after the original alloca, and remove +  // old llvm.dbg.declare. +  Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore);    DDI->eraseFromParent();    return true;  } -/// changeToUnreachable - Insert an unreachable instruction before the specified -/// instruction, making it and the rest of the code in the block dead. -static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { +bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, +                                      DIBuilder &Builder, bool Deref, int Offset) { +  return replaceDbgDeclare(AI, NewAllocaAddress, AI->getNextNode(), Builder, +                           Deref, Offset); +} + +void llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap) {    BasicBlock *BB = I->getParent();    // Loop over all of the successors, removing BB's entry from any PHI    // nodes. @@ -1132,7 +1207,7 @@ static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {    new UnreachableInst(I->getContext(), I);    // All instructions after this are dead. -  BasicBlock::iterator BBI = I, BBE = BB->end(); +  BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end();    while (BBI != BBE) {      if (!BBI->use_empty())        BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); @@ -1142,8 +1217,11 @@ static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {  /// changeToCall - Convert the specified invoke into a normal call.  static void changeToCall(InvokeInst *II) { -  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); -  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II); +  SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end()); +  SmallVector<OperandBundleDef, 1> OpBundles; +  II->getOperandBundlesAsDefs(OpBundles); +  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, OpBundles, +                                       "", II);    NewCall->takeName(II);    NewCall->setCallingConv(II->getCallingConv());    NewCall->setAttributes(II->getAttributes()); @@ -1162,7 +1240,7 @@ static bool markAliveBlocks(Function &F,                              SmallPtrSetImpl<BasicBlock*> &Reachable) {    SmallVector<BasicBlock*, 128> Worklist; -  BasicBlock *BB = F.begin(); +  BasicBlock *BB = &F.front();    Worklist.push_back(BB);    Reachable.insert(BB);    bool Changed = false; @@ -1187,7 +1265,7 @@ static bool markAliveBlocks(Function &F,            if (MakeUnreachable) {              // Don't insert a call to llvm.trap right before the unreachable. -            changeToUnreachable(BBI, false); +            changeToUnreachable(&*BBI, false);              Changed = true;              break;            } @@ -1201,7 +1279,7 @@ static bool markAliveBlocks(Function &F,            ++BBI;            if (!isa<UnreachableInst>(BBI)) {              // Don't insert a call to llvm.trap right before the unreachable. -            changeToUnreachable(BBI, false); +            changeToUnreachable(&*BBI, false);              Changed = true;            }            break; @@ -1253,6 +1331,40 @@ static bool markAliveBlocks(Function &F,    return Changed;  } +void llvm::removeUnwindEdge(BasicBlock *BB) { +  TerminatorInst *TI = BB->getTerminator(); + +  if (auto *II = dyn_cast<InvokeInst>(TI)) { +    changeToCall(II); +    return; +  } + +  TerminatorInst *NewTI; +  BasicBlock *UnwindDest; + +  if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) { +    NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI); +    UnwindDest = CRI->getUnwindDest(); +  } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) { +    auto *NewCatchSwitch = CatchSwitchInst::Create( +        CatchSwitch->getParentPad(), nullptr, CatchSwitch->getNumHandlers(), +        CatchSwitch->getName(), CatchSwitch); +    for (BasicBlock *PadBB : CatchSwitch->handlers()) +      NewCatchSwitch->addHandler(PadBB); + +    NewTI = NewCatchSwitch; +    UnwindDest = CatchSwitch->getUnwindDest(); +  } else { +    llvm_unreachable("Could not find unwind successor"); +  } + +  NewTI->takeName(TI); +  NewTI->setDebugLoc(TI->getDebugLoc()); +  UnwindDest->removePredecessor(BB); +  TI->replaceAllUsesWith(NewTI); +  TI->eraseFromParent(); +} +  /// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even  /// if they are in a dead cycle.  Return true if a change was made, false  /// otherwise. @@ -1270,17 +1382,18 @@ bool llvm::removeUnreachableBlocks(Function &F) {    // Loop over all of the basic blocks that are not reachable, dropping all of    // their internal references...    for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { -    if (Reachable.count(BB)) +    if (Reachable.count(&*BB))        continue; -    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) +    for (succ_iterator SI = succ_begin(&*BB), SE = succ_end(&*BB); SI != SE; +         ++SI)        if (Reachable.count(*SI)) -        (*SI)->removePredecessor(BB); +        (*SI)->removePredecessor(&*BB);      BB->dropAllReferences();    }    for (Function::iterator I = ++F.begin(); I != F.end();) -    if (!Reachable.count(I)) +    if (!Reachable.count(&*I))        I = F.getBasicBlockList().erase(I);      else        ++I; @@ -1288,9 +1401,10 @@ bool llvm::removeUnreachableBlocks(Function &F) {    return true;  } -void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsigned> KnownIDs) { +void llvm::combineMetadata(Instruction *K, const Instruction *J, +                           ArrayRef<unsigned> KnownIDs) {    SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; -  K->dropUnknownMetadata(KnownIDs); +  K->dropUnknownNonDebugMetadata(KnownIDs);    K->getAllMetadataOtherThanDebugLoc(Metadata);    for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {      unsigned Kind = Metadata[i].first; @@ -1326,8 +1440,29 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsign          // Only set the !nonnull if it is present in both instructions.          K->setMetadata(Kind, JMD);          break; +      case LLVMContext::MD_invariant_group: +        // Preserve !invariant.group in K. +        break; +      case LLVMContext::MD_align: +        K->setMetadata(Kind,  +          MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); +        break; +      case LLVMContext::MD_dereferenceable: +      case LLVMContext::MD_dereferenceable_or_null: +        K->setMetadata(Kind,  +          MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); +        break;      }    } +  // Set !invariant.group from J if J has it. If both instructions have it +  // then we will just pick it from J - even when they are different. +  // Also make sure that K is load or store - f.e. combining bitcast with load +  // could produce bitcast with invariant.group metadata, which is invalid. +  // FIXME: we should try to preserve both invariant.group md if they are +  // different, but right now instruction can only have one invariant.group. +  if (auto *JMD = J->getMetadata(LLVMContext::MD_invariant_group)) +    if (isa<LoadInst>(K) || isa<StoreInst>(K)) +      K->setMetadata(LLVMContext::MD_invariant_group, JMD);  }  unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, @@ -1349,3 +1484,40 @@ unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,    }    return Count;  } + +unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, +                                        DominatorTree &DT, +                                        const BasicBlock *BB) { +  assert(From->getType() == To->getType()); + +  unsigned Count = 0; +  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); +       UI != UE;) { +    Use &U = *UI++; +    auto *I = cast<Instruction>(U.getUser()); +    if (DT.dominates(BB, I->getParent())) { +      U.set(To); +      DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as " +                   << *To << " in " << *U << "\n"); +      ++Count; +    } +  } +  return Count; +} + +bool llvm::callsGCLeafFunction(ImmutableCallSite CS) { +  if (isa<IntrinsicInst>(CS.getInstruction())) +    // Most LLVM intrinsics are things which can never take a safepoint. +    // As a result, we don't need to have the stack parsable at the +    // callsite.  This is a highly useful optimization since intrinsic +    // calls are fairly prevalent, particularly in debug builds. +    return true; + +  // Check if the function is specifically marked as a gc leaf function. +  // +  // TODO: we should be checking the attributes on the call site as well. +  if (const Function *F = CS.getCalledFunction()) +    return F->hasFnAttribute("gc-leaf-function"); + +  return false; +} diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp index 5c98043e4632..1fa469595d16 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -44,11 +44,14 @@  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/InstructionSimplify.h"  #include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"  #include "llvm/IR/CFG.h"  #include "llvm/IR/Constants.h"  #include "llvm/IR/DataLayout.h" @@ -78,7 +81,7 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB,                                       SmallVectorImpl<BasicBlock *> &SplitPreds,                                       Loop *L) {    // Check to see if NewBB is already well placed. -  Function::iterator BBI = NewBB; --BBI; +  Function::iterator BBI = --NewBB->getIterator();    for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {      if (&*BBI == SplitPreds[i])        return; @@ -92,9 +95,8 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB,    // block that neighbors a BB actually in the loop.    BasicBlock *FoundBB = nullptr;    for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { -    Function::iterator BBI = SplitPreds[i]; -    if (++BBI != NewBB->getParent()->end() && -        L->contains(BBI)) { +    Function::iterator BBI = SplitPreds[i]->getIterator(); +    if (++BBI != NewBB->getParent()->end() && L->contains(&*BBI)) {        FoundBB = SplitPreds[i];        break;      } @@ -112,17 +114,10 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB,  /// preheader, this method is called to insert one.  This method has two phases:  /// preheader insertion and analysis updating.  /// -BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { +BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT, +                                         LoopInfo *LI, bool PreserveLCSSA) {    BasicBlock *Header = L->getHeader(); -  // Get analyses that we try to update. -  auto *AA = PP->getAnalysisIfAvailable<AliasAnalysis>(); -  auto *DTWP = PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); -  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; -  auto *LIWP = PP->getAnalysisIfAvailable<LoopInfoWrapperPass>(); -  auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; -  bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); -    // Compute the set of predecessors of the loop that are not in the loop.    SmallVector<BasicBlock*, 8> OutsideBlocks;    for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); @@ -141,8 +136,10 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) {    // Split out the loop pre-header.    BasicBlock *PreheaderBB; -  PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", -                                       AA, DT, LI, PreserveLCSSA); +  PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", DT, +                                       LI, PreserveLCSSA); +  if (!PreheaderBB) +    return nullptr;    DEBUG(dbgs() << "LoopSimplify: Creating pre-header "                 << PreheaderBB->getName() << "\n"); @@ -159,8 +156,8 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) {  /// This method is used to split exit blocks that have predecessors outside of  /// the loop.  static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, -                                        AliasAnalysis *AA, DominatorTree *DT, -                                        LoopInfo *LI, Pass *PP) { +                                        DominatorTree *DT, LoopInfo *LI, +                                        bool PreserveLCSSA) {    SmallVector<BasicBlock*, 8> LoopBlocks;    for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) {      BasicBlock *P = *I; @@ -175,10 +172,10 @@ static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit,    assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");    BasicBlock *NewExitBB = nullptr; -  bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); - -  NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", AA, DT, -                                     LI, PreserveLCSSA); +  NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", DT, LI, +                                     PreserveLCSSA); +  if (!NewExitBB) +    return nullptr;    DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "                 << NewExitBB->getName() << "\n"); @@ -206,8 +203,7 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,  /// \brief The first part of loop-nestification is to find a PHI node that tells  /// us how to partition the loops. -static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA, -                                        DominatorTree *DT, +static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,                                          AssumptionCache *AC) {    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();    for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) { @@ -216,7 +212,6 @@ static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA,      if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) {        // This is a degenerate PHI already, don't modify it!        PN->replaceAllUsesWith(V); -      if (AA) AA->deleteValue(PN);        PN->eraseFromParent();        continue;      } @@ -251,18 +246,18 @@ static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA,  /// created.  ///  static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, -                                AliasAnalysis *AA, DominatorTree *DT, -                                LoopInfo *LI, ScalarEvolution *SE, Pass *PP, +                                DominatorTree *DT, LoopInfo *LI, +                                ScalarEvolution *SE, bool PreserveLCSSA,                                  AssumptionCache *AC) {    // Don't try to separate loops without a preheader.    if (!Preheader)      return nullptr;    // The header is not a landing pad; preheader insertion should ensure this. -  assert(!L->getHeader()->isLandingPad() && -         "Can't insert backedge to landing pad"); +  BasicBlock *Header = L->getHeader(); +  assert(!Header->isEHPad() && "Can't insert backedge to EH pad"); -  PHINode *PN = findPHIToPartitionLoops(L, AA, DT, AC); +  PHINode *PN = findPHIToPartitionLoops(L, DT, AC);    if (!PN) return nullptr;  // No known way to partition.    // Pull out all predecessors that have varying values in the loop.  This @@ -286,11 +281,8 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,    if (SE)      SE->forgetLoop(L); -  bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); - -  BasicBlock *Header = L->getHeader();    BasicBlock *NewBB = SplitBlockPredecessors(Header, OuterLoopPreds, ".outer", -                                             AA, DT, LI, PreserveLCSSA); +                                             DT, LI, PreserveLCSSA);    // Make sure that NewBB is put someplace intelligent, which doesn't mess up    // code layout too horribly. @@ -357,7 +349,6 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,  /// and have that block branch to the loop header.  This ensures that loops  /// have exactly one backedge.  static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, -                                             AliasAnalysis *AA,                                               DominatorTree *DT, LoopInfo *LI) {    assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!"); @@ -369,8 +360,8 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,    if (!Preheader)      return nullptr; -  // The header is not a landing pad; preheader insertion should ensure this. -  assert(!Header->isLandingPad() && "Can't insert backedge to landing pad"); +  // The header is not an EH pad; preheader insertion should ensure this. +  assert(!Header->isEHPad() && "Can't insert backedge to EH pad");    // Figure out which basic blocks contain back-edges to the loop header.    std::vector<BasicBlock*> BackedgeBlocks; @@ -394,7 +385,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,                 << BEBlock->getName() << "\n");    // Move the new backedge block to right after the last backedge block. -  Function::iterator InsertPos = BackedgeBlocks.back(); ++InsertPos; +  Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator();    F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock);    // Now that the block has been inserted into the function, create PHI nodes in @@ -443,7 +434,6 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,      // eliminate the PHI Node.      if (HasUniqueIncomingValue) {        NewPN->replaceAllUsesWith(UniqueValue); -      if (AA) AA->deleteValue(NewPN);        BEBlock->getInstList().erase(NewPN);      }    } @@ -470,15 +460,10 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,  }  /// \brief Simplify one loop and queue further loops for simplification. -/// -/// FIXME: Currently this accepts both lots of analyses that it uses and a raw -/// Pass pointer. The Pass pointer is used by numerous utilities to update -/// specific analyses. Rather than a pass it would be much cleaner and more -/// explicit if they accepted the analysis directly and then updated it.  static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist, -                            AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, -                            ScalarEvolution *SE, Pass *PP, -                            AssumptionCache *AC) { +                            DominatorTree *DT, LoopInfo *LI, +                            ScalarEvolution *SE, AssumptionCache *AC, +                            bool PreserveLCSSA) {    bool Changed = false;  ReprocessLoop: @@ -544,7 +529,7 @@ ReprocessLoop:    // Does the loop already have a preheader?  If so, don't insert one.    BasicBlock *Preheader = L->getLoopPreheader();    if (!Preheader) { -    Preheader = InsertPreheaderForLoop(L, PP); +    Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);      if (Preheader) {        ++NumInserted;        Changed = true; @@ -568,7 +553,7 @@ ReprocessLoop:        // Must be exactly this loop: no subloops, parent loops, or non-loop preds        // allowed.        if (!L->contains(*PI)) { -        if (rewriteLoopExitBlock(L, ExitBlock, AA, DT, LI, PP)) { +        if (rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA)) {            ++NumInserted;            Changed = true;          } @@ -585,7 +570,7 @@ ReprocessLoop:      // common backedge instead.      if (L->getNumBackEdges() < 8) {        if (Loop *OuterL = -              separateNestedLoop(L, Preheader, AA, DT, LI, SE, PP, AC)) { +              separateNestedLoop(L, Preheader, DT, LI, SE, PreserveLCSSA, AC)) {          ++NumNested;          // Enqueue the outer loop as it should be processed next in our          // depth-first nest walk. @@ -602,7 +587,7 @@ ReprocessLoop:      // If we either couldn't, or didn't want to, identify nesting of the loops,      // insert a new block that all backedges target, then make it jump to the      // loop header. -    LoopLatch = insertUniqueBackedgeBlock(L, Preheader, AA, DT, LI); +    LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI);      if (LoopLatch) {        ++NumInserted;        Changed = true; @@ -618,7 +603,6 @@ ReprocessLoop:    for (BasicBlock::iterator I = L->getHeader()->begin();         (PN = dyn_cast<PHINode>(I++)); )      if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) { -      if (AA) AA->deleteValue(PN);        if (SE) SE->forgetValue(PN);        PN->replaceAllUsesWith(V);        PN->eraseFromParent(); @@ -654,7 +638,7 @@ ReprocessLoop:        bool AllInvariant = true;        bool AnyInvariant = false;        for (BasicBlock::iterator I = ExitingBlock->begin(); &*I != BI; ) { -        Instruction *Inst = I++; +        Instruction *Inst = &*I++;          // Skip debug info intrinsics.          if (isa<DbgInfoIntrinsic>(Inst))            continue; @@ -716,9 +700,9 @@ ReprocessLoop:    return Changed;  } -bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, -                        AliasAnalysis *AA, ScalarEvolution *SE, -                        AssumptionCache *AC) { +bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, +                        ScalarEvolution *SE, AssumptionCache *AC, +                        bool PreserveLCSSA) {    bool Changed = false;    // Worklist maintains our depth-first queue of loops in this nest to process. @@ -734,8 +718,8 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,    }    while (!Worklist.empty()) -    Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI, -                               SE, PP, AC); +    Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, DT, LI, SE, +                               AC, PreserveLCSSA);    return Changed;  } @@ -747,9 +731,6 @@ namespace {        initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());      } -    // AA - If we have an alias analysis object to update, this is it, otherwise -    // this is null. -    AliasAnalysis *AA;      DominatorTree *DT;      LoopInfo *LI;      ScalarEvolution *SE; @@ -767,8 +748,11 @@ namespace {        AU.addRequired<LoopInfoWrapperPass>();        AU.addPreserved<LoopInfoWrapperPass>(); -      AU.addPreserved<AliasAnalysis>(); -      AU.addPreserved<ScalarEvolution>(); +      AU.addPreserved<BasicAAWrapperPass>(); +      AU.addPreserved<AAResultsWrapperPass>(); +      AU.addPreserved<GlobalsAAWrapperPass>(); +      AU.addPreserved<ScalarEvolutionWrapperPass>(); +      AU.addPreserved<SCEVAAWrapperPass>();        AU.addPreserved<DependenceAnalysis>();        AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.      } @@ -784,6 +768,9 @@ INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",  INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)  INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",                  "Canonicalize natural loops", false, false) @@ -796,15 +783,16 @@ Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }  ///  bool LoopSimplify::runOnFunction(Function &F) {    bool Changed = false; -  AA = getAnalysisIfAvailable<AliasAnalysis>();    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); -  SE = getAnalysisIfAvailable<ScalarEvolution>(); +  auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); +  SE = SEWP ? &SEWP->getSE() : nullptr;    AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); +  bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);    // Simplify each loop nest in the function.    for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) -    Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, AC); +    Changed |= simplifyLoop(*I, DT, LI, SE, AC, PreserveLCSSA);    return Changed;  } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 1dbce4746835..2499b88741fe 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -73,7 +73,7 @@ static inline void RemapInstruction(Instruction *I,  /// of loops that have already been forgotten to prevent redundant, expensive  /// calls to ScalarEvolution::forgetLoop.  Returns the new combined block.  static BasicBlock * -FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM, +FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, ScalarEvolution *SE,                           SmallPtrSetImpl<Loop *> &ForgottenLoops) {    // Merge basic blocks into their predecessor if there is only one distinct    // pred, and if there is only one distinct successor of the predecessor, and @@ -109,12 +109,10 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM,    // Erase basic block from the function...    // ScalarEvolution holds references to loop exit blocks. -  if (LPM) { -    if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) { -      if (Loop *L = LI->getLoopFor(BB)) { -        if (ForgottenLoops.insert(L).second) -          SE->forgetLoop(L); -      } +  if (SE) { +    if (Loop *L = LI->getLoopFor(BB)) { +      if (ForgottenLoops.insert(L).second) +        SE->forgetLoop(L);      }    }    LI->removeBlock(BB); @@ -155,15 +153,13 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM,  ///  /// The LoopInfo Analysis that is passed will be kept consistent.  /// -/// If a LoopPassManager is passed in, and the loop is fully removed, it will be -/// removed from the LoopPassManager as well. LPM can also be NULL. -/// -/// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are -/// available from the Pass it must also preserve those analyses. +/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and +/// DominatorTree if they are non-null.  bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,                        bool AllowRuntime, bool AllowExpensiveTripCount, -                      unsigned TripMultiple, LoopInfo *LI, Pass *PP, -                      LPPassManager *LPM, AssumptionCache *AC) { +                      unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE, +                      DominatorTree *DT, AssumptionCache *AC, +                      bool PreserveLCSSA) {    BasicBlock *Preheader = L->getLoopPreheader();    if (!Preheader) {      DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n"); @@ -220,6 +216,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,    // Are we eliminating the loop control altogether?    bool CompletelyUnroll = Count == TripCount; +  SmallVector<BasicBlock *, 4> ExitBlocks; +  L->getExitBlocks(ExitBlocks); +  Loop *ParentL = L->getParentLoop(); +  bool AllExitsAreInsideParentLoop = !ParentL || +      std::all_of(ExitBlocks.begin(), ExitBlocks.end(), +                  [&](BasicBlock *BB) { return ParentL->contains(BB); });    // We assume a run-time trip count if the compiler cannot    // figure out the loop trip count and the unroll-runtime @@ -227,13 +229,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,    bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);    if (RuntimeTripCount && -      !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, LPM)) +      !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, SE, DT, +                               PreserveLCSSA))      return false;    // Notify ScalarEvolution that the loop will be substantially changed,    // if not outright eliminated. -  ScalarEvolution *SE = -      PP ? PP->getAnalysisIfAvailable<ScalarEvolution>() : nullptr;    if (SE)      SE->forgetLoop(L); @@ -392,7 +393,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,      for (unsigned i = 0; i < NewBlocks.size(); ++i)        for (BasicBlock::iterator I = NewBlocks[i]->begin(),             E = NewBlocks[i]->end(); I != E; ++I) -        ::RemapInstruction(I, LastValueMap); +        ::RemapInstruction(&*I, LastValueMap);    }    // Loop over the PHI nodes in the original block, setting incoming values. @@ -432,8 +433,9 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,      // For a complete unroll, make the last iteration end with a branch      // to the exit block. -    if (CompletelyUnroll && j == 0) { -      Dest = LoopExit; +    if (CompletelyUnroll) { +      if (j == 0) +        Dest = LoopExit;        NeedConditional = false;      } @@ -473,7 +475,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,      BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());      if (Term->isUnconditional()) {        BasicBlock *Dest = Term->getSuccessor(0); -      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM, +      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, SE,                                                        ForgottenLoops))          std::replace(Latches.begin(), Latches.end(), Dest, Fold);      } @@ -483,29 +485,24 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,    // whole function's cache.    AC->clear(); -  DominatorTree *DT = nullptr; -  if (PP) { -    // FIXME: Reconstruct dom info, because it is not preserved properly. -    // Incrementally updating domtree after loop unrolling would be easy. -    if (DominatorTreeWrapperPass *DTWP = -            PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { -      DT = &DTWP->getDomTree(); -      DT->recalculate(*L->getHeader()->getParent()); -    } - -    // Simplify any new induction variables in the partially unrolled loop. -    if (SE && !CompletelyUnroll) { -      SmallVector<WeakVH, 16> DeadInsts; -      simplifyLoopIVs(L, SE, LPM, DeadInsts); - -      // Aggressively clean up dead instructions that simplifyLoopIVs already -      // identified. Any remaining should be cleaned up below. -      while (!DeadInsts.empty()) -        if (Instruction *Inst = -            dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) -          RecursivelyDeleteTriviallyDeadInstructions(Inst); -    } +  // FIXME: Reconstruct dom info, because it is not preserved properly. +  // Incrementally updating domtree after loop unrolling would be easy. +  if (DT) +    DT->recalculate(*L->getHeader()->getParent()); + +  // Simplify any new induction variables in the partially unrolled loop. +  if (SE && !CompletelyUnroll) { +    SmallVector<WeakVH, 16> DeadInsts; +    simplifyLoopIVs(L, SE, DT, LI, DeadInsts); + +    // Aggressively clean up dead instructions that simplifyLoopIVs already +    // identified. Any remaining should be cleaned up below. +    while (!DeadInsts.empty()) +      if (Instruction *Inst = +              dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) +        RecursivelyDeleteTriviallyDeadInstructions(Inst);    } +    // At this point, the code is well formed.  We now do a quick sweep over the    // inserted code, doing constant propagation and dead code elimination as we    // go. @@ -514,7 +511,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,    for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),         BBE = NewLoopBlocks.end(); BB != BBE; ++BB)      for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) { -      Instruction *Inst = I++; +      Instruction *Inst = &*I++;        if (isInstructionTriviallyDead(Inst))          (*BB)->getInstList().erase(Inst); @@ -529,29 +526,33 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,    ++NumUnrolled;    Loop *OuterL = L->getParentLoop(); -  // Remove the loop from the LoopPassManager if it's completely removed. -  if (CompletelyUnroll && LPM != nullptr) -    LPM->deleteLoopFromQueue(L); +  // Update LoopInfo if the loop is completely removed. +  if (CompletelyUnroll) +    LI->updateUnloop(L);;    // If we have a pass and a DominatorTree we should re-simplify impacted loops    // to ensure subsequent analyses can rely on this form. We want to simplify    // at least one layer outside of the loop that was unrolled so that any    // changes to the parent loop exposed by the unrolling are considered. -  if (PP && DT) { +  if (DT) {      if (!OuterL && !CompletelyUnroll)        OuterL = L;      if (OuterL) { -      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, AC); +      bool Simplified = simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);        // LCSSA must be performed on the outermost affected loop. The unrolled        // loop's last loop latch is guaranteed to be in the outermost loop after -      // deleteLoopFromQueue updates LoopInfo. +      // LoopInfo's been updated by updateUnloop.        Loop *LatchLoop = LI->getLoopFor(Latches.back());        if (!OuterL->contains(LatchLoop))          while (OuterL->getParentLoop() != LatchLoop)            OuterL = OuterL->getParentLoop(); -      formLCSSARecursively(*OuterL, *DT, LI, SE); +      if (CompletelyUnroll && (!AllExitsAreInsideParentLoop || Simplified)) +        formLCSSARecursively(*OuterL, *DT, LI, SE); +      else +        assert(OuterL->isLCSSAForm(*DT) && +               "Loops should be in LCSSA form after loop-unroll.");      }    } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index add5432aa276..0d68f18ad0e5 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -62,8 +62,8 @@ STATISTIC(NumRuntimeUnrolled,  static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,                            BasicBlock *LastPrologBB, BasicBlock *PrologEnd,                            BasicBlock *OrigPH, BasicBlock *NewPH, -                          ValueToValueMapTy &VMap, AliasAnalysis *AA, -                          DominatorTree *DT, LoopInfo *LI, Pass *P) { +                          ValueToValueMapTy &VMap, DominatorTree *DT, +                          LoopInfo *LI, bool PreserveLCSSA) {    BasicBlock *Latch = L->getLoopLatch();    assert(Latch && "Loop must have a latch"); @@ -127,8 +127,8 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,    assert(Exit && "Loop must have a single exit block only");    // Split the exit to maintain loop canonicalization guarantees    SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit)); -  SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", AA, DT, LI, -                         P->mustPreserveAnalysisID(LCSSAID)); +  SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI, +                         PreserveLCSSA);    // Add the branch to the exit block (around the unrolled loop)    B.CreateCondBr(BrLoopExit, Exit, NewPH);    InsertPt->eraseFromParent(); @@ -150,7 +150,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,    Function *F = Header->getParent();    LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();    LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); -  Loop *NewLoop = 0; +  Loop *NewLoop = nullptr;    Loop *ParentLoop = L->getParentLoop();    if (!UnrollProlog) {      NewLoop = new Loop(); @@ -206,9 +206,9 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,    // Change the incoming values to the ones defined in the preheader or    // cloned loop.    for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { -    PHINode *NewPHI = cast<PHINode>(VMap[I]); +    PHINode *NewPHI = cast<PHINode>(VMap[&*I]);      if (UnrollProlog) { -      VMap[I] = NewPHI->getIncomingValueForBlock(Preheader); +      VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);        cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);      } else {        unsigned idx = NewPHI->getBasicBlockIndex(Preheader); @@ -279,7 +279,8 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,  ///  bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,                                     bool AllowExpensiveTripCount, LoopInfo *LI, -                                   LPPassManager *LPM) { +                                   ScalarEvolution *SE, DominatorTree *DT, +                                   bool PreserveLCSSA) {    // for now, only unroll loops that contain a single exit    if (!L->getExitingBlock())      return false; @@ -291,9 +292,6 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,    // Use Scalar Evolution to compute the trip count.  This allows more    // loops to be unrolled than relying on induction var simplification -  if (!LPM) -    return false; -  ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();    if (!SE)      return false; @@ -308,7 +306,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,    // Add 1 since the backedge count doesn't include the first loop iteration    const SCEV *TripCountSC = -    SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); +      SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));    if (isa<SCEVCouldNotCompute>(TripCountSC))      return false; @@ -333,10 +331,6 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,    if (Loop *ParentLoop = L->getParentLoop())      SE->forgetLoop(ParentLoop); -  // Grab analyses that we preserve. -  auto *DTWP = LPM->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); -  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; -    BasicBlock *PH = L->getLoopPreheader();    BasicBlock *Latch = L->getLoopLatch();    // It helps to splits the original preheader twice, one for the end of the @@ -397,8 +391,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,                    VMap, LI);    // Insert the cloned blocks into function just before the original loop -  F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0], -                                F->end()); +  F->getBasicBlockList().splice(PEnd->getIterator(), F->getBasicBlockList(), +                                NewBlocks[0]->getIterator(), F->end());    // Rewrite the cloned instruction operands to use the values    // created when the clone is created. @@ -406,7 +400,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,      for (BasicBlock::iterator I = NewBlocks[i]->begin(),                                E = NewBlocks[i]->end();           I != E; ++I) { -      RemapInstruction(I, VMap, +      RemapInstruction(&*I, VMap,                         RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);      }    } @@ -414,8 +408,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,    // Connect the prolog code to the original loop and update the    // PHI functions.    BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]); -  ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, -                /*AliasAnalysis*/ nullptr, DT, LI, LPM->getAsPass()); +  ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, DT, LI, +                PreserveLCSSA);    NumRuntimeUnrolled++;    return true;  } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp index 5cbde94a98ed..e03880526bfa 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -12,13 +12,13 @@  //===----------------------------------------------------------------------===//  #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h"  #include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h"  #include "llvm/IR/PatternMatch.h"  #include "llvm/IR/ValueHandle.h"  #include "llvm/Support/Debug.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/IR/Module.h"  #include "llvm/Transforms/Utils/LoopUtils.h"  using namespace llvm; @@ -34,6 +34,124 @@ bool RecurrenceDescriptor::areAllUsesIn(Instruction *I,    return true;  } +bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurrenceKind Kind) { +  switch (Kind) { +  default: +    break; +  case RK_IntegerAdd: +  case RK_IntegerMult: +  case RK_IntegerOr: +  case RK_IntegerAnd: +  case RK_IntegerXor: +  case RK_IntegerMinMax: +    return true; +  } +  return false; +} + +bool RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind Kind) { +  return (Kind != RK_NoRecurrence) && !isIntegerRecurrenceKind(Kind); +} + +bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurrenceKind Kind) { +  switch (Kind) { +  default: +    break; +  case RK_IntegerAdd: +  case RK_IntegerMult: +  case RK_FloatAdd: +  case RK_FloatMult: +    return true; +  } +  return false; +} + +Instruction * +RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT, +                                     SmallPtrSetImpl<Instruction *> &Visited, +                                     SmallPtrSetImpl<Instruction *> &CI) { +  if (!Phi->hasOneUse()) +    return Phi; + +  const APInt *M = nullptr; +  Instruction *I, *J = cast<Instruction>(Phi->use_begin()->getUser()); + +  // Matches either I & 2^x-1 or 2^x-1 & I. If we find a match, we update RT +  // with a new integer type of the corresponding bit width. +  if (match(J, m_CombineOr(m_And(m_Instruction(I), m_APInt(M)), +                           m_And(m_APInt(M), m_Instruction(I))))) { +    int32_t Bits = (*M + 1).exactLogBase2(); +    if (Bits > 0) { +      RT = IntegerType::get(Phi->getContext(), Bits); +      Visited.insert(Phi); +      CI.insert(J); +      return J; +    } +  } +  return Phi; +} + +bool RecurrenceDescriptor::getSourceExtensionKind( +    Instruction *Start, Instruction *Exit, Type *RT, bool &IsSigned, +    SmallPtrSetImpl<Instruction *> &Visited, +    SmallPtrSetImpl<Instruction *> &CI) { + +  SmallVector<Instruction *, 8> Worklist; +  bool FoundOneOperand = false; +  unsigned DstSize = RT->getPrimitiveSizeInBits(); +  Worklist.push_back(Exit); + +  // Traverse the instructions in the reduction expression, beginning with the +  // exit value. +  while (!Worklist.empty()) { +    Instruction *I = Worklist.pop_back_val(); +    for (Use &U : I->operands()) { + +      // Terminate the traversal if the operand is not an instruction, or we +      // reach the starting value. +      Instruction *J = dyn_cast<Instruction>(U.get()); +      if (!J || J == Start) +        continue; + +      // Otherwise, investigate the operation if it is also in the expression. +      if (Visited.count(J)) { +        Worklist.push_back(J); +        continue; +      } + +      // If the operand is not in Visited, it is not a reduction operation, but +      // it does feed into one. Make sure it is either a single-use sign- or +      // zero-extend instruction. +      CastInst *Cast = dyn_cast<CastInst>(J); +      bool IsSExtInst = isa<SExtInst>(J); +      if (!Cast || !Cast->hasOneUse() || !(isa<ZExtInst>(J) || IsSExtInst)) +        return false; + +      // Ensure the source type of the extend is no larger than the reduction +      // type. It is not necessary for the types to be identical. +      unsigned SrcSize = Cast->getSrcTy()->getPrimitiveSizeInBits(); +      if (SrcSize > DstSize) +        return false; + +      // Furthermore, ensure that all such extends are of the same kind. +      if (FoundOneOperand) { +        if (IsSigned != IsSExtInst) +          return false; +      } else { +        FoundOneOperand = true; +        IsSigned = IsSExtInst; +      } + +      // Lastly, if the source type of the extend matches the reduction type, +      // add the extend to CI so that we can avoid accounting for it in the +      // cost model. +      if (SrcSize == DstSize) +        CI.insert(Cast); +    } +  } +  return true; +} +  bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,                                             Loop *TheLoop, bool HasFunNoNaNAttr,                                             RecurrenceDescriptor &RedDes) { @@ -68,10 +186,32 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,    unsigned NumCmpSelectPatternInst = 0;    InstDesc ReduxDesc(false, nullptr); +  // Data used for determining if the recurrence has been type-promoted. +  Type *RecurrenceType = Phi->getType(); +  SmallPtrSet<Instruction *, 4> CastInsts; +  Instruction *Start = Phi; +  bool IsSigned = false; +    SmallPtrSet<Instruction *, 8> VisitedInsts;    SmallVector<Instruction *, 8> Worklist; -  Worklist.push_back(Phi); -  VisitedInsts.insert(Phi); + +  // Return early if the recurrence kind does not match the type of Phi. If the +  // recurrence kind is arithmetic, we attempt to look through AND operations +  // resulting from the type promotion performed by InstCombine.  Vector +  // operations are not limited to the legal integer widths, so we may be able +  // to evaluate the reduction in the narrower width. +  if (RecurrenceType->isFloatingPointTy()) { +    if (!isFloatingPointRecurrenceKind(Kind)) +      return false; +  } else { +    if (!isIntegerRecurrenceKind(Kind)) +      return false; +    if (isArithmeticRecurrenceKind(Kind)) +      Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts); +  } + +  Worklist.push_back(Start); +  VisitedInsts.insert(Start);    // A value in the reduction can be used:    //  - By the reduction: @@ -110,10 +250,14 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,          !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))        return false; -    // Any reduction instruction must be of one of the allowed kinds. -    ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr); -    if (!ReduxDesc.isRecurrence()) -      return false; +    // Any reduction instruction must be of one of the allowed kinds. We ignore +    // the starting value (the Phi or an AND instruction if the Phi has been +    // type-promoted). +    if (Cur != Start) { +      ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr); +      if (!ReduxDesc.isRecurrence()) +        return false; +    }      // A reduction operation must only have one use of the reduction value.      if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && @@ -131,7 +275,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,        ++NumCmpSelectPatternInst;      // Check  whether we found a reduction operator. -    FoundReduxOp |= !IsAPhi; +    FoundReduxOp |= !IsAPhi && Cur != Start;      // Process users of current instruction. Push non-PHI nodes after PHI nodes      // onto the stack. This way we are going to have seen all inputs to PHI @@ -193,6 +337,14 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,    if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)      return false; +  // If we think Phi may have been type-promoted, we also need to ensure that +  // all source operands of the reduction are either SExtInsts or ZEstInsts. If +  // so, we will be able to evaluate the reduction in the narrower bit width. +  if (Start != Phi) +    if (!getSourceExtensionKind(Start, ExitInstruction, RecurrenceType, +                                IsSigned, VisitedInsts, CastInsts)) +      return false; +    // We found a reduction var if we have reached the original phi node and we    // only have a single instruction with out-of-loop users. @@ -200,9 +352,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,    // is saved as part of the RecurrenceDescriptor.    // Save the description of this reduction variable. -  RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, -                          ReduxDesc.getMinMaxKind()); - +  RecurrenceDescriptor RD( +      RdxStart, ExitInstruction, Kind, ReduxDesc.getMinMaxKind(), +      ReduxDesc.getUnsafeAlgebraInst(), RecurrenceType, IsSigned, CastInsts);    RedDes = RD;    return true; @@ -263,14 +415,14 @@ RecurrenceDescriptor::InstDesc  RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,                                          InstDesc &Prev, bool HasFunNoNaNAttr) {    bool FP = I->getType()->isFloatingPointTy(); -  bool FastMath = FP && I->hasUnsafeAlgebra(); +  Instruction *UAI = Prev.getUnsafeAlgebraInst(); +  if (!UAI && FP && !I->hasUnsafeAlgebra()) +    UAI = I; // Found an unsafe (unvectorizable) algebra instruction. +    switch (I->getOpcode()) {    default:      return InstDesc(false, I);    case Instruction::PHI: -    if (FP && -        (Kind != RK_FloatMult && Kind != RK_FloatAdd && Kind != RK_FloatMinMax)) -      return InstDesc(false, I);      return InstDesc(I, Prev.getMinMaxKind());    case Instruction::Sub:    case Instruction::Add: @@ -284,10 +436,10 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,    case Instruction::Xor:      return InstDesc(Kind == RK_IntegerXor, I);    case Instruction::FMul: -    return InstDesc(Kind == RK_FloatMult && FastMath, I); +    return InstDesc(Kind == RK_FloatMult, I, UAI);    case Instruction::FSub:    case Instruction::FAdd: -    return InstDesc(Kind == RK_FloatAdd && FastMath, I); +    return InstDesc(Kind == RK_FloatAdd, I, UAI);    case Instruction::FCmp:    case Instruction::ICmp:    case Instruction::Select: @@ -442,6 +594,13 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder,      break;    } +  // We only match FP sequences with unsafe algebra, so we can unconditionally +  // set it on any generated instructions. +  IRBuilder<>::FastMathFlagGuard FMFG(Builder); +  FastMathFlags FMF; +  FMF.setUnsafeAlgebra(); +  Builder.SetFastMathFlags(FMF); +    Value *Cmp;    if (RK == MRK_FloatMin || RK == MRK_FloatMax)      Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); @@ -452,8 +611,54 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder,    return Select;  } -bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, -                          ConstantInt *&StepValue) { +InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K, +                                         ConstantInt *Step) +  : StartValue(Start), IK(K), StepValue(Step) { +  assert(IK != IK_NoInduction && "Not an induction"); +  assert(StartValue && "StartValue is null"); +  assert(StepValue && !StepValue->isZero() && "StepValue is zero"); +  assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && +         "StartValue is not a pointer for pointer induction"); +  assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && +         "StartValue is not an integer for integer induction"); +  assert(StepValue->getType()->isIntegerTy() && +         "StepValue is not an integer"); +} + +int InductionDescriptor::getConsecutiveDirection() const { +  if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) +    return StepValue->getSExtValue(); +  return 0; +} + +Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index) const { +  switch (IK) { +  case IK_IntInduction: +    assert(Index->getType() == StartValue->getType() && +           "Index type does not match StartValue type"); +    if (StepValue->isMinusOne()) +      return B.CreateSub(StartValue, Index); +    if (!StepValue->isOne()) +      Index = B.CreateMul(Index, StepValue); +    return B.CreateAdd(StartValue, Index); + +  case IK_PtrInduction: +    assert(Index->getType() == StepValue->getType() && +           "Index type does not match StepValue type"); +    if (StepValue->isMinusOne()) +      Index = B.CreateNeg(Index); +    else if (!StepValue->isOne()) +      Index = B.CreateMul(Index, StepValue); +    return B.CreateGEP(nullptr, StartValue, Index); + +  case IK_NoInduction: +    return nullptr; +  } +  llvm_unreachable("invalid enum"); +} + +bool InductionDescriptor::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, +                                         InductionDescriptor &D) {    Type *PhiTy = Phi->getType();    // We only handle integer and pointer inductions variables.    if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) @@ -467,6 +672,10 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,      return false;    } +  assert(AR->getLoop()->getHeader() == Phi->getParent() && +         "PHI is an AddRec for a different loop?!"); +  Value *StartValue = +    Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader());    const SCEV *Step = AR->getStepRecurrence(*SE);    // Calculate the pointer stride and check if it is consecutive.    const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); @@ -475,7 +684,7 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,    ConstantInt *CV = C->getValue();    if (PhiTy->isIntegerTy()) { -    StepValue = CV; +    D = InductionDescriptor(StartValue, IK_IntInduction, CV);      return true;    } @@ -494,6 +703,27 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,    int64_t CVSize = CV->getSExtValue();    if (CVSize % Size)      return false; -  StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size); +  auto *StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size); + +  D = InductionDescriptor(StartValue, IK_PtrInduction, StepValue);    return true;  } + +/// \brief Returns the instructions that use values defined in the loop. +SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) { +  SmallVector<Instruction *, 8> UsedOutside; + +  for (auto *Block : L->getBlocks()) +    // FIXME: I believe that this could use copy_if if the Inst reference could +    // be adapted into a pointer. +    for (auto &Inst : *Block) { +      auto Users = Inst.users(); +      if (std::any_of(Users.begin(), Users.end(), [&](User *U) { +            auto *Use = cast<Instruction>(U); +            return !L->contains(Use->getParent()); +          })) +        UsedOutside.push_back(&Inst); +    } + +  return UsedOutside; +} diff --git a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp index 832079d2cf63..9a2a06cf6891 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -13,43 +13,81 @@  //  //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Utils/LoopVersioning.h"  #include "llvm/Analysis/LoopAccessAnalysis.h"  #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h"  #include "llvm/IR/Dominators.h"  #include "llvm/Transforms/Utils/BasicBlockUtils.h"  #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/LoopVersioning.h"  using namespace llvm;  LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, -                               DominatorTree *DT, -                               const SmallVector<int, 8> *PtrToPartition) -    : VersionedLoop(L), NonVersionedLoop(nullptr), -      PtrToPartition(PtrToPartition), LAI(LAI), LI(LI), DT(DT) { +                               DominatorTree *DT, ScalarEvolution *SE, +                               bool UseLAIChecks) +    : VersionedLoop(L), NonVersionedLoop(nullptr), LAI(LAI), LI(LI), DT(DT), +      SE(SE) {    assert(L->getExitBlock() && "No single exit block");    assert(L->getLoopPreheader() && "No preheader"); +  if (UseLAIChecks) { +    setAliasChecks(LAI.getRuntimePointerChecking()->getChecks()); +    setSCEVChecks(LAI.PSE.getUnionPredicate()); +  }  } -bool LoopVersioning::needsRuntimeChecks() const { -  return LAI.getRuntimePointerChecking()->needsAnyChecking(PtrToPartition); +void LoopVersioning::setAliasChecks( +    const SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks) { +  AliasChecks = std::move(Checks);  } -void LoopVersioning::versionLoop(Pass *P) { +void LoopVersioning::setSCEVChecks(SCEVUnionPredicate Check) { +  Preds = std::move(Check); +} + +void LoopVersioning::versionLoop( +    const SmallVectorImpl<Instruction *> &DefsUsedOutside) {    Instruction *FirstCheckInst;    Instruction *MemRuntimeCheck; +  Value *SCEVRuntimeCheck; +  Value *RuntimeCheck = nullptr; +    // Add the memcheck in the original preheader (this is empty initially). -  BasicBlock *MemCheckBB = VersionedLoop->getLoopPreheader(); +  BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader();    std::tie(FirstCheckInst, MemRuntimeCheck) = -      LAI.addRuntimeCheck(MemCheckBB->getTerminator(), PtrToPartition); +      LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks);    assert(MemRuntimeCheck && "called even though needsAnyChecking = false"); +  const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate(); +  SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(), +                   "scev.check"); +  SCEVRuntimeCheck = +      Exp.expandCodeForPredicate(&Pred, RuntimeCheckBB->getTerminator()); +  auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck); + +  // Discard the SCEV runtime check if it is always true. +  if (CI && CI->isZero()) +    SCEVRuntimeCheck = nullptr; + +  if (MemRuntimeCheck && SCEVRuntimeCheck) { +    RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck, +                                          SCEVRuntimeCheck, "ldist.safe"); +    if (auto *I = dyn_cast<Instruction>(RuntimeCheck)) +      I->insertBefore(RuntimeCheckBB->getTerminator()); +  } else +    RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck; + +  assert(RuntimeCheck && "called even though we don't need " +                         "any runtime checks"); +    // Rename the block to make the IR more readable. -  MemCheckBB->setName(VersionedLoop->getHeader()->getName() + ".lver.memcheck"); +  RuntimeCheckBB->setName(VersionedLoop->getHeader()->getName() + +                          ".lver.check");    // Create empty preheader for the loop (and after cloning for the    // non-versioned loop). -  BasicBlock *PH = SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI); +  BasicBlock *PH = +      SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI);    PH->setName(VersionedLoop->getHeader()->getName() + ".ph");    // Clone the loop including the preheader. @@ -58,20 +96,23 @@ void LoopVersioning::versionLoop(Pass *P) {    // block is a join between the two loops.    SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks;    NonVersionedLoop = -      cloneLoopWithPreheader(PH, MemCheckBB, VersionedLoop, VMap, ".lver.orig", -                             LI, DT, NonVersionedLoopBlocks); +      cloneLoopWithPreheader(PH, RuntimeCheckBB, VersionedLoop, VMap, +                             ".lver.orig", LI, DT, NonVersionedLoopBlocks);    remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap);    // Insert the conditional branch based on the result of the memchecks. -  Instruction *OrigTerm = MemCheckBB->getTerminator(); +  Instruction *OrigTerm = RuntimeCheckBB->getTerminator();    BranchInst::Create(NonVersionedLoop->getLoopPreheader(), -                     VersionedLoop->getLoopPreheader(), MemRuntimeCheck, -                     OrigTerm); +                     VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm);    OrigTerm->eraseFromParent();    // The loops merge in the original exit block.  This is now dominated by the    // memchecking block. -  DT->changeImmediateDominator(VersionedLoop->getExitBlock(), MemCheckBB); +  DT->changeImmediateDominator(VersionedLoop->getExitBlock(), RuntimeCheckBB); + +  // Adds the necessary PHI nodes for the versioned loops based on the +  // loop-defined values used outside of the loop. +  addPHINodes(DefsUsedOutside);  }  void LoopVersioning::addPHINodes( @@ -94,7 +135,7 @@ void LoopVersioning::addPHINodes(      // If not create it.      if (!PN) {        PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver", -                           PHIBlock->begin()); +                           &PHIBlock->front());        for (auto *User : Inst->users())          if (!VersionedLoop->contains(cast<Instruction>(User)->getParent()))            User->replaceUsesOfWith(Inst, PN); diff --git a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp index 66d57b069fe7..b0ad4d5e84a1 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp @@ -69,7 +69,7 @@ bool LowerInvoke::runOnFunction(Function &F) {        BranchInst::Create(II->getNormalDest(), II);        // Remove any PHI node entries from the exception destination. -      II->getUnwindDest()->removePredecessor(BB); +      II->getUnwindDest()->removePredecessor(&*BB);        // Remove the invoke instruction now.        BB->getInstList().erase(II); diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp index 4acd988691d2..52beb1542497 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -49,8 +49,7 @@ namespace {      return I != Ranges.end() && I->Low <= R.Low;    } -  /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch -  /// instructions. +  /// Replace all SwitchInst instructions with chained branch instructions.    class LowerSwitch : public FunctionPass {    public:      static char ID; // Pass identification, replacement for typeid @@ -78,7 +77,7 @@ namespace {      typedef std::vector<CaseRange> CaseVector;      typedef std::vector<CaseRange>::iterator CaseItr;    private: -    void processSwitchInst(SwitchInst *SI); +    void processSwitchInst(SwitchInst *SI, SmallPtrSetImpl<BasicBlock*> &DeleteList);      BasicBlock *switchConvert(CaseItr Begin, CaseItr End,                                ConstantInt *LowerBound, ConstantInt *UpperBound, @@ -116,21 +115,30 @@ FunctionPass *llvm::createLowerSwitchPass() {  bool LowerSwitch::runOnFunction(Function &F) {    bool Changed = false; +  SmallPtrSet<BasicBlock*, 8> DeleteList;    for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { -    BasicBlock *Cur = I++; // Advance over block so we don't traverse new blocks +    BasicBlock *Cur = &*I++; // Advance over block so we don't traverse new blocks + +    // If the block is a dead Default block that will be deleted later, don't +    // waste time processing it. +    if (DeleteList.count(Cur)) +      continue;      if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur->getTerminator())) {        Changed = true; -      processSwitchInst(SI); +      processSwitchInst(SI, DeleteList);      }    } +  for (BasicBlock* BB: DeleteList) { +    DeleteDeadBlock(BB); +  } +    return Changed;  } -// operator<< - Used for debugging purposes. -// +/// Used for debugging purposes.  static raw_ostream& operator<<(raw_ostream &O,                                 const LowerSwitch::CaseVector &C)      LLVM_ATTRIBUTE_USED; @@ -147,23 +155,24 @@ static raw_ostream& operator<<(raw_ostream &O,    return O << "]";  } -// \brief Update the first occurrence of the "switch statement" BB in the PHI -// node with the "new" BB. The other occurrences will: -// -// 1) Be updated by subsequent calls to this function.  Switch statements may -// have more than one outcoming edge into the same BB if they all have the same -// value. When the switch statement is converted these incoming edges are now -// coming from multiple BBs. -// 2) Removed if subsequent incoming values now share the same case, i.e., -// multiple outcome edges are condensed into one. This is necessary to keep the -// number of phi values equal to the number of branches to SuccBB. +/// \brief Update the first occurrence of the "switch statement" BB in the PHI +/// node with the "new" BB. The other occurrences will: +/// +/// 1) Be updated by subsequent calls to this function.  Switch statements may +/// have more than one outcoming edge into the same BB if they all have the same +/// value. When the switch statement is converted these incoming edges are now +/// coming from multiple BBs. +/// 2) Removed if subsequent incoming values now share the same case, i.e., +/// multiple outcome edges are condensed into one. This is necessary to keep the +/// number of phi values equal to the number of branches to SuccBB.  static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,                      unsigned NumMergedCases) { -  for (BasicBlock::iterator I = SuccBB->begin(), IE = SuccBB->getFirstNonPHI(); +  for (BasicBlock::iterator I = SuccBB->begin(), +                            IE = SuccBB->getFirstNonPHI()->getIterator();         I != IE; ++I) {      PHINode *PN = cast<PHINode>(I); -    // Only update the first occurence. +    // Only update the first occurrence.      unsigned Idx = 0, E = PN->getNumIncomingValues();      unsigned LocalNumMergedCases = NumMergedCases;      for (; Idx != E; ++Idx) { @@ -173,7 +182,7 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,        }      } -    // Remove additional occurences coming from condensed cases and keep the +    // Remove additional occurrences coming from condensed cases and keep the      // number of incoming values equal to the number of branches to SuccBB.      SmallVector<unsigned, 8> Indices;      for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx) @@ -188,11 +197,11 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,    }  } -// switchConvert - Convert the switch statement into a binary lookup of -// the case values. The function recursively builds this tree. -// LowerBound and UpperBound are used to keep track of the bounds for Val -// that have already been checked by a block emitted by one of the previous -// calls to switchConvert in the call stack. +/// Convert the switch statement into a binary lookup of the case values. +/// The function recursively builds this tree. LowerBound and UpperBound are +/// used to keep track of the bounds for Val that have already been checked by +/// a block emitted by one of the previous calls to switchConvert in the call +/// stack.  BasicBlock *  LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,                             ConstantInt *UpperBound, Value *Val, @@ -278,28 +287,24 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,                                        UpperBound, Val, NewNode, OrigBlock,                                        Default, UnreachableRanges); -  Function::iterator FI = OrigBlock; -  F->getBasicBlockList().insert(++FI, NewNode); +  F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewNode);    NewNode->getInstList().push_back(Comp);    BranchInst::Create(LBranch, RBranch, Comp, NewNode);    return NewNode;  } -// newLeafBlock - Create a new leaf block for the binary lookup tree. It -// checks if the switch's value == the case's value. If not, then it -// jumps to the default branch. At this point in the tree, the value -// can't be another valid case value, so the jump to the "default" branch -// is warranted. -// +/// Create a new leaf block for the binary lookup tree. It checks if the +/// switch's value == the case's value. If not, then it jumps to the default +/// branch. At this point in the tree, the value can't be another valid case +/// value, so the jump to the "default" branch is warranted.  BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,                                        BasicBlock* OrigBlock,                                        BasicBlock* Default)  {    Function* F = OrigBlock->getParent();    BasicBlock* NewLeaf = BasicBlock::Create(Val->getContext(), "LeafBlock"); -  Function::iterator FI = OrigBlock; -  F->getBasicBlockList().insert(++FI, NewLeaf); +  F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewLeaf);    // Emit comparison    ICmpInst* Comp = nullptr; @@ -352,7 +357,7 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,    return NewLeaf;  } -// Clusterify - Transform simple list of Cases into list of CaseRange's +/// Transform simple list of Cases into list of CaseRange's.  unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {    unsigned numCmps = 0; @@ -394,10 +399,10 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {    return numCmps;  } -// processSwitchInst - Replace the specified switch instruction with a sequence -// of chained if-then insts in a balanced binary search. -// -void LowerSwitch::processSwitchInst(SwitchInst *SI) { +/// Replace the specified switch instruction with a sequence of chained if-then +/// insts in a balanced binary search. +void LowerSwitch::processSwitchInst(SwitchInst *SI, +                                    SmallPtrSetImpl<BasicBlock*> &DeleteList) {    BasicBlock *CurBlock = SI->getParent();    BasicBlock *OrigBlock = CurBlock;    Function *F = CurBlock->getParent(); @@ -424,7 +429,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {    std::vector<IntRange> UnreachableRanges;    if (isa<UnreachableInst>(Default->getFirstNonPHIOrDbg())) { -    // Make the bounds tightly fitted around the case value range, becase we +    // Make the bounds tightly fitted around the case value range, because we      // know that the value passed to the switch must be exactly one of the case      // values.      assert(!Cases.empty()); @@ -495,7 +500,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {    // Create a new, empty default block so that the new hierarchy of    // if-then statements go to this and the PHI nodes are happy.    BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault"); -  F->getBasicBlockList().insert(Default, NewDefault); +  F->getBasicBlockList().insert(Default->getIterator(), NewDefault);    BranchInst::Create(Default, NewDefault);    // If there is an entry in any PHI nodes for the default edge, make sure @@ -518,7 +523,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {    BasicBlock *OldDefault = SI->getDefaultDest();    CurBlock->getInstList().erase(SI); -  // If the Default block has no more predecessors just remove it. +  // If the Default block has no more predecessors just add it to DeleteList.    if (pred_begin(OldDefault) == pred_end(OldDefault)) -    DeleteDeadBlock(OldDefault); +    DeleteList.insert(OldDefault);  } diff --git a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp index 00cf4e6c01c8..aa1e35ddba02 100644 --- a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp +++ b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp @@ -63,6 +63,9 @@ bool PromotePass::runOnFunction(Function &F) {    BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function +  if (F.hasFnAttribute(Attribute::OptimizeNone)) +    return false; +    bool Changed  = false;    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); diff --git a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp index 395a46bad97b..c999bd008fef 100644 --- a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp +++ b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp @@ -42,6 +42,24 @@ namespace {      }    }; +  static const char *const metaNames[] = { +    // See http://en.wikipedia.org/wiki/Metasyntactic_variable +    "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge", +    "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam" +  }; + +  struct Renamer { +    Renamer(unsigned int seed) { +      prng.srand(seed); +    } + +    const char *newName() { +      return metaNames[prng.rand() % array_lengthof(metaNames)]; +    } + +    PRNG prng; +  }; +      struct MetaRenamer : public ModulePass {      static char ID; // Pass identification, replacement for typeid      MetaRenamer() : ModulePass(ID) { @@ -53,36 +71,26 @@ namespace {      }      bool runOnModule(Module &M) override { -      static const char *const metaNames[] = { -        // See http://en.wikipedia.org/wiki/Metasyntactic_variable -        "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge", -        "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam" -      }; -        // Seed our PRNG with simple additive sum of ModuleID. We're looking to        // simply avoid always having the same function names, and we need to        // remain deterministic.        unsigned int randSeed = 0; -      for (std::string::const_iterator I = M.getModuleIdentifier().begin(), -           E = M.getModuleIdentifier().end(); I != E; ++I) -        randSeed += *I; +      for (auto C : M.getModuleIdentifier()) +        randSeed += C; -      PRNG prng; -      prng.srand(randSeed); +      Renamer renamer(randSeed);        // Rename all aliases -      for (Module::alias_iterator AI = M.alias_begin(), AE = M.alias_end(); -           AI != AE; ++AI) { +      for (auto AI = M.alias_begin(), AE = M.alias_end(); AI != AE; ++AI) {          StringRef Name = AI->getName();          if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))            continue;          AI->setName("alias");        } -       +        // Rename all global variables -      for (Module::global_iterator GI = M.global_begin(), GE = M.global_end(); -           GI != GE; ++GI) { +      for (auto GI = M.global_begin(), GE = M.global_end(); GI != GE; ++GI) {          StringRef Name = GI->getName();          if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))            continue; @@ -93,40 +101,37 @@ namespace {        // Rename all struct types        TypeFinder StructTypes;        StructTypes.run(M, true); -      for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) { -        StructType *STy = StructTypes[i]; +      for (StructType *STy : StructTypes) {          if (STy->isLiteral() || STy->getName().empty()) continue;          SmallString<128> NameStorage; -        STy->setName((Twine("struct.") + metaNames[prng.rand() % -                     array_lengthof(metaNames)]).toStringRef(NameStorage)); +        STy->setName((Twine("struct.") + +          renamer.newName()).toStringRef(NameStorage));        }        // Rename all functions -      for (Module::iterator FI = M.begin(), FE = M.end(); -           FI != FE; ++FI) { -        StringRef Name = FI->getName(); +      for (auto &F : M) { +        StringRef Name = F.getName();          if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))            continue; -        FI->setName(metaNames[prng.rand() % array_lengthof(metaNames)]); -        runOnFunction(*FI); +        F.setName(renamer.newName()); +        runOnFunction(F);        }        return true;      }      bool runOnFunction(Function &F) { -      for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); -           AI != AE; ++AI) +      for (auto AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI)          if (!AI->getType()->isVoidTy())            AI->setName("arg"); -      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { -        BB->setName("bb"); +      for (auto &BB : F) { +        BB.setName("bb"); -        for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) -          if (!I->getType()->isVoidTy()) -            I->setName("tmp"); +        for (auto &I : BB) +          if (!I.getType()->isVoidTy()) +            I.setName("tmp");        }        return true;      } diff --git a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp index d69a81ec4741..9ec28a3f3d47 100644 --- a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -43,9 +43,9 @@ static void appendToGlobalArray(const char *Array,      }      GVCtor->eraseFromParent();    } else { -    // Use a simple two-field struct if there isn't one already. +    // Use the new three-field struct if there isn't one already.      EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy), -                            nullptr); +                            IRB.getInt8PtrTy(), nullptr);    }    // Build a 2 or 3 field global_ctor entry.  We don't take a comdat key. @@ -107,7 +107,8 @@ Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) {  std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(      Module &M, StringRef CtorName, StringRef InitName, -    ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs) { +    ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs, +    StringRef VersionCheckName) {    assert(!InitName.empty() && "Expected init function name");    assert(InitArgTypes.size() == InitArgTypes.size() &&           "Sanitizer's init function expects different number of arguments"); @@ -122,6 +123,13 @@ std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(            AttributeSet()));    InitFunction->setLinkage(Function::ExternalLinkage);    IRB.CreateCall(InitFunction, InitArgs); +  if (!VersionCheckName.empty()) { +    Function *VersionCheckFunction = +        checkSanitizerInterfaceFunction(M.getOrInsertFunction( +            VersionCheckName, FunctionType::get(IRB.getVoidTy(), {}, false), +            AttributeSet())); +    IRB.CreateCall(VersionCheckFunction, {}); +  }    return std::make_pair(Ctor, InitFunction);  } diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index a87f8504bfb5..c4f9b9f61407 100644 --- a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -205,10 +205,9 @@ public:      // avoid gratuitus rescans.      const BasicBlock *BB = I->getParent();      unsigned InstNo = 0; -    for (BasicBlock::const_iterator BBI = BB->begin(), E = BB->end(); BBI != E; -         ++BBI) -      if (isInterestingInstruction(BBI)) -        InstNumbers[BBI] = InstNo++; +    for (const Instruction &BBI : *BB) +      if (isInterestingInstruction(&BBI)) +        InstNumbers[&BBI] = InstNo++;      It = InstNumbers.find(I);      assert(It != InstNumbers.end() && "Didn't insert instruction?"); @@ -402,8 +401,7 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,    // Record debuginfo for the store and remove the declaration's    // debuginfo.    if (DbgDeclareInst *DDI = Info.DbgDeclare) { -    DIBuilder DIB(*AI->getParent()->getParent()->getParent(), -                  /*AllowUnresolved*/ false); +    DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);      ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, DIB);      DDI->eraseFromParent();      LBI.deleteValue(DDI); @@ -425,14 +423,17 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,  /// using the Alloca.  ///  /// If we cannot promote this alloca (because it is read before it is written), -/// return true.  This is necessary in cases where, due to control flow, the -/// alloca is potentially undefined on some control flow paths.  e.g. code like -/// this is potentially correct: -/// -///   for (...) { if (c) { A = undef; undef = B; } } -/// -/// ... so long as A is not used before undef is set. -static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, +/// return false.  This is necessary in cases where, due to control flow, the +/// alloca is undefined only on some control flow paths.  e.g. code like +/// this is correct in LLVM IR: +///  // A is an alloca with no stores so far +///  for (...) { +///    int t = *A; +///    if (!first_iteration) +///      use(t); +///    *A = 42; +///  } +static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,                                       LargeBlockInfo &LBI,                                       AliasSetTracker *AST) {    // The trickiest case to handle is when we have large blocks. Because of this, @@ -467,10 +468,15 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,                           std::make_pair(LoadIdx,                                          static_cast<StoreInst *>(nullptr)),                           less_first()); - -    if (I == StoresByIndex.begin()) -      // If there is no store before this load, the load takes the undef value. -      LI->replaceAllUsesWith(UndefValue::get(LI->getType())); +    if (I == StoresByIndex.begin()) { +      if (StoresByIndex.empty()) +        // If there are no stores, the load takes the undef value. +        LI->replaceAllUsesWith(UndefValue::get(LI->getType())); +      else +        // There is no store before this load, bail out (load may be affected +        // by the following stores - see main comment). +        return false; +    }      else        // Otherwise, there was a store before this load, the load takes its value.        LI->replaceAllUsesWith(std::prev(I)->second->getOperand(0)); @@ -486,8 +492,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,      StoreInst *SI = cast<StoreInst>(AI->user_back());      // Record debuginfo for the store before removing it.      if (DbgDeclareInst *DDI = Info.DbgDeclare) { -      DIBuilder DIB(*AI->getParent()->getParent()->getParent(), -                    /*AllowUnresolved*/ false); +      DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);        ConvertDebugDeclareToDebugValue(DDI, SI, DIB);      }      SI->eraseFromParent(); @@ -506,6 +511,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,    }    ++NumLocalPromoted; +  return true;  }  void PromoteMem2Reg::run() { @@ -557,9 +563,8 @@ void PromoteMem2Reg::run() {      // If the alloca is only read and written in one basic block, just perform a      // linear sweep over the block to eliminate it. -    if (Info.OnlyUsedInOneBlock) { -      promoteSingleBlockAlloca(AI, Info, LBI, AST); - +    if (Info.OnlyUsedInOneBlock && +        promoteSingleBlockAlloca(AI, Info, LBI, AST)) {        // The alloca has been processed, move on.        RemoveFromAllocasList(AllocaNum);        continue; @@ -636,7 +641,7 @@ void PromoteMem2Reg::run() {    // and inserting the phi nodes we marked as necessary    //    std::vector<RenamePassData> RenamePassWorkList; -  RenamePassWorkList.emplace_back(F.begin(), nullptr, std::move(Values)); +  RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values));    do {      RenamePassData RPD;      RPD.swap(RenamePassWorkList.back()); @@ -854,7 +859,7 @@ bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,    // BasicBlock.    PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), getNumPreds(BB),                         Allocas[AllocaNo]->getName() + "." + Twine(Version++), -                       BB->begin()); +                       &BB->front());    ++NumPHIInsert;    PhiToAllocaMap[PN] = AllocaNo; @@ -919,7 +924,7 @@ NextIteration:      return;    for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II);) { -    Instruction *I = II++; // get the instruction, increment iterator +    Instruction *I = &*II++; // get the instruction, increment iterator      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {        AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand()); diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 36781c1189cd..d0932f834cf5 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -14,6 +14,7 @@  #include "llvm/Transforms/Utils/Local.h"  #include "llvm/ADT/DenseMap.h"  #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetOperations.h"  #include "llvm/ADT/SetVector.h"  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/ADT/SmallVector.h" @@ -43,7 +44,6 @@  #include "llvm/Support/Debug.h"  #include "llvm/Support/raw_ostream.h"  #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h"  #include "llvm/Transforms/Utils/ValueMapper.h"  #include <algorithm>  #include <map> @@ -73,6 +73,22 @@ static cl::opt<bool> HoistCondStores(      "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),      cl::desc("Hoist conditional stores if an unconditional store precedes")); +static cl::opt<bool> MergeCondStores( +    "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true), +    cl::desc("Hoist conditional stores even if an unconditional store does not " +             "precede - hoist multiple conditional stores into a single " +             "predicated store")); + +static cl::opt<bool> MergeCondStoresAggressively( +    "simplifycfg-merge-cond-stores-aggressively", cl::Hidden, cl::init(false), +    cl::desc("When merging conditional stores, do so even if the resultant " +             "basic blocks are unlikely to be if-converted as a result")); + +static cl::opt<bool> SpeculateOneExpensiveInst( +    "speculate-one-expensive-inst", cl::Hidden, cl::init(true), +    cl::desc("Allow exactly one expensive instruction to be speculatively " +             "executed")); +  STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");  STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping");  STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables"); @@ -83,13 +99,13 @@ STATISTIC(NumSpeculations, "Number of speculative executed instructions");  namespace {    // The first field contains the value that the switch produces when a certain -  // case group is selected, and the second field is a vector containing the cases -  // composing the case group. +  // case group is selected, and the second field is a vector containing the +  // cases composing the case group.    typedef SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2>      SwitchCaseResultVectorTy;    // The first field contains the phi node that generates a result of the switch -  // and the second field contains the value generated for a certain case in the switch -  // for that PHI. +  // and the second field contains the value generated for a certain case in the +  // switch for that PHI.    typedef SmallVector<std::pair<PHINode *, Constant *>, 4> SwitchCaseResultsTy;    /// ValueEqualityComparisonCase - Represents a case of a switch. @@ -124,6 +140,7 @@ class SimplifyCFGOpt {    bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);    bool SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder); +  bool SimplifyCleanupReturn(CleanupReturnInst *RI);    bool SimplifyUnreachable(UnreachableInst *UI);    bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);    bool SimplifyIndirectBr(IndirectBrInst *IBI); @@ -226,6 +243,7 @@ static unsigned ComputeSpeculationCost(const User *I,           "Instruction is not safe to speculatively execute!");    return TTI.getUserCost(I);  } +  /// If we have a merge point of an "if condition" as accepted above,  /// return true if the specified value dominates the block.  We  /// don't handle the true generality of domination here, just a special case @@ -246,7 +264,8 @@ static unsigned ComputeSpeculationCost(const User *I,  static bool DominatesMergePoint(Value *V, BasicBlock *BB,                                  SmallPtrSetImpl<Instruction*> *AggressiveInsts,                                  unsigned &CostRemaining, -                                const TargetTransformInfo &TTI) { +                                const TargetTransformInfo &TTI, +                                unsigned Depth = 0) {    Instruction *I = dyn_cast<Instruction>(V);    if (!I) {      // Non-instructions all dominate instructions, but not all constantexprs @@ -284,15 +303,24 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,    unsigned Cost = ComputeSpeculationCost(I, TTI); -  if (Cost > CostRemaining) +  // Allow exactly one instruction to be speculated regardless of its cost +  // (as long as it is safe to do so). +  // This is intended to flatten the CFG even if the instruction is a division +  // or other expensive operation. The speculation of an expensive instruction +  // is expected to be undone in CodeGenPrepare if the speculation has not +  // enabled further IR optimizations. +  if (Cost > CostRemaining && +      (!SpeculateOneExpensiveInst || !AggressiveInsts->empty() || Depth > 0))      return false; -  CostRemaining -= Cost; +  // Avoid unsigned wrap. +  CostRemaining = (Cost > CostRemaining) ? 0 : CostRemaining - Cost;    // Okay, we can only really hoist these out if their operands do    // not take us over the cost threshold.    for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) -    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI)) +    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI, +                             Depth + 1))        return false;    // Okay, it's safe to do this!  Remember this instruction.    AggressiveInsts->insert(I); @@ -970,8 +998,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,        // Okay, at this point, we know which new successor Pred will get.  Make        // sure we update the number of entries in the PHI nodes for these        // successors. -      for (unsigned i = 0, e = NewSuccessors.size(); i != e; ++i) -        AddPredecessorToBlock(NewSuccessors[i], Pred, BB); +      for (BasicBlock *NewSuccessor : NewSuccessors) +        AddPredecessorToBlock(NewSuccessor, Pred, BB);        Builder.SetInsertPoint(PTI);        // Convert pointer to int before we switch. @@ -984,8 +1012,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,        SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault,                                                 PredCases.size());        NewSI->setDebugLoc(PTI->getDebugLoc()); -      for (unsigned i = 0, e = PredCases.size(); i != e; ++i) -        NewSI->addCase(PredCases[i].Value, PredCases[i].Dest); +      for (ValueEqualityComparisonCase &V : PredCases) +        NewSI->addCase(V.Value, V.Dest);        if (PredHasWeights || SuccHasWeights) {          // Halve the weights if any of them cannot fit in an uint32_t @@ -1059,15 +1087,15 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,    BasicBlock::iterator BB1_Itr = BB1->begin();    BasicBlock::iterator BB2_Itr = BB2->begin(); -  Instruction *I1 = BB1_Itr++, *I2 = BB2_Itr++; +  Instruction *I1 = &*BB1_Itr++, *I2 = &*BB2_Itr++;    // Skip debug info if it is not identical.    DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);    DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);    if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {      while (isa<DbgInfoIntrinsic>(I1)) -      I1 = BB1_Itr++; +      I1 = &*BB1_Itr++;      while (isa<DbgInfoIntrinsic>(I2)) -      I2 = BB2_Itr++; +      I2 = &*BB2_Itr++;    }    if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) ||        (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))) @@ -1088,31 +1116,30 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,      // For a normal instruction, we just move one to right before the branch,      // then replace all uses of the other with the first.  Finally, we remove      // the now redundant second instruction. -    BIParent->getInstList().splice(BI, BB1->getInstList(), I1); +    BIParent->getInstList().splice(BI->getIterator(), BB1->getInstList(), I1);      if (!I2->use_empty())        I2->replaceAllUsesWith(I1);      I1->intersectOptionalDataWith(I2);      unsigned KnownIDs[] = { -      LLVMContext::MD_tbaa, -      LLVMContext::MD_range, -      LLVMContext::MD_fpmath, -      LLVMContext::MD_invariant_load, -      LLVMContext::MD_nonnull -    }; +        LLVMContext::MD_tbaa,    LLVMContext::MD_range, +        LLVMContext::MD_fpmath,  LLVMContext::MD_invariant_load, +        LLVMContext::MD_nonnull, LLVMContext::MD_invariant_group, +        LLVMContext::MD_align,   LLVMContext::MD_dereferenceable, +        LLVMContext::MD_dereferenceable_or_null};      combineMetadata(I1, I2, KnownIDs);      I2->eraseFromParent();      Changed = true; -    I1 = BB1_Itr++; -    I2 = BB2_Itr++; +    I1 = &*BB1_Itr++; +    I2 = &*BB2_Itr++;      // Skip debug info if it is not identical.      DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);      DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);      if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {        while (isa<DbgInfoIntrinsic>(I1)) -        I1 = BB1_Itr++; +        I1 = &*BB1_Itr++;        while (isa<DbgInfoIntrinsic>(I2)) -        I2 = BB2_Itr++; +        I2 = &*BB2_Itr++;      }    } while (I1->isIdenticalToWhenDefined(I2)); @@ -1147,7 +1174,7 @@ HoistTerminator:    // Okay, it is safe to hoist the terminator.    Instruction *NT = I1->clone(); -  BIParent->getInstList().insert(BI, NT); +  BIParent->getInstList().insert(BI->getIterator(), NT);    if (!NT->getType()->isVoidTy()) {      I1->replaceAllUsesWith(NT);      I2->replaceAllUsesWith(NT); @@ -1265,7 +1292,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {      // Cannot move control-flow-involving, volatile loads, vaarg, etc.      if (isa<PHINode>(I1) || isa<PHINode>(I2) ||          isa<TerminatorInst>(I1) || isa<TerminatorInst>(I2) || -        isa<LandingPadInst>(I1) || isa<LandingPadInst>(I2) || +        I1->isEHPad() || I2->isEHPad() ||          isa<AllocaInst>(I1) || isa<AllocaInst>(I2) ||          I1->mayHaveSideEffects() || I2->mayHaveSideEffects() ||          I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() || @@ -1324,7 +1351,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {        if (!NewPN) {          NewPN =              PHINode::Create(DifferentOp1->getType(), 2, -                            DifferentOp1->getName() + ".sink", BBEnd->begin()); +                            DifferentOp1->getName() + ".sink", &BBEnd->front());          NewPN->addIncoming(DifferentOp1, BB1);          NewPN->addIncoming(DifferentOp2, BB2);          DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";); @@ -1339,7 +1366,8 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {      // instruction in the basic block down.      bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin());      // Sink the instruction. -    BBEnd->getInstList().splice(FirstNonPhiInBBEnd, BB1->getInstList(), I1); +    BBEnd->getInstList().splice(FirstNonPhiInBBEnd->getIterator(), +                                BB1->getInstList(), I1);      if (!OldPN->use_empty())        OldPN->replaceAllUsesWith(I1);      OldPN->eraseFromParent(); @@ -1355,7 +1383,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {        RE1 = BB1->getInstList().rend();      if (UpdateRE2)        RE2 = BB2->getInstList().rend(); -    FirstNonPhiInBBEnd = I1; +    FirstNonPhiInBBEnd = &*I1;      NumSinkCommons++;      Changed = true;    } @@ -1491,7 +1519,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,    for (BasicBlock::iterator BBI = ThenBB->begin(),                              BBE = std::prev(ThenBB->end());         BBI != BBE; ++BBI) { -    Instruction *I = BBI; +    Instruction *I = &*BBI;      // Skip debug info.      if (isa<DbgInfoIntrinsic>(I))        continue; @@ -1604,9 +1632,14 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,      SpeculatedStore->setOperand(0, S);    } +  // Metadata can be dependent on the condition we are hoisting above. +  // Conservatively strip all metadata on the instruction. +  for (auto &I: *ThenBB) +    I.dropUnknownNonDebugMetadata(); +    // Hoist the instructions. -  BB->getInstList().splice(BI, ThenBB->getInstList(), ThenBB->begin(), -                           std::prev(ThenBB->end())); +  BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(), +                           ThenBB->begin(), std::prev(ThenBB->end()));    // Insert selects and rewrite the PHI operands.    IRBuilder<true, NoFolder> Builder(BI); @@ -1747,13 +1780,13 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {        // Check for trivial simplification.        if (Value *V = SimplifyInstruction(N, DL)) { -        TranslateMap[BBI] = V; +        TranslateMap[&*BBI] = V;          delete N;   // Instruction folded away, don't need actual inst        } else {          // Insert the new instruction into its new home.          EdgeBB->getInstList().insert(InsertPt, N);          if (!BBI->use_empty()) -          TranslateMap[BBI] = N; +          TranslateMap[&*BBI] = N;        }      } @@ -1850,7 +1883,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,    } else {      DomBlock = *pred_begin(IfBlock1);      for (BasicBlock::iterator I = IfBlock1->begin();!isa<TerminatorInst>(I);++I) -      if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) { +      if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {          // This is not an aggressive instruction that we can promote.          // Because of this, we won't be able to get rid of the control          // flow, so the xform is not worth it. @@ -1863,7 +1896,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,    } else {      DomBlock = *pred_begin(IfBlock2);      for (BasicBlock::iterator I = IfBlock2->begin();!isa<TerminatorInst>(I);++I) -      if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) { +      if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {          // This is not an aggressive instruction that we can promote.          // Because of this, we won't be able to get rid of the control          // flow, so the xform is not worth it. @@ -1882,13 +1915,13 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,    // Move all 'aggressive' instructions, which are defined in the    // conditional parts of the if's up to the dominating block.    if (IfBlock1) -    DomBlock->getInstList().splice(InsertPt, +    DomBlock->getInstList().splice(InsertPt->getIterator(),                                     IfBlock1->getInstList(), IfBlock1->begin(), -                                   IfBlock1->getTerminator()); +                                   IfBlock1->getTerminator()->getIterator());    if (IfBlock2) -    DomBlock->getInstList().splice(InsertPt, +    DomBlock->getInstList().splice(InsertPt->getIterator(),                                     IfBlock2->getInstList(), IfBlock2->begin(), -                                   IfBlock2->getTerminator()); +                                   IfBlock2->getTerminator()->getIterator());    while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {      // Change the PHI node into a select instruction. @@ -2057,7 +2090,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {               BI->getSuccessor(0) == PBI->getSuccessor(1))) {            for (BasicBlock::iterator I = BB->begin(), E = BB->end();                 I != E; ) { -            Instruction *Curr = I++; +            Instruction *Curr = &*I++;              if (isa<CmpInst>(Curr)) {                Cond = Curr;                break; @@ -2077,7 +2110,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {    return false;    // Make sure the instruction after the condition is the cond branch. -  BasicBlock::iterator CondIt = Cond; ++CondIt; +  BasicBlock::iterator CondIt = ++Cond->getIterator();    // Ignore dbg intrinsics.    while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt; @@ -2095,7 +2128,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {      // Ignore dbg intrinsics.      if (isa<DbgInfoIntrinsic>(I))        continue; -    if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(I)) +    if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(&*I))        return false;      // I has only one use and can be executed unconditionally.      Instruction *User = dyn_cast<Instruction>(I->user_back()); @@ -2192,17 +2225,17 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {        Instruction *NewBonusInst = BonusInst->clone();        RemapInstruction(NewBonusInst, VMap,                         RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); -      VMap[BonusInst] = NewBonusInst; +      VMap[&*BonusInst] = NewBonusInst;        // If we moved a load, we cannot any longer claim any knowledge about        // its potential value. The previous information might have been valid        // only given the branch precondition.        // For an analogous reason, we must also drop all the metadata whose        // semantics we don't understand. -      NewBonusInst->dropUnknownMetadata(LLVMContext::MD_dbg); +      NewBonusInst->dropUnknownNonDebugMetadata(); -      PredBlock->getInstList().insert(PBI, NewBonusInst); -      NewBonusInst->takeName(BonusInst); +      PredBlock->getInstList().insert(PBI->getIterator(), NewBonusInst); +      NewBonusInst->takeName(&*BonusInst);        BonusInst->setName(BonusInst->getName() + ".old");      } @@ -2211,7 +2244,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {      Instruction *New = Cond->clone();      RemapInstruction(New, VMap,                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); -    PredBlock->getInstList().insert(PBI, New); +    PredBlock->getInstList().insert(PBI->getIterator(), New);      New->takeName(Cond);      Cond->setName(New->getName() + ".old"); @@ -2332,11 +2365,297 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {    return false;  } +// If there is only one store in BB1 and BB2, return it, otherwise return +// nullptr. +static StoreInst *findUniqueStoreInBlocks(BasicBlock *BB1, BasicBlock *BB2) { +  StoreInst *S = nullptr; +  for (auto *BB : {BB1, BB2}) { +    if (!BB) +      continue; +    for (auto &I : *BB) +      if (auto *SI = dyn_cast<StoreInst>(&I)) { +        if (S) +          // Multiple stores seen. +          return nullptr; +        else +          S = SI; +      } +  } +  return S; +} + +static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB, +                                              Value *AlternativeV = nullptr) { +  // PHI is going to be a PHI node that allows the value V that is defined in +  // BB to be referenced in BB's only successor. +  // +  // If AlternativeV is nullptr, the only value we care about in PHI is V. It +  // doesn't matter to us what the other operand is (it'll never get used). We +  // could just create a new PHI with an undef incoming value, but that could +  // increase register pressure if EarlyCSE/InstCombine can't fold it with some +  // other PHI. So here we directly look for some PHI in BB's successor with V +  // as an incoming operand. If we find one, we use it, else we create a new +  // one. +  // +  // If AlternativeV is not nullptr, we care about both incoming values in PHI. +  // PHI must be exactly: phi <ty> [ %BB, %V ], [ %OtherBB, %AlternativeV] +  // where OtherBB is the single other predecessor of BB's only successor. +  PHINode *PHI = nullptr; +  BasicBlock *Succ = BB->getSingleSuccessor(); +   +  for (auto I = Succ->begin(); isa<PHINode>(I); ++I) +    if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) { +      PHI = cast<PHINode>(I); +      if (!AlternativeV) +        break; + +      assert(std::distance(pred_begin(Succ), pred_end(Succ)) == 2); +      auto PredI = pred_begin(Succ); +      BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI; +      if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV) +        break; +      PHI = nullptr; +    } +  if (PHI) +    return PHI; + +  // If V is not an instruction defined in BB, just return it. +  if (!AlternativeV && +      (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != BB)) +    return V; + +  PHI = PHINode::Create(V->getType(), 2, "simplifycfg.merge", &Succ->front()); +  PHI->addIncoming(V, BB); +  for (BasicBlock *PredBB : predecessors(Succ)) +    if (PredBB != BB) +      PHI->addIncoming(AlternativeV ? AlternativeV : UndefValue::get(V->getType()), +                       PredBB); +  return PHI; +} + +static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, +                                           BasicBlock *QTB, BasicBlock *QFB, +                                           BasicBlock *PostBB, Value *Address, +                                           bool InvertPCond, bool InvertQCond) { +  auto IsaBitcastOfPointerType = [](const Instruction &I) { +    return Operator::getOpcode(&I) == Instruction::BitCast && +           I.getType()->isPointerTy(); +  }; + +  // If we're not in aggressive mode, we only optimize if we have some +  // confidence that by optimizing we'll allow P and/or Q to be if-converted. +  auto IsWorthwhile = [&](BasicBlock *BB) { +    if (!BB) +      return true; +    // Heuristic: if the block can be if-converted/phi-folded and the +    // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to +    // thread this store. +    unsigned N = 0; +    for (auto &I : *BB) { +      // Cheap instructions viable for folding. +      if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) || +          isa<StoreInst>(I)) +        ++N; +      // Free instructions. +      else if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || +               IsaBitcastOfPointerType(I)) +        continue; +      else +        return false; +    } +    return N <= PHINodeFoldingThreshold; +  }; + +  if (!MergeCondStoresAggressively && (!IsWorthwhile(PTB) || +                                       !IsWorthwhile(PFB) || +                                       !IsWorthwhile(QTB) || +                                       !IsWorthwhile(QFB))) +    return false; + +  // For every pointer, there must be exactly two stores, one coming from +  // PTB or PFB, and the other from QTB or QFB. We don't support more than one +  // store (to any address) in PTB,PFB or QTB,QFB. +  // FIXME: We could relax this restriction with a bit more work and performance +  // testing. +  StoreInst *PStore = findUniqueStoreInBlocks(PTB, PFB); +  StoreInst *QStore = findUniqueStoreInBlocks(QTB, QFB); +  if (!PStore || !QStore) +    return false; + +  // Now check the stores are compatible. +  if (!QStore->isUnordered() || !PStore->isUnordered()) +    return false; + +  // Check that sinking the store won't cause program behavior changes. Sinking +  // the store out of the Q blocks won't change any behavior as we're sinking +  // from a block to its unconditional successor. But we're moving a store from +  // the P blocks down through the middle block (QBI) and past both QFB and QTB. +  // So we need to check that there are no aliasing loads or stores in +  // QBI, QTB and QFB. We also need to check there are no conflicting memory +  // operations between PStore and the end of its parent block. +  // +  // The ideal way to do this is to query AliasAnalysis, but we don't +  // preserve AA currently so that is dangerous. Be super safe and just +  // check there are no other memory operations at all. +  for (auto &I : *QFB->getSinglePredecessor()) +    if (I.mayReadOrWriteMemory()) +      return false; +  for (auto &I : *QFB) +    if (&I != QStore && I.mayReadOrWriteMemory()) +      return false; +  if (QTB) +    for (auto &I : *QTB) +      if (&I != QStore && I.mayReadOrWriteMemory()) +        return false; +  for (auto I = BasicBlock::iterator(PStore), E = PStore->getParent()->end(); +       I != E; ++I) +    if (&*I != PStore && I->mayReadOrWriteMemory()) +      return false; + +  // OK, we're going to sink the stores to PostBB. The store has to be +  // conditional though, so first create the predicate. +  Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator()) +                     ->getCondition(); +  Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator()) +                     ->getCondition(); + +  Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(), +                                                PStore->getParent()); +  Value *QPHI = ensureValueAvailableInSuccessor(QStore->getValueOperand(), +                                                QStore->getParent(), PPHI); + +  IRBuilder<> QB(&*PostBB->getFirstInsertionPt()); + +  Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond); +  Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond); + +  if (InvertPCond) +    PPred = QB.CreateNot(PPred); +  if (InvertQCond) +    QPred = QB.CreateNot(QPred); +  Value *CombinedPred = QB.CreateOr(PPred, QPred); + +  auto *T = +      SplitBlockAndInsertIfThen(CombinedPred, &*QB.GetInsertPoint(), false); +  QB.SetInsertPoint(T); +  StoreInst *SI = cast<StoreInst>(QB.CreateStore(QPHI, Address)); +  AAMDNodes AAMD; +  PStore->getAAMetadata(AAMD, /*Merge=*/false); +  PStore->getAAMetadata(AAMD, /*Merge=*/true); +  SI->setAAMetadata(AAMD); + +  QStore->eraseFromParent(); +  PStore->eraseFromParent(); +   +  return true; +} + +static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) { +  // The intention here is to find diamonds or triangles (see below) where each +  // conditional block contains a store to the same address. Both of these +  // stores are conditional, so they can't be unconditionally sunk. But it may +  // be profitable to speculatively sink the stores into one merged store at the +  // end, and predicate the merged store on the union of the two conditions of +  // PBI and QBI. +  // +  // This can reduce the number of stores executed if both of the conditions are +  // true, and can allow the blocks to become small enough to be if-converted. +  // This optimization will also chain, so that ladders of test-and-set +  // sequences can be if-converted away. +  // +  // We only deal with simple diamonds or triangles: +  // +  //     PBI       or      PBI        or a combination of the two +  //    /   \               | \ +  //   PTB  PFB             |  PFB +  //    \   /               | / +  //     QBI                QBI +  //    /  \                | \ +  //   QTB  QFB             |  QFB +  //    \  /                | / +  //    PostBB            PostBB +  // +  // We model triangles as a type of diamond with a nullptr "true" block. +  // Triangles are canonicalized so that the fallthrough edge is represented by +  // a true condition, as in the diagram above. +  //   +  BasicBlock *PTB = PBI->getSuccessor(0); +  BasicBlock *PFB = PBI->getSuccessor(1); +  BasicBlock *QTB = QBI->getSuccessor(0); +  BasicBlock *QFB = QBI->getSuccessor(1); +  BasicBlock *PostBB = QFB->getSingleSuccessor(); + +  bool InvertPCond = false, InvertQCond = false; +  // Canonicalize fallthroughs to the true branches. +  if (PFB == QBI->getParent()) { +    std::swap(PFB, PTB); +    InvertPCond = true; +  } +  if (QFB == PostBB) { +    std::swap(QFB, QTB); +    InvertQCond = true; +  } + +  // From this point on we can assume PTB or QTB may be fallthroughs but PFB +  // and QFB may not. Model fallthroughs as a nullptr block. +  if (PTB == QBI->getParent()) +    PTB = nullptr; +  if (QTB == PostBB) +    QTB = nullptr; + +  // Legality bailouts. We must have at least the non-fallthrough blocks and +  // the post-dominating block, and the non-fallthroughs must only have one +  // predecessor. +  auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) { +    return BB->getSinglePredecessor() == P && +           BB->getSingleSuccessor() == S; +  }; +  if (!PostBB || +      !HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) || +      !HasOnePredAndOneSucc(QFB, QBI->getParent(), PostBB)) +    return false; +  if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) || +      (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB))) +    return false; +  if (PostBB->getNumUses() != 2 || QBI->getParent()->getNumUses() != 2) +    return false; + +  // OK, this is a sequence of two diamonds or triangles. +  // Check if there are stores in PTB or PFB that are repeated in QTB or QFB. +  SmallPtrSet<Value *,4> PStoreAddresses, QStoreAddresses; +  for (auto *BB : {PTB, PFB}) { +    if (!BB) +      continue; +    for (auto &I : *BB) +      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) +        PStoreAddresses.insert(SI->getPointerOperand()); +  } +  for (auto *BB : {QTB, QFB}) { +    if (!BB) +      continue; +    for (auto &I : *BB) +      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) +        QStoreAddresses.insert(SI->getPointerOperand()); +  } +   +  set_intersect(PStoreAddresses, QStoreAddresses); +  // set_intersect mutates PStoreAddresses in place. Rename it here to make it +  // clear what it contains. +  auto &CommonAddresses = PStoreAddresses; + +  bool Changed = false; +  for (auto *Address : CommonAddresses) +    Changed |= mergeConditionalStoreToAddress( +        PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond); +  return Changed; +} +  /// If we have a conditional branch as a predecessor of another block,  /// this function tries to simplify it.  We know  /// that PBI and BI are both conditional branches, and BI is in one of the  /// successor blocks of PBI - PBI branches to BI. -static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { +static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, +                                           const DataLayout &DL) {    assert(PBI->isConditional() && BI->isConditional());    BasicBlock *BB = BI->getParent(); @@ -2360,10 +2679,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {      // simplifycfg will thread the block.      if (BlockIsSimpleEnoughToThreadThrough(BB)) {        pred_iterator PB = pred_begin(BB), PE = pred_end(BB); -      PHINode *NewPN = PHINode::Create(Type::getInt1Ty(BB->getContext()), -                                       std::distance(PB, PE), -                                       BI->getCondition()->getName() + ".pr", -                                       BB->begin()); +      PHINode *NewPN = PHINode::Create( +          Type::getInt1Ty(BB->getContext()), std::distance(PB, PE), +          BI->getCondition()->getName() + ".pr", &BB->front());        // Okay, we're going to insert the PHI node.  Since PBI is not the only        // predecessor, compute the PHI'd conditional value for all of the preds.        // Any predecessor where the condition is not computable we keep symbolic. @@ -2386,6 +2704,29 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {      }    } +  if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition())) +    if (CE->canTrap()) +      return false; + +  // If BI is reached from the true path of PBI and PBI's condition implies +  // BI's condition, we know the direction of the BI branch. +  if (PBI->getSuccessor(0) == BI->getParent() && +      isImpliedCondition(PBI->getCondition(), BI->getCondition(), DL) && +      PBI->getSuccessor(0) != PBI->getSuccessor(1) && +      BB->getSinglePredecessor()) { +    // Turn this into a branch on constant. +    auto *OldCond = BI->getCondition(); +    BI->setCondition(ConstantInt::getTrue(BB->getContext())); +    RecursivelyDeleteTriviallyDeadInstructions(OldCond); +    return true;  // Nuke the branch on constant. +  } + +  // If both branches are conditional and both contain stores to the same +  // address, remove the stores from the conditionals and create a conditional +  // merged store at the end. +  if (MergeCondStores && mergeConditionalStores(PBI, BI)) +    return true; +    // If this is a conditional branch in an empty block, and if any    // predecessors are a conditional branch to one of our destinations,    // fold the conditions into logical ops and one cond br. @@ -2396,11 +2737,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {    if (&*BBI != BI)      return false; - -  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BI->getCondition())) -    if (CE->canTrap()) -      return false; -    int PBIOp, BIOp;    if (PBI->getSuccessor(0) == BI->getSuccessor(0))      PBIOp = BIOp = 0; @@ -2565,15 +2901,15 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,    BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr;    // Then remove the rest. -  for (unsigned I = 0, E = OldTerm->getNumSuccessors(); I != E; ++I) { -    BasicBlock *Succ = OldTerm->getSuccessor(I); +  for (BasicBlock *Succ : OldTerm->successors()) {      // Make sure only to keep exactly one copy of each edge.      if (Succ == KeepEdge1)        KeepEdge1 = nullptr;      else if (Succ == KeepEdge2)        KeepEdge2 = nullptr;      else -      Succ->removePredecessor(OldTerm->getParent()); +      Succ->removePredecessor(OldTerm->getParent(), +                              /*DontDeleteUselessPHIs=*/true);    }    IRBuilder<> Builder(OldTerm); @@ -2827,7 +3163,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,    Values.erase(std::unique(Values.begin(), Values.end()), Values.end());    // If Extra was used, we require at least two switch values to do the -  // transformation.  A switch with one value is just an cond branch. +  // transformation.  A switch with one value is just a conditional branch.    if (ExtraCase && Values.size() < 2) return false;    // TODO: Preserve branch weight metadata, similarly to how @@ -2847,7 +3183,8 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,    // then we evaluate them with an explicit branch first.  Split the block    // right before the condbr to handle it.    if (ExtraCase) { -    BasicBlock *NewBB = BB->splitBasicBlock(BI, "switch.early.test"); +    BasicBlock *NewBB = +        BB->splitBasicBlock(BI->getIterator(), "switch.early.test");      // Remove the uncond branch added to the old block.      TerminatorInst *OldTI = BB->getTerminator();      Builder.SetInsertPoint(OldTI); @@ -2911,34 +3248,15 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {      return false;    // Check that there are no other instructions except for debug intrinsics. -  BasicBlock::iterator I = LPInst, E = RI; +  BasicBlock::iterator I = LPInst->getIterator(), E = RI->getIterator();    while (++I != E)      if (!isa<DbgInfoIntrinsic>(I))        return false;    // Turn all invokes that unwind here into calls and delete the basic block.    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) { -    InvokeInst *II = cast<InvokeInst>((*PI++)->getTerminator()); -    SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); -    // Insert a call instruction before the invoke. -    CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II); -    Call->takeName(II); -    Call->setCallingConv(II->getCallingConv()); -    Call->setAttributes(II->getAttributes()); -    Call->setDebugLoc(II->getDebugLoc()); - -    // Anything that used the value produced by the invoke instruction now uses -    // the value produced by the call instruction.  Note that we do this even -    // for void functions and calls with no uses so that the callgraph edge is -    // updated. -    II->replaceAllUsesWith(Call); -    BB->removePredecessor(II->getParent()); - -    // Insert a branch to the normal destination right before the invoke. -    BranchInst::Create(II->getNormalDest(), II); - -    // Finally, delete the invoke instruction! -    II->eraseFromParent(); +    BasicBlock *Pred = *PI++; +    removeUnwindEdge(Pred);    }    // The landingpad is now unreachable.  Zap it. @@ -2946,6 +3264,124 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {    return true;  } +bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) { +  // If this is a trivial cleanup pad that executes no instructions, it can be +  // eliminated.  If the cleanup pad continues to the caller, any predecessor +  // that is an EH pad will be updated to continue to the caller and any +  // predecessor that terminates with an invoke instruction will have its invoke +  // instruction converted to a call instruction.  If the cleanup pad being +  // simplified does not continue to the caller, each predecessor will be +  // updated to continue to the unwind destination of the cleanup pad being +  // simplified. +  BasicBlock *BB = RI->getParent(); +  CleanupPadInst *CPInst = RI->getCleanupPad(); +  if (CPInst->getParent() != BB) +    // This isn't an empty cleanup. +    return false; + +  // Check that there are no other instructions except for debug intrinsics. +  BasicBlock::iterator I = CPInst->getIterator(), E = RI->getIterator(); +  while (++I != E) +    if (!isa<DbgInfoIntrinsic>(I)) +      return false; + +  // If the cleanup return we are simplifying unwinds to the caller, this will +  // set UnwindDest to nullptr. +  BasicBlock *UnwindDest = RI->getUnwindDest(); +  Instruction *DestEHPad = UnwindDest ? UnwindDest->getFirstNonPHI() : nullptr; + +  // We're about to remove BB from the control flow.  Before we do, sink any +  // PHINodes into the unwind destination.  Doing this before changing the +  // control flow avoids some potentially slow checks, since we can currently +  // be certain that UnwindDest and BB have no common predecessors (since they +  // are both EH pads). +  if (UnwindDest) { +    // First, go through the PHI nodes in UnwindDest and update any nodes that +    // reference the block we are removing +    for (BasicBlock::iterator I = UnwindDest->begin(), +                              IE = DestEHPad->getIterator(); +         I != IE; ++I) { +      PHINode *DestPN = cast<PHINode>(I); + +      int Idx = DestPN->getBasicBlockIndex(BB); +      // Since BB unwinds to UnwindDest, it has to be in the PHI node. +      assert(Idx != -1); +      // This PHI node has an incoming value that corresponds to a control +      // path through the cleanup pad we are removing.  If the incoming +      // value is in the cleanup pad, it must be a PHINode (because we +      // verified above that the block is otherwise empty).  Otherwise, the +      // value is either a constant or a value that dominates the cleanup +      // pad being removed. +      // +      // Because BB and UnwindDest are both EH pads, all of their +      // predecessors must unwind to these blocks, and since no instruction +      // can have multiple unwind destinations, there will be no overlap in +      // incoming blocks between SrcPN and DestPN. +      Value *SrcVal = DestPN->getIncomingValue(Idx); +      PHINode *SrcPN = dyn_cast<PHINode>(SrcVal); + +      // Remove the entry for the block we are deleting. +      DestPN->removeIncomingValue(Idx, false); + +      if (SrcPN && SrcPN->getParent() == BB) { +        // If the incoming value was a PHI node in the cleanup pad we are +        // removing, we need to merge that PHI node's incoming values into +        // DestPN. +        for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues(); +              SrcIdx != SrcE; ++SrcIdx) { +          DestPN->addIncoming(SrcPN->getIncomingValue(SrcIdx), +                              SrcPN->getIncomingBlock(SrcIdx)); +        } +      } else { +        // Otherwise, the incoming value came from above BB and +        // so we can just reuse it.  We must associate all of BB's +        // predecessors with this value. +        for (auto *pred : predecessors(BB)) { +          DestPN->addIncoming(SrcVal, pred); +        } +      } +    } + +    // Sink any remaining PHI nodes directly into UnwindDest. +    Instruction *InsertPt = DestEHPad; +    for (BasicBlock::iterator I = BB->begin(), +                              IE = BB->getFirstNonPHI()->getIterator(); +         I != IE;) { +      // The iterator must be incremented here because the instructions are +      // being moved to another block. +      PHINode *PN = cast<PHINode>(I++); +      if (PN->use_empty()) +        // If the PHI node has no uses, just leave it.  It will be erased +        // when we erase BB below. +        continue; + +      // Otherwise, sink this PHI node into UnwindDest. +      // Any predecessors to UnwindDest which are not already represented +      // must be back edges which inherit the value from the path through +      // BB.  In this case, the PHI value must reference itself. +      for (auto *pred : predecessors(UnwindDest)) +        if (pred != BB) +          PN->addIncoming(PN, pred); +      PN->moveBefore(InsertPt); +    } +  } + +  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) { +    // The iterator must be updated here because we are removing this pred. +    BasicBlock *PredBB = *PI++; +    if (UnwindDest == nullptr) { +      removeUnwindEdge(PredBB); +    } else { +      TerminatorInst *TI = PredBB->getTerminator(); +      TI->replaceUsesOfWith(BB, UnwindDest); +    } +  } + +  // The cleanup pad is now unreachable.  Zap it. +  BB->eraseFromParent(); +  return true; +} +  bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {    BasicBlock *BB = RI->getParent();    if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false; @@ -3003,8 +3439,8 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {    // If there are any instructions immediately before the unreachable that can    // be removed, do so. -  while (UI != BB->begin()) { -    BasicBlock::iterator BBI = UI; +  while (UI->getIterator() != BB->begin()) { +    BasicBlock::iterator BBI = UI->getIterator();      --BBI;      // Do not delete instructions that can have side effects which might cause      // the unreachable to not be reachable; specifically, calls and volatile @@ -3075,26 +3511,18 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {            --i; --e;            Changed = true;          } -    } else if (InvokeInst *II = dyn_cast<InvokeInst>(TI)) { -      if (II->getUnwindDest() == BB) { -        // Convert the invoke to a call instruction.  This would be a good -        // place to note that the call does not throw though. -        BranchInst *BI = Builder.CreateBr(II->getNormalDest()); -        II->removeFromParent();   // Take out of symbol table - -        // Insert the call now... -        SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3); -        Builder.SetInsertPoint(BI); -        CallInst *CI = Builder.CreateCall(II->getCalledValue(), -                                          Args, II->getName()); -        CI->setCallingConv(II->getCallingConv()); -        CI->setAttributes(II->getAttributes()); -        // If the invoke produced a value, the call does now instead. -        II->replaceAllUsesWith(CI); -        delete II; -        Changed = true; -      } +    } else if ((isa<InvokeInst>(TI) && +                cast<InvokeInst>(TI)->getUnwindDest() == BB) || +               isa<CatchSwitchInst>(TI)) { +      removeUnwindEdge(TI->getParent()); +      Changed = true; +    } else if (isa<CleanupReturnInst>(TI)) { +      new UnreachableInst(TI->getContext(), TI); +      TI->eraseFromParent(); +      Changed = true;      } +    // TODO: We can remove a catchswitch if all it's catchpads end in +    // unreachable.    }    // If this block is now dead, remove it. @@ -3249,6 +3677,29 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,      }    } +  // If we can prove that the cases must cover all possible values, the  +  // default destination becomes dead and we can remove it.  If we know some  +  // of the bits in the value, we can use that to more precisely compute the +  // number of possible unique case values. +  bool HasDefault = +    !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg()); +  const unsigned NumUnknownBits = Bits -  +    (KnownZero.Or(KnownOne)).countPopulation(); +  assert(NumUnknownBits <= Bits); +  if (HasDefault && DeadCases.empty() && +      NumUnknownBits < 64 /* avoid overflow */ &&   +      SI->getNumCases() == (1ULL << NumUnknownBits)) { +    DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n"); +    BasicBlock *NewDefault = SplitBlockPredecessors(SI->getDefaultDest(), +                                                    SI->getParent(), ""); +    SI->setDefaultDest(&*NewDefault); +    SplitBlock(&*NewDefault, &NewDefault->front()); +    auto *OldTI = NewDefault->getTerminator(); +    new UnreachableInst(SI->getContext(), OldTI); +    EraseTerminatorInstAndDCECond(OldTI); +    return true; +  } +    SmallVector<uint64_t, 8> Weights;    bool HasWeight = HasBranchWeights(SI);    if (HasWeight) { @@ -3439,7 +3890,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,      } else if (isa<DbgInfoIntrinsic>(I)) {        // Skip debug intrinsic.        continue; -    } else if (Constant *C = ConstantFold(I, DL, ConstantPool)) { +    } else if (Constant *C = ConstantFold(&*I, DL, ConstantPool)) {        // Instruction is side-effect free and constant.        // If the instruction has uses outside this block or a phi node slot for @@ -3456,7 +3907,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,          return false;        } -      ConstantPool.insert(std::make_pair(I, C)); +      ConstantPool.insert(std::make_pair(&*I, C));      } else {        break;      } @@ -3664,7 +4115,7 @@ namespace {      /// Return true if a table with TableSize elements of      /// type ElementType would fit in a target-legal register.      static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize, -                                   const Type *ElementType); +                                   Type *ElementType);    private:      // Depending on the contents of the table, it can be represented in @@ -3880,8 +4331,8 @@ Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {  bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL,                                             uint64_t TableSize, -                                           const Type *ElementType) { -  const IntegerType *IT = dyn_cast<IntegerType>(ElementType); +                                           Type *ElementType) { +  auto *IT = dyn_cast<IntegerType>(ElementType);    if (!IT)      return false;    // FIXME: If the type is wider than it needs to be, e.g. i8 but all values @@ -3992,7 +4443,7 @@ static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock,      assert((CaseConst == TrueConst || CaseConst == FalseConst) &&             "Expect true or false as compare result.");    } -  +      // Check if the branch instruction dominates the phi node. It's a simple    // dominance check, but sufficient for our needs.    // Although this check is invariant in the calling loops, it's better to do it @@ -4422,7 +4873,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){      return true;    // If the Terminator is the only non-phi instruction, simplify the block. -  BasicBlock::iterator I = BB->getFirstNonPHIOrDbg(); +  BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();    if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&        TryToSimplifyUncondBranchFromEmptyBlock(BB))      return true; @@ -4457,6 +4908,16 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){    return false;  } +static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) { +  BasicBlock *PredPred = nullptr; +  for (auto *P : predecessors(BB)) { +    BasicBlock *PPred = P->getSinglePredecessor(); +    if (!PPred || (PredPred && PredPred != PPred)) +      return nullptr; +    PredPred = PPred; +  } +  return PredPred; +}  bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {    BasicBlock *BB = BI->getParent(); @@ -4537,9 +4998,17 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)      if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))        if (PBI != BI && PBI->isConditional()) -        if (SimplifyCondBranchToCondBranch(PBI, BI)) +        if (SimplifyCondBranchToCondBranch(PBI, BI, DL))            return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; +  // Look for diamond patterns. +  if (MergeCondStores) +    if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB)) +      if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator())) +        if (PBI != BI && PBI->isConditional()) +          if (mergeConditionalStores(PBI, BI)) +            return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; +      return false;  } @@ -4663,6 +5132,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {      if (SimplifyReturn(RI, Builder)) return true;    } else if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {      if (SimplifyResume(RI, Builder)) return true; +  } else if (CleanupReturnInst *RI = +               dyn_cast<CleanupReturnInst>(BB->getTerminator())) { +    if (SimplifyCleanupReturn(RI)) return true;    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {      if (SimplifySwitch(SI, Builder)) return true;    } else if (UnreachableInst *UI = diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index ab30aa17c76b..ddd8775a8431 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -47,15 +47,16 @@ namespace {      Loop             *L;      LoopInfo         *LI;      ScalarEvolution  *SE; +    DominatorTree    *DT;      SmallVectorImpl<WeakVH> &DeadInsts;      bool Changed;    public: -    SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LoopInfo *LI, -                   SmallVectorImpl<WeakVH> &Dead) -        : L(Loop), LI(LI), SE(SE), DeadInsts(Dead), Changed(false) { +    SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT, +                   LoopInfo *LI,SmallVectorImpl<WeakVH> &Dead) +        : L(Loop), LI(LI), SE(SE), DT(DT), DeadInsts(Dead), Changed(false) {        assert(LI && "IV simplification requires LoopInfo");      } @@ -63,11 +64,13 @@ namespace {      /// Iteratively perform simplification on a worklist of users of the      /// specified induction variable. This is the top-level driver that applies -    /// all simplicitions to users of an IV. +    /// all simplifications to users of an IV.      void simplifyUsers(PHINode *CurrIV, IVVisitor *V = nullptr);      Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand); +    bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand); +      bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);      void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);      void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand, @@ -166,19 +169,65 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {    S = SE->getSCEVAtScope(S, ICmpLoop);    X = SE->getSCEVAtScope(X, ICmpLoop); +  ICmpInst::Predicate InvariantPredicate; +  const SCEV *InvariantLHS, *InvariantRHS; +    // If the condition is always true or always false, replace it with    // a constant value. -  if (SE->isKnownPredicate(Pred, S, X)) +  if (SE->isKnownPredicate(Pred, S, X)) {      ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext())); -  else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) +    DeadInsts.emplace_back(ICmp); +    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); +  } else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) {      ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext())); -  else +    DeadInsts.emplace_back(ICmp); +    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); +  } else if (isa<PHINode>(IVOperand) && +             SE->isLoopInvariantPredicate(Pred, S, X, ICmpLoop, +                                          InvariantPredicate, InvariantLHS, +                                          InvariantRHS)) { + +    // Rewrite the comparison to a loop invariant comparison if it can be done +    // cheaply, where cheaply means "we don't need to emit any new +    // instructions". + +    Value *NewLHS = nullptr, *NewRHS = nullptr; + +    if (S == InvariantLHS || X == InvariantLHS) +      NewLHS = +          ICmp->getOperand(S == InvariantLHS ? IVOperIdx : (1 - IVOperIdx)); + +    if (S == InvariantRHS || X == InvariantRHS) +      NewRHS = +          ICmp->getOperand(S == InvariantRHS ? IVOperIdx : (1 - IVOperIdx)); + +    for (Value *Incoming : cast<PHINode>(IVOperand)->incoming_values()) { +      if (NewLHS && NewRHS) +        break; + +      const SCEV *IncomingS = SE->getSCEV(Incoming); + +      if (!NewLHS && IncomingS == InvariantLHS) +        NewLHS = Incoming; +      if (!NewRHS && IncomingS == InvariantRHS) +        NewRHS = Incoming; +    } + +    if (!NewLHS || !NewRHS) +      // We could not find an existing value to replace either LHS or RHS. +      // Generating new instructions has subtler tradeoffs, so avoid doing that +      // for now. +      return; + +    DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n'); +    ICmp->setPredicate(InvariantPredicate); +    ICmp->setOperand(0, NewLHS); +    ICmp->setOperand(1, NewRHS); +  } else      return; -  DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');    ++NumElimCmp;    Changed = true; -  DeadInsts.emplace_back(ICmp);  }  /// SimplifyIVUsers helper for eliminating useless @@ -207,8 +256,7 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,      Rem->replaceAllUsesWith(Rem->getOperand(0));    else {      // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n). -    const SCEV *LessOne = -      SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1)); +    const SCEV *LessOne = SE->getMinusSCEV(S, SE->getOne(S->getType()));      if (IsSigned && !SE->isKnownNonNegative(LessOne))        return; @@ -232,9 +280,9 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,    DeadInsts.emplace_back(Rem);  } -/// Eliminate an operation that consumes a simple IV and has -/// no observable side-effect given the range of IV values. -/// IVOperand is guaranteed SCEVable, but UseInst may not be. +/// Eliminate an operation that consumes a simple IV and has no observable +/// side-effect given the range of IV values.  IVOperand is guaranteed SCEVable, +/// but UseInst may not be.  bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,                                       Instruction *IVOperand) {    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { @@ -249,12 +297,45 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,      }    } -  // Eliminate any operation that SCEV can prove is an identity function. +  if (eliminateIdentitySCEV(UseInst, IVOperand)) +    return true; + +  return false; +} + +/// Eliminate any operation that SCEV can prove is an identity function. +bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst, +                                           Instruction *IVOperand) {    if (!SE->isSCEVable(UseInst->getType()) ||        (UseInst->getType() != IVOperand->getType()) ||        (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))      return false; +  // getSCEV(X) == getSCEV(Y) does not guarantee that X and Y are related in the +  // dominator tree, even if X is an operand to Y.  For instance, in +  // +  //     %iv = phi i32 {0,+,1} +  //     br %cond, label %left, label %merge +  // +  //   left: +  //     %X = add i32 %iv, 0 +  //     br label %merge +  // +  //   merge: +  //     %M = phi (%X, %iv) +  // +  // getSCEV(%M) == getSCEV(%X) == {0,+,1}, but %X does not dominate %M, and +  // %M.replaceAllUsesWith(%X) would be incorrect. + +  if (isa<PHINode>(UseInst)) +    // If UseInst is not a PHI node then we know that IVOperand dominates +    // UseInst directly from the legality of SSA. +    if (!DT || !DT->dominates(IVOperand, UseInst)) +      return false; + +  if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand)) +    return false; +    DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');    UseInst->replaceAllUsesWith(IVOperand); @@ -436,8 +517,8 @@ static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) {  /// This algorithm does not require IVUsers analysis. Instead, it simplifies  /// instructions in-place during analysis. Rather than rewriting induction  /// variables bottom-up from their users, it transforms a chain of IVUsers -/// top-down, updating the IR only when it encouters a clear optimization -/// opportunitiy. +/// top-down, updating the IR only when it encounters a clear optimization +/// opportunity.  ///  /// Once DisableIVRewrite is default, LSR will be the only client of IVUsers.  /// @@ -513,22 +594,21 @@ void IVVisitor::anchor() { }  /// Simplify instructions that use this induction variable  /// by using ScalarEvolution to analyze the IV's recurrence. -bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM, -                       SmallVectorImpl<WeakVH> &Dead, IVVisitor *V) -{ -  LoopInfo *LI = &LPM->getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); -  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LI, Dead); +bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT, +                       LoopInfo *LI, SmallVectorImpl<WeakVH> &Dead, +                       IVVisitor *V) { +  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, Dead);    SIV.simplifyUsers(CurrIV, V);    return SIV.hasChanged();  }  /// Simplify users of induction variables within this  /// loop. This does not actually change or add IVs. -bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, LPPassManager *LPM, -                     SmallVectorImpl<WeakVH> &Dead) { +bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT, +                     LoopInfo *LI, SmallVectorImpl<WeakVH> &Dead) {    bool Changed = false;    for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { -    Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, LPM, Dead); +    Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, Dead);    }    return Changed;  } diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp index c499c87b1f0b..d5377f9a4c1f 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp @@ -20,12 +20,12 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/Dominators.h"  #include "llvm/IR/Function.h"  #include "llvm/IR/Type.h"  #include "llvm/Pass.h" -#include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/Transforms/Utils/Local.h"  using namespace llvm; @@ -64,7 +64,7 @@ namespace {            // Here be subtlety: the iterator must be incremented before the loop            // body (not sure why), so a range-for loop won't work here.            for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { -            Instruction *I = BI++; +            Instruction *I = &*BI++;              // The first time through the loop ToSimplify is empty and we try to              // simplify all instructions.  On later iterations ToSimplify is not              // empty and we only bother simplifying instructions that are in it. diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 6bbf8287e223..81dea6d1b9ae 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -18,6 +18,7 @@  #include "llvm/ADT/SmallString.h"  #include "llvm/ADT/StringMap.h"  #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/IR/DiagnosticInfo.h" @@ -30,8 +31,8 @@  #include "llvm/IR/PatternMatch.h"  #include "llvm/Support/Allocator.h"  #include "llvm/Support/CommandLine.h" -#include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/Transforms/Utils/Local.h"  using namespace llvm;  using namespace PatternMatch; @@ -52,16 +53,8 @@ static cl::opt<bool>  //===----------------------------------------------------------------------===//  static bool ignoreCallingConv(LibFunc::Func Func) { -  switch (Func) { -  case LibFunc::abs: -  case LibFunc::labs: -  case LibFunc::llabs: -  case LibFunc::strlen: -    return true; -  default: -    return false; -  } -  llvm_unreachable("All cases should be covered in the switch."); +  return Func == LibFunc::abs || Func == LibFunc::labs || +         Func == LibFunc::llabs || Func == LibFunc::strlen;  }  /// isOnlyUsedInZeroEqualityComparison - Return true if it only matters that the @@ -93,16 +86,13 @@ static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {  }  static bool callHasFloatingPointArgument(const CallInst *CI) { -  for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end(); -       it != e; ++it) { -    if ((*it)->getType()->isFloatingPointTy()) -      return true; -  } -  return false; +  return std::any_of(CI->op_begin(), CI->op_end(), [](const Use &OI) { +    return OI->getType()->isFloatingPointTy(); +  });  }  /// \brief Check whether the overloaded unary floating point function -/// corresponing to \a Ty is available. +/// corresponding to \a Ty is available.  static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,                              LibFunc::Func DoubleFn, LibFunc::Func FloatFn,                              LibFunc::Func LongDoubleFn) { @@ -116,6 +106,23 @@ static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,    }  } +/// \brief Check whether we can use unsafe floating point math for +/// the function passed as input. +static bool canUseUnsafeFPMath(Function *F) { + +  // FIXME: For finer-grain optimization, we need intrinsics to have the same +  // fast-math flag decorations that are applied to FP instructions. For now, +  // we have to rely on the function-level unsafe-fp-math attribute to do this +  // optimization because there's no other way to express that the call can be +  // relaxed. +  if (F->hasFnAttribute("unsafe-fp-math")) { +    Attribute Attr = F->getFnAttribute("unsafe-fp-math"); +    if (Attr.getValueAsString() == "true") +      return true; +  } +  return false; +} +  /// \brief Returns whether \p F matches the signature expected for the  /// string/memory copying library function \p Func.  /// Acceptable functions are st[rp][n]?cpy, memove, memcpy, and memset. @@ -467,9 +474,6 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {  Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {    Function *Callee = CI->getCalledFunction(); -  // Verify the "stpcpy" function prototype. -  FunctionType *FT = Callee->getFunctionType(); -    if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy))      return nullptr; @@ -484,7 +488,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {    if (Len == 0)      return nullptr; -  Type *PT = FT->getParamType(0); +  Type *PT = Callee->getFunctionType()->getParamType(0);    Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len);    Value *DstEnd =        B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(DL.getIntPtrType(PT), Len - 1)); @@ -497,8 +501,6 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {  Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {    Function *Callee = CI->getCalledFunction(); -  FunctionType *FT = Callee->getFunctionType(); -    if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy))      return nullptr; @@ -531,7 +533,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {    if (Len > SrcLen + 1)      return nullptr; -  Type *PT = FT->getParamType(0); +  Type *PT = Callee->getFunctionType()->getParamType(0);    // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]    B.CreateMemCpy(Dst, Src, ConstantInt::get(DL.getIntPtrType(PT), Len), 1); @@ -862,6 +864,27 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {      return B.CreateSub(LHSV, RHSV, "chardiff");    } +  // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0 +  if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) { + +    IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8); +    unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType); + +    if (getKnownAlignment(LHS, DL, CI) >= PrefAlignment && +        getKnownAlignment(RHS, DL, CI) >= PrefAlignment) { + +      Type *LHSPtrTy = +          IntType->getPointerTo(LHS->getType()->getPointerAddressSpace()); +      Type *RHSPtrTy = +          IntType->getPointerTo(RHS->getType()->getPointerAddressSpace()); + +      Value *LHSV = B.CreateLoad(B.CreateBitCast(LHS, LHSPtrTy, "lhsc"), "lhsv"); +      Value *RHSV = B.CreateLoad(B.CreateBitCast(RHS, RHSPtrTy, "rhsc"), "rhsv"); + +      return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp"); +    } +  } +    // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)    StringRef LHSStr, RHSStr;    if (getConstantStringInfo(LHS, LHSStr) && @@ -972,7 +995,7 @@ Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,    // floor((double)floatval) -> (double)floorf(floatval)    if (Callee->isIntrinsic()) { -    Module *M = CI->getParent()->getParent()->getParent(); +    Module *M = CI->getModule();      Intrinsic::ID IID = Callee->getIntrinsicID();      Function *F = Intrinsic::getDeclaration(M, IID, B.getFloatTy());      V = B.CreateCall(F, V); @@ -1015,9 +1038,9 @@ Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {  Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) {    Function *Callee = CI->getCalledFunction();    Value *Ret = nullptr; -  if (UnsafeFPShrink && Callee->getName() == "cos" && TLI->has(LibFunc::cosf)) { +  StringRef Name = Callee->getName(); +  if (UnsafeFPShrink && Name == "cos" && hasFloatVersion(Name))      Ret = optimizeUnaryDoubleFP(CI, B, true); -  }    FunctionType *FT = Callee->getFunctionType();    // Just make sure this has 1 argument of FP type, which matches the @@ -1035,13 +1058,37 @@ Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) {    return Ret;  } +static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) { +  // Multiplications calculated using Addition Chains. +  // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html + +  assert(Exp != 0 && "Incorrect exponent 0 not handled"); + +  if (InnerChain[Exp]) +    return InnerChain[Exp]; + +  static const unsigned AddChain[33][2] = { +      {0, 0}, // Unused. +      {0, 0}, // Unused (base case = pow1). +      {1, 1}, // Unused (pre-computed). +      {1, 2},  {2, 2},   {2, 3},  {3, 3},   {2, 5},  {4, 4}, +      {1, 8},  {5, 5},   {1, 10}, {6, 6},   {4, 9},  {7, 7}, +      {3, 12}, {8, 8},   {8, 9},  {2, 16},  {1, 18}, {10, 10}, +      {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13}, +      {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16}, +  }; + +  InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B), +                                 getPow(InnerChain, AddChain[Exp][1], B)); +  return InnerChain[Exp]; +} +  Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {    Function *Callee = CI->getCalledFunction(); -    Value *Ret = nullptr; -  if (UnsafeFPShrink && Callee->getName() == "pow" && TLI->has(LibFunc::powf)) { +  StringRef Name = Callee->getName(); +  if (UnsafeFPShrink && Name == "pow" && hasFloatVersion(Name))      Ret = optimizeUnaryDoubleFP(CI, B, true); -  }    FunctionType *FT = Callee->getFunctionType();    // Just make sure this has 2 arguments of the same FP type, which match the @@ -1060,7 +1107,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {      if (Op1C->isExactlyValue(2.0) &&          hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f,                          LibFunc::exp2l)) -      return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes()); +      return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp2), B, +                                  Callee->getAttributes());      // pow(10.0, x) -> exp10(x)      if (Op1C->isExactlyValue(10.0) &&          hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f, @@ -1069,6 +1117,32 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {                                    Callee->getAttributes());    } +  bool unsafeFPMath = canUseUnsafeFPMath(CI->getParent()->getParent()); + +  // pow(exp(x), y) -> exp(x*y) +  // pow(exp2(x), y) -> exp2(x * y) +  // We enable these only under fast-math. Besides rounding +  // differences the transformation changes overflow and +  // underflow behavior quite dramatically. +  // Example: x = 1000, y = 0.001. +  // pow(exp(x), y) = pow(inf, 0.001) = inf, whereas exp(x*y) = exp(1). +  if (unsafeFPMath) { +    if (auto *OpC = dyn_cast<CallInst>(Op1)) { +      IRBuilder<>::FastMathFlagGuard Guard(B); +      FastMathFlags FMF; +      FMF.setUnsafeAlgebra(); +      B.SetFastMathFlags(FMF); + +      LibFunc::Func Func; +      Function *OpCCallee = OpC->getCalledFunction(); +      if (OpCCallee && TLI->getLibFunc(OpCCallee->getName(), Func) && +          TLI->has(Func) && (Func == LibFunc::exp || Func == LibFunc::exp2)) +        return EmitUnaryFloatFnCall( +            B.CreateFMul(OpC->getArgOperand(0), Op2, "mul"), +            OpCCallee->getName(), B, OpCCallee->getAttributes()); +    } +  } +    ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);    if (!Op2C)      return Ret; @@ -1081,10 +1155,15 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {                        LibFunc::sqrtl) &&        hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf,                        LibFunc::fabsl)) { + +    // In -ffast-math, pow(x, 0.5) -> sqrt(x). +    if (unsafeFPMath) +      return EmitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B, +                                  Callee->getAttributes()); +      // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).      // This is faster than calling pow, and still handles negative zero      // and negative infinity correctly. -    // TODO: In fast-math mode, this could be just sqrt(x).      // TODO: In finite-only mode, this could be just fabs(sqrt(x)).      Value *Inf = ConstantFP::getInfinity(CI->getType());      Value *NegInf = ConstantFP::getInfinity(CI->getType(), true); @@ -1102,18 +1181,42 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {      return B.CreateFMul(Op1, Op1, "pow2");    if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x      return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip"); + +  // In -ffast-math, generate repeated fmul instead of generating pow(x, n). +  if (unsafeFPMath) { +    APFloat V = abs(Op2C->getValueAPF()); +    // We limit to a max of 7 fmul(s). Thus max exponent is 32. +    // This transformation applies to integer exponents only. +    if (V.compare(APFloat(V.getSemantics(), 32.0)) == APFloat::cmpGreaterThan || +        !V.isInteger()) +      return nullptr; + +    // We will memoize intermediate products of the Addition Chain. +    Value *InnerChain[33] = {nullptr}; +    InnerChain[1] = Op1; +    InnerChain[2] = B.CreateFMul(Op1, Op1); + +    // We cannot readily convert a non-double type (like float) to a double. +    // So we first convert V to something which could be converted to double. +    bool ignored; +    V.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored); +    Value *FMul = getPow(InnerChain, V.convertToDouble(), B); +    // For negative exponents simply compute the reciprocal. +    if (Op2C->isNegative()) +      FMul = B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), FMul); +    return FMul; +  } +    return nullptr;  }  Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {    Function *Callee = CI->getCalledFunction();    Function *Caller = CI->getParent()->getParent(); -    Value *Ret = nullptr; -  if (UnsafeFPShrink && Callee->getName() == "exp2" && -      TLI->has(LibFunc::exp2f)) { +  StringRef Name = Callee->getName(); +  if (UnsafeFPShrink && Name == "exp2" && hasFloatVersion(Name))      Ret = optimizeUnaryDoubleFP(CI, B, true); -  }    FunctionType *FT = Callee->getFunctionType();    // Just make sure this has 1 argument of FP type, which matches the @@ -1162,11 +1265,10 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {  Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {    Function *Callee = CI->getCalledFunction(); -    Value *Ret = nullptr; -  if (Callee->getName() == "fabs" && TLI->has(LibFunc::fabsf)) { +  StringRef Name = Callee->getName(); +  if (Name == "fabs" && hasFloatVersion(Name))      Ret = optimizeUnaryDoubleFP(CI, B, false); -  }    FunctionType *FT = Callee->getFunctionType();    // Make sure this has 1 argument of FP type which matches the result type. @@ -1184,6 +1286,105 @@ Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {    return Ret;  } +Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) { +  // If we can shrink the call to a float function rather than a double +  // function, do that first. +  Function *Callee = CI->getCalledFunction(); +  StringRef Name = Callee->getName(); +  if ((Name == "fmin" && hasFloatVersion(Name)) || +      (Name == "fmax" && hasFloatVersion(Name))) { +    Value *Ret = optimizeBinaryDoubleFP(CI, B); +    if (Ret) +      return Ret; +  } + +  // Make sure this has 2 arguments of FP type which match the result type. +  FunctionType *FT = Callee->getFunctionType(); +  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || +      FT->getParamType(0) != FT->getParamType(1) || +      !FT->getParamType(0)->isFloatingPointTy()) +    return nullptr; + +  IRBuilder<>::FastMathFlagGuard Guard(B); +  FastMathFlags FMF; +  Function *F = CI->getParent()->getParent(); +  if (canUseUnsafeFPMath(F)) { +    // Unsafe algebra sets all fast-math-flags to true. +    FMF.setUnsafeAlgebra(); +  } else { +    // At a minimum, no-nans-fp-math must be true. +    Attribute Attr = F->getFnAttribute("no-nans-fp-math"); +    if (Attr.getValueAsString() != "true") +      return nullptr; +    // No-signed-zeros is implied by the definitions of fmax/fmin themselves: +    // "Ideally, fmax would be sensitive to the sign of zero, for example +    // fmax(-0. 0, +0. 0) would return +0; however, implementation in software +    // might be impractical." +    FMF.setNoSignedZeros(); +    FMF.setNoNaNs(); +  } +  B.SetFastMathFlags(FMF); + +  // We have a relaxed floating-point environment. We can ignore NaN-handling +  // and transform to a compare and select. We do not have to consider errno or +  // exceptions, because fmin/fmax do not have those. +  Value *Op0 = CI->getArgOperand(0); +  Value *Op1 = CI->getArgOperand(1); +  Value *Cmp = Callee->getName().startswith("fmin") ? +    B.CreateFCmpOLT(Op0, Op1) : B.CreateFCmpOGT(Op0, Op1); +  return B.CreateSelect(Cmp, Op0, Op1); +} + +Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) { +  Function *Callee = CI->getCalledFunction(); +  Value *Ret = nullptr; +  StringRef Name = Callee->getName(); +  if (UnsafeFPShrink && hasFloatVersion(Name)) +    Ret = optimizeUnaryDoubleFP(CI, B, true); +  FunctionType *FT = Callee->getFunctionType(); + +  // Just make sure this has 1 argument of FP type, which matches the +  // result type. +  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || +      !FT->getParamType(0)->isFloatingPointTy()) +    return Ret; + +  if (!canUseUnsafeFPMath(CI->getParent()->getParent())) +    return Ret; +  Value *Op1 = CI->getArgOperand(0); +  auto *OpC = dyn_cast<CallInst>(Op1); +  if (!OpC) +    return Ret; + +  // log(pow(x,y)) -> y*log(x) +  // This is only applicable to log, log2, log10. +  if (Name != "log" && Name != "log2" && Name != "log10") +    return Ret; + +  IRBuilder<>::FastMathFlagGuard Guard(B); +  FastMathFlags FMF; +  FMF.setUnsafeAlgebra(); +  B.SetFastMathFlags(FMF); + +  LibFunc::Func Func; +  Function *F = OpC->getCalledFunction(); +  if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && +      Func == LibFunc::pow) || F->getIntrinsicID() == Intrinsic::pow)) +    return B.CreateFMul(OpC->getArgOperand(1), +      EmitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B, +                           Callee->getAttributes()), "mul"); + +  // log(exp2(y)) -> y*log(2) +  if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) && +      TLI->has(Func) && Func == LibFunc::exp2) +    return B.CreateFMul( +        OpC->getArgOperand(0), +        EmitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0), +                             Callee->getName(), B, Callee->getAttributes()), +        "logmul"); +  return Ret; +} +  Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {    Function *Callee = CI->getCalledFunction(); @@ -1191,19 +1392,9 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {    if (TLI->has(LibFunc::sqrtf) && (Callee->getName() == "sqrt" ||                                     Callee->getIntrinsicID() == Intrinsic::sqrt))      Ret = optimizeUnaryDoubleFP(CI, B, true); +  if (!canUseUnsafeFPMath(CI->getParent()->getParent())) +    return Ret; -  // FIXME: For finer-grain optimization, we need intrinsics to have the same -  // fast-math flag decorations that are applied to FP instructions. For now, -  // we have to rely on the function-level unsafe-fp-math attribute to do this -  // optimization because there's no other way to express that the sqrt can be -  // reassociated. -  Function *F = CI->getParent()->getParent(); -  if (F->hasFnAttribute("unsafe-fp-math")) { -    // Check for unsafe-fp-math = true. -    Attribute Attr = F->getFnAttribute("unsafe-fp-math"); -    if (Attr.getValueAsString() != "true") -      return Ret; -  }    Value *Op = CI->getArgOperand(0);    if (Instruction *I = dyn_cast<Instruction>(Op)) {      if (I->getOpcode() == Instruction::FMul && I->hasUnsafeAlgebra()) { @@ -1238,8 +1429,7 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {          // and multiply.          // FIXME: We're not checking the sqrt because it doesn't have          // fast-math-flags (see earlier comment). -        IRBuilder<true, ConstantFolder, -          IRBuilderDefaultInserter<true> >::FastMathFlagGuard Guard(B); +        IRBuilder<>::FastMathFlagGuard Guard(B);          B.SetFastMathFlags(I->getFastMathFlags());          // If we found a repeated factor, hoist it out of the square root and          // replace it with the fabs of that factor. @@ -1262,6 +1452,40 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {    return Ret;  } +Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) { +  Function *Callee = CI->getCalledFunction(); +  Value *Ret = nullptr; +  StringRef Name = Callee->getName(); +  if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name)) +    Ret = optimizeUnaryDoubleFP(CI, B, true); +  FunctionType *FT = Callee->getFunctionType(); + +  // Just make sure this has 1 argument of FP type, which matches the +  // result type. +  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || +      !FT->getParamType(0)->isFloatingPointTy()) +    return Ret; + +  if (!canUseUnsafeFPMath(CI->getParent()->getParent())) +    return Ret; +  Value *Op1 = CI->getArgOperand(0); +  auto *OpC = dyn_cast<CallInst>(Op1); +  if (!OpC) +    return Ret; + +  // tan(atan(x)) -> x +  // tanf(atanf(x)) -> x +  // tanl(atanl(x)) -> x +  LibFunc::Func Func; +  Function *F = OpC->getCalledFunction(); +  if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && +      ((Func == LibFunc::atan && Callee->getName() == "tan") || +       (Func == LibFunc::atanf && Callee->getName() == "tanf") || +       (Func == LibFunc::atanl && Callee->getName() == "tanl"))) +    Ret = OpC->getArgOperand(0); +  return Ret; +} +  static bool isTrigLibCall(CallInst *CI);  static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,                               bool UseFloat, Value *&Sin, Value *&Cos, @@ -1329,9 +1553,9 @@ LibCallSimplifier::classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat,      return;    Function *Callee = CI->getCalledFunction(); -  StringRef FuncName = Callee->getName();    LibFunc::Func Func; -  if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func) || !isTrigLibCall(CI)) +  if (!Callee || !TLI->getLibFunc(Callee->getName(), Func) || !TLI->has(Func) || +      !isTrigLibCall(CI))      return;    if (IsFloat) { @@ -1353,10 +1577,8 @@ LibCallSimplifier::classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat,  void LibCallSimplifier::replaceTrigInsts(SmallVectorImpl<CallInst *> &Calls,                                           Value *Res) { -  for (SmallVectorImpl<CallInst *>::iterator I = Calls.begin(), E = Calls.end(); -       I != E; ++I) { -    replaceAllUsesWith(*I, Res); -  } +  for (CallInst *C : Calls) +    replaceAllUsesWith(C, Res);  }  void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, @@ -1387,8 +1609,7 @@ void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,    if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {      // If the argument is an instruction, it must dominate all uses so put our      // sincos call there. -    BasicBlock::iterator Loc = ArgInst; -    B.SetInsertPoint(ArgInst->getParent(), ++Loc); +    B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());    } else {      // Otherwise (e.g. for a constant) the beginning of the function is as      // good a place as any. @@ -1413,15 +1634,16 @@ void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,  // Integer Library Call Optimizations  //===----------------------------------------------------------------------===// +static bool checkIntUnaryReturnAndParam(Function *Callee) { +  FunctionType *FT = Callee->getFunctionType(); +  return FT->getNumParams() == 1 && FT->getReturnType()->isIntegerTy(32) && +    FT->getParamType(0)->isIntegerTy(); +} +  Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {    Function *Callee = CI->getCalledFunction(); -  FunctionType *FT = Callee->getFunctionType(); -  // Just make sure this has 2 arguments of the same FP type, which match the -  // result type. -  if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy(32) || -      !FT->getParamType(0)->isIntegerTy()) +  if (!checkIntUnaryReturnAndParam(Callee))      return nullptr; -    Value *Op = CI->getArgOperand(0);    // Constant fold. @@ -1436,7 +1658,7 @@ Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {    Type *ArgType = Op->getType();    Value *F =        Intrinsic::getDeclaration(Callee->getParent(), Intrinsic::cttz, ArgType); -  Value *V = B.CreateCall(F, {Op, B.getFalse()}, "cttz"); +  Value *V = B.CreateCall(F, {Op, B.getTrue()}, "cttz");    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));    V = B.CreateIntCast(V, B.getInt32Ty(), false); @@ -1461,11 +1683,7 @@ Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {  }  Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) { -  Function *Callee = CI->getCalledFunction(); -  FunctionType *FT = Callee->getFunctionType(); -  // We require integer(i32) -  if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || -      !FT->getParamType(0)->isIntegerTy(32)) +  if (!checkIntUnaryReturnAndParam(CI->getCalledFunction()))      return nullptr;    // isdigit(c) -> (c-'0') <u 10 @@ -1476,11 +1694,7 @@ Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {  }  Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) { -  Function *Callee = CI->getCalledFunction(); -  FunctionType *FT = Callee->getFunctionType(); -  // We require integer(i32) -  if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || -      !FT->getParamType(0)->isIntegerTy(32)) +  if (!checkIntUnaryReturnAndParam(CI->getCalledFunction()))      return nullptr;    // isascii(c) -> c <u 128 @@ -1490,11 +1704,7 @@ Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) {  }  Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) { -  Function *Callee = CI->getCalledFunction(); -  FunctionType *FT = Callee->getFunctionType(); -  // We require i32(i32) -  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || -      !FT->getParamType(0)->isIntegerTy(32)) +  if (!checkIntUnaryReturnAndParam(CI->getCalledFunction()))      return nullptr;    // toascii(c) -> c & 0x7f @@ -1529,10 +1739,7 @@ Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,  }  static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) { -  if (!ColdErrorCalls) -    return false; - -  if (!Callee || !Callee->isDeclaration()) +  if (!ColdErrorCalls || !Callee || !Callee->isDeclaration())      return false;    if (StreamArg < 0) @@ -1968,16 +2175,8 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {    // Command-line parameter overrides function attribute.    if (EnableUnsafeFPShrink.getNumOccurrences() > 0)      UnsafeFPShrink = EnableUnsafeFPShrink; -  else if (Callee->hasFnAttribute("unsafe-fp-math")) { -    // FIXME: This is the same problem as described in optimizeSqrt(). -    // If calls gain access to IR-level FMF, then use that instead of a -    // function attribute. - -    // Check for unsafe-fp-math = true. -    Attribute Attr = Callee->getFnAttribute("unsafe-fp-math"); -    if (Attr.getValueAsString() == "true") -      UnsafeFPShrink = true; -  } +  else if (canUseUnsafeFPMath(Callee)) +    UnsafeFPShrink = true;    // First, check for intrinsics.    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { @@ -1990,6 +2189,8 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {        return optimizeExp2(CI, Builder);      case Intrinsic::fabs:        return optimizeFabs(CI, Builder); +    case Intrinsic::log: +      return optimizeLog(CI, Builder);      case Intrinsic::sqrt:        return optimizeSqrt(CI, Builder);      default: @@ -2001,13 +2202,17 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {    if (Value *SimplifiedFortifiedCI = FortifiedSimplifier.optimizeCall(CI)) {      // Try to further simplify the result.      CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI); -    if (SimplifiedCI && SimplifiedCI->getCalledFunction()) -      if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) { +    if (SimplifiedCI && SimplifiedCI->getCalledFunction()) { +      // Use an IR Builder from SimplifiedCI if available instead of CI +      // to guarantee we reach all uses we might replace later on. +      IRBuilder<> TmpBuilder(SimplifiedCI); +      if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) {          // If we were able to further simplify, remove the now redundant call.          SimplifiedCI->replaceAllUsesWith(V);          SimplifiedCI->eraseFromParent();          return V;        } +    }      return SimplifiedFortifiedCI;    } @@ -2068,8 +2273,18 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {        return optimizeFWrite(CI, Builder);      case LibFunc::fputs:        return optimizeFPuts(CI, Builder); +    case LibFunc::log: +    case LibFunc::log10: +    case LibFunc::log1p: +    case LibFunc::log2: +    case LibFunc::logb: +      return optimizeLog(CI, Builder);      case LibFunc::puts:        return optimizePuts(CI, Builder); +    case LibFunc::tan: +    case LibFunc::tanf: +    case LibFunc::tanl: +      return optimizeTan(CI, Builder);      case LibFunc::perror:        return optimizeErrorReporting(CI, Builder);      case LibFunc::vfprintf: @@ -2097,24 +2312,23 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {      case LibFunc::exp:      case LibFunc::exp10:      case LibFunc::expm1: -    case LibFunc::log: -    case LibFunc::log10: -    case LibFunc::log1p: -    case LibFunc::log2: -    case LibFunc::logb:      case LibFunc::sin:      case LibFunc::sinh: -    case LibFunc::tan:      case LibFunc::tanh:        if (UnsafeFPShrink && hasFloatVersion(FuncName))          return optimizeUnaryDoubleFP(CI, Builder, true);        return nullptr;      case LibFunc::copysign: -    case LibFunc::fmin: -    case LibFunc::fmax:        if (hasFloatVersion(FuncName))          return optimizeBinaryDoubleFP(CI, Builder);        return nullptr; +    case LibFunc::fminf: +    case LibFunc::fmin: +    case LibFunc::fminl: +    case LibFunc::fmaxf: +    case LibFunc::fmax: +    case LibFunc::fmaxl: +      return optimizeFMinFMax(CI, Builder);      default:        return nullptr;      } @@ -2133,37 +2347,27 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {    Replacer(I, With);  } -/*static*/ void LibCallSimplifier::replaceAllUsesWithDefault(Instruction *I, -                                                             Value *With) { -  I->replaceAllUsesWith(With); -  I->eraseFromParent(); -} -  // TODO:  //   Additional cases that we need to add to this file:  //  // cbrt:  //   * cbrt(expN(X))  -> expN(x/3)  //   * cbrt(sqrt(x))  -> pow(x,1/6) -//   * cbrt(sqrt(x))  -> pow(x,1/9) +//   * cbrt(cbrt(x))  -> pow(x,1/9)  //  // exp, expf, expl:  //   * exp(log(x))  -> x  //  // log, logf, logl:  //   * log(exp(x))   -> x -//   * log(x**y)     -> y*log(x)  //   * log(exp(y))   -> y*log(e) -//   * log(exp2(y))  -> y*log(2)  //   * log(exp10(y)) -> y*log(10)  //   * log(sqrt(x))  -> 0.5*log(x) -//   * log(pow(x,y)) -> y*log(x)  //  // lround, lroundf, lroundl:  //   * lround(cnst) -> cnst'  //  // pow, powf, powl: -//   * pow(exp(x),y)  -> exp(x*y)  //   * pow(sqrt(x),y) -> pow(x,y*0.5)  //   * pow(pow(x,y),z)-> pow(x,y*z)  // @@ -2179,9 +2383,6 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {  //   * sqrt(Nroot(x)) -> pow(x,1/(2*N))  //   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)  // -// tan, tanf, tanl: -//   * tan(atan(x)) -> x -//  // trunc, truncf, truncl:  //   * trunc(cnst) -> cnst'  // diff --git a/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp b/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp new file mode 100644 index 000000000000..ad6b782caf8b --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp @@ -0,0 +1,85 @@ +//===- SplitModule.cpp - Split a module into partitions -------------------===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the function llvm::SplitModule, which splits a module +// into multiple linkable partitions. It can be used to implement parallel code +// generation for link-time optimization. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/SplitModule.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/MD5.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +static void externalize(GlobalValue *GV) { +  if (GV->hasLocalLinkage()) { +    GV->setLinkage(GlobalValue::ExternalLinkage); +    GV->setVisibility(GlobalValue::HiddenVisibility); +  } + +  // Unnamed entities must be named consistently between modules. setName will +  // give a distinct name to each such entity. +  if (!GV->hasName()) +    GV->setName("__llvmsplit_unnamed"); +} + +// Returns whether GV should be in partition (0-based) I of N. +static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) { +  if (auto GA = dyn_cast<GlobalAlias>(GV)) +    if (const GlobalObject *Base = GA->getBaseObject()) +      GV = Base; + +  StringRef Name; +  if (const Comdat *C = GV->getComdat()) +    Name = C->getName(); +  else +    Name = GV->getName(); + +  // Partition by MD5 hash. We only need a few bits for evenness as the number +  // of partitions will generally be in the 1-2 figure range; the low 16 bits +  // are enough. +  MD5 H; +  MD5::MD5Result R; +  H.update(Name); +  H.final(R); +  return (R[0] | (R[1] << 8)) % N == I; +} + +void llvm::SplitModule( +    std::unique_ptr<Module> M, unsigned N, +    std::function<void(std::unique_ptr<Module> MPart)> ModuleCallback) { +  for (Function &F : *M) +    externalize(&F); +  for (GlobalVariable &GV : M->globals()) +    externalize(&GV); +  for (GlobalAlias &GA : M->aliases()) +    externalize(&GA); + +  // FIXME: We should be able to reuse M as the last partition instead of +  // cloning it. +  for (unsigned I = 0; I != N; ++I) { +    ValueToValueMapTy VMap; +    std::unique_ptr<Module> MPart( +        CloneModule(M.get(), VMap, [=](const GlobalValue *GV) { +          return isInPartition(GV, I, N); +        })); +    if (I != 0) +      MPart->setModuleInlineAsm(""); +    ModuleCallback(std::move(MPart)); +  } +} diff --git a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp index a2a54da8590c..1d1f602b041d 100644 --- a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp @@ -69,7 +69,6 @@  #include "llvm/Support/SourceMgr.h"  #include "llvm/Support/YAMLParser.h"  #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/IPO/PassManagerBuilder.h"  #include "llvm/Transforms/Utils/SymbolRewriter.h"  using namespace llvm; diff --git a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp index 7e00a80989dc..6b1d1dae5f01 100644 --- a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp +++ b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp @@ -50,11 +50,11 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) {    //    std::vector<BasicBlock*> ReturningBlocks;    std::vector<BasicBlock*> UnreachableBlocks; -  for(Function::iterator I = F.begin(), E = F.end(); I != E; ++I) -    if (isa<ReturnInst>(I->getTerminator())) -      ReturningBlocks.push_back(I); -    else if (isa<UnreachableInst>(I->getTerminator())) -      UnreachableBlocks.push_back(I); +  for (BasicBlock &I : F) +    if (isa<ReturnInst>(I.getTerminator())) +      ReturningBlocks.push_back(&I); +    else if (isa<UnreachableInst>(I.getTerminator())) +      UnreachableBlocks.push_back(&I);    // Then unreachable blocks.    if (UnreachableBlocks.empty()) { diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp index 8c72641da9e7..1add78e01657 100644 --- a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -19,11 +19,14 @@  #include "llvm/IR/InlineAsm.h"  #include "llvm/IR/Instructions.h"  #include "llvm/IR/Metadata.h" +#include "llvm/IR/Operator.h"  using namespace llvm;  // Out of line method to get vtable etc for class.  void ValueMapTypeRemapper::anchor() {}  void ValueMaterializer::anchor() {} +void ValueMaterializer::materializeInitFor(GlobalValue *New, GlobalValue *Old) { +}  Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,                        ValueMapTypeRemapper *TypeMapper, @@ -35,15 +38,28 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,    // If we have a materializer and it can materialize a value, use that.    if (Materializer) { -    if (Value *NewV = Materializer->materializeValueFor(const_cast<Value*>(V))) -      return VM[V] = NewV; +    if (Value *NewV = +            Materializer->materializeDeclFor(const_cast<Value *>(V))) { +      VM[V] = NewV; +      if (auto *NewGV = dyn_cast<GlobalValue>(NewV)) +        Materializer->materializeInitFor( +            NewGV, const_cast<GlobalValue *>(cast<GlobalValue>(V))); +      return NewV; +    }    }    // Global values do not need to be seeded into the VM if they    // are using the identity mapping. -  if (isa<GlobalValue>(V)) +  if (isa<GlobalValue>(V)) { +    if (Flags & RF_NullMapMissingGlobalValues) { +      assert(!(Flags & RF_IgnoreMissingEntries) && +             "Illegal to specify both RF_NullMapMissingGlobalValues and " +             "RF_IgnoreMissingEntries"); +      return nullptr; +    }      return VM[V] = const_cast<Value*>(V); -   +  } +    if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {      // Inline asm may need *type* remapping.      FunctionType *NewTy = IA->getFunctionType(); @@ -73,7 +89,8 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,      // correct.  For now, just match behaviour from before the metadata/value      // split.      // -    //    assert(MappedMD && "Referenced metadata value not in value map"); +    //    assert((MappedMD || (Flags & RF_NullMapMissingGlobalValues)) && +    //           "Referenced metadata value not in value map");      return VM[V] = MetadataAsValue::get(V->getContext(), MappedMD);    } @@ -127,9 +144,13 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,        Ops.push_back(MapValue(cast<Constant>(C->getOperand(OpNo)), VM,                               Flags, TypeMapper, Materializer));    } -   +  Type *NewSrcTy = nullptr; +  if (TypeMapper) +    if (auto *GEPO = dyn_cast<GEPOperator>(C)) +      NewSrcTy = TypeMapper->remapType(GEPO->getSourceElementType()); +    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) -    return VM[V] = CE->getWithOperands(Ops, NewTy); +    return VM[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy);    if (isa<ConstantArray>(C))      return VM[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops);    if (isa<ConstantStruct>(C)) @@ -146,29 +167,42 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,  }  static Metadata *mapToMetadata(ValueToValueMapTy &VM, const Metadata *Key, -                     Metadata *Val) { +                               Metadata *Val, ValueMaterializer *Materializer, +                               RemapFlags Flags) {    VM.MD()[Key].reset(Val); +  if (Materializer && !(Flags & RF_HaveUnmaterializedMetadata)) { +    auto *N = dyn_cast_or_null<MDNode>(Val); +    // Need to invoke this once we have non-temporary MD. +    if (!N || !N->isTemporary()) +      Materializer->replaceTemporaryMetadata(Key, Val); +  }    return Val;  } -static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD) { -  return mapToMetadata(VM, MD, const_cast<Metadata *>(MD)); +static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD, +                           ValueMaterializer *Materializer, RemapFlags Flags) { +  return mapToMetadata(VM, MD, const_cast<Metadata *>(MD), Materializer, Flags);  }  static Metadata *MapMetadataImpl(const Metadata *MD, -                                 SmallVectorImpl<MDNode *> &Cycles, +                                 SmallVectorImpl<MDNode *> &DistinctWorklist,                                   ValueToValueMapTy &VM, RemapFlags Flags,                                   ValueMapTypeRemapper *TypeMapper,                                   ValueMaterializer *Materializer); -static Metadata *mapMetadataOp(Metadata *Op, SmallVectorImpl<MDNode *> &Cycles, +static Metadata *mapMetadataOp(Metadata *Op, +                               SmallVectorImpl<MDNode *> &DistinctWorklist,                                 ValueToValueMapTy &VM, RemapFlags Flags,                                 ValueMapTypeRemapper *TypeMapper,                                 ValueMaterializer *Materializer) {    if (!Op)      return nullptr; -  if (Metadata *MappedOp = -          MapMetadataImpl(Op, Cycles, VM, Flags, TypeMapper, Materializer)) + +  if (Materializer && !Materializer->isMetadataNeeded(Op)) +    return nullptr; + +  if (Metadata *MappedOp = MapMetadataImpl(Op, DistinctWorklist, VM, Flags, +                                           TypeMapper, Materializer))      return MappedOp;    // Use identity map if MappedOp is null and we can ignore missing entries.    if (Flags & RF_IgnoreMissingEntries) @@ -178,89 +212,113 @@ static Metadata *mapMetadataOp(Metadata *Op, SmallVectorImpl<MDNode *> &Cycles,    // correct.  For now, just match behaviour from before the metadata/value    // split.    // -  //    llvm_unreachable("Referenced metadata not in value map!"); +  //    assert((Flags & RF_NullMapMissingGlobalValues) && +  //           "Referenced metadata not in value map!");    return nullptr;  } -/// \brief Remap nodes. +/// Resolve uniquing cycles involving the given metadata. +static void resolveCycles(Metadata *MD, bool MDMaterialized) { +  if (auto *N = dyn_cast_or_null<MDNode>(MD)) { +    if (!MDMaterialized && N->isTemporary()) +      return; +    if (!N->isResolved()) +      N->resolveCycles(MDMaterialized); +  } +} + +/// Remap the operands of an MDNode.  /// -/// Insert \c NewNode in the value map, and then remap \c OldNode's operands. -/// Assumes that \c NewNode is already a clone of \c OldNode. +/// If \c Node is temporary, uniquing cycles are ignored.  If \c Node is +/// distinct, uniquing cycles are resolved as they're found.  /// -/// \pre \c NewNode is a clone of \c OldNode. -static bool remap(const MDNode *OldNode, MDNode *NewNode, -                  SmallVectorImpl<MDNode *> &Cycles, ValueToValueMapTy &VM, -                  RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, -                  ValueMaterializer *Materializer) { -  assert(OldNode->getNumOperands() == NewNode->getNumOperands() && -         "Expected nodes to match"); -  assert(OldNode->isResolved() && "Expected resolved node"); -  assert(!NewNode->isUniqued() && "Expected non-uniqued node"); - -  // Map the node upfront so it's available for cyclic references. -  mapToMetadata(VM, OldNode, NewNode); -  bool AnyChanged = false; -  for (unsigned I = 0, E = OldNode->getNumOperands(); I != E; ++I) { -    Metadata *Old = OldNode->getOperand(I); -    assert(NewNode->getOperand(I) == Old && -           "Expected old operands to already be in place"); +/// \pre \c Node.isDistinct() or \c Node.isTemporary(). +static bool remapOperands(MDNode &Node, +                          SmallVectorImpl<MDNode *> &DistinctWorklist, +                          ValueToValueMapTy &VM, RemapFlags Flags, +                          ValueMapTypeRemapper *TypeMapper, +                          ValueMaterializer *Materializer) { +  assert(!Node.isUniqued() && "Expected temporary or distinct node"); +  const bool IsDistinct = Node.isDistinct(); -    Metadata *New = mapMetadataOp(OldNode->getOperand(I), Cycles, VM, Flags, -                                  TypeMapper, Materializer); +  bool AnyChanged = false; +  for (unsigned I = 0, E = Node.getNumOperands(); I != E; ++I) { +    Metadata *Old = Node.getOperand(I); +    Metadata *New = mapMetadataOp(Old, DistinctWorklist, VM, Flags, TypeMapper, +                                  Materializer);      if (Old != New) {        AnyChanged = true; -      NewNode->replaceOperandWith(I, New); +      Node.replaceOperandWith(I, New); + +      // Resolve uniquing cycles underneath distinct nodes on the fly so they +      // don't infect later operands. +      if (IsDistinct) +        resolveCycles(New, !(Flags & RF_HaveUnmaterializedMetadata));      }    }    return AnyChanged;  } -/// \brief Map a distinct MDNode. +/// Map a distinct MDNode.  /// -/// Distinct nodes are not uniqued, so they must always recreated. +/// Whether distinct nodes change is independent of their operands.  If \a +/// RF_MoveDistinctMDs, then they are reused, and their operands remapped in +/// place; effectively, they're moved from one graph to another.  Otherwise, +/// they're cloned/duplicated, and the new copy's operands are remapped.  static Metadata *mapDistinctNode(const MDNode *Node, -                                 SmallVectorImpl<MDNode *> &Cycles, +                                 SmallVectorImpl<MDNode *> &DistinctWorklist,                                   ValueToValueMapTy &VM, RemapFlags Flags,                                   ValueMapTypeRemapper *TypeMapper,                                   ValueMaterializer *Materializer) {    assert(Node->isDistinct() && "Expected distinct node"); -  MDNode *NewMD = MDNode::replaceWithDistinct(Node->clone()); -  remap(Node, NewMD, Cycles, VM, Flags, TypeMapper, Materializer); +  MDNode *NewMD; +  if (Flags & RF_MoveDistinctMDs) +    NewMD = const_cast<MDNode *>(Node); +  else +    NewMD = MDNode::replaceWithDistinct(Node->clone()); -  // Track any cycles beneath this node. -  for (Metadata *Op : NewMD->operands()) -    if (auto *Node = dyn_cast_or_null<MDNode>(Op)) -      if (!Node->isResolved()) -        Cycles.push_back(Node); - -  return NewMD; +  // Remap operands later. +  DistinctWorklist.push_back(NewMD); +  return mapToMetadata(VM, Node, NewMD, Materializer, Flags);  }  /// \brief Map a uniqued MDNode.  ///  /// Uniqued nodes may not need to be recreated (they may map to themselves).  static Metadata *mapUniquedNode(const MDNode *Node, -                                SmallVectorImpl<MDNode *> &Cycles, +                                SmallVectorImpl<MDNode *> &DistinctWorklist,                                  ValueToValueMapTy &VM, RemapFlags Flags,                                  ValueMapTypeRemapper *TypeMapper,                                  ValueMaterializer *Materializer) { -  assert(Node->isUniqued() && "Expected uniqued node"); +  assert(((Flags & RF_HaveUnmaterializedMetadata) || Node->isUniqued()) && +         "Expected uniqued node"); -  // Create a temporary node upfront in case we have a metadata cycle. +  // Create a temporary node and map it upfront in case we have a uniquing +  // cycle.  If necessary, this mapping will get updated by RAUW logic before +  // returning.    auto ClonedMD = Node->clone(); -  if (!remap(Node, ClonedMD.get(), Cycles, VM, Flags, TypeMapper, Materializer)) -    // No operands changed, so use the identity mapping. -    return mapToSelf(VM, Node); +  mapToMetadata(VM, Node, ClonedMD.get(), Materializer, Flags); +  if (!remapOperands(*ClonedMD, DistinctWorklist, VM, Flags, TypeMapper, +                     Materializer)) { +    // No operands changed, so use the original. +    ClonedMD->replaceAllUsesWith(const_cast<MDNode *>(Node)); +    // Even though replaceAllUsesWith would have replaced the value map +    // entry, we need to explictly map with the final non-temporary node +    // to replace any temporary metadata via the callback. +    return mapToSelf(VM, Node, Materializer, Flags); +  } -  // At least one operand has changed, so uniquify the cloned node. +  // Uniquify the cloned node. Explicitly map it with the final non-temporary +  // node so that replacement of temporary metadata via the callback occurs.    return mapToMetadata(VM, Node, -                       MDNode::replaceWithUniqued(std::move(ClonedMD))); +                       MDNode::replaceWithUniqued(std::move(ClonedMD)), +                       Materializer, Flags);  }  static Metadata *MapMetadataImpl(const Metadata *MD, -                                 SmallVectorImpl<MDNode *> &Cycles, +                                 SmallVectorImpl<MDNode *> &DistinctWorklist,                                   ValueToValueMapTy &VM, RemapFlags Flags,                                   ValueMapTypeRemapper *TypeMapper,                                   ValueMaterializer *Materializer) { @@ -269,26 +327,28 @@ static Metadata *MapMetadataImpl(const Metadata *MD,      return NewMD;    if (isa<MDString>(MD)) -    return mapToSelf(VM, MD); +    return mapToSelf(VM, MD, Materializer, Flags);    if (isa<ConstantAsMetadata>(MD))      if ((Flags & RF_NoModuleLevelChanges)) -      return mapToSelf(VM, MD); +      return mapToSelf(VM, MD, Materializer, Flags);    if (const auto *VMD = dyn_cast<ValueAsMetadata>(MD)) {      Value *MappedV =          MapValue(VMD->getValue(), VM, Flags, TypeMapper, Materializer);      if (VMD->getValue() == MappedV ||          (!MappedV && (Flags & RF_IgnoreMissingEntries))) -      return mapToSelf(VM, MD); +      return mapToSelf(VM, MD, Materializer, Flags);      // FIXME: This assert crashes during bootstrap, but I think it should be      // correct.  For now, just match behaviour from before the metadata/value      // split.      // -    //    assert(MappedV && "Referenced metadata not in value map!"); +    //    assert((MappedV || (Flags & RF_NullMapMissingGlobalValues)) && +    //           "Referenced metadata not in value map!");      if (MappedV) -      return mapToMetadata(VM, MD, ValueAsMetadata::get(MappedV)); +      return mapToMetadata(VM, MD, ValueAsMetadata::get(MappedV), Materializer, +                           Flags);      return nullptr;    } @@ -299,37 +359,54 @@ static Metadata *MapMetadataImpl(const Metadata *MD,    // If this is a module-level metadata and we know that nothing at the    // module level is changing, then use an identity mapping.    if (Flags & RF_NoModuleLevelChanges) -    return mapToSelf(VM, MD); +    return mapToSelf(VM, MD, Materializer, Flags);    // Require resolved nodes whenever metadata might be remapped. -  assert(Node->isResolved() && "Unexpected unresolved node"); +  assert(((Flags & RF_HaveUnmaterializedMetadata) || Node->isResolved()) && +         "Unexpected unresolved node"); + +  if (Materializer && Node->isTemporary()) { +    assert(Flags & RF_HaveUnmaterializedMetadata); +    Metadata *TempMD = +        Materializer->mapTemporaryMetadata(const_cast<Metadata *>(MD)); +    // If the above callback returned an existing temporary node, use it +    // instead of the current temporary node. This happens when earlier +    // function importing passes already created and saved a temporary +    // metadata node for the same value id. +    if (TempMD) { +      mapToMetadata(VM, MD, TempMD, Materializer, Flags); +      return TempMD; +    } +  }    if (Node->isDistinct()) -    return mapDistinctNode(Node, Cycles, VM, Flags, TypeMapper, Materializer); +    return mapDistinctNode(Node, DistinctWorklist, VM, Flags, TypeMapper, +                           Materializer); -  return mapUniquedNode(Node, Cycles, VM, Flags, TypeMapper, Materializer); +  return mapUniquedNode(Node, DistinctWorklist, VM, Flags, TypeMapper, +                        Materializer);  }  Metadata *llvm::MapMetadata(const Metadata *MD, ValueToValueMapTy &VM,                              RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,                              ValueMaterializer *Materializer) { -  SmallVector<MDNode *, 8> Cycles; -  Metadata *NewMD = -      MapMetadataImpl(MD, Cycles, VM, Flags, TypeMapper, Materializer); - -  // Resolve cycles underneath MD. -  if (NewMD && NewMD != MD) { -    if (auto *N = dyn_cast<MDNode>(NewMD)) -      if (!N->isResolved()) -        N->resolveCycles(); - -    for (MDNode *N : Cycles) -      if (!N->isResolved()) -        N->resolveCycles(); -  } else { -    // Shouldn't get unresolved cycles if nothing was remapped. -    assert(Cycles.empty() && "Expected no unresolved cycles"); -  } +  SmallVector<MDNode *, 8> DistinctWorklist; +  Metadata *NewMD = MapMetadataImpl(MD, DistinctWorklist, VM, Flags, TypeMapper, +                                    Materializer); + +  // When there are no module-level changes, it's possible that the metadata +  // graph has temporaries.  Skip the logic to resolve cycles, since it's +  // unnecessary (and invalid) in that case. +  if (Flags & RF_NoModuleLevelChanges) +    return NewMD; + +  // Resolve cycles involving the entry metadata. +  resolveCycles(NewMD, !(Flags & RF_HaveUnmaterializedMetadata)); + +  // Remap the operands of distinct MDNodes. +  while (!DistinctWorklist.empty()) +    remapOperands(*DistinctWorklist.pop_back_val(), DistinctWorklist, VM, Flags, +                  TypeMapper, Materializer);    return NewMD;  } @@ -374,14 +451,11 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,    // Remap attached metadata.    SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;    I->getAllMetadata(MDs); -  for (SmallVectorImpl<std::pair<unsigned, MDNode *>>::iterator -           MI = MDs.begin(), -           ME = MDs.end(); -       MI != ME; ++MI) { -    MDNode *Old = MI->second; +  for (const auto &MI : MDs) { +    MDNode *Old = MI.second;      MDNode *New = MapMetadata(Old, VMap, Flags, TypeMapper, Materializer);      if (New != Old) -      I->setMetadata(MI->first, New); +      I->setMetadata(MI.first, New);    }    if (!TypeMapper) diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp index 215d6f9a1eb6..8844d574a79d 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp @@ -25,8 +25,11 @@  #include "llvm/ADT/StringExtras.h"  #include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"  #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h"  #include "llvm/Analysis/TargetTransformInfo.h"  #include "llvm/Analysis/ValueTracking.h"  #include "llvm/IR/Constants.h" @@ -204,9 +207,10 @@ namespace {      BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)        : BasicBlockPass(ID), Config(C) { -      AA = &P->getAnalysis<AliasAnalysis>(); +      AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults();        DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); -      SE = &P->getAnalysis<ScalarEvolution>(); +      SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE(); +      TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();        TTI = IgnoreTargetInfo                  ? nullptr                  : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); @@ -221,6 +225,7 @@ namespace {      AliasAnalysis *AA;      DominatorTree *DT;      ScalarEvolution *SE; +    const TargetLibraryInfo *TLI;      const TargetTransformInfo *TTI;      // FIXME: const correct? @@ -437,9 +442,10 @@ namespace {      bool runOnBasicBlock(BasicBlock &BB) override {        // OptimizeNone check deferred to vectorizeBB(). -      AA = &getAnalysis<AliasAnalysis>(); +      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();        DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); -      SE = &getAnalysis<ScalarEvolution>(); +      SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); +      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();        TTI = IgnoreTargetInfo                  ? nullptr                  : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( @@ -450,13 +456,15 @@ namespace {      void getAnalysisUsage(AnalysisUsage &AU) const override {        BasicBlockPass::getAnalysisUsage(AU); -      AU.addRequired<AliasAnalysis>(); +      AU.addRequired<AAResultsWrapperPass>();        AU.addRequired<DominatorTreeWrapperPass>(); -      AU.addRequired<ScalarEvolution>(); +      AU.addRequired<ScalarEvolutionWrapperPass>(); +      AU.addRequired<TargetLibraryInfoWrapperPass>();        AU.addRequired<TargetTransformInfoWrapperPass>(); -      AU.addPreserved<AliasAnalysis>();        AU.addPreserved<DominatorTreeWrapperPass>(); -      AU.addPreserved<ScalarEvolution>(); +      AU.addPreserved<GlobalsAAWrapperPass>(); +      AU.addPreserved<ScalarEvolutionWrapperPass>(); +      AU.addPreserved<SCEVAAWrapperPass>();        AU.setPreservesCFG();      } @@ -842,7 +850,7 @@ namespace {      // It is important to cleanup here so that future iterations of this      // function have less work to do. -    (void)SimplifyInstructionsInBlock(&BB, AA->getTargetLibraryInfo()); +    (void)SimplifyInstructionsInBlock(&BB, TLI);      return true;    } @@ -1239,20 +1247,23 @@ namespace {        if (I == Start) IAfterStart = true;        bool IsSimpleLoadStore; -      if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; +      if (!isInstVectorizable(&*I, IsSimpleLoadStore)) +        continue;        // Look for an instruction with which to pair instruction *I...        DenseSet<Value *> Users;        AliasSetTracker WriteSet(*AA); -      if (I->mayWriteToMemory()) WriteSet.add(I); +      if (I->mayWriteToMemory()) +        WriteSet.add(&*I);        bool JAfterStart = IAfterStart;        BasicBlock::iterator J = std::next(I);        for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { -        if (J == Start) JAfterStart = true; +        if (&*J == Start) +          JAfterStart = true;          // Determine if J uses I, if so, exit the loop. -        bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); +        bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep);          if (Config.FastDep) {            // Note: For this heuristic to be effective, independent operations            // must tend to be intermixed. This is likely to be true from some @@ -1269,25 +1280,26 @@ namespace {          // J does not use I, and comes before the first use of I, so it can be          // merged with I if the instructions are compatible.          int CostSavings, FixedOrder; -        if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len, -            CostSavings, FixedOrder)) continue; +        if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len, +                                CostSavings, FixedOrder)) +          continue;          // J is a candidate for merging with I.          if (PairableInsts.empty() || -             PairableInsts[PairableInsts.size()-1] != I) { -          PairableInsts.push_back(I); +            PairableInsts[PairableInsts.size() - 1] != &*I) { +          PairableInsts.push_back(&*I);          } -        CandidatePairs[I].push_back(J); +        CandidatePairs[&*I].push_back(&*J);          ++TotalPairs;          if (TTI) -          CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), -                                                            CostSavings)); +          CandidatePairCostSavings.insert( +              ValuePairWithCost(ValuePair(&*I, &*J), CostSavings));          if (FixedOrder == 1) -          FixedOrderPairs.insert(ValuePair(I, J)); +          FixedOrderPairs.insert(ValuePair(&*I, &*J));          else if (FixedOrder == -1) -          FixedOrderPairs.insert(ValuePair(J, I)); +          FixedOrderPairs.insert(ValuePair(&*J, &*I));          // The next call to this function must start after the last instruction          // selected during this invocation. @@ -1468,14 +1480,16 @@ namespace {      BasicBlock::iterator E = BB.end(), EL =        BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));      for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { -      if (IsInPair.find(I) == IsInPair.end()) continue; +      if (IsInPair.find(&*I) == IsInPair.end()) +        continue;        DenseSet<Value *> Users;        AliasSetTracker WriteSet(*AA); -      if (I->mayWriteToMemory()) WriteSet.add(I); +      if (I->mayWriteToMemory()) +        WriteSet.add(&*I);        for (BasicBlock::iterator J = std::next(I); J != E; ++J) { -        (void) trackUsesOfI(Users, WriteSet, I, J); +        (void)trackUsesOfI(Users, WriteSet, &*I, &*J);          if (J == EL)            break; @@ -1484,7 +1498,7 @@ namespace {        for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();             U != E; ++U) {          if (IsInPair.find(*U) == IsInPair.end()) continue; -        PairableInstUsers.insert(ValuePair(I, *U)); +        PairableInstUsers.insert(ValuePair(&*I, *U));        }        if (I == EL) @@ -2806,55 +2820,51 @@ namespace {                       Instruction *J, Instruction *K,                       Instruction *&InsertionPt,                       Instruction *&K1, Instruction *&K2) { -    if (isa<StoreInst>(I)) { -      AA->replaceWithNewValue(I, K); -      AA->replaceWithNewValue(J, K); -    } else { -      Type *IType = I->getType(); -      Type *JType = J->getType(); +    if (isa<StoreInst>(I)) +      return; -      VectorType *VType = getVecTypeForPair(IType, JType); -      unsigned numElem = VType->getNumElements(); +    Type *IType = I->getType(); +    Type *JType = J->getType(); -      unsigned numElemI = getNumScalarElements(IType); -      unsigned numElemJ = getNumScalarElements(JType); +    VectorType *VType = getVecTypeForPair(IType, JType); +    unsigned numElem = VType->getNumElements(); -      if (IType->isVectorTy()) { -        std::vector<Constant*> Mask1(numElemI), Mask2(numElemI); -        for (unsigned v = 0; v < numElemI; ++v) { -          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); -          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v); -        } +    unsigned numElemI = getNumScalarElements(IType); +    unsigned numElemJ = getNumScalarElements(JType); -        K1 = new ShuffleVectorInst(K, UndefValue::get(VType), -                                   ConstantVector::get( Mask1), -                                   getReplacementName(K, false, 1)); -      } else { -        Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); -        K1 = ExtractElementInst::Create(K, CV0, -                                          getReplacementName(K, false, 1)); +    if (IType->isVectorTy()) { +      std::vector<Constant *> Mask1(numElemI), Mask2(numElemI); +      for (unsigned v = 0; v < numElemI; ++v) { +        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); +        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v);        } -      if (JType->isVectorTy()) { -        std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ); -        for (unsigned v = 0; v < numElemJ; ++v) { -          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); -          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v); -        } +      K1 = new ShuffleVectorInst(K, UndefValue::get(VType), +                                 ConstantVector::get(Mask1), +                                 getReplacementName(K, false, 1)); +    } else { +      Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); +      K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1)); +    } -        K2 = new ShuffleVectorInst(K, UndefValue::get(VType), -                                   ConstantVector::get( Mask2), -                                   getReplacementName(K, false, 2)); -      } else { -        Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); -        K2 = ExtractElementInst::Create(K, CV1, -                                          getReplacementName(K, false, 2)); +    if (JType->isVectorTy()) { +      std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ); +      for (unsigned v = 0; v < numElemJ; ++v) { +        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); +        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v);        } -      K1->insertAfter(K); -      K2->insertAfter(K1); -      InsertionPt = K2; +      K2 = new ShuffleVectorInst(K, UndefValue::get(VType), +                                 ConstantVector::get(Mask2), +                                 getReplacementName(K, false, 2)); +    } else { +      Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1); +      K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2));      } + +    K1->insertAfter(K); +    K2->insertAfter(K1); +    InsertionPt = K2;    }    // Move all uses of the function I (including pairing-induced uses) after J. @@ -2869,7 +2879,7 @@ namespace {      if (I->mayWriteToMemory()) WriteSet.add(I);      for (; cast<Instruction>(L) != J; ++L) -      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs); +      (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs);      assert(cast<Instruction>(L) == J &&        "Tracking has not proceeded far enough to check for dependencies"); @@ -2891,9 +2901,9 @@ namespace {      if (I->mayWriteToMemory()) WriteSet.add(I);      for (; cast<Instruction>(L) != J;) { -      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) { +      if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) {          // Move this instruction -        Instruction *InstToMove = L; ++L; +        Instruction *InstToMove = &*L++;          DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<                          " to after " << *InsertionPt << "\n"); @@ -2924,11 +2934,11 @@ namespace {      // Note: We cannot end the loop when we reach J because J could be moved      // farther down the use chain by another instruction pairing. Also, J      // could be before I if this is an inverted input. -    for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { -      if (trackUsesOfI(Users, WriteSet, I, L)) { +    for (BasicBlock::iterator E = BB.end(); L != E; ++L) { +      if (trackUsesOfI(Users, WriteSet, I, &*L)) {          if (L->mayReadFromMemory()) { -          LoadMoveSet[L].push_back(I); -          LoadMoveSetPairs.insert(ValuePair(L, I)); +          LoadMoveSet[&*L].push_back(I); +          LoadMoveSetPairs.insert(ValuePair(&*L, I));          }        }      } @@ -2991,7 +3001,7 @@ namespace {      DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");      for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { -      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); +      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI);        if (P == ChosenPairs.end()) {          ++PI;          continue; @@ -3116,12 +3126,9 @@ namespace {        } else if (!isa<StoreInst>(K))          K->mutateType(getVecTypeForPair(L->getType(), H->getType())); -      unsigned KnownIDs[] = { -        LLVMContext::MD_tbaa, -        LLVMContext::MD_alias_scope, -        LLVMContext::MD_noalias, -        LLVMContext::MD_fpmath -      }; +      unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, +                             LLVMContext::MD_noalias, LLVMContext::MD_fpmath, +                             LLVMContext::MD_invariant_group};        combineMetadata(K, H, KnownIDs);        K->intersectOptionalDataWith(H); @@ -3145,8 +3152,6 @@ namespace {        if (!isa<StoreInst>(I)) {          L->replaceAllUsesWith(K1);          H->replaceAllUsesWith(K2); -        AA->replaceWithNewValue(L, K1); -        AA->replaceWithNewValue(H, K2);        }        // Instructions that may read from memory may be in the load move set. @@ -3197,10 +3202,14 @@ namespace {  char BBVectorize::ID = 0;  static const char bb_vectorize_name[] = "Basic-Block Vectorization";  INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)  INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)  BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 69ca2688c810..a627dd665179 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -48,7 +48,6 @@  #include "llvm/Transforms/Vectorize.h"  #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/EquivalenceClasses.h"  #include "llvm/ADT/Hashing.h"  #include "llvm/ADT/MapVector.h"  #include "llvm/ADT/SetVector.h" @@ -58,10 +57,13 @@  #include "llvm/ADT/Statistic.h"  #include "llvm/ADT/StringExtras.h"  #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h"  #include "llvm/Analysis/AliasSetTracker.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/BlockFrequencyInfo.h"  #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/LoopAccessAnalysis.h"  #include "llvm/Analysis/LoopInfo.h"  #include "llvm/Analysis/LoopIterator.h" @@ -99,6 +101,7 @@  #include "llvm/Analysis/VectorUtils.h"  #include "llvm/Transforms/Utils/LoopUtils.h"  #include <algorithm> +#include <functional>  #include <map>  #include <tuple> @@ -123,6 +126,11 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),                                        "trip count that is smaller than this "                                        "value.")); +static cl::opt<bool> MaximizeBandwidth( +    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, +    cl::desc("Maximize bandwidth when selecting vectorization factor which " +             "will be determined by the smallest type in loop.")); +  /// This enables versioning on the strides of symbolically striding memory  /// accesses in code like the following.  ///   for (i = 0; i < N; ++i) @@ -136,7 +144,7 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),  ///      ...  static cl::opt<bool> EnableMemAccessVersioning(      "enable-mem-access-versioning", cl::init(true), cl::Hidden, -    cl::desc("Enable symblic stride memory access versioning")); +    cl::desc("Enable symbolic stride memory access versioning"));  static cl::opt<bool> EnableInterleavedMemAccesses(      "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, @@ -214,12 +222,27 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(      cl::desc("The maximum interleave count to use when interleaving a scalar "               "reduction in a nested loop.")); +static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( +    "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, +    cl::desc("The maximum allowed number of runtime memory checks with a " +             "vectorize(enable) pragma.")); + +static cl::opt<unsigned> VectorizeSCEVCheckThreshold( +    "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, +    cl::desc("The maximum number of SCEV checks allowed.")); + +static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( +    "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, +    cl::desc("The maximum number of SCEV checks allowed with a " +             "vectorize(enable) pragma")); +  namespace {  // Forward declarations. +class LoopVectorizeHints;  class LoopVectorizationLegality;  class LoopVectorizationCostModel; -class LoopVectorizeHints; +class LoopVectorizationRequirements;  /// \brief This modifies LoopAccessReport to initialize message with  /// loop-vectorizer-specific part. @@ -245,6 +268,32 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) {    return VectorType::get(Scalar, VF);  } +/// A helper function that returns GEP instruction and knows to skip a +/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination +/// pointee types of the 'bitcast' have the same size. +/// For example: +///   bitcast double** %var to i64* - can be skipped +///   bitcast double** %var to i8*  - can not +static GetElementPtrInst *getGEPInstruction(Value *Ptr) { + +  if (isa<GetElementPtrInst>(Ptr)) +    return cast<GetElementPtrInst>(Ptr); + +  if (isa<BitCastInst>(Ptr) && +      isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) { +    Type *BitcastTy = Ptr->getType(); +    Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy(); +    if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy)) +      return nullptr; +    Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType(); +    Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType(); +    const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout(); +    if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) +      return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0)); +  } +  return nullptr; +} +  /// InnerLoopVectorizer vectorizes loops which contain only one basic  /// block to a specified vectorization factor (VF).  /// This class performs the widening of scalars into vectors, or multiple @@ -261,25 +310,30 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) {  /// and reduction variables that were found to a given vectorization factor.  class InnerLoopVectorizer {  public: -  InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, -                      DominatorTree *DT, const TargetLibraryInfo *TLI, +  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, +                      LoopInfo *LI, DominatorTree *DT, +                      const TargetLibraryInfo *TLI,                        const TargetTransformInfo *TTI, unsigned VecWidth,                        unsigned UnrollFactor) -      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), -        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), +      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), +        VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),          Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), -        Legal(nullptr), AddedSafetyChecks(false) {} +        TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), +        AddedSafetyChecks(false) {}    // Perform the actual loop widening (vectorization). -  void vectorize(LoopVectorizationLegality *L) { +  // MinimumBitWidths maps scalar integer values to the smallest bitwidth they +  // can be validly truncated to. The cost model has assumed this truncation +  // will happen when vectorizing. +  void vectorize(LoopVectorizationLegality *L, +                 MapVector<Instruction*,uint64_t> MinimumBitWidths) { +    MinBWs = MinimumBitWidths;      Legal = L;      // Create a new empty loop. Unlink the old loop and connect the new one.      createEmptyLoop();      // Widen each instruction in the old loop to a new one in the new loop.      // Use the Legality module to find the induction and reduction variables.      vectorizeLoop(); -    // Register the new loop and update the analysis passes. -    updateAnalysis();    }    // Return true if any runtime check is added. @@ -302,14 +356,11 @@ protected:    typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,                     VectorParts> EdgeMaskCache; -  /// \brief Add checks for strides that were assumed to be 1. -  /// -  /// Returns the last check instruction and the first check instruction in the -  /// pair as (first, last). -  std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc); -    /// Create an empty loop, based on the loop ranges of the old loop.    void createEmptyLoop(); +  /// Create a new induction variable inside L. +  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, +                                   Value *Step, Instruction *DL);    /// Copy and widen the instructions from the old loop.    virtual void vectorizeLoop(); @@ -319,6 +370,9 @@ protected:    /// See PR14725.    void fixLCSSAPHIs(); +  /// Shrinks vector element sizes based on information in "MinBWs". +  void truncateToMinimalBitwidths(); +      /// A helper function that computes the predicate of the block BB, assuming    /// that the header block of the loop is set to True. It returns the *entry*    /// mask for the block BB. @@ -329,7 +383,7 @@ protected:    /// A helper function to vectorize a single BB within the innermost loop.    void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); - +      /// Vectorize a single PHINode in a block. This method handles the induction    /// variable canonicalization. It supports both VF = 1 for unrolled loops and    /// arbitrary length vectors. @@ -374,6 +428,23 @@ protected:    /// Generate a shuffle sequence that will reverse the vector Vec.    virtual Value *reverseVector(Value *Vec); +  /// Returns (and creates if needed) the original loop trip count. +  Value *getOrCreateTripCount(Loop *NewLoop); + +  /// Returns (and creates if needed) the trip count of the widened loop. +  Value *getOrCreateVectorTripCount(Loop *NewLoop); + +  /// Emit a bypass check to see if the trip count would overflow, or we +  /// wouldn't have enough iterations to execute one vector loop. +  void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); +  /// Emit a bypass check to see if the vector trip count is nonzero. +  void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); +  /// Emit a bypass check to see if all of the SCEV assumptions we've +  /// had to make are correct. +  void emitSCEVChecks(Loop *L, BasicBlock *Bypass); +  /// Emit bypass checks to check any memory assumptions we may have made. +  void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); +    /// This is a helper class that holds the vectorizer state. It maps scalar    /// instructions to vector instructions. When the code is 'unrolled' then    /// then a single scalar value is mapped to multiple vector parts. The parts @@ -416,8 +487,10 @@ protected:    /// The original loop.    Loop *OrigLoop; -  /// Scev analysis to use. -  ScalarEvolution *SE; +  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies +  /// dynamic knowledge to simplify SCEV expressions and converts them to a +  /// more usable form. +  PredicatedScalarEvolution &PSE;    /// Loop Info.    LoopInfo *LI;    /// Dominator Tree. @@ -462,12 +535,21 @@ protected:    PHINode *Induction;    /// The induction variable of the old basic block.    PHINode *OldInduction; -  /// Holds the extended (to the widest induction type) start index. -  Value *ExtendedIdx;    /// Maps scalars to widened vectors.    ValueMap WidenMap; +  /// Store instructions that should be predicated, as a pair +  ///   <StoreInst, Predicate> +  SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores;    EdgeMaskCache MaskCache; - +  /// Trip count of the original loop. +  Value *TripCount; +  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) +  Value *VectorTripCount; + +  /// Map of scalar integer values to the smallest bitwidth they can be legally +  /// represented as. The vector equivalents of these values should be truncated +  /// to this type. +  MapVector<Instruction*,uint64_t> MinBWs;    LoopVectorizationLegality *Legal;    // Record whether runtime check is added. @@ -476,10 +558,11 @@ protected:  class InnerLoopUnroller : public InnerLoopVectorizer {  public: -  InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, -                    DominatorTree *DT, const TargetLibraryInfo *TLI, +  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, +                    LoopInfo *LI, DominatorTree *DT, +                    const TargetLibraryInfo *TLI,                      const TargetTransformInfo *TTI, unsigned UnrollFactor) -      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {} +      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {}  private:    void scalarizeInstruction(Instruction *Instr, @@ -551,7 +634,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) {      if (Kind != LLVMContext::MD_tbaa &&          Kind != LLVMContext::MD_alias_scope &&          Kind != LLVMContext::MD_noalias && -        Kind != LLVMContext::MD_fpmath) +        Kind != LLVMContext::MD_fpmath && +        Kind != LLVMContext::MD_nontemporal)        continue;      To->setMetadata(Kind, M.second); @@ -559,7 +643,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) {  }  /// \brief Propagate known metadata from one instruction to a vector of others. -static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) { +static void propagateMetadata(SmallVectorImpl<Value *> &To, +                              const Instruction *From) {    for (Value *V : To)      if (Instruction *I = dyn_cast<Instruction>(V))        propagateMetadata(I, From); @@ -699,8 +784,9 @@ private:  /// between the member and the group in a map.  class InterleavedAccessInfo {  public: -  InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT) -      : SE(SE), TheLoop(L), DT(DT) {} +  InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, +                        DominatorTree *DT) +      : PSE(PSE), TheLoop(L), DT(DT) {}    ~InterleavedAccessInfo() {      SmallSet<InterleaveGroup *, 4> DelSet; @@ -730,7 +816,11 @@ public:    }  private: -  ScalarEvolution *SE; +  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. +  /// Simplifies SCEV expressions in the context of existing SCEV assumptions. +  /// The interleaved access analysis can also add new predicates (for example +  /// by versioning strides of pointers). +  PredicatedScalarEvolution &PSE;    Loop *TheLoop;    DominatorTree *DT; @@ -778,6 +868,304 @@ private:        const ValueToValueMap &Strides);  }; +/// Utility class for getting and setting loop vectorizer hints in the form +/// of loop metadata. +/// This class keeps a number of loop annotations locally (as member variables) +/// and can, upon request, write them back as metadata on the loop. It will +/// initially scan the loop for existing metadata, and will update the local +/// values based on information in the loop. +/// We cannot write all values to metadata, as the mere presence of some info, +/// for example 'force', means a decision has been made. So, we need to be +/// careful NOT to add them if the user hasn't specifically asked so. +class LoopVectorizeHints { +  enum HintKind { +    HK_WIDTH, +    HK_UNROLL, +    HK_FORCE +  }; + +  /// Hint - associates name and validation with the hint value. +  struct Hint { +    const char * Name; +    unsigned Value; // This may have to change for non-numeric values. +    HintKind Kind; + +    Hint(const char * Name, unsigned Value, HintKind Kind) +      : Name(Name), Value(Value), Kind(Kind) { } + +    bool validate(unsigned Val) { +      switch (Kind) { +      case HK_WIDTH: +        return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; +      case HK_UNROLL: +        return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; +      case HK_FORCE: +        return (Val <= 1); +      } +      return false; +    } +  }; + +  /// Vectorization width. +  Hint Width; +  /// Vectorization interleave factor. +  Hint Interleave; +  /// Vectorization forced +  Hint Force; + +  /// Return the loop metadata prefix. +  static StringRef Prefix() { return "llvm.loop."; } + +public: +  enum ForceKind { +    FK_Undefined = -1, ///< Not selected. +    FK_Disabled = 0,   ///< Forcing disabled. +    FK_Enabled = 1,    ///< Forcing enabled. +  }; + +  LoopVectorizeHints(const Loop *L, bool DisableInterleaving) +      : Width("vectorize.width", VectorizerParams::VectorizationFactor, +              HK_WIDTH), +        Interleave("interleave.count", DisableInterleaving, HK_UNROLL), +        Force("vectorize.enable", FK_Undefined, HK_FORCE), +        TheLoop(L) { +    // Populate values with existing loop metadata. +    getHintsFromMetadata(); + +    // force-vector-interleave overrides DisableInterleaving. +    if (VectorizerParams::isInterleaveForced()) +      Interleave.Value = VectorizerParams::VectorizationInterleave; + +    DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() +          << "LV: Interleaving disabled by the pass manager\n"); +  } + +  /// Mark the loop L as already vectorized by setting the width to 1. +  void setAlreadyVectorized() { +    Width.Value = Interleave.Value = 1; +    Hint Hints[] = {Width, Interleave}; +    writeHintsToMetadata(Hints); +  } + +  bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { +    if (getForce() == LoopVectorizeHints::FK_Disabled) { +      DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); +      emitOptimizationRemarkAnalysis(F->getContext(), +                                     vectorizeAnalysisPassName(), *F, +                                     L->getStartLoc(), emitRemark()); +      return false; +    } + +    if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { +      DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); +      emitOptimizationRemarkAnalysis(F->getContext(), +                                     vectorizeAnalysisPassName(), *F, +                                     L->getStartLoc(), emitRemark()); +      return false; +    } + +    if (getWidth() == 1 && getInterleave() == 1) { +      // FIXME: Add a separate metadata to indicate when the loop has already +      // been vectorized instead of setting width and count to 1. +      DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); +      // FIXME: Add interleave.disable metadata. This will allow +      // vectorize.disable to be used without disabling the pass and errors +      // to differentiate between disabled vectorization and a width of 1. +      emitOptimizationRemarkAnalysis( +          F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(), +          "loop not vectorized: vectorization and interleaving are explicitly " +          "disabled, or vectorize width and interleave count are both set to " +          "1"); +      return false; +    } + +    return true; +  } + +  /// Dumps all the hint information. +  std::string emitRemark() const { +    VectorizationReport R; +    if (Force.Value == LoopVectorizeHints::FK_Disabled) +      R << "vectorization is explicitly disabled"; +    else { +      R << "use -Rpass-analysis=loop-vectorize for more info"; +      if (Force.Value == LoopVectorizeHints::FK_Enabled) { +        R << " (Force=true"; +        if (Width.Value != 0) +          R << ", Vector Width=" << Width.Value; +        if (Interleave.Value != 0) +          R << ", Interleave Count=" << Interleave.Value; +        R << ")"; +      } +    } + +    return R.str(); +  } + +  unsigned getWidth() const { return Width.Value; } +  unsigned getInterleave() const { return Interleave.Value; } +  enum ForceKind getForce() const { return (ForceKind)Force.Value; } +  const char *vectorizeAnalysisPassName() const { +    // If hints are provided that don't disable vectorization use the +    // AlwaysPrint pass name to force the frontend to print the diagnostic. +    if (getWidth() == 1) +      return LV_NAME; +    if (getForce() == LoopVectorizeHints::FK_Disabled) +      return LV_NAME; +    if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) +      return LV_NAME; +    return DiagnosticInfo::AlwaysPrint; +  } + +  bool allowReordering() const { +    // When enabling loop hints are provided we allow the vectorizer to change +    // the order of operations that is given by the scalar loop. This is not +    // enabled by default because can be unsafe or inefficient. For example, +    // reordering floating-point operations will change the way round-off +    // error accumulates in the loop. +    return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; +  } + +private: +  /// Find hints specified in the loop metadata and update local values. +  void getHintsFromMetadata() { +    MDNode *LoopID = TheLoop->getLoopID(); +    if (!LoopID) +      return; + +    // First operand should refer to the loop id itself. +    assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); +    assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + +    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { +      const MDString *S = nullptr; +      SmallVector<Metadata *, 4> Args; + +      // The expected hint is either a MDString or a MDNode with the first +      // operand a MDString. +      if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { +        if (!MD || MD->getNumOperands() == 0) +          continue; +        S = dyn_cast<MDString>(MD->getOperand(0)); +        for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) +          Args.push_back(MD->getOperand(i)); +      } else { +        S = dyn_cast<MDString>(LoopID->getOperand(i)); +        assert(Args.size() == 0 && "too many arguments for MDString"); +      } + +      if (!S) +        continue; + +      // Check if the hint starts with the loop metadata prefix. +      StringRef Name = S->getString(); +      if (Args.size() == 1) +        setHint(Name, Args[0]); +    } +  } + +  /// Checks string hint with one operand and set value if valid. +  void setHint(StringRef Name, Metadata *Arg) { +    if (!Name.startswith(Prefix())) +      return; +    Name = Name.substr(Prefix().size(), StringRef::npos); + +    const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); +    if (!C) return; +    unsigned Val = C->getZExtValue(); + +    Hint *Hints[] = {&Width, &Interleave, &Force}; +    for (auto H : Hints) { +      if (Name == H->Name) { +        if (H->validate(Val)) +          H->Value = Val; +        else +          DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); +        break; +      } +    } +  } + +  /// Create a new hint from name / value pair. +  MDNode *createHintMetadata(StringRef Name, unsigned V) const { +    LLVMContext &Context = TheLoop->getHeader()->getContext(); +    Metadata *MDs[] = {MDString::get(Context, Name), +                       ConstantAsMetadata::get( +                           ConstantInt::get(Type::getInt32Ty(Context), V))}; +    return MDNode::get(Context, MDs); +  } + +  /// Matches metadata with hint name. +  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { +    MDString* Name = dyn_cast<MDString>(Node->getOperand(0)); +    if (!Name) +      return false; + +    for (auto H : HintTypes) +      if (Name->getString().endswith(H.Name)) +        return true; +    return false; +  } + +  /// Sets current hints into loop metadata, keeping other values intact. +  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { +    if (HintTypes.size() == 0) +      return; + +    // Reserve the first element to LoopID (see below). +    SmallVector<Metadata *, 4> MDs(1); +    // If the loop already has metadata, then ignore the existing operands. +    MDNode *LoopID = TheLoop->getLoopID(); +    if (LoopID) { +      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { +        MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); +        // If node in update list, ignore old value. +        if (!matchesHintMetadataName(Node, HintTypes)) +          MDs.push_back(Node); +      } +    } + +    // Now, add the missing hints. +    for (auto H : HintTypes) +      MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + +    // Replace current metadata node with new one. +    LLVMContext &Context = TheLoop->getHeader()->getContext(); +    MDNode *NewLoopID = MDNode::get(Context, MDs); +    // Set operand 0 to refer to the loop id itself. +    NewLoopID->replaceOperandWith(0, NewLoopID); + +    TheLoop->setLoopID(NewLoopID); +  } + +  /// The loop these hints belong to. +  const Loop *TheLoop; +}; + +static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop, +                             const LoopVectorizeHints &Hints, +                             const LoopAccessReport &Message) { +  const char *Name = Hints.vectorizeAnalysisPassName(); +  LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name); +} + +static void emitMissedWarning(Function *F, Loop *L, +                              const LoopVectorizeHints &LH) { +  emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(), +                               LH.emitRemark()); + +  if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { +    if (LH.getWidth() != 1) +      emitLoopVectorizeWarning( +          F->getContext(), *F, L->getStartLoc(), +          "failed explicitly specified loop vectorization"); +    else if (LH.getInterleave() != 1) +      emitLoopInterleaveWarning( +          F->getContext(), *F, L->getStartLoc(), +          "failed explicitly specified loop interleaving"); +  } +} +  /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and  /// to what vectorization factor.  /// This class does not look at the profitability of vectorization, only the @@ -793,87 +1181,17 @@ private:  /// induction variable and the different reduction variables.  class LoopVectorizationLegality {  public: -  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT, -                            TargetLibraryInfo *TLI, AliasAnalysis *AA, -                            Function *F, const TargetTransformInfo *TTI, -                            LoopAccessAnalysis *LAA) -      : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), -        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT), -        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} - -  /// This enum represents the kinds of inductions that we support. -  enum InductionKind { -    IK_NoInduction,  ///< Not an induction variable. -    IK_IntInduction, ///< Integer induction variable. Step = C. -    IK_PtrInduction  ///< Pointer induction var. Step = C / sizeof(elem). -  }; - -  /// A struct for saving information about induction variables. -  struct InductionInfo { -    InductionInfo(Value *Start, InductionKind K, ConstantInt *Step) -        : StartValue(Start), IK(K), StepValue(Step) { -      assert(IK != IK_NoInduction && "Not an induction"); -      assert(StartValue && "StartValue is null"); -      assert(StepValue && !StepValue->isZero() && "StepValue is zero"); -      assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && -             "StartValue is not a pointer for pointer induction"); -      assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && -             "StartValue is not an integer for integer induction"); -      assert(StepValue->getType()->isIntegerTy() && -             "StepValue is not an integer"); -    } -    InductionInfo() -        : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {} - -    /// Get the consecutive direction. Returns: -    ///   0 - unknown or non-consecutive. -    ///   1 - consecutive and increasing. -    ///  -1 - consecutive and decreasing. -    int getConsecutiveDirection() const { -      if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) -        return StepValue->getSExtValue(); -      return 0; -    } - -    /// Compute the transformed value of Index at offset StartValue using step -    /// StepValue. -    /// For integer induction, returns StartValue + Index * StepValue. -    /// For pointer induction, returns StartValue[Index * StepValue]. -    /// FIXME: The newly created binary instructions should contain nsw/nuw -    /// flags, which can be found from the original scalar operations. -    Value *transform(IRBuilder<> &B, Value *Index) const { -      switch (IK) { -      case IK_IntInduction: -        assert(Index->getType() == StartValue->getType() && -               "Index type does not match StartValue type"); -        if (StepValue->isMinusOne()) -          return B.CreateSub(StartValue, Index); -        if (!StepValue->isOne()) -          Index = B.CreateMul(Index, StepValue); -        return B.CreateAdd(StartValue, Index); - -      case IK_PtrInduction: -        assert(Index->getType() == StepValue->getType() && -               "Index type does not match StepValue type"); -        if (StepValue->isMinusOne()) -          Index = B.CreateNeg(Index); -        else if (!StepValue->isOne()) -          Index = B.CreateMul(Index, StepValue); -        return B.CreateGEP(nullptr, StartValue, Index); - -      case IK_NoInduction: -        return nullptr; -      } -      llvm_unreachable("invalid enum"); -    } - -    /// Start value. -    TrackingVH<Value> StartValue; -    /// Induction kind. -    InductionKind IK; -    /// Step value. -    ConstantInt *StepValue; -  }; +  LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE, +                            DominatorTree *DT, TargetLibraryInfo *TLI, +                            AliasAnalysis *AA, Function *F, +                            const TargetTransformInfo *TTI, +                            LoopAccessAnalysis *LAA, +                            LoopVectorizationRequirements *R, +                            const LoopVectorizeHints *H) +      : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), +        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), +        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), +        Requirements(R), Hints(H) {}    /// ReductionList contains the reduction descriptors for all    /// of the reductions that were found in the loop. @@ -881,7 +1199,7 @@ public:    /// InductionList saves induction variables and maps them to the    /// induction descriptor. -  typedef MapVector<PHINode*, InductionInfo> InductionList; +  typedef MapVector<PHINode*, InductionDescriptor> InductionList;    /// Returns true if it is legal to vectorize this loop.    /// This does not mean that it is profitable to vectorize this @@ -903,6 +1221,9 @@ public:    /// Returns True if V is an induction variable in this loop.    bool isInductionVariable(const Value *V); +  /// Returns True if PN is a reduction variable in this loop. +  bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } +    /// Return true if the block BB needs to be predicated in order for the loop    /// to be vectorized.    bool blockNeedsPredication(BasicBlock *BB); @@ -954,12 +1275,12 @@ public:    /// Returns true if the target machine supports masked store operation    /// for the given \p DataType and kind of access to \p Ptr.    bool isLegalMaskedStore(Type *DataType, Value *Ptr) { -    return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr)); +    return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);    }    /// Returns true if the target machine supports masked load operation    /// for the given \p DataType and kind of access to \p Ptr.    bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { -    return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr)); +    return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);    }    /// Returns true if vector representation of the instruction \p I    /// requires mask. @@ -999,10 +1320,6 @@ private:    /// and we know that we can read from them without segfault.    bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); -  /// Returns the induction kind of Phi and record the step. This function may -  /// return NoInduction if the PHI is not an induction variable. -  InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue); -    /// \brief Collect memory access with loop invariant strides.    ///    /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop @@ -1013,16 +1330,20 @@ private:    /// not vectorized.  These are handled as LoopAccessReport rather than    /// VectorizationReport because the << operator of VectorizationReport returns    /// LoopAccessReport. -  void emitAnalysis(const LoopAccessReport &Message) { -    LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); +  void emitAnalysis(const LoopAccessReport &Message) const { +    emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);    }    unsigned NumPredStores;    /// The loop that we evaluate.    Loop *TheLoop; -  /// Scev analysis. -  ScalarEvolution *SE; +  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. +  /// Applies dynamic knowledge to simplify SCEV expressions in the context +  /// of existing SCEV assumptions. The analysis will also add a minimal set +  /// of new predicates if this is required to enable vectorization and +  /// unrolling. +  PredicatedScalarEvolution &PSE;    /// Target Library Info.    TargetLibraryInfo *TLI;    /// Parent function @@ -1065,12 +1386,18 @@ private:    /// Can we assume the absence of NaNs.    bool HasFunNoNaNAttr; +  /// Vectorization requirements that will go through late-evaluation. +  LoopVectorizationRequirements *Requirements; + +  /// Used to emit an analysis of any legality issues. +  const LoopVectorizeHints *Hints; +    ValueToValueMap Strides;    SmallPtrSet<Value *, 8> StrideSet;    /// While vectorizing these instructions we have to generate a    /// call to the appropriate masked intrinsic -  SmallPtrSet<const Instruction*, 8> MaskedOp; +  SmallPtrSet<const Instruction *, 8> MaskedOp;  };  /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1082,15 +1409,14 @@ private:  /// different operations.  class LoopVectorizationCostModel {  public: -  LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, -                             LoopVectorizationLegality *Legal, +  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, +                             LoopInfo *LI, LoopVectorizationLegality *Legal,                               const TargetTransformInfo &TTI, -                             const TargetLibraryInfo *TLI, AssumptionCache *AC, -                             const Function *F, const LoopVectorizeHints *Hints) -      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), -        TheFunction(F), Hints(Hints) { -    CodeMetrics::collectEphemeralValues(L, AC, EphValues); -  } +                             const TargetLibraryInfo *TLI, DemandedBits *DB, +                             AssumptionCache *AC, const Function *F, +                             const LoopVectorizeHints *Hints) +      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), +        AC(AC), TheFunction(F), Hints(Hints) {}    /// Information about vectorization costs    struct VectorizationFactor { @@ -1103,10 +1429,10 @@ public:    /// possible.    VectorizationFactor selectVectorizationFactor(bool OptForSize); -  /// \return The size (in bits) of the widest type in the code that -  /// needs to be vectorized. We ignore values that remain scalar such as +  /// \return The size (in bits) of the smallest and widest types in the code +  /// that needs to be vectorized. We ignore values that remain scalar such as    /// 64 bit loop indices. -  unsigned getWidestType(); +  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();    /// \return The desired interleave count.    /// If interleave count has been specified by metadata it will be returned. @@ -1133,8 +1459,13 @@ public:      unsigned NumInstructions;    }; -  /// \return  information about the register usage of the loop. -  RegisterUsage calculateRegisterUsage(); +  /// \return Returns information about the register usages of the loop for the +  /// given vectorization factors. +  SmallVector<RegisterUsage, 8> +  calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs); + +  /// Collect values we want to ignore in the cost model. +  void collectValuesToIgnore();  private:    /// Returns the expected execution cost. The unit of the cost does @@ -1155,17 +1486,20 @@ private:    /// not vectorized.  These are handled as LoopAccessReport rather than    /// VectorizationReport because the << operator of VectorizationReport returns    /// LoopAccessReport. -  void emitAnalysis(const LoopAccessReport &Message) { -    LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); +  void emitAnalysis(const LoopAccessReport &Message) const { +    emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);    } -  /// Values used only by @llvm.assume calls. -  SmallPtrSet<const Value *, 32> EphValues; +public: +  /// Map of scalar integer values to the smallest bitwidth they can be legally +  /// represented as. The vector equivalents of these values should be truncated +  /// to this type. +  MapVector<Instruction*,uint64_t> MinBWs;    /// The loop that we evaluate.    Loop *TheLoop; -  /// Scev analysis. -  ScalarEvolution *SE; +  /// Predicated scalar evolution analysis. +  PredicatedScalarEvolution &PSE;    /// Loop Info analysis.    LoopInfo *LI;    /// Vectorization legality. @@ -1174,247 +1508,78 @@ private:    const TargetTransformInfo &TTI;    /// Target Library Info.    const TargetLibraryInfo *TLI; +  /// Demanded bits analysis. +  DemandedBits *DB; +  /// Assumption cache. +  AssumptionCache *AC;    const Function *TheFunction; -  // Loop Vectorize Hint. +  /// Loop Vectorize Hint.    const LoopVectorizeHints *Hints; +  /// Values to ignore in the cost model. +  SmallPtrSet<const Value *, 16> ValuesToIgnore; +  /// Values to ignore in the cost model when VF > 1. +  SmallPtrSet<const Value *, 16> VecValuesToIgnore;  }; -/// Utility class for getting and setting loop vectorizer hints in the form -/// of loop metadata. -/// This class keeps a number of loop annotations locally (as member variables) -/// and can, upon request, write them back as metadata on the loop. It will -/// initially scan the loop for existing metadata, and will update the local -/// values based on information in the loop. -/// We cannot write all values to metadata, as the mere presence of some info, -/// for example 'force', means a decision has been made. So, we need to be -/// careful NOT to add them if the user hasn't specifically asked so. -class LoopVectorizeHints { -  enum HintKind { -    HK_WIDTH, -    HK_UNROLL, -    HK_FORCE -  }; - -  /// Hint - associates name and validation with the hint value. -  struct Hint { -    const char * Name; -    unsigned Value; // This may have to change for non-numeric values. -    HintKind Kind; - -    Hint(const char * Name, unsigned Value, HintKind Kind) -      : Name(Name), Value(Value), Kind(Kind) { } - -    bool validate(unsigned Val) { -      switch (Kind) { -      case HK_WIDTH: -        return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; -      case HK_UNROLL: -        return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; -      case HK_FORCE: -        return (Val <= 1); -      } -      return false; -    } -  }; - -  /// Vectorization width. -  Hint Width; -  /// Vectorization interleave factor. -  Hint Interleave; -  /// Vectorization forced -  Hint Force; - -  /// Return the loop metadata prefix. -  static StringRef Prefix() { return "llvm.loop."; } - +/// \brief This holds vectorization requirements that must be verified late in +/// the process. The requirements are set by legalize and costmodel. Once +/// vectorization has been determined to be possible and profitable the +/// requirements can be verified by looking for metadata or compiler options. +/// For example, some loops require FP commutativity which is only allowed if +/// vectorization is explicitly specified or if the fast-math compiler option +/// has been provided. +/// Late evaluation of these requirements allows helpful diagnostics to be +/// composed that tells the user what need to be done to vectorize the loop. For +/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late +/// evaluation should be used only when diagnostics can generated that can be +/// followed by a non-expert user. +class LoopVectorizationRequirements {  public: -  enum ForceKind { -    FK_Undefined = -1, ///< Not selected. -    FK_Disabled = 0,   ///< Forcing disabled. -    FK_Enabled = 1,    ///< Forcing enabled. -  }; - -  LoopVectorizeHints(const Loop *L, bool DisableInterleaving) -      : Width("vectorize.width", VectorizerParams::VectorizationFactor, -              HK_WIDTH), -        Interleave("interleave.count", DisableInterleaving, HK_UNROLL), -        Force("vectorize.enable", FK_Undefined, HK_FORCE), -        TheLoop(L) { -    // Populate values with existing loop metadata. -    getHintsFromMetadata(); - -    // force-vector-interleave overrides DisableInterleaving. -    if (VectorizerParams::isInterleaveForced()) -      Interleave.Value = VectorizerParams::VectorizationInterleave; - -    DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() -          << "LV: Interleaving disabled by the pass manager\n"); -  } - -  /// Mark the loop L as already vectorized by setting the width to 1. -  void setAlreadyVectorized() { -    Width.Value = Interleave.Value = 1; -    Hint Hints[] = {Width, Interleave}; -    writeHintsToMetadata(Hints); -  } - -  /// Dumps all the hint information. -  std::string emitRemark() const { -    VectorizationReport R; -    if (Force.Value == LoopVectorizeHints::FK_Disabled) -      R << "vectorization is explicitly disabled"; -    else { -      R << "use -Rpass-analysis=loop-vectorize for more info"; -      if (Force.Value == LoopVectorizeHints::FK_Enabled) { -        R << " (Force=true"; -        if (Width.Value != 0) -          R << ", Vector Width=" << Width.Value; -        if (Interleave.Value != 0) -          R << ", Interleave Count=" << Interleave.Value; -        R << ")"; -      } -    } - -    return R.str(); -  } - -  unsigned getWidth() const { return Width.Value; } -  unsigned getInterleave() const { return Interleave.Value; } -  enum ForceKind getForce() const { return (ForceKind)Force.Value; } - -private: -  /// Find hints specified in the loop metadata and update local values. -  void getHintsFromMetadata() { -    MDNode *LoopID = TheLoop->getLoopID(); -    if (!LoopID) -      return; - -    // First operand should refer to the loop id itself. -    assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); -    assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); - -    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { -      const MDString *S = nullptr; -      SmallVector<Metadata *, 4> Args; - -      // The expected hint is either a MDString or a MDNode with the first -      // operand a MDString. -      if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { -        if (!MD || MD->getNumOperands() == 0) -          continue; -        S = dyn_cast<MDString>(MD->getOperand(0)); -        for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) -          Args.push_back(MD->getOperand(i)); -      } else { -        S = dyn_cast<MDString>(LoopID->getOperand(i)); -        assert(Args.size() == 0 && "too many arguments for MDString"); -      } - -      if (!S) -        continue; - -      // Check if the hint starts with the loop metadata prefix. -      StringRef Name = S->getString(); -      if (Args.size() == 1) -        setHint(Name, Args[0]); +  LoopVectorizationRequirements() +      : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {} + +  void addUnsafeAlgebraInst(Instruction *I) { +    // First unsafe algebra instruction. +    if (!UnsafeAlgebraInst) +      UnsafeAlgebraInst = I; +  } + +  void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } + +  bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { +    const char *Name = Hints.vectorizeAnalysisPassName(); +    bool Failed = false; +    if (UnsafeAlgebraInst && !Hints.allowReordering()) { +      emitOptimizationRemarkAnalysisFPCommute( +          F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(), +          VectorizationReport() << "cannot prove it is safe to reorder " +                                   "floating-point operations"); +      Failed = true;      } -  } - -  /// Checks string hint with one operand and set value if valid. -  void setHint(StringRef Name, Metadata *Arg) { -    if (!Name.startswith(Prefix())) -      return; -    Name = Name.substr(Prefix().size(), StringRef::npos); - -    const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); -    if (!C) return; -    unsigned Val = C->getZExtValue(); -    Hint *Hints[] = {&Width, &Interleave, &Force}; -    for (auto H : Hints) { -      if (Name == H->Name) { -        if (H->validate(Val)) -          H->Value = Val; -        else -          DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); -        break; -      } +    // Test if runtime memcheck thresholds are exceeded. +    bool PragmaThresholdReached = +        NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; +    bool ThresholdReached = +        NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; +    if ((ThresholdReached && !Hints.allowReordering()) || +        PragmaThresholdReached) { +      emitOptimizationRemarkAnalysisAliasing( +          F->getContext(), Name, *F, L->getStartLoc(), +          VectorizationReport() +              << "cannot prove it is safe to reorder memory operations"); +      DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); +      Failed = true;      } -  } -  /// Create a new hint from name / value pair. -  MDNode *createHintMetadata(StringRef Name, unsigned V) const { -    LLVMContext &Context = TheLoop->getHeader()->getContext(); -    Metadata *MDs[] = {MDString::get(Context, Name), -                       ConstantAsMetadata::get( -                           ConstantInt::get(Type::getInt32Ty(Context), V))}; -    return MDNode::get(Context, MDs); +    return Failed;    } -  /// Matches metadata with hint name. -  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { -    MDString* Name = dyn_cast<MDString>(Node->getOperand(0)); -    if (!Name) -      return false; - -    for (auto H : HintTypes) -      if (Name->getString().endswith(H.Name)) -        return true; -    return false; -  } - -  /// Sets current hints into loop metadata, keeping other values intact. -  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { -    if (HintTypes.size() == 0) -      return; - -    // Reserve the first element to LoopID (see below). -    SmallVector<Metadata *, 4> MDs(1); -    // If the loop already has metadata, then ignore the existing operands. -    MDNode *LoopID = TheLoop->getLoopID(); -    if (LoopID) { -      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { -        MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); -        // If node in update list, ignore old value. -        if (!matchesHintMetadataName(Node, HintTypes)) -          MDs.push_back(Node); -      } -    } - -    // Now, add the missing hints. -    for (auto H : HintTypes) -      MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); - -    // Replace current metadata node with new one. -    LLVMContext &Context = TheLoop->getHeader()->getContext(); -    MDNode *NewLoopID = MDNode::get(Context, MDs); -    // Set operand 0 to refer to the loop id itself. -    NewLoopID->replaceOperandWith(0, NewLoopID); - -    TheLoop->setLoopID(NewLoopID); -  } - -  /// The loop these hints belong to. -  const Loop *TheLoop; +private: +  unsigned NumRuntimePointerChecks; +  Instruction *UnsafeAlgebraInst;  }; -static void emitMissedWarning(Function *F, Loop *L, -                              const LoopVectorizeHints &LH) { -  emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F, -                               L->getStartLoc(), LH.emitRemark()); - -  if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { -    if (LH.getWidth() != 1) -      emitLoopVectorizeWarning( -          F->getContext(), *F, L->getStartLoc(), -          "failed explicitly specified loop vectorization"); -    else if (LH.getInterleave() != 1) -      emitLoopInterleaveWarning( -          F->getContext(), *F, L->getStartLoc(), -          "failed explicitly specified loop interleaving"); -  } -} -  static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {    if (L.empty())      return V.push_back(&L); @@ -1441,6 +1606,7 @@ struct LoopVectorize : public FunctionPass {    DominatorTree *DT;    BlockFrequencyInfo *BFI;    TargetLibraryInfo *TLI; +  DemandedBits *DB;    AliasAnalysis *AA;    AssumptionCache *AC;    LoopAccessAnalysis *LAA; @@ -1450,16 +1616,17 @@ struct LoopVectorize : public FunctionPass {    BlockFrequency ColdEntryFreq;    bool runOnFunction(Function &F) override { -    SE = &getAnalysis<ScalarEvolution>(); +    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();      TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); -    BFI = &getAnalysis<BlockFrequencyInfo>(); +    BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();      auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();      TLI = TLIP ? &TLIP->getTLI() : nullptr; -    AA = &getAnalysis<AliasAnalysis>(); +    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();      AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);      LAA = &getAnalysis<LoopAccessAnalysis>(); +    DB = &getAnalysis<DemandedBits>();      // Compute some weights outside of the loop over the loops. Compute this      // using a BranchProbability to re-use its scaling math. @@ -1562,26 +1729,8 @@ struct LoopVectorize : public FunctionPass {      // less verbose reporting vectorized loops and unvectorized loops that may      // benefit from vectorization, respectively. -    if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) { -      DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); -      emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, -                                     L->getStartLoc(), Hints.emitRemark()); -      return false; -    } - -    if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) { -      DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); -      emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, -                                     L->getStartLoc(), Hints.emitRemark()); -      return false; -    } - -    if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) { -      DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); -      emitOptimizationRemarkAnalysis( -          F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), -          "loop not vectorized: vector width and interleave count are " -          "explicitly set to 1"); +    if (!Hints.allowVectorization(F, L, AlwaysVectorize)) { +      DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");        return false;      } @@ -1595,15 +1744,19 @@ struct LoopVectorize : public FunctionPass {          DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");        else {          DEBUG(dbgs() << "\n"); -        emitOptimizationRemarkAnalysis( -            F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), -            "vectorization is not beneficial and is not explicitly forced"); +        emitAnalysisDiag(F, L, Hints, VectorizationReport() +                                          << "vectorization is not beneficial " +                                             "and is not explicitly forced");          return false;        }      } +    PredicatedScalarEvolution PSE(*SE); +      // Check if it is legal to vectorize the loop. -    LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA); +    LoopVectorizationRequirements Requirements; +    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, +                                  &Requirements, &Hints);      if (!LVL.canVectorize()) {        DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");        emitMissedWarning(F, L, Hints); @@ -1611,16 +1764,18 @@ struct LoopVectorize : public FunctionPass {      }      // Use the cost model. -    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints); +    LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F, +                                  &Hints); +    CM.collectValuesToIgnore();      // Check the function attributes to find out if this function should be      // optimized for size.      bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && -                      F->hasFnAttribute(Attribute::OptimizeForSize); +                      F->optForSize();      // Compute the weighted frequency of this loop being executed and see if it      // is less than 20% of the function entry baseline frequency. Note that we -    // always have a canonical loop here because we think we *can* vectoriez. +    // always have a canonical loop here because we think we *can* vectorize.      // FIXME: This is hidden behind a flag due to pervasive problems with      // exactly what block frequency models.      if (LoopVectorizeWithBlockFrequency) { @@ -1630,16 +1785,17 @@ struct LoopVectorize : public FunctionPass {          OptForSize = true;      } -    // Check the function attributes to see if implicit floats are allowed.a +    // Check the function attributes to see if implicit floats are allowed.      // FIXME: This check doesn't seem possibly correct -- what if the loop is      // an integer loop and the vector instructions selected are purely integer      // vector instructions?      if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {        DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"              "attribute is used.\n"); -      emitOptimizationRemarkAnalysis( -          F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), -          "loop not vectorized due to NoImplicitFloat attribute"); +      emitAnalysisDiag( +          F, L, Hints, +          VectorizationReport() +              << "loop not vectorized due to NoImplicitFloat attribute");        emitMissedWarning(F, L, Hints);        return false;      } @@ -1651,32 +1807,86 @@ struct LoopVectorize : public FunctionPass {      // Select the interleave count.      unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); -    DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " -                 << DebugLocStr << '\n'); -    DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); +    // Get user interleave count. +    unsigned UserIC = Hints.getInterleave(); + +    // Identify the diagnostic messages that should be produced. +    std::string VecDiagMsg, IntDiagMsg; +    bool VectorizeLoop = true, InterleaveLoop = true; + +    if (Requirements.doesNotMeet(F, L, Hints)) { +      DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " +                      "requirements.\n"); +      emitMissedWarning(F, L, Hints); +      return false; +    }      if (VF.Width == 1) { -      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n"); +      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); +      VecDiagMsg = +          "the cost-model indicates that vectorization is not beneficial"; +      VectorizeLoop = false; +    } -      if (IC == 1) { -        emitOptimizationRemarkAnalysis( -            F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), -            "not beneficial to vectorize and user disabled interleaving"); -        return false; -      } -      DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n"); +    if (IC == 1 && UserIC <= 1) { +      // Tell the user interleaving is not beneficial. +      DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); +      IntDiagMsg = +          "the cost-model indicates that interleaving is not beneficial"; +      InterleaveLoop = false; +      if (UserIC == 1) +        IntDiagMsg += +            " and is explicitly disabled or interleave count is set to 1"; +    } else if (IC > 1 && UserIC == 1) { +      // Tell the user interleaving is beneficial, but it explicitly disabled. +      DEBUG(dbgs() +            << "LV: Interleaving is beneficial but is explicitly disabled."); +      IntDiagMsg = "the cost-model indicates that interleaving is beneficial " +                   "but is explicitly disabled or interleave count is set to 1"; +      InterleaveLoop = false; +    } -      // Report the unrolling decision. -      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), -                             Twine("interleaved by " + Twine(IC) + -                                   " (vectorization not beneficial)")); +    // Override IC if user provided an interleave count. +    IC = UserIC > 0 ? UserIC : IC; + +    // Emit diagnostic messages, if any. +    const char *VAPassName = Hints.vectorizeAnalysisPassName(); +    if (!VectorizeLoop && !InterleaveLoop) { +      // Do not vectorize or interleaving the loop. +      emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, +                                     L->getStartLoc(), VecDiagMsg); +      emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, +                                     L->getStartLoc(), IntDiagMsg); +      return false; +    } else if (!VectorizeLoop && InterleaveLoop) { +      DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); +      emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, +                                     L->getStartLoc(), VecDiagMsg); +    } else if (VectorizeLoop && !InterleaveLoop) { +      DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " +                   << DebugLocStr << '\n'); +      emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, +                                     L->getStartLoc(), IntDiagMsg); +    } else if (VectorizeLoop && InterleaveLoop) { +      DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " +                   << DebugLocStr << '\n'); +      DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); +    } + +    if (!VectorizeLoop) { +      assert(IC > 1 && "interleave count should not be 1 or 0"); +      // If we decided that it is not legal to vectorize the loop then +      // interleave it. +      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC); +      Unroller.vectorize(&LVL, CM.MinBWs); -      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC); -      Unroller.vectorize(&LVL); +      emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), +                             Twine("interleaved loop (interleaved count: ") + +                                 Twine(IC) + ")");      } else {        // If we decided that it is *legal* to vectorize the loop then do it. -      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC); -      LB.vectorize(&LVL); +      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC); +      LB.vectorize(&LVL, CM.MinBWs);        ++LoopsVectorized;        // Add metadata to disable runtime unrolling scalar loop when there's no @@ -1686,7 +1896,7 @@ struct LoopVectorize : public FunctionPass {          AddRuntimeUnrollDisableMetaData(L);        // Report the vectorization decision. -      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), +      emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),                               Twine("vectorized loop (vectorization width: ") +                                   Twine(VF.Width) + ", interleaved count: " +                                   Twine(IC) + ")"); @@ -1703,16 +1913,19 @@ struct LoopVectorize : public FunctionPass {      AU.addRequired<AssumptionCacheTracker>();      AU.addRequiredID(LoopSimplifyID);      AU.addRequiredID(LCSSAID); -    AU.addRequired<BlockFrequencyInfo>(); +    AU.addRequired<BlockFrequencyInfoWrapperPass>();      AU.addRequired<DominatorTreeWrapperPass>();      AU.addRequired<LoopInfoWrapperPass>(); -    AU.addRequired<ScalarEvolution>(); +    AU.addRequired<ScalarEvolutionWrapperPass>();      AU.addRequired<TargetTransformInfoWrapperPass>(); -    AU.addRequired<AliasAnalysis>(); +    AU.addRequired<AAResultsWrapperPass>();      AU.addRequired<LoopAccessAnalysis>(); +    AU.addRequired<DemandedBits>();      AU.addPreserved<LoopInfoWrapperPass>();      AU.addPreserved<DominatorTreeWrapperPass>(); -    AU.addPreserved<AliasAnalysis>(); +    AU.addPreserved<BasicAAWrapperPass>(); +    AU.addPreserved<AAResultsWrapperPass>(); +    AU.addPreserved<GlobalsAAWrapperPass>();    }  }; @@ -1773,6 +1986,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,  int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {    assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); +  auto *SE = PSE.getSE();    // Make sure that the pointer does not point to structs.    if (Ptr->getType()->getPointerElementType()->isAggregateType())      return 0; @@ -1780,11 +1994,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {    // If this value is a pointer induction variable we know it is consecutive.    PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);    if (Phi && Inductions.count(Phi)) { -    InductionInfo II = Inductions[Phi]; +    InductionDescriptor II = Inductions[Phi];      return II.getConsecutiveDirection();    } -  GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); +  GetElementPtrInst *Gep = getGEPInstruction(Ptr);    if (!Gep)      return 0; @@ -1802,10 +2016,10 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {      // Make sure that all of the index operands are loop invariant.      for (unsigned i = 1; i < NumOperands; ++i) -      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) +      if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))          return 0; -    InductionInfo II = Inductions[Phi]; +    InductionDescriptor II = Inductions[Phi];      return II.getConsecutiveDirection();    } @@ -1815,14 +2029,14 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {    // operand.    for (unsigned i = 0; i != NumOperands; ++i)      if (i != InductionOperand && -        !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) +        !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))        return 0;    // We can emit wide load/stores only if the last non-zero index is the    // induction variable.    const SCEV *Last = nullptr;    if (!Strides.count(Gep)) -    Last = SE->getSCEV(Gep->getOperand(InductionOperand)); +    Last = PSE.getSCEV(Gep->getOperand(InductionOperand));    else {      // Because of the multiplication by a stride we can have a s/zext cast.      // We are going to replace this stride by 1 so the cast is safe to ignore. @@ -1833,7 +2047,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {      //  %idxprom = zext i32 %mul to i64  << Safe cast.      //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom      // -    Last = replaceSymbolicStrideSCEV(SE, Strides, +    Last = replaceSymbolicStrideSCEV(PSE, Strides,                                       Gep->getOperand(InductionOperand), Gep);      if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))        Last = @@ -2177,7 +2391,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {    VectorParts &Entry = WidenMap.get(Instr);    // Handle consecutive loads/stores. -  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); +  GetElementPtrInst *Gep = getGEPInstruction(Ptr);    if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {      setDebugLocFromInst(Builder, Gep);      Value *PtrOperand = Gep->getPointerOperand(); @@ -2191,8 +2405,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {      Ptr = Builder.Insert(Gep2);    } else if (Gep) {      setDebugLocFromInst(Builder, Gep); -    assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), -                               OrigLoop) && "Base ptr must be invariant"); +    assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), +                                        OrigLoop) && +           "Base ptr must be invariant");      // The last index does not have to be the induction. It can be      // consecutive and be a function of the index. For example A[I+1]; @@ -2209,7 +2424,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {        if (i == InductionOperand ||            (GepOperandInst && OrigLoop->contains(GepOperandInst))) {          assert((i == InductionOperand || -               SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && +                PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), +                                             OrigLoop)) &&                 "Must be last index or loop invariant");          VectorParts &GEPParts = getVectorValue(GepOperand); @@ -2237,14 +2453,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {      // We don't want to update the value in the map as it might be used in      // another expression. So don't use a reference type for "StoredVal".      VectorParts StoredVal = getVectorValue(SI->getValueOperand()); -     +      for (unsigned Part = 0; Part < UF; ++Part) {        // Calculate the pointer for the specific unroll-part.        Value *PartPtr =            Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));        if (Reverse) { -        // If we store to reverse consecutive memory locations then we need +        // If we store to reverse consecutive memory locations, then we need          // to reverse the order of elements in the stored value.          StoredVal[Part] = reverseVector(StoredVal[Part]);          // If the address is consecutive but reversed, then the @@ -2298,7 +2514,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {    }  } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, +                                               bool IfPredicateStore) {    assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");    // Holds vector parameters or scalars, in case of uniform vals.    SmallVector<VectorParts, 4> Params; @@ -2318,7 +2535,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic      // Try using previously calculated values.      Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); -    // If the src is an instruction that appeared earlier in the basic block +    // If the src is an instruction that appeared earlier in the basic block,      // then it should already be vectorized.      if (SrcInst && OrigLoop->contains(SrcInst)) {        assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); @@ -2343,19 +2560,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic    // Create a new entry in the WidenMap and initialize it to Undef or Null.    VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); -  Instruction *InsertPt = Builder.GetInsertPoint(); -  BasicBlock *IfBlock = Builder.GetInsertBlock(); -  BasicBlock *CondBlock = nullptr; -    VectorParts Cond; -  Loop *VectorLp = nullptr;    if (IfPredicateStore) {      assert(Instr->getParent()->getSinglePredecessor() &&             "Only support single predecessor blocks");      Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),                            Instr->getParent()); -    VectorLp = LI->getLoopFor(IfBlock); -    assert(VectorLp && "Must have a loop for this block");    }    // For each vector unroll 'part': @@ -2367,12 +2577,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic        Value *Cmp = nullptr;        if (IfPredicateStore) {          Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); -        Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); -        CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); -        LoopVectorBody.push_back(CondBlock); -        VectorLp->addBasicBlockToLoop(CondBlock, *LI); -        // Update Builder with newly created basic block. -        Builder.SetInsertPoint(InsertPt); +        Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, +                                 ConstantInt::get(Cmp->getType(), 1));        }        Instruction *Cloned = Instr->clone(); @@ -2396,85 +2602,223 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic          VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,                                                         Builder.getInt32(Width));        // End if-block. -      if (IfPredicateStore) { -         BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); -         LoopVectorBody.push_back(NewIfBlock); -         VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); -         Builder.SetInsertPoint(InsertPt); -         ReplaceInstWithInst(IfBlock->getTerminator(), -                             BranchInst::Create(CondBlock, NewIfBlock, Cmp)); -         IfBlock = NewIfBlock; -      } +      if (IfPredicateStore) +        PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), +                                                  Cmp));      }    }  } -static Instruction *getFirstInst(Instruction *FirstInst, Value *V, -                                 Instruction *Loc) { -  if (FirstInst) -    return FirstInst; -  if (Instruction *I = dyn_cast<Instruction>(V)) -    return I->getParent() == Loc->getParent() ? I : nullptr; -  return nullptr; +PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, +                                                      Value *End, Value *Step, +                                                      Instruction *DL) { +  BasicBlock *Header = L->getHeader(); +  BasicBlock *Latch = L->getLoopLatch(); +  // As we're just creating this loop, it's possible no latch exists +  // yet. If so, use the header as this will be a single block loop. +  if (!Latch) +    Latch = Header; + +  IRBuilder<> Builder(&*Header->getFirstInsertionPt()); +  setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); +  auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); + +  Builder.SetInsertPoint(Latch->getTerminator()); +   +  // Create i+1 and fill the PHINode. +  Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); +  Induction->addIncoming(Start, L->getLoopPreheader()); +  Induction->addIncoming(Next, Latch); +  // Create the compare. +  Value *ICmp = Builder.CreateICmpEQ(Next, End); +  Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); +   +  // Now we have two terminators. Remove the old one from the block. +  Latch->getTerminator()->eraseFromParent(); + +  return Induction;  } -std::pair<Instruction *, Instruction *> -InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { -  Instruction *tnullptr = nullptr; -  if (!Legal->mustCheckStrides()) -    return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); - -  IRBuilder<> ChkBuilder(Loc); - -  // Emit checks. -  Value *Check = nullptr; -  Instruction *FirstInst = nullptr; -  for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(), -                                         SE = Legal->strides_end(); -       SI != SE; ++SI) { -    Value *Ptr = stripIntegerCast(*SI); -    Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1), -                                       "stride.chk"); -    // Store the first instruction we create. -    FirstInst = getFirstInst(FirstInst, C, Loc); -    if (Check) -      Check = ChkBuilder.CreateOr(Check, C); -    else -      Check = C; -  } +Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { +  if (TripCount) +    return TripCount; -  // We have to do this trickery because the IRBuilder might fold the check to a -  // constant expression in which case there is no Instruction anchored in a -  // the block. -  LLVMContext &Ctx = Loc->getContext(); -  Instruction *TheCheck = -      BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx)); -  ChkBuilder.Insert(TheCheck, "stride.not.one"); -  FirstInst = getFirstInst(FirstInst, TheCheck, Loc); +  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); +  // Find the loop boundaries. +  ScalarEvolution *SE = PSE.getSE(); +  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop); +  assert(BackedgeTakenCount != SE->getCouldNotCompute() && +         "Invalid loop count"); -  return std::make_pair(FirstInst, TheCheck); +  Type *IdxTy = Legal->getWidestInductionType(); +   +  // The exit count might have the type of i64 while the phi is i32. This can +  // happen if we have an induction variable that is sign extended before the +  // compare. The only way that we get a backedge taken count is that the +  // induction variable was signed and as such will not overflow. In such a case +  // truncation is legal. +  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > +      IdxTy->getPrimitiveSizeInBits()) +    BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); +  BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); +   +  // Get the total trip count from the count by adding 1. +  const SCEV *ExitCount = SE->getAddExpr( +      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + +  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + +  // Expand the trip count and place the new instructions in the preheader. +  // Notice that the pre-header does not change, only the loop body. +  SCEVExpander Exp(*SE, DL, "induction"); + +  // Count holds the overall loop count (N). +  TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), +                                L->getLoopPreheader()->getTerminator()); + +  if (TripCount->getType()->isPointerTy()) +    TripCount = +      CastInst::CreatePointerCast(TripCount, IdxTy, +                                  "exitcount.ptrcnt.to.int", +                                  L->getLoopPreheader()->getTerminator()); + +  return TripCount;  } +Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { +  if (VectorTripCount) +    return VectorTripCount; +   +  Value *TC = getOrCreateTripCount(L); +  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); +   +  // Now we need to generate the expression for N - (N % VF), which is +  // the part that the vectorized body will execute. +  // The loop step is equal to the vectorization factor (num of SIMD elements) +  // times the unroll factor (num of SIMD instructions). +  Constant *Step = ConstantInt::get(TC->getType(), VF * UF); +  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); +  VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); + +  return VectorTripCount; +} + +void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, +                                                         BasicBlock *Bypass) { +  Value *Count = getOrCreateTripCount(L); +  BasicBlock *BB = L->getLoopPreheader(); +  IRBuilder<> Builder(BB->getTerminator()); + +  // Generate code to check that the loop's trip count that we computed by +  // adding one to the backedge-taken count will not overflow. +  Value *CheckMinIters = +    Builder.CreateICmpULT(Count, +                          ConstantInt::get(Count->getType(), VF * UF), +                          "min.iters.check"); +   +  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), +                                          "min.iters.checked"); +  if (L->getParentLoop()) +    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); +  ReplaceInstWithInst(BB->getTerminator(), +                      BranchInst::Create(Bypass, NewBB, CheckMinIters)); +  LoopBypassBlocks.push_back(BB); +} + +void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, +                                                     BasicBlock *Bypass) { +  Value *TC = getOrCreateVectorTripCount(L); +  BasicBlock *BB = L->getLoopPreheader(); +  IRBuilder<> Builder(BB->getTerminator()); +   +  // Now, compare the new count to zero. If it is zero skip the vector loop and +  // jump to the scalar loop. +  Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()), +                                    "cmp.zero"); + +  // Generate code to check that the loop's trip count that we computed by +  // adding one to the backedge-taken count will not overflow. +  BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), +                                          "vector.ph"); +  if (L->getParentLoop()) +    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); +  ReplaceInstWithInst(BB->getTerminator(), +                      BranchInst::Create(Bypass, NewBB, Cmp)); +  LoopBypassBlocks.push_back(BB); +} + +void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { +  BasicBlock *BB = L->getLoopPreheader(); + +  // Generate the code to check that the SCEV assumptions that we made. +  // We want the new basic block to start at the first instruction in a +  // sequence of instructions that form a check. +  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), +                   "scev.check"); +  Value *SCEVCheck = +      Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); + +  if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) +    if (C->isZero()) +      return; + +  // Create a new block containing the stride check. +  BB->setName("vector.scevcheck"); +  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); +  if (L->getParentLoop()) +    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); +  ReplaceInstWithInst(BB->getTerminator(), +                      BranchInst::Create(Bypass, NewBB, SCEVCheck)); +  LoopBypassBlocks.push_back(BB); +  AddedSafetyChecks = true; +} + +void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, +                                               BasicBlock *Bypass) { +  BasicBlock *BB = L->getLoopPreheader(); + +  // Generate the code that checks in runtime if arrays overlap. We put the +  // checks into a separate block to make the more common case of few elements +  // faster. +  Instruction *FirstCheckInst; +  Instruction *MemRuntimeCheck; +  std::tie(FirstCheckInst, MemRuntimeCheck) = +      Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); +  if (!MemRuntimeCheck) +    return; + +  // Create a new block containing the memory check. +  BB->setName("vector.memcheck"); +  auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); +  if (L->getParentLoop()) +    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); +  ReplaceInstWithInst(BB->getTerminator(), +                      BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); +  LoopBypassBlocks.push_back(BB); +  AddedSafetyChecks = true; +} + +  void InnerLoopVectorizer::createEmptyLoop() {    /*     In this function we generate a new loop. The new loop will contain     the vectorized instructions while the old loop will continue to run the     scalar remainder. -       [ ] <-- Back-edge taken count overflow check. +       [ ] <-- loop iteration number check.      /   |     /    v    |    [ ] <-- vector loop bypass (may consist of multiple blocks).    |  /  |    | /   v    ||   [ ]     <-- vector pre header. -  ||    | -  ||    v -  ||   [  ] \ -  ||   [  ]_|   <-- vector loop. -  ||    | -  | \   v -  |   >[ ]   <--- middle-block. +  |/    | +  |     v +  |    [  ] \ +  |    [  ]_|   <-- vector loop. +  |     | +  |     v +  |   -[ ]   <--- middle-block.    |  /  |    | /   v    -|- >[ ]     <--- new preheader. @@ -2498,65 +2842,16 @@ void InnerLoopVectorizer::createEmptyLoop() {    // don't. One example is c++ iterators that often have multiple pointer    // induction variables. In the code below we also support a case where we    // don't have a single induction variable. +  // +  // We try to obtain an induction variable from the original loop as hard +  // as possible. However if we don't find one that: +  //   - is an integer +  //   - counts from zero, stepping by one +  //   - is the size of the widest induction variable type +  // then we create a new one.    OldInduction = Legal->getInduction();    Type *IdxTy = Legal->getWidestInductionType(); -  // Find the loop boundaries. -  const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); -  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); - -  // The exit count might have the type of i64 while the phi is i32. This can -  // happen if we have an induction variable that is sign extended before the -  // compare. The only way that we get a backedge taken count is that the -  // induction variable was signed and as such will not overflow. In such a case -  // truncation is legal. -  if (ExitCount->getType()->getPrimitiveSizeInBits() > -      IdxTy->getPrimitiveSizeInBits()) -    ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); - -  const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); -  // Get the total trip count from the count by adding 1. -  ExitCount = SE->getAddExpr(BackedgeTakeCount, -                             SE->getConstant(BackedgeTakeCount->getType(), 1)); - -  const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout(); - -  // Expand the trip count and place the new instructions in the preheader. -  // Notice that the pre-header does not change, only the loop body. -  SCEVExpander Exp(*SE, DL, "induction"); - -  // We need to test whether the backedge-taken count is uint##_max. Adding one -  // to it will cause overflow and an incorrect loop trip count in the vector -  // body. In case of overflow we want to directly jump to the scalar remainder -  // loop. -  Value *BackedgeCount = -      Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), -                        VectorPH->getTerminator()); -  if (BackedgeCount->getType()->isPointerTy()) -    BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, -                                                "backedge.ptrcnt.to.int", -                                                VectorPH->getTerminator()); -  Instruction *CheckBCOverflow = -      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, -                      Constant::getAllOnesValue(BackedgeCount->getType()), -                      "backedge.overflow", VectorPH->getTerminator()); - -  // The loop index does not have to start at Zero. Find the original start -  // value from the induction PHI node. If we don't have an induction variable -  // then we know that it starts at zero. -  Builder.SetInsertPoint(VectorPH->getTerminator()); -  Value *StartIdx = ExtendedIdx = -      OldInduction -          ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH), -                               IdxTy) -          : ConstantInt::get(IdxTy, 0); - -  // Count holds the overall loop count (N). -  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), -                                   VectorPH->getTerminator()); - -  LoopBypassBlocks.push_back(VectorPH); -    // Split the single block loop into the two loop structure described above.    BasicBlock *VecBody =        VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); @@ -2580,118 +2875,36 @@ void InnerLoopVectorizer::createEmptyLoop() {    }    Lp->addBasicBlockToLoop(VecBody, *LI); -  // Use this IR builder to create the loop instructions (Phi, Br, Cmp) -  // inside the loop. -  Builder.SetInsertPoint(VecBody->getFirstNonPHI()); - -  // Generate the induction variable. -  setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); -  Induction = Builder.CreatePHI(IdxTy, 2, "index"); -  // The loop step is equal to the vectorization factor (num of SIMD elements) -  // times the unroll factor (num of SIMD instructions). -  Constant *Step = ConstantInt::get(IdxTy, VF * UF); - -  // Generate code to check that the loop's trip count that we computed by -  // adding one to the backedge-taken count will not overflow. -  BasicBlock *NewVectorPH = -      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked"); -  if (ParentLoop) -    ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); -  ReplaceInstWithInst( -      VectorPH->getTerminator(), -      BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow)); -  VectorPH = NewVectorPH; - -  // This is the IR builder that we use to add all of the logic for bypassing -  // the new vector loop. -  IRBuilder<> BypassBuilder(VectorPH->getTerminator()); -  setDebugLocFromInst(BypassBuilder, -                      getDebugLocFromInstOrOperands(OldInduction)); - -  // We may need to extend the index in case there is a type mismatch. -  // We know that the count starts at zero and does not overflow. -  if (Count->getType() != IdxTy) { -    // The exit count can be of pointer type. Convert it to the correct -    // integer type. -    if (ExitCount->getType()->isPointerTy()) -      Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); -    else -      Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); -  } - -  // Add the start index to the loop count to get the new end index. -  Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); +  // Find the loop boundaries. +  Value *Count = getOrCreateTripCount(Lp); -  // Now we need to generate the expression for N - (N % VF), which is -  // the part that the vectorized body will execute. -  Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); -  Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); -  Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, -                                                     "end.idx.rnd.down"); +  Value *StartIdx = ConstantInt::get(IdxTy, 0); +  // We need to test whether the backedge-taken count is uint##_max. Adding one +  // to it will cause overflow and an incorrect loop trip count in the vector +  // body. In case of overflow we want to directly jump to the scalar remainder +  // loop. +  emitMinimumIterationCountCheck(Lp, ScalarPH);    // Now, compare the new count to zero. If it is zero skip the vector loop and    // jump to the scalar loop. -  Value *Cmp = -      BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); -  NewVectorPH = -      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); -  if (ParentLoop) -    ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); -  LoopBypassBlocks.push_back(VectorPH); -  ReplaceInstWithInst(VectorPH->getTerminator(), -                      BranchInst::Create(MiddleBlock, NewVectorPH, Cmp)); -  VectorPH = NewVectorPH; - -  // Generate the code to check that the strides we assumed to be one are really -  // one. We want the new basic block to start at the first instruction in a -  // sequence of instructions that form a check. -  Instruction *StrideCheck; -  Instruction *FirstCheckInst; -  std::tie(FirstCheckInst, StrideCheck) = -      addStrideCheck(VectorPH->getTerminator()); -  if (StrideCheck) { -    AddedSafetyChecks = true; -    // Create a new block containing the stride check. -    VectorPH->setName("vector.stridecheck"); -    NewVectorPH = -        VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); -    if (ParentLoop) -      ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); -    LoopBypassBlocks.push_back(VectorPH); - -    // Replace the branch into the memory check block with a conditional branch -    // for the "few elements case". -    ReplaceInstWithInst( -        VectorPH->getTerminator(), -        BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck)); - -    VectorPH = NewVectorPH; -  } +  emitVectorLoopEnteredCheck(Lp, ScalarPH); +  // Generate the code to check any assumptions that we've made for SCEV +  // expressions. +  emitSCEVChecks(Lp, ScalarPH);    // Generate the code that checks in runtime if arrays overlap. We put the    // checks into a separate block to make the more common case of few elements    // faster. -  Instruction *MemRuntimeCheck; -  std::tie(FirstCheckInst, MemRuntimeCheck) = -      Legal->getLAI()->addRuntimeCheck(VectorPH->getTerminator()); -  if (MemRuntimeCheck) { -    AddedSafetyChecks = true; -    // Create a new block containing the memory check. -    VectorPH->setName("vector.memcheck"); -    NewVectorPH = -        VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); -    if (ParentLoop) -      ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); -    LoopBypassBlocks.push_back(VectorPH); - -    // Replace the branch into the memory check block with a conditional branch -    // for the "few elements case". -    ReplaceInstWithInst( -        VectorPH->getTerminator(), -        BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck)); - -    VectorPH = NewVectorPH; -  } +  emitMemRuntimeChecks(Lp, ScalarPH); +   +  // Generate the induction variable. +  // The loop step is equal to the vectorization factor (num of SIMD elements) +  // times the unroll factor (num of SIMD instructions). +  Value *CountRoundDown = getOrCreateVectorTripCount(Lp); +  Constant *Step = ConstantInt::get(IdxTy, VF * UF); +  Induction = +    createInductionVariable(Lp, StartIdx, CountRoundDown, Step, +                            getDebugLocFromInstOrOperands(OldInduction));    // We are going to resume the execution of the scalar loop.    // Go over all of the induction variables that we found and fix the @@ -2701,152 +2914,60 @@ void InnerLoopVectorizer::createEmptyLoop() {    // If we come from a bypass edge then we need to start from the original    // start value. -  // This variable saves the new starting index for the scalar loop. -  PHINode *ResumeIndex = nullptr; +  // This variable saves the new starting index for the scalar loop. It is used +  // to test if there are any tail iterations left once the vector loop has +  // completed.    LoopVectorizationLegality::InductionList::iterator I, E;    LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); -  // Set builder to point to last bypass block. -  BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());    for (I = List->begin(), E = List->end(); I != E; ++I) {      PHINode *OrigPhi = I->first; -    LoopVectorizationLegality::InductionInfo II = I->second; - -    Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); -    PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", -                                         MiddleBlock->getTerminator()); -    // We might have extended the type of the induction variable but we need a -    // truncated version for the scalar loop. -    PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? -      PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", -                      MiddleBlock->getTerminator()) : nullptr; +    InductionDescriptor II = I->second;      // Create phi nodes to merge from the  backedge-taken check block. -    PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", +    PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, +                                           "bc.resume.val",                                             ScalarPH->getTerminator()); -    BCResumeVal->addIncoming(ResumeVal, MiddleBlock); - -    PHINode *BCTruncResumeVal = nullptr; +    Value *EndValue;      if (OrigPhi == OldInduction) { -      BCTruncResumeVal = -          PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", -                          ScalarPH->getTerminator()); -      BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); -    } - -    Value *EndValue = nullptr; -    switch (II.IK) { -    case LoopVectorizationLegality::IK_NoInduction: -      llvm_unreachable("Unknown induction"); -    case LoopVectorizationLegality::IK_IntInduction: { -      // Handle the integer induction counter. -      assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); - -      // We have the canonical induction variable. -      if (OrigPhi == OldInduction) { -        // Create a truncated version of the resume value for the scalar loop, -        // we might have promoted the type to a larger width. -        EndValue = -          BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); -        // The new PHI merges the original incoming value, in case of a bypass, -        // or the value at the end of the vectorized loop. -        for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) -          TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); -        TruncResumeVal->addIncoming(EndValue, VecBody); - -        BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); - -        // We know what the end value is. -        EndValue = IdxEndRoundDown; -        // We also know which PHI node holds it. -        ResumeIndex = ResumeVal; -        break; -      } - -      // Not the canonical induction variable - add the vector loop count to the -      // start value. -      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, -                                                   II.StartValue->getType(), -                                                   "cast.crd"); -      EndValue = II.transform(BypassBuilder, CRD); +      // We know what the end value is. +      EndValue = CountRoundDown; +    } else { +      IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); +      Value *CRD = B.CreateSExtOrTrunc(CountRoundDown, +                                       II.getStepValue()->getType(), +                                       "cast.crd"); +      EndValue = II.transform(B, CRD);        EndValue->setName("ind.end"); -      break;      } -    case LoopVectorizationLegality::IK_PtrInduction: { -      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, -                                                   II.StepValue->getType(), -                                                   "cast.crd"); -      EndValue = II.transform(BypassBuilder, CRD); -      EndValue->setName("ptr.ind.end"); -      break; -    } -    }// end of case      // The new PHI merges the original incoming value, in case of a bypass,      // or the value at the end of the vectorized loop. -    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { -      if (OrigPhi == OldInduction) -        ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); -      else -        ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); -    } -    ResumeVal->addIncoming(EndValue, VecBody); +    BCResumeVal->addIncoming(EndValue, MiddleBlock);      // Fix the scalar body counter (PHI node).      unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);      // The old induction's phi node in the scalar body needs the truncated      // value. -    if (OrigPhi == OldInduction) { -      BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); -      OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); -    } else { -      BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); -      OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); -    } +    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) +      BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]); +    OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);    } -  // If we are generating a new induction variable then we also need to -  // generate the code that calculates the exit value. This value is not -  // simply the end of the counter because we may skip the vectorized body -  // in case of a runtime check. -  if (!OldInduction){ -    assert(!ResumeIndex && "Unexpected resume value found"); -    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", -                                  MiddleBlock->getTerminator()); -    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) -      ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); -    ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); -  } - -  // Make sure that we found the index where scalar loop needs to continue. -  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && -         "Invalid resume Index"); -    // Add a check in the middle block to see if we have completed    // all of the iterations in the first vector loop.    // If (N - N%VF) == N, then we *don't* need to run the remainder. -  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, -                                ResumeIndex, "cmp.n", +  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, +                                CountRoundDown, "cmp.n",                                  MiddleBlock->getTerminator());    ReplaceInstWithInst(MiddleBlock->getTerminator(),                        BranchInst::Create(ExitBlock, ScalarPH, CmpN)); -  // Create i+1 and fill the PHINode. -  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); -  Induction->addIncoming(StartIdx, VectorPH); -  Induction->addIncoming(NextIdx, VecBody); -  // Create the compare. -  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); -  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); - -  // Now we have two terminators. Remove the old one from the block. -  VecBody->getTerminator()->eraseFromParent(); -    // Get ready to start creating new instructions into the vectorized body. -  Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); +  Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());    // Save the state. -  LoopVectorPreHeader = VectorPH; +  LoopVectorPreHeader = Lp->getLoopPreheader();    LoopScalarPreHeader = ScalarPH;    LoopMiddleBlock = MiddleBlock;    LoopExitBlock = ExitBlock; @@ -2899,7 +3020,7 @@ static void cse(SmallVector<BasicBlock *, 4> &BBs) {    for (unsigned i = 0, e = BBs.size(); i != e; ++i) {      BasicBlock *BB = BBs[i];      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { -      Instruction *In = I++; +      Instruction *In = &*I++;        if (!CSEDenseMapInfo::canHandle(In))          continue; @@ -3021,6 +3142,117 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,    return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);  } +static Type *smallestIntegerVectorType(Type *T1, Type *T2) { +  IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType()); +  IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType()); +  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; +} +static Type *largestIntegerVectorType(Type *T1, Type *T2) { +  IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType()); +  IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType()); +  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; +} + +void InnerLoopVectorizer::truncateToMinimalBitwidths() { +  // For every instruction `I` in MinBWs, truncate the operands, create a +  // truncated version of `I` and reextend its result. InstCombine runs +  // later and will remove any ext/trunc pairs. +  // +  for (auto &KV : MinBWs) { +    VectorParts &Parts = WidenMap.get(KV.first); +    for (Value *&I : Parts) { +      if (I->use_empty()) +        continue; +      Type *OriginalTy = I->getType(); +      Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), +                                                 KV.second); +      Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, +                                          OriginalTy->getVectorNumElements()); +      if (TruncatedTy == OriginalTy) +        continue; + +      IRBuilder<> B(cast<Instruction>(I)); +      auto ShrinkOperand = [&](Value *V) -> Value* { +        if (auto *ZI = dyn_cast<ZExtInst>(V)) +          if (ZI->getSrcTy() == TruncatedTy) +            return ZI->getOperand(0); +        return B.CreateZExtOrTrunc(V, TruncatedTy); +      }; + +      // The actual instruction modification depends on the instruction type, +      // unfortunately. +      Value *NewI = nullptr; +      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { +        NewI = B.CreateBinOp(BO->getOpcode(), +                             ShrinkOperand(BO->getOperand(0)), +                             ShrinkOperand(BO->getOperand(1))); +        cast<BinaryOperator>(NewI)->copyIRFlags(I); +      } else if (ICmpInst *CI = dyn_cast<ICmpInst>(I)) { +        NewI = B.CreateICmp(CI->getPredicate(), +                            ShrinkOperand(CI->getOperand(0)), +                            ShrinkOperand(CI->getOperand(1))); +      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) { +        NewI = B.CreateSelect(SI->getCondition(), +                              ShrinkOperand(SI->getTrueValue()), +                              ShrinkOperand(SI->getFalseValue())); +      } else if (CastInst *CI = dyn_cast<CastInst>(I)) { +        switch (CI->getOpcode()) { +        default: llvm_unreachable("Unhandled cast!"); +        case Instruction::Trunc: +          NewI = ShrinkOperand(CI->getOperand(0)); +          break; +        case Instruction::SExt: +          NewI = B.CreateSExtOrTrunc(CI->getOperand(0), +                                     smallestIntegerVectorType(OriginalTy, +                                                               TruncatedTy)); +          break; +        case Instruction::ZExt: +          NewI = B.CreateZExtOrTrunc(CI->getOperand(0), +                                     smallestIntegerVectorType(OriginalTy, +                                                               TruncatedTy)); +          break; +        } +      } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { +        auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); +        auto *O0 = +          B.CreateZExtOrTrunc(SI->getOperand(0), +                              VectorType::get(ScalarTruncatedTy, Elements0)); +        auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); +        auto *O1 = +          B.CreateZExtOrTrunc(SI->getOperand(1), +                              VectorType::get(ScalarTruncatedTy, Elements1)); + +        NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); +      } else if (isa<LoadInst>(I)) { +        // Don't do anything with the operands, just extend the result. +        continue; +      } else { +        llvm_unreachable("Unhandled instruction type!"); +      } + +      // Lastly, extend the result. +      NewI->takeName(cast<Instruction>(I)); +      Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); +      I->replaceAllUsesWith(Res); +      cast<Instruction>(I)->eraseFromParent(); +      I = Res; +    } +  } + +  // We'll have created a bunch of ZExts that are now parentless. Clean up. +  for (auto &KV : MinBWs) { +    VectorParts &Parts = WidenMap.get(KV.first); +    for (Value *&I : Parts) { +      ZExtInst *Inst = dyn_cast<ZExtInst>(I); +      if (Inst && Inst->use_empty()) { +        Value *NewI = Inst->getOperand(0); +        Inst->eraseFromParent(); +        I = NewI; +      } +    } +  } +} +  void InnerLoopVectorizer::vectorizeLoop() {    //===------------------------------------------------===//    // @@ -3051,6 +3283,11 @@ void InnerLoopVectorizer::vectorizeLoop() {         be = DFS.endRPO(); bb != be; ++bb)      vectorizeBlockInLoop(*bb, &RdxPHIsToFix); +  // Insert truncates and extends for any truncated instructions as hints to +  // InstCombine. +  if (VF > 1) +    truncateToMinimalBitwidths(); +      // At this point every instruction in the original loop is widened to    // a vector form. We are almost done. Now, we need to fix the PHI nodes    // that we vectorized. The PHI nodes are currently empty because we did @@ -3066,7 +3303,7 @@ void InnerLoopVectorizer::vectorizeLoop() {      assert(RdxPhi && "Unable to recover vectorized PHI");      // Find the reduction variable descriptor. -    assert(Legal->getReductionVars()->count(RdxPhi) && +    assert(Legal->isReductionVariable(RdxPhi) &&             "Unable to find the reduction variable");      RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; @@ -3141,21 +3378,33 @@ void InnerLoopVectorizer::vectorizeLoop() {      // the PHIs and the values we are going to write.      // This allows us to write both PHINodes and the extractelement      // instructions. -    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); +    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); -    VectorParts RdxParts; +    VectorParts RdxParts = getVectorValue(LoopExitInst);      setDebugLocFromInst(Builder, LoopExitInst); -    for (unsigned part = 0; part < UF; ++part) { -      // This PHINode contains the vectorized reduction variable, or -      // the initial value vector, if we bypass the vector loop. -      VectorParts &RdxExitVal = getVectorValue(LoopExitInst); -      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); -      Value *StartVal = (part == 0) ? VectorStart : Identity; -      for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) -        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); -      NewPhi->addIncoming(RdxExitVal[part], -                          LoopVectorBody.back()); -      RdxParts.push_back(NewPhi); + +    // If the vector reduction can be performed in a smaller type, we truncate +    // then extend the loop exit value to enable InstCombine to evaluate the +    // entire expression in the smaller type. +    if (VF > 1 && RdxPhi->getType() != RdxDesc.getRecurrenceType()) { +      Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); +      Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator()); +      for (unsigned part = 0; part < UF; ++part) { +        Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); +        Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) +                                          : Builder.CreateZExt(Trunc, VecTy); +        for (Value::user_iterator UI = RdxParts[part]->user_begin(); +             UI != RdxParts[part]->user_end();) +          if (*UI != Trunc) { +            (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); +            RdxParts[part] = Extnd; +          } else { +            ++UI; +          } +      } +      Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); +      for (unsigned part = 0; part < UF; ++part) +        RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);      }      // Reduce all of the unrolled parts into a single vector. @@ -3208,13 +3457,22 @@ void InnerLoopVectorizer::vectorizeLoop() {        // The result is in the first element of the vector.        ReducedPartRdx = Builder.CreateExtractElement(TmpVec,                                                      Builder.getInt32(0)); + +      // If the reduction can be performed in a smaller type, we need to extend +      // the reduction to the wider type before we branch to the original loop. +      if (RdxPhi->getType() != RdxDesc.getRecurrenceType()) +        ReducedPartRdx = +            RdxDesc.isSigned() +                ? Builder.CreateSExt(ReducedPartRdx, RdxPhi->getType()) +                : Builder.CreateZExt(ReducedPartRdx, RdxPhi->getType());      }      // Create a phi node that merges control-flow from the backedge-taken check      // block and the middle block.      PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",                                            LoopScalarPreHeader->getTerminator()); -    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]); +    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) +      BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);      BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);      // Now, we need to fix the users of the reduction variable @@ -3252,6 +3510,20 @@ void InnerLoopVectorizer::vectorizeLoop() {    fixLCSSAPHIs(); +  // Make sure DomTree is updated. +  updateAnalysis(); +   +  // Predicate any stores. +  for (auto KV : PredicatedStores) { +    BasicBlock::iterator I(KV.first); +    auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); +    auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, +                                        /*BranchWeights=*/nullptr, DT); +    I->moveBefore(T); +    I->getParent()->setName("pred.store.if"); +    BB->setName("pred.store.continue"); +  } +  DEBUG(DT->verifyDomTree());    // Remove redundant induction instructions.    cse(LoopVectorBody);  } @@ -3326,18 +3598,18 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {    return BlockMask;  } -void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, -                                              InnerLoopVectorizer::VectorParts &Entry, -                                              unsigned UF, unsigned VF, PhiVector *PV) { +void InnerLoopVectorizer::widenPHIInstruction( +    Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, +    unsigned VF, PhiVector *PV) {    PHINode* P = cast<PHINode>(PN);    // Handle reduction variables: -  if (Legal->getReductionVars()->count(P)) { +  if (Legal->isReductionVariable(P)) {      for (unsigned part = 0; part < UF; ++part) {        // This is phase one of vectorizing PHIs.        Type *VecTy = (VF == 1) ? PN->getType() :        VectorType::get(PN->getType(), VF); -      Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", -                                    LoopVectorBody.back()-> getFirstInsertionPt()); +      Entry[part] = PHINode::Create( +          VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt());      }      PV->push_back(P);      return; @@ -3385,53 +3657,44 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,    assert(Legal->getInductionVars()->count(P) &&           "Not an induction variable"); -  LoopVectorizationLegality::InductionInfo II = -  Legal->getInductionVars()->lookup(P); +  InductionDescriptor II = Legal->getInductionVars()->lookup(P);    // FIXME: The newly created binary instructions should contain nsw/nuw flags,    // which can be found from the original scalar operations. -  switch (II.IK) { -    case LoopVectorizationLegality::IK_NoInduction: +  switch (II.getKind()) { +    case InductionDescriptor::IK_NoInduction:        llvm_unreachable("Unknown induction"); -    case LoopVectorizationLegality::IK_IntInduction: { -      assert(P->getType() == II.StartValue->getType() && "Types must match"); -      Type *PhiTy = P->getType(); -      Value *Broadcasted; -      if (P == OldInduction) { -        // Handle the canonical induction variable. We might have had to -        // extend the type. -        Broadcasted = Builder.CreateTrunc(Induction, PhiTy); -      } else { -        // Handle other induction variables that are now based on the -        // canonical one. -        Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, -                                                 "normalized.idx"); -        NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); -        Broadcasted = II.transform(Builder, NormalizedIdx); -        Broadcasted->setName("offset.idx"); +    case InductionDescriptor::IK_IntInduction: { +      assert(P->getType() == II.getStartValue()->getType() && +             "Types must match"); +      // Handle other induction variables that are now based on the +      // canonical one. +      Value *V = Induction; +      if (P != OldInduction) { +        V = Builder.CreateSExtOrTrunc(Induction, P->getType()); +        V = II.transform(Builder, V); +        V->setName("offset.idx");        } -      Broadcasted = getBroadcastInstrs(Broadcasted); +      Value *Broadcasted = getBroadcastInstrs(V);        // After broadcasting the induction variable we need to make the vector        // consecutive by adding 0, 1, 2, etc.        for (unsigned part = 0; part < UF; ++part) -        Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue); +        Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue());        return;      } -    case LoopVectorizationLegality::IK_PtrInduction: +    case InductionDescriptor::IK_PtrInduction:        // Handle the pointer induction variable case.        assert(P->getType()->isPointerTy() && "Unexpected type.");        // This is the normalized GEP that starts counting at zero. -      Value *NormalizedIdx = -          Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); -      NormalizedIdx = -          Builder.CreateSExtOrTrunc(NormalizedIdx, II.StepValue->getType()); +      Value *PtrInd = Induction; +      PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType());        // This is the vector of results. Notice that we don't generate        // vector geps because scalar geps result in better code.        for (unsigned part = 0; part < UF; ++part) {          if (VF == 1) {            int EltIndex = part; -          Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex); -          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); +          Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); +          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);            Value *SclrGep = II.transform(Builder, GlobalIdx);            SclrGep->setName("next.gep");            Entry[part] = SclrGep; @@ -3441,8 +3704,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));          for (unsigned int i = 0; i < VF; ++i) {            int EltIndex = i + part * VF; -          Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex); -          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); +          Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); +          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);            Value *SclrGep = II.transform(Builder, GlobalIdx);            SclrGep->setName("next.gep");            VecVal = Builder.CreateInsertElement(VecVal, SclrGep, @@ -3458,7 +3721,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,  void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {    // For each instruction in the old loop.    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { -    VectorParts &Entry = WidenMap.get(it); +    VectorParts &Entry = WidenMap.get(&*it); +      switch (it->getOpcode()) {      case Instruction::Br:        // Nothing to do for PHIs and BR, since we already took care of the @@ -3466,7 +3730,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {        continue;      case Instruction::PHI: {        // Vectorize PHINodes. -      widenPHIInstruction(it, Entry, UF, VF, PV); +      widenPHIInstruction(&*it, Entry, UF, VF, PV);        continue;      }// End of PHI. @@ -3504,16 +3768,17 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {          Entry[Part] = V;        } -      propagateMetadata(Entry, it); +      propagateMetadata(Entry, &*it);        break;      }      case Instruction::Select: {        // Widen selects.        // If the selector is loop invariant we can create a select        // instruction with a scalar condition. Otherwise, use vector-select. -      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), -                                               OrigLoop); -      setDebugLocFromInst(Builder, it); +      auto *SE = PSE.getSE(); +      bool InvariantCond = +          SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop); +      setDebugLocFromInst(Builder, &*it);        // The condition can be loop invariant  but still defined inside the        // loop. This means that we can't just use the original 'cond' value. @@ -3522,7 +3787,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {        VectorParts &Cond = getVectorValue(it->getOperand(0));        VectorParts &Op0  = getVectorValue(it->getOperand(1));        VectorParts &Op1  = getVectorValue(it->getOperand(2)); - +              Value *ScalarCond = (VF == 1) ? Cond[0] :          Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); @@ -3533,7 +3798,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {            Op1[Part]);        } -      propagateMetadata(Entry, it); +      propagateMetadata(Entry, &*it);        break;      } @@ -3542,25 +3807,27 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {        // Widen compares. Generate vector compares.        bool FCmp = (it->getOpcode() == Instruction::FCmp);        CmpInst *Cmp = dyn_cast<CmpInst>(it); -      setDebugLocFromInst(Builder, it); +      setDebugLocFromInst(Builder, &*it);        VectorParts &A = getVectorValue(it->getOperand(0));        VectorParts &B = getVectorValue(it->getOperand(1));        for (unsigned Part = 0; Part < UF; ++Part) {          Value *C = nullptr; -        if (FCmp) +        if (FCmp) {            C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); -        else +          cast<FCmpInst>(C)->copyFastMathFlags(&*it); +        } else {            C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); +        }          Entry[Part] = C;        } -      propagateMetadata(Entry, it); +      propagateMetadata(Entry, &*it);        break;      }      case Instruction::Store:      case Instruction::Load: -      vectorizeMemoryInstruction(it); +      vectorizeMemoryInstruction(&*it);          break;      case Instruction::ZExt:      case Instruction::SExt: @@ -3575,7 +3842,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {      case Instruction::FPTrunc:      case Instruction::BitCast: {        CastInst *CI = dyn_cast<CastInst>(it); -      setDebugLocFromInst(Builder, it); +      setDebugLocFromInst(Builder, &*it);        /// Optimize the special case where the source is the induction        /// variable. Notice that we can only optimize the 'trunc' case        /// because: a. FP conversions lose precision, b. sext/zext may wrap, @@ -3585,13 +3852,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {          Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,                                                 CI->getType());          Value *Broadcasted = getBroadcastInstrs(ScalarCast); -        LoopVectorizationLegality::InductionInfo II = +        InductionDescriptor II =              Legal->getInductionVars()->lookup(OldInduction); -        Constant *Step = -            ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue()); +        Constant *Step = ConstantInt::getSigned( +            CI->getType(), II.getStepValue()->getSExtValue());          for (unsigned Part = 0; Part < UF; ++Part)            Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); -        propagateMetadata(Entry, it); +        propagateMetadata(Entry, &*it);          break;        }        /// Vectorize casts. @@ -3601,7 +3868,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {        VectorParts &A = getVectorValue(it->getOperand(0));        for (unsigned Part = 0; Part < UF; ++Part)          Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); -      propagateMetadata(Entry, it); +      propagateMetadata(Entry, &*it);        break;      } @@ -3609,7 +3876,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {        // Ignore dbg intrinsics.        if (isa<DbgInfoIntrinsic>(it))          break; -      setDebugLocFromInst(Builder, it); +      setDebugLocFromInst(Builder, &*it);        Module *M = BB->getParent()->getParent();        CallInst *CI = cast<CallInst>(it); @@ -3625,7 +3892,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {        if (ID &&            (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||             ID == Intrinsic::lifetime_start)) { -        scalarizeInstruction(it); +        scalarizeInstruction(&*it);          break;        }        // The flag shows whether we use Intrinsic or a usual Call for vectorized @@ -3636,7 +3903,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {        bool UseVectorIntrinsic =            ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;        if (!UseVectorIntrinsic && NeedToScalarize) { -        scalarizeInstruction(it); +        scalarizeInstruction(&*it);          break;        } @@ -3677,13 +3944,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {          Entry[Part] = Builder.CreateCall(VectorF, Args);        } -      propagateMetadata(Entry, it); +      propagateMetadata(Entry, &*it);        break;      }      default:        // All other instructions are unsupported. Scalarize them. -      scalarizeInstruction(it); +      scalarizeInstruction(&*it);        break;      }// end of switch.    }// end of for_each instr. @@ -3691,7 +3958,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {  void InnerLoopVectorizer::updateAnalysis() {    // Forget the original basic block. -  SE->forgetLoop(OrigLoop); +  PSE.getSE()->forgetLoop(OrigLoop);    // Update the dominator tree information.    assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && @@ -3701,19 +3968,12 @@ void InnerLoopVectorizer::updateAnalysis() {      DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);    DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); -  // Due to if predication of stores we might create a sequence of "if(pred) -  // a[i] = ...;  " blocks. -  for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { -    if (i == 0) -      DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); -    else if (isPredicatedBlock(i)) { -      DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); -    } else { -      DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); -    } -  } +  // We don't predicate stores by this point, so the vector body should be a +  // single loop. +  assert(LoopVectorBody.size() == 1 && "Expected single block loop!"); +  DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); -  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); +  DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back());    DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);    DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);    DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); @@ -3850,10 +4110,10 @@ bool LoopVectorizationLegality::canVectorize() {    }    // ScalarEvolution needs to be able to find the exit count. -  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); -  if (ExitCount == SE->getCouldNotCompute()) { -    emitAnalysis(VectorizationReport() << -                 "could not determine number of loop iterations"); +  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop); +  if (ExitCount == PSE.getSE()->getCouldNotCompute()) { +    emitAnalysis(VectorizationReport() +                 << "could not determine number of loop iterations");      DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");      return false;    } @@ -3879,10 +4139,28 @@ bool LoopVectorizationLegality::canVectorize() {                         : "")                 << "!\n"); +  bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); + +  // If an override option has been passed in for interleaved accesses, use it. +  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) +    UseInterleaved = EnableInterleavedMemAccesses; +    // Analyze interleaved memory accesses. -  if (EnableInterleavedMemAccesses) +  if (UseInterleaved)      InterleaveInfo.analyzeInterleaving(Strides); +  unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; +  if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) +    SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; + +  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { +    emitAnalysis(VectorizationReport() +                 << "Too many SCEV assumptions need to be made and checked " +                 << "at runtime"); +    DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); +    return false; +  } +    // Okay! We can vectorize. At this point we don't have any other mem analysis    // which may limit our maximum vectorization factor, so just return true with    // no restrictions. @@ -3929,7 +4207,6 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,  }  bool LoopVectorizationLegality::canVectorizeInstrs() { -  BasicBlock *PreHeader = TheLoop->getLoopPreheader();    BasicBlock *Header = TheLoop->getHeader();    // Look for the attribute signaling the absence of NaNs. @@ -3953,7 +4230,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {          if (!PhiTy->isIntegerTy() &&              !PhiTy->isFloatingPointTy() &&              !PhiTy->isPointerTy()) { -          emitAnalysis(VectorizationReport(it) +          emitAnalysis(VectorizationReport(&*it)                         << "loop control flow is not understood by vectorizer");            DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");            return false; @@ -3965,9 +4242,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {          if (*bb != Header) {            // Check that this instruction has no outside users or is an            // identified reduction value with an outside user. -          if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) +          if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit))              continue; -          emitAnalysis(VectorizationReport(it) << +          emitAnalysis(VectorizationReport(&*it) <<                         "value could not be identified as "                         "an induction or reduction variable");            return false; @@ -3975,19 +4252,15 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {          // We only allow if-converted PHIs with exactly two incoming values.          if (Phi->getNumIncomingValues() != 2) { -          emitAnalysis(VectorizationReport(it) +          emitAnalysis(VectorizationReport(&*it)                         << "control flow not understood by vectorizer");            DEBUG(dbgs() << "LV: Found an invalid PHI.\n");            return false;          } -        // This is the value coming from the preheader. -        Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); -        ConstantInt *StepValue = nullptr; -        // Check if this is an induction variable. -        InductionKind IK = isInductionVariable(Phi, StepValue); - -        if (IK_NoInduction != IK) { +        InductionDescriptor ID; +        if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) { +          Inductions[Phi] = ID;            // Get the widest type.            if (!WidestIndTy)              WidestIndTy = convertPointerToIntegerType(DL, PhiTy); @@ -3995,21 +4268,24 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {              WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);            // Int inductions are special because we only allow one IV. -          if (IK == IK_IntInduction && StepValue->isOne()) { +          if (ID.getKind() == InductionDescriptor::IK_IntInduction && +              ID.getStepValue()->isOne() && +              isa<Constant>(ID.getStartValue()) && +                cast<Constant>(ID.getStartValue())->isNullValue()) {              // Use the phi node with the widest type as induction. Use the last              // one if there are multiple (no good reason for doing this other -            // than it is expedient). +            // than it is expedient). We've checked that it begins at zero and +            // steps by one, so this is a canonical induction variable.              if (!Induction || PhiTy == WidestIndTy)                Induction = Phi;            }            DEBUG(dbgs() << "LV: Found an induction variable.\n"); -          Inductions[Phi] = InductionInfo(StartValue, IK, StepValue);            // Until we explicitly handle the case of an induction variable with            // an outside loop user we have to give up vectorizing this loop. -          if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { -            emitAnalysis(VectorizationReport(it) << +          if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { +            emitAnalysis(VectorizationReport(&*it) <<                           "use of induction value outside of the "                           "loop is not handled by vectorizer");              return false; @@ -4020,11 +4296,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {          if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop,                                                   Reductions[Phi])) { +          if (Reductions[Phi].hasUnsafeAlgebra()) +            Requirements->addUnsafeAlgebraInst( +                Reductions[Phi].getUnsafeAlgebraInst());            AllowedExit.insert(Reductions[Phi].getLoopExitInstr());            continue;          } -        emitAnalysis(VectorizationReport(it) << +        emitAnalysis(VectorizationReport(&*it) <<                       "value that could not be identified as "                       "reduction is used outside the loop");          DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); @@ -4039,8 +4318,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {        if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&            !(CI->getCalledFunction() && TLI &&              TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { -        emitAnalysis(VectorizationReport(it) << -                     "call instruction cannot be vectorized"); +        emitAnalysis(VectorizationReport(&*it) +                     << "call instruction cannot be vectorized");          DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");          return false;        } @@ -4049,8 +4328,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {        // second argument is the same (i.e. loop invariant)        if (CI &&            hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { -        if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { -          emitAnalysis(VectorizationReport(it) +        auto *SE = PSE.getSE(); +        if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { +          emitAnalysis(VectorizationReport(&*it)                         << "intrinsic instruction cannot be vectorized");            DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");            return false; @@ -4061,7 +4341,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {        // Also, we can't vectorize extractelement instructions.        if ((!VectorType::isValidElementType(it->getType()) &&             !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { -        emitAnalysis(VectorizationReport(it) +        emitAnalysis(VectorizationReport(&*it)                       << "instruction return type cannot be vectorized");          DEBUG(dbgs() << "LV: Found unvectorizable type.\n");          return false; @@ -4085,8 +4365,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {        // Reduction instructions are allowed to have exit users.        // All other instructions must not have external users. -      if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { -        emitAnalysis(VectorizationReport(it) << +      if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { +        emitAnalysis(VectorizationReport(&*it) <<                       "value cannot be used outside the loop");          return false;        } @@ -4104,6 +4384,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {      }    } +  // Now we know the widest induction type, check if our found induction +  // is the same size. If it's not, unset it here and InnerLoopVectorizer +  // will create another. +  if (Induction && WidestIndTy != Induction->getType()) +    Induction = nullptr; +    return true;  } @@ -4116,7 +4402,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {    else      return; -  Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop); +  Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop);    if (!Stride)      return; @@ -4142,7 +4428,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {         BE = TheLoop->block_end(); B != BE; ++B)      for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();           I != IE; ++I) -      if (I->getType()->isPointerTy() && isConsecutivePtr(I)) +      if (I->getType()->isPointerTy() && isConsecutivePtr(&*I))          Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());    while (!Worklist.empty()) { @@ -4179,30 +4465,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() {      return false;    } -  if (LAI->getNumRuntimePointerChecks() > -      VectorizerParams::RuntimeMemoryCheckThreshold) { -    emitAnalysis(VectorizationReport() -                 << LAI->getNumRuntimePointerChecks() << " exceeds limit of " -                 << VectorizerParams::RuntimeMemoryCheckThreshold -                 << " dependent memory operations checked at runtime"); -    DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); -    return false; -  } -  return true; -} +  Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); +  PSE.addPredicate(LAI->PSE.getUnionPredicate()); -LoopVectorizationLegality::InductionKind -LoopVectorizationLegality::isInductionVariable(PHINode *Phi, -                                               ConstantInt *&StepValue) { -  if (!isInductionPHI(Phi, SE, StepValue)) -    return IK_NoInduction; - -  Type *PhiTy = Phi->getType(); -  // Found an Integer induction variable. -  if (PhiTy->isIntegerTy()) -    return IK_IntInduction; -  // Found an Pointer induction variable. -  return IK_PtrInduction; +  return true;  }  bool LoopVectorizationLegality::isInductionVariable(const Value *V) { @@ -4256,8 +4522,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,        if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||            !isSinglePredecessor) { -        // Build a masked store if it is legal for the target, otherwise scalarize -        // the block. +        // Build a masked store if it is legal for the target, otherwise +        // scalarize the block.          bool isLegalMaskedOp =            isLegalMaskedStore(SI->getValueOperand()->getType(),                               SI->getPointerOperand()); @@ -4315,7 +4581,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(      StoreInst *SI = dyn_cast<StoreInst>(I);      Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); -    int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides); +    int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides);      // The factor of the corresponding interleave group.      unsigned Factor = std::abs(Stride); @@ -4324,7 +4590,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(      if (Factor < 2 || Factor > MaxInterleaveGroupFactor)        continue; -    const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); +    const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);      PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());      unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); @@ -4411,12 +4677,12 @@ void InterleavedAccessInfo::analyzeInterleaving(          continue;        // Calculate the distance and prepare for the rule 3. -      const SCEVConstant *DistToA = -          dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev)); +      const SCEVConstant *DistToA = dyn_cast<SCEVConstant>( +          PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev));        if (!DistToA)          continue; -      int DistanceToA = DistToA->getValue()->getValue().getSExtValue(); +      int DistanceToA = DistToA->getAPInt().getSExtValue();        // Skip if the distance is not multiple of size as they are not in the        // same group. @@ -4454,8 +4720,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {      emitAnalysis(VectorizationReport() <<                   "runtime pointer checks needed. Enable vectorization of this "                   "loop with '#pragma clang loop vectorize(enable)' when " -                 "compiling with -Os"); -    DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); +                 "compiling with -Os/-Oz"); +    DEBUG(dbgs() << +          "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");      return Factor;    } @@ -4467,10 +4734,12 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {    }    // Find the trip count. -  unsigned TC = SE->getSmallConstantTripCount(TheLoop); +  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);    DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); -  unsigned WidestType = getWidestType(); +  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); +  unsigned SmallestType, WidestType; +  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();    unsigned WidestRegister = TTI.getRegisterBitWidth(true);    unsigned MaxSafeDepDist = -1U;    if (Legal->getMaxSafeDepDistBytes() != -1U) @@ -4478,7 +4747,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {    WidestRegister = ((WidestRegister < MaxSafeDepDist) ?                      WidestRegister : MaxSafeDepDist);    unsigned MaxVectorSize = WidestRegister / WidestType; -  DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); + +  DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " +               << WidestType << " bits.\n");    DEBUG(dbgs() << "LV: The Widest register is: "            << WidestRegister << " bits.\n"); @@ -4491,6 +4762,26 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {           " into one vector!");    unsigned VF = MaxVectorSize; +  if (MaximizeBandwidth && !OptForSize) { +    // Collect all viable vectorization factors. +    SmallVector<unsigned, 8> VFs; +    unsigned NewMaxVectorSize = WidestRegister / SmallestType; +    for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) +      VFs.push_back(VS); + +    // For each VF calculate its register usage. +    auto RUs = calculateRegisterUsage(VFs); + +    // Select the largest VF which doesn't require more registers than existing +    // ones. +    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); +    for (int i = RUs.size() - 1; i >= 0; --i) { +      if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { +        VF = VFs[i]; +        break; +      } +    } +  }    // If we optimize the program for size, avoid creating the tail loop.    if (OptForSize) { @@ -4499,7 +4790,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {        emitAnalysis          (VectorizationReport() <<           "unable to calculate the loop count due to complex control flow"); -      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); +      DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");        return Factor;      } @@ -4515,8 +4806,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {                     "cannot optimize for size and vectorize at the "                     "same time. Enable vectorization of this loop "                     "with '#pragma clang loop vectorize(enable)' " -                   "when compiling with -Os"); -      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); +                   "when compiling with -Os/-Oz"); +      DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");        return Factor;      }    } @@ -4566,7 +4857,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {    return Factor;  } -unsigned LoopVectorizationCostModel::getWidestType() { +std::pair<unsigned, unsigned> +LoopVectorizationCostModel::getSmallestAndWidestTypes() { +  unsigned MinWidth = -1U;    unsigned MaxWidth = 8;    const DataLayout &DL = TheFunction->getParent()->getDataLayout(); @@ -4579,18 +4872,22 @@ unsigned LoopVectorizationCostModel::getWidestType() {      for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {        Type *T = it->getType(); -      // Ignore ephemeral values. -      if (EphValues.count(it)) +      // Skip ignored values. +      if (ValuesToIgnore.count(&*it))          continue;        // Only examine Loads, Stores and PHINodes.        if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))          continue; -      // Examine PHI nodes that are reduction variables. -      if (PHINode *PN = dyn_cast<PHINode>(it)) -        if (!Legal->getReductionVars()->count(PN)) +      // Examine PHI nodes that are reduction variables. Update the type to +      // account for the recurrence type. +      if (PHINode *PN = dyn_cast<PHINode>(it)) { +        if (!Legal->isReductionVariable(PN))            continue; +        RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; +        T = RdxDesc.getRecurrenceType(); +      }        // Examine the stored values.        if (StoreInst *ST = dyn_cast<StoreInst>(it)) @@ -4599,15 +4896,17 @@ unsigned LoopVectorizationCostModel::getWidestType() {        // Ignore loaded pointer types and stored pointer types that are not        // consecutive. However, we do want to take consecutive stores/loads of        // pointer vectors into account. -      if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) +      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it))          continue; +      MinWidth = std::min(MinWidth, +                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));        MaxWidth = std::max(MaxWidth,                            (unsigned)DL.getTypeSizeInBits(T->getScalarType()));      }    } -  return MaxWidth; +  return {MinWidth, MaxWidth};  }  unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, @@ -4628,11 +4927,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,    // 3. We don't interleave if we think that we will spill registers to memory    // due to the increased register pressure. -  // Use the user preference, unless 'auto' is selected. -  int UserUF = Hints->getInterleave(); -  if (UserUF != 0) -    return UserUF; -    // When we optimize for size, we don't interleave.    if (OptForSize)      return 1; @@ -4642,7 +4936,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,      return 1;    // Do not interleave loops with a relatively small trip count. -  unsigned TC = SE->getSmallConstantTripCount(TheLoop); +  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);    if (TC > 1 && TC < TinyTripCountInterleaveThreshold)      return 1; @@ -4658,7 +4952,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,        TargetNumRegisters = ForceTargetNumVectorRegs;    } -  LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); +  RegisterUsage R = calculateRegisterUsage({VF})[0];    // We divide by these constants so assume that we have at least one    // instruction that uses at least one register.    R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); @@ -4756,8 +5050,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,    }    // Interleave if this is a large loop (small loops are already dealt with by -  // this -  // point) that could benefit from interleaving. +  // this point) that could benefit from interleaving.    bool HasReductions = (Legal->getReductionVars()->size() > 0);    if (TTI.enableAggressiveInterleaving(HasReductions)) {      DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); @@ -4768,8 +5061,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,    return 1;  } -LoopVectorizationCostModel::RegisterUsage -LoopVectorizationCostModel::calculateRegisterUsage() { +SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> +LoopVectorizationCostModel::calculateRegisterUsage( +    const SmallVector<unsigned, 8> &VFs) {    // This function calculates the register usage by measuring the highest number    // of values that are alive at a single location. Obviously, this is a very    // rough estimation. We scan the loop in a topological order in order and @@ -4790,8 +5084,8 @@ LoopVectorizationCostModel::calculateRegisterUsage() {    LoopBlocksDFS DFS(TheLoop);    DFS.perform(LI); -  RegisterUsage R; -  R.NumInstructions = 0; +  RegisterUsage RU; +  RU.NumInstructions = 0;    // Each 'key' in the map opens a new interval. The values    // of the map are the index of the 'last seen' usage of the @@ -4810,15 +5104,13 @@ LoopVectorizationCostModel::calculateRegisterUsage() {    unsigned Index = 0;    for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),         be = DFS.endRPO(); bb != be; ++bb) { -    R.NumInstructions += (*bb)->size(); -    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; -         ++it) { -      Instruction *I = it; -      IdxToInstr[Index++] = I; +    RU.NumInstructions += (*bb)->size(); +    for (Instruction &I : **bb) { +      IdxToInstr[Index++] = &I;        // Save the end location of each USE. -      for (unsigned i = 0; i < I->getNumOperands(); ++i) { -        Value *U = I->getOperand(i); +      for (unsigned i = 0; i < I.getNumOperands(); ++i) { +        Value *U = I.getOperand(i);          Instruction *Instr = dyn_cast<Instruction>(U);          // Ignore non-instruction values such as arguments, constants, etc. @@ -4847,42 +5139,85 @@ LoopVectorizationCostModel::calculateRegisterUsage() {      TransposeEnds[it->second].push_back(it->first);    SmallSet<Instruction*, 8> OpenIntervals; -  unsigned MaxUsage = 0; +  // Get the size of the widest register. +  unsigned MaxSafeDepDist = -1U; +  if (Legal->getMaxSafeDepDistBytes() != -1U) +    MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; +  unsigned WidestRegister = +      std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); +  const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + +  SmallVector<RegisterUsage, 8> RUs(VFs.size()); +  SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);    DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + +  // A lambda that gets the register usage for the given type and VF. +  auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { +    unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); +    return std::max<unsigned>(1, VF * TypeSize / WidestRegister); +  }; +    for (unsigned int i = 0; i < Index; ++i) {      Instruction *I = IdxToInstr[i];      // Ignore instructions that are never used within the loop.      if (!Ends.count(I)) continue; -    // Ignore ephemeral values. -    if (EphValues.count(I)) -      continue; -      // Remove all of the instructions that end at this location.      InstrList &List = TransposeEnds[i]; -    for (unsigned int j=0, e = List.size(); j < e; ++j) +    for (unsigned int j = 0, e = List.size(); j < e; ++j)        OpenIntervals.erase(List[j]); -    // Count the number of live interals. -    MaxUsage = std::max(MaxUsage, OpenIntervals.size()); +    // Skip ignored values. +    if (ValuesToIgnore.count(I)) +      continue; -    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << -          OpenIntervals.size() << '\n'); +    // For each VF find the maximum usage of registers. +    for (unsigned j = 0, e = VFs.size(); j < e; ++j) { +      if (VFs[j] == 1) { +        MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); +        continue; +      } + +      // Count the number of live intervals. +      unsigned RegUsage = 0; +      for (auto Inst : OpenIntervals) { +        // Skip ignored values for VF > 1. +        if (VecValuesToIgnore.count(Inst)) +          continue; +        RegUsage += GetRegUsage(Inst->getType(), VFs[j]); +      } +      MaxUsages[j] = std::max(MaxUsages[j], RegUsage); +    } + +    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " +                 << OpenIntervals.size() << '\n');      // Add the current instruction to the list of open intervals.      OpenIntervals.insert(I);    } -  unsigned Invariant = LoopInvariants.size(); -  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); -  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); -  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); +  for (unsigned i = 0, e = VFs.size(); i < e; ++i) { +    unsigned Invariant = 0; +    if (VFs[i] == 1) +      Invariant = LoopInvariants.size(); +    else { +      for (auto Inst : LoopInvariants) +        Invariant += GetRegUsage(Inst->getType(), VFs[i]); +    } + +    DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] <<  '\n'); +    DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); +    DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); +    DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); -  R.LoopInvariantRegs = Invariant; -  R.MaxLocalUsers = MaxUsage; -  return R; +    RU.LoopInvariantRegs = Invariant; +    RU.MaxLocalUsers = MaxUsages[i]; +    RUs[i] = RU; +  } + +  return RUs;  }  unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { @@ -4900,11 +5235,11 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {        if (isa<DbgInfoIntrinsic>(it))          continue; -      // Ignore ephemeral values. -      if (EphValues.count(it)) +      // Skip ignored values. +      if (ValuesToIgnore.count(&*it))          continue; -      unsigned C = getInstructionCost(it, VF); +      unsigned C = getInstructionCost(&*it, VF);        // Check if we should override the cost.        if (ForceTargetInstructionCost.getNumOccurrences() > 0) @@ -4969,7 +5304,7 @@ static bool isLikelyComplexAddressComputation(Value *Ptr,    if (!C)      return true; -  const APInt &APStepVal = C->getValue()->getValue(); +  const APInt &APStepVal = C->getAPInt();    // Huge step value - give up.    if (APStepVal.getBitWidth() > 64) @@ -4981,9 +5316,8 @@ static bool isLikelyComplexAddressComputation(Value *Ptr,  }  static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { -  if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1))) -    return true; -  return false; +  return Legal->hasStride(I->getOperand(0)) || +         Legal->hasStride(I->getOperand(1));  }  unsigned @@ -4994,7 +5328,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {      VF = 1;    Type *RetTy = I->getType(); +  if (VF > 1 && MinBWs.count(I)) +    RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);    Type *VectorTy = ToVectorTy(RetTy, VF); +  auto SE = PSE.getSE();    // TODO: We need to estimate the cost of intrinsic calls.    switch (I->getOpcode()) { @@ -5076,6 +5413,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {    case Instruction::ICmp:    case Instruction::FCmp: {      Type *ValTy = I->getOperand(0)->getType(); +    Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); +    auto It = MinBWs.find(Op0AsInstruction); +    if (VF > 1 && It != MinBWs.end()) +      ValTy = IntegerType::get(ValTy->getContext(), It->second);      VectorTy = ToVectorTy(ValTy, VF);      return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);    } @@ -5199,8 +5540,28 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {          Legal->isInductionVariable(I->getOperand(0)))        return TTI.getCastInstrCost(I->getOpcode(), I->getType(),                                    I->getOperand(0)->getType()); - -    Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); +     +    Type *SrcScalarTy = I->getOperand(0)->getType(); +    Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); +    if (VF > 1 && MinBWs.count(I)) { +      // This cast is going to be shrunk. This may remove the cast or it might +      // turn it into slightly different cast. For example, if MinBW == 16, +      // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". +      // +      // Calculate the modified src and dest types. +      Type *MinVecTy = VectorTy; +      if (I->getOpcode() == Instruction::Trunc) { +        SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); +        VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), +                                            MinVecTy); +      } else if (I->getOpcode() == Instruction::ZExt || +                 I->getOpcode() == Instruction::SExt) { +        SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); +        VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), +                                             MinVecTy); +      } +    } +          return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);    }    case Instruction::Call: { @@ -5240,15 +5601,18 @@ char LoopVectorize::ID = 0;  static const char lv_name[] = "Loop Vectorization";  INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)  INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LCSSA)  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)  INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DemandedBits)  INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)  namespace llvm { @@ -5269,6 +5633,79 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {    return false;  } +void LoopVectorizationCostModel::collectValuesToIgnore() { +  // Ignore ephemeral values. +  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); + +  // Ignore type-promoting instructions we identified during reduction +  // detection. +  for (auto &Reduction : *Legal->getReductionVars()) { +    RecurrenceDescriptor &RedDes = Reduction.second; +    SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); +    VecValuesToIgnore.insert(Casts.begin(), Casts.end()); +  } + +  // Ignore induction phis that are only used in either GetElementPtr or ICmp +  // instruction to exit loop. Induction variables usually have large types and +  // can have big impact when estimating register usage. +  // This is for when VF > 1. +  for (auto &Induction : *Legal->getInductionVars()) { +    auto *PN = Induction.first; +    auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); + +    // Check that the PHI is only used by the induction increment (UpdateV) or +    // by GEPs. Then check that UpdateV is only used by a compare instruction or +    // the loop header PHI. +    // FIXME: Need precise def-use analysis to determine if this instruction +    // variable will be vectorized. +    if (std::all_of(PN->user_begin(), PN->user_end(), +                    [&](const User *U) -> bool { +                      return U == UpdateV || isa<GetElementPtrInst>(U); +                    }) && +        std::all_of(UpdateV->user_begin(), UpdateV->user_end(), +                    [&](const User *U) -> bool { +                      return U == PN || isa<ICmpInst>(U); +                    })) { +      VecValuesToIgnore.insert(PN); +      VecValuesToIgnore.insert(UpdateV); +    } +  } + +  // Ignore instructions that will not be vectorized. +  // This is for when VF > 1. +  for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; +       ++bb) { +    for (auto &Inst : **bb) { +      switch (Inst.getOpcode()) { +      case Instruction::GetElementPtr: { +        // Ignore GEP if its last operand is an induction variable so that it is +        // a consecutive load/store and won't be vectorized as scatter/gather +        // pattern. + +        GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst); +        unsigned NumOperands = Gep->getNumOperands(); +        unsigned InductionOperand = getGEPInductionOperand(Gep); +        bool GepToIgnore = true; + +        // Check that all of the gep indices are uniform except for the +        // induction operand. +        for (unsigned i = 0; i != NumOperands; ++i) { +          if (i != InductionOperand && +              !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), +                                            TheLoop)) { +            GepToIgnore = false; +            break; +          } +        } + +        if (GepToIgnore) +          VecValuesToIgnore.insert(&Inst); +        break; +      } +      } +    } +  } +}  void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,                                               bool IfPredicateStore) { @@ -5316,19 +5753,12 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,    // Create a new entry in the WidenMap and initialize it to Undef or Null.    VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); -  Instruction *InsertPt = Builder.GetInsertPoint(); -  BasicBlock *IfBlock = Builder.GetInsertBlock(); -  BasicBlock *CondBlock = nullptr; -    VectorParts Cond; -  Loop *VectorLp = nullptr;    if (IfPredicateStore) {      assert(Instr->getParent()->getSinglePredecessor() &&             "Only support single predecessor blocks");      Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),                            Instr->getParent()); -    VectorLp = LI->getLoopFor(IfBlock); -    assert(VectorLp && "Must have a loop for this block");    }    // For each vector unroll 'part': @@ -5343,11 +5773,6 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,              Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));        Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],                                 ConstantInt::get(Cond[Part]->getType(), 1)); -      CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); -      LoopVectorBody.push_back(CondBlock); -      VectorLp->addBasicBlockToLoop(CondBlock, *LI); -      // Update Builder with newly created basic block. -      Builder.SetInsertPoint(InsertPt);      }      Instruction *Cloned = Instr->clone(); @@ -5367,16 +5792,10 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,        if (!IsVoidRetTy)          VecResults[Part] = Cloned; -    // End if-block. -      if (IfPredicateStore) { -        BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); -        LoopVectorBody.push_back(NewIfBlock); -        VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); -        Builder.SetInsertPoint(InsertPt); -        ReplaceInstWithInst(IfBlock->getTerminator(), -                            BranchInst::Create(CondBlock, NewIfBlock, Cmp)); -        IfBlock = NewIfBlock; -      } +      // End if-block. +      if (IfPredicateStore) +        PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), +                                                  Cmp));    }  } diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b180c976c233..9ed44d1e0cb8 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -22,6 +22,7 @@  #include "llvm/ADT/SetVector.h"  #include "llvm/ADT/Statistic.h"  #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h"  #include "llvm/Analysis/AssumptionCache.h"  #include "llvm/Analysis/CodeMetrics.h"  #include "llvm/Analysis/LoopInfo.h" @@ -61,7 +62,7 @@ static cl::opt<int>                                "number "));  static cl::opt<bool> -ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, +ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,                     cl::desc("Attempt to vectorize horizontal reductions"));  static cl::opt<bool> ShouldStartVectorizeHorAtStore( @@ -73,6 +74,14 @@ static cl::opt<int>  MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,      cl::desc("Attempt to vectorize for this register size in bits")); +/// Limits the size of scheduling regions in a block. +/// It avoid long compile times for _very_ large blocks where vector +/// instructions are spread over a wide range. +/// This limit is way higher than needed by real-world functions. +static cl::opt<int> +ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, +    cl::desc("Limit the size of the SLP scheduling region per block")); +  namespace {  // FIXME: Set this via cl::opt to allow overriding. @@ -89,6 +98,10 @@ static const unsigned AliasedCheckLimit = 10;  // This limit is useful for very large basic blocks.  static const unsigned MaxMemDepDistance = 160; +/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling +/// regions to be handled. +static const int MinScheduleRegionSize = 16; +  /// \brief Predicate for the element types that the SLP vectorizer supports.  ///  /// The most important thing to filter here are types which are invalid in LLVM @@ -156,13 +169,11 @@ static unsigned getAltOpcode(unsigned Op) {  /// of an alternate sequence which can later be merged as  /// a ShuffleVector instruction.  static bool canCombineAsAltInst(unsigned Op) { -  if (Op == Instruction::FAdd || Op == Instruction::FSub || -      Op == Instruction::Sub || Op == Instruction::Add) -    return true; -  return false; +  return Op == Instruction::FAdd || Op == Instruction::FSub || +         Op == Instruction::Sub || Op == Instruction::Add;  } -/// \returns ShuffleVector instruction if intructions in \p VL have +/// \returns ShuffleVector instruction if instructions in \p VL have  ///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.  /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)  static unsigned isAltInst(ArrayRef<Value *> VL) { @@ -242,6 +253,9 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {        case LLVMContext::MD_fpmath:          MD = MDNode::getMostGenericFPMath(MD, IMD);          break; +      case LLVMContext::MD_nontemporal: +        MD = MDNode::intersect(MD, IMD); +        break;        }      }      I->setMetadata(Kind, MD); @@ -393,7 +407,7 @@ public:    /// \brief Perform LICM and CSE on the newly generated gather sequences.    void optimizeGatherSequence(); -  /// \returns true if it is benefitial to reverse the vector order. +  /// \returns true if it is beneficial to reverse the vector order.    bool shouldReorder() const {      return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;    } @@ -441,7 +455,7 @@ private:    /// \returns a vector from a collection of scalars in \p VL.    Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); -  /// \returns whether the VectorizableTree is fully vectoriable and will +  /// \returns whether the VectorizableTree is fully vectorizable and will    /// be beneficial even the tree height is tiny.    bool isFullyVectorizableTinyTree(); @@ -506,7 +520,7 @@ private:    /// This POD struct describes one external user in the vectorized tree.    struct ExternalUser {      ExternalUser (Value *S, llvm::User *U, int L) : -      Scalar(S), User(U), Lane(L){}; +      Scalar(S), User(U), Lane(L){}      // Which scalar in our function.      Value *Scalar;      // Which user that uses the scalar. @@ -717,6 +731,8 @@ private:          : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),            ScheduleStart(nullptr), ScheduleEnd(nullptr),            FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), +          ScheduleRegionSize(0), +          ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),            // Make sure that the initial SchedulingRegionID is greater than the            // initial SchedulingRegionID in ScheduleData (which is 0).            SchedulingRegionID(1) {} @@ -728,6 +744,13 @@ private:        FirstLoadStoreInRegion = nullptr;        LastLoadStoreInRegion = nullptr; +      // Reduce the maximum schedule region size by the size of the +      // previous scheduling run. +      ScheduleRegionSizeLimit -= ScheduleRegionSize; +      if (ScheduleRegionSizeLimit < MinScheduleRegionSize) +        ScheduleRegionSizeLimit = MinScheduleRegionSize; +      ScheduleRegionSize = 0; +        // Make a new scheduling region, i.e. all existing ScheduleData is not        // in the new region yet.        ++SchedulingRegionID; @@ -804,7 +827,8 @@ private:      void cancelScheduling(ArrayRef<Value *> VL);      /// Extends the scheduling region so that V is inside the region. -    void extendSchedulingRegion(Value *V); +    /// \returns true if the region size is within the limit. +    bool extendSchedulingRegion(Value *V);      /// Initialize the ScheduleData structures for new instructions in the      /// scheduling region. @@ -858,6 +882,12 @@ private:      /// (can be null).      ScheduleData *LastLoadStoreInRegion; +    /// The current size of the scheduling region. +    int ScheduleRegionSize; +     +    /// The maximum size allowed for the scheduling region. +    int ScheduleRegionSizeLimit; +      /// The ID of the scheduling region. For a new vectorization iteration this      /// is incremented which "removes" all ScheduleData from the region.      int SchedulingRegionID; @@ -1077,7 +1107,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {    if (!BS.tryScheduleBundle(VL, this)) {      DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); -    BS.cancelScheduling(VL); +    assert((!BS.getScheduleData(VL[0]) || +            !BS.getScheduleData(VL[0])->isPartOfBundle()) && +           "tryScheduleBundle should cancelScheduling on failure");      newTreeEntry(VL, false);      return;    } @@ -1125,6 +1157,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {        return;      }      case Instruction::Load: { +      // Check that a vectorized load would load the same memory as a scalar +      // load. +      // For example we don't want vectorize loads that are smaller than 8 bit. +      // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats +      // loading/storing it as an i8 struct. If we vectorize loads/stores from +      // such a struct we read/write packed bits disagreeing with the +      // unvectorized version. +      const DataLayout &DL = F->getParent()->getDataLayout(); +      Type *ScalarTy = VL[0]->getType(); + +      if (DL.getTypeSizeInBits(ScalarTy) != +          DL.getTypeAllocSizeInBits(ScalarTy)) { +        BS.cancelScheduling(VL); +        newTreeEntry(VL, false); +        DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); +        return; +      }        // Check if the loads are consecutive or of we need to swizzle them.        for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {          LoadInst *L = cast<LoadInst>(VL[i]); @@ -1134,7 +1183,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {            DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");            return;          } -        const DataLayout &DL = F->getParent()->getDataLayout(); +          if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {            if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {              ++NumLoadsWantToChangeOrder; @@ -1690,7 +1739,8 @@ int BoUpSLP::getSpillCost() {      }          // Now find the sequence of instructions between PrevInst and Inst. -    BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst); +    BasicBlock::reverse_iterator InstIt(Inst->getIterator()), +        PrevInstIt(PrevInst->getIterator());      --PrevInstIt;      while (InstIt != PrevInstIt) {        if (PrevInstIt == PrevInst->getParent()->rend()) { @@ -1890,106 +1940,126 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,    }  } +// Return true if I should be commuted before adding it's left and right +// operands to the arrays Left and Right. +// +// The vectorizer is trying to either have all elements one side being +// instruction with the same opcode to enable further vectorization, or having +// a splat to lower the vectorizing cost. +static bool shouldReorderOperands(int i, Instruction &I, +                                  SmallVectorImpl<Value *> &Left, +                                  SmallVectorImpl<Value *> &Right, +                                  bool AllSameOpcodeLeft, +                                  bool AllSameOpcodeRight, bool SplatLeft, +                                  bool SplatRight) { +  Value *VLeft = I.getOperand(0); +  Value *VRight = I.getOperand(1); +  // If we have "SplatRight", try to see if commuting is needed to preserve it. +  if (SplatRight) { +    if (VRight == Right[i - 1]) +      // Preserve SplatRight +      return false; +    if (VLeft == Right[i - 1]) { +      // Commuting would preserve SplatRight, but we don't want to break +      // SplatLeft either, i.e. preserve the original order if possible. +      // (FIXME: why do we care?) +      if (SplatLeft && VLeft == Left[i - 1]) +        return false; +      return true; +    } +  } +  // Symmetrically handle Right side. +  if (SplatLeft) { +    if (VLeft == Left[i - 1]) +      // Preserve SplatLeft +      return false; +    if (VRight == Left[i - 1]) +      return true; +  } + +  Instruction *ILeft = dyn_cast<Instruction>(VLeft); +  Instruction *IRight = dyn_cast<Instruction>(VRight); + +  // If we have "AllSameOpcodeRight", try to see if the left operands preserves +  // it and not the right, in this case we want to commute. +  if (AllSameOpcodeRight) { +    unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode(); +    if (IRight && RightPrevOpcode == IRight->getOpcode()) +      // Do not commute, a match on the right preserves AllSameOpcodeRight +      return false; +    if (ILeft && RightPrevOpcode == ILeft->getOpcode()) { +      // We have a match and may want to commute, but first check if there is +      // not also a match on the existing operands on the Left to preserve +      // AllSameOpcodeLeft, i.e. preserve the original order if possible. +      // (FIXME: why do we care?) +      if (AllSameOpcodeLeft && ILeft && +          cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode()) +        return false; +      return true; +    } +  } +  // Symmetrically handle Left side. +  if (AllSameOpcodeLeft) { +    unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode(); +    if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) +      return false; +    if (IRight && LeftPrevOpcode == IRight->getOpcode()) +      return true; +  } +  return false; +} +  void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,                                               SmallVectorImpl<Value *> &Left,                                               SmallVectorImpl<Value *> &Right) { -  SmallVector<Value *, 16> OrigLeft, OrigRight; - -  bool AllSameOpcodeLeft = true; -  bool AllSameOpcodeRight = true; -  for (unsigned i = 0, e = VL.size(); i != e; ++i) { -    Instruction *I = cast<Instruction>(VL[i]); -    Value *VLeft = I->getOperand(0); -    Value *VRight = I->getOperand(1); - -    OrigLeft.push_back(VLeft); -    OrigRight.push_back(VRight); - -    Instruction *ILeft = dyn_cast<Instruction>(VLeft); -    Instruction *IRight = dyn_cast<Instruction>(VRight); - -    // Check whether all operands on one side have the same opcode. In this case -    // we want to preserve the original order and not make things worse by -    // reordering. -    if (i && AllSameOpcodeLeft && ILeft) { -      if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) { -        if (PLeft->getOpcode() != ILeft->getOpcode()) -          AllSameOpcodeLeft = false; -      } else -        AllSameOpcodeLeft = false; -    } -    if (i && AllSameOpcodeRight && IRight) { -      if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) { -        if (PRight->getOpcode() != IRight->getOpcode()) -          AllSameOpcodeRight = false; -      } else -        AllSameOpcodeRight = false; -    } - -    // Sort two opcodes. In the code below we try to preserve the ability to use -    // broadcast of values instead of individual inserts. -    // vl1 = load -    // vl2 = phi -    // vr1 = load -    // vr2 = vr2 -    //    = vl1 x vr1 -    //    = vl2 x vr2 -    // If we just sorted according to opcode we would leave the first line in -    // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). -    //    = vl1 x vr1 -    //    = vr2 x vl2 -    // Because vr2 and vr1 are from the same load we loose the opportunity of a -    // broadcast for the packed right side in the backend: we have [vr1, vl2] -    // instead of [vr1, vr2=vr1]. -    if (ILeft && IRight) { -      if (!i && ILeft->getOpcode() > IRight->getOpcode()) { -        Left.push_back(IRight); -        Right.push_back(ILeft); -      } else if (i && ILeft->getOpcode() > IRight->getOpcode() && -                 Right[i - 1] != IRight) { -        // Try not to destroy a broad cast for no apparent benefit. -        Left.push_back(IRight); -        Right.push_back(ILeft); -      } else if (i && ILeft->getOpcode() == IRight->getOpcode() && -                 Right[i - 1] == ILeft) { -        // Try preserve broadcasts. -        Left.push_back(IRight); -        Right.push_back(ILeft); -      } else if (i && ILeft->getOpcode() == IRight->getOpcode() && -                 Left[i - 1] == IRight) { -        // Try preserve broadcasts. -        Left.push_back(IRight); -        Right.push_back(ILeft); -      } else { -        Left.push_back(ILeft); -        Right.push_back(IRight); -      } -      continue; -    } -    // One opcode, put the instruction on the right. -    if (ILeft) { -      Left.push_back(VRight); -      Right.push_back(ILeft); -      continue; -    } +  if (VL.size()) { +    // Peel the first iteration out of the loop since there's nothing +    // interesting to do anyway and it simplifies the checks in the loop. +    auto VLeft = cast<Instruction>(VL[0])->getOperand(0); +    auto VRight = cast<Instruction>(VL[0])->getOperand(1); +    if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft)) +      // Favor having instruction to the right. FIXME: why? +      std::swap(VLeft, VRight);      Left.push_back(VLeft);      Right.push_back(VRight);    } -  bool LeftBroadcast = isSplat(Left); -  bool RightBroadcast = isSplat(Right); - -  // If operands end up being broadcast return this operand order. -  if (LeftBroadcast || RightBroadcast) -    return; +  // Keep track if we have instructions with all the same opcode on one side. +  bool AllSameOpcodeLeft = isa<Instruction>(Left[0]); +  bool AllSameOpcodeRight = isa<Instruction>(Right[0]); +  // Keep track if we have one side with all the same value (broadcast). +  bool SplatLeft = true; +  bool SplatRight = true; -  // Don't reorder if the operands where good to begin. -  if (AllSameOpcodeRight || AllSameOpcodeLeft) { -    Left = OrigLeft; -    Right = OrigRight; +  for (unsigned i = 1, e = VL.size(); i != e; ++i) { +    Instruction *I = cast<Instruction>(VL[i]); +    assert(I->isCommutative() && "Can only process commutative instruction"); +    // Commute to favor either a splat or maximizing having the same opcodes on +    // one side. +    if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, +                              AllSameOpcodeRight, SplatLeft, SplatRight)) { +      Left.push_back(I->getOperand(1)); +      Right.push_back(I->getOperand(0)); +    } else { +      Left.push_back(I->getOperand(0)); +      Right.push_back(I->getOperand(1)); +    } +    // Update Splat* and AllSameOpcode* after the insertion. +    SplatRight = SplatRight && (Right[i - 1] == Right[i]); +    SplatLeft = SplatLeft && (Left[i - 1] == Left[i]); +    AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) && +                        (cast<Instruction>(Left[i - 1])->getOpcode() == +                         cast<Instruction>(Left[i])->getOpcode()); +    AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) && +                         (cast<Instruction>(Right[i - 1])->getOpcode() == +                          cast<Instruction>(Right[i])->getOpcode());    } +  // If one operand end up being broadcast, return this operand order. +  if (SplatRight || SplatLeft) +    return; +    const DataLayout &DL = F->getParent()->getDataLayout();    // Finally check if we can get longer vectorizable chain by reordering @@ -2030,7 +2100,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,  void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {    Instruction *VL0 = cast<Instruction>(VL[0]); -  BasicBlock::iterator NextInst = VL0; +  BasicBlock::iterator NextInst(VL0);    ++NextInst;    Builder.SetInsertPoint(VL0->getParent(), NextInst);    Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); @@ -2487,7 +2557,7 @@ Value *BoUpSLP::vectorizeTree() {      scheduleBlock(BSIter.second.get());    } -  Builder.SetInsertPoint(F->getEntryBlock().begin()); +  Builder.SetInsertPoint(&F->getEntryBlock().front());    vectorizeTree(&VectorizableTree[0]);    DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); @@ -2532,7 +2602,7 @@ Value *BoUpSLP::vectorizeTree() {          User->replaceUsesOfWith(Scalar, Ex);       }      } else { -      Builder.SetInsertPoint(F->getEntryBlock().begin()); +      Builder.SetInsertPoint(&F->getEntryBlock().front());        Value *Ex = Builder.CreateExtractElement(Vec, Lane);        CSEBlocks.insert(&F->getEntryBlock());        User->replaceUsesOfWith(Scalar, Ex); @@ -2641,7 +2711,7 @@ void BoUpSLP::optimizeGatherSequence() {      BasicBlock *BB = (*I)->getBlock();      // For all instructions in blocks containing gather sequences:      for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { -      Instruction *In = it++; +      Instruction *In = &*it++;        if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))          continue; @@ -2681,8 +2751,15 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,    ScheduleData *Bundle = nullptr;    bool ReSchedule = false;    DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n"); + +  // Make sure that the scheduling region contains all +  // instructions of the bundle. +  for (Value *V : VL) { +    if (!extendSchedulingRegion(V)) +      return false; +  } +    for (Value *V : VL) { -    extendSchedulingRegion(V);      ScheduleData *BundleMember = getScheduleData(V);      assert(BundleMember &&             "no ScheduleData for bundle member (maybe not in same basic block)"); @@ -2743,7 +2820,11 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,        schedule(pickedSD, ReadyInsts);      }    } -  return Bundle->isReady(); +  if (!Bundle->isReady()) { +    cancelScheduling(VL); +    return false; +  } +  return true;  }  void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { @@ -2772,9 +2853,9 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {    }  } -void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { +bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {    if (getScheduleData(V)) -    return; +    return true;    Instruction *I = dyn_cast<Instruction>(V);    assert(I && "bundle member must be an instruction");    assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); @@ -2785,21 +2866,26 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {      ScheduleEnd = I->getNextNode();      assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");      DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n"); -    return; +    return true;    }    // Search up and down at the same time, because we don't know if the new    // instruction is above or below the existing scheduling region. -  BasicBlock::reverse_iterator UpIter(ScheduleStart); +  BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator());    BasicBlock::reverse_iterator UpperEnd = BB->rend();    BasicBlock::iterator DownIter(ScheduleEnd);    BasicBlock::iterator LowerEnd = BB->end();    for (;;) { +    if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { +      DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n"); +      return false; +    } +      if (UpIter != UpperEnd) {        if (&*UpIter == I) {          initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);          ScheduleStart = I;          DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n"); -        return; +        return true;        }        UpIter++;      } @@ -2810,13 +2896,14 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {          ScheduleEnd = I->getNextNode();          assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");          DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n"); -        return; +        return true;        }        DownIter++;      }      assert((UpIter != UpperEnd || DownIter != LowerEnd) &&             "instruction not found in block");    } +  return true;  }  void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, @@ -2896,8 +2983,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,              }            } else {              // I'm not sure if this can ever happen. But we need to be safe. -            // This lets the instruction/bundle never be scheduled and eventally -            // disable vectorization. +            // This lets the instruction/bundle never be scheduled and +            // eventually disable vectorization.              BundleMember->Dependencies++;              BundleMember->incrementUnscheduledDeps(1);            } @@ -3003,7 +3090,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {    };    std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; -  // Ensure that all depencency data is updated and fill the ready-list with +  // Ensure that all dependency data is updated and fill the ready-list with    // initial instructions.    int Idx = 0;    int NumToSchedule = 0; @@ -3035,7 +3122,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {        Instruction *pickedInst = BundleMember->Inst;        if (LastScheduledInst->getNextNode() != pickedInst) {          BS->BB->getInstList().remove(pickedInst); -        BS->BB->getInstList().insert(LastScheduledInst, pickedInst); +        BS->BB->getInstList().insert(LastScheduledInst->getIterator(), +                                     pickedInst);        }        LastScheduledInst = pickedInst;        BundleMember = BundleMember->NextInBundle; @@ -3074,11 +3162,11 @@ struct SLPVectorizer : public FunctionPass {      if (skipOptnoneFunction(F))        return false; -    SE = &getAnalysis<ScalarEvolution>(); +    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();      TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);      auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();      TLI = TLIP ? &TLIP->getTLI() : nullptr; -    AA = &getAnalysis<AliasAnalysis>(); +    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();      AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); @@ -3139,13 +3227,15 @@ struct SLPVectorizer : public FunctionPass {    void getAnalysisUsage(AnalysisUsage &AU) const override {      FunctionPass::getAnalysisUsage(AU);      AU.addRequired<AssumptionCacheTracker>(); -    AU.addRequired<ScalarEvolution>(); -    AU.addRequired<AliasAnalysis>(); +    AU.addRequired<ScalarEvolutionWrapperPass>(); +    AU.addRequired<AAResultsWrapperPass>();      AU.addRequired<TargetTransformInfoWrapperPass>();      AU.addRequired<LoopInfoWrapperPass>();      AU.addRequired<DominatorTreeWrapperPass>();      AU.addPreserved<LoopInfoWrapperPass>();      AU.addPreserved<DominatorTreeWrapperPass>(); +    AU.addPreserved<AAResultsWrapperPass>(); +    AU.addPreserved<GlobalsAAWrapperPass>();      AU.setPreservesCFG();    } @@ -3260,15 +3350,26 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,    // Do a quadratic search on all of the given stores and find    // all of the pairs of stores that follow each other. +  SmallVector<unsigned, 16> IndexQueue;    for (unsigned i = 0, e = Stores.size(); i < e; ++i) { -    for (unsigned j = 0; j < e; ++j) { -      if (i == j) -        continue; -      const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); -      if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) { -        Tails.insert(Stores[j]); +    const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); +    IndexQueue.clear(); +    // If a store has multiple consecutive store candidates, search Stores +    // array according to the sequence: from i+1 to e, then from i-1 to 0. +    // This is because usually pairing with immediate succeeding or preceding +    // candidate create the best chance to find slp vectorization opportunity. +    unsigned j = 0; +    for (j = i + 1; j < e; ++j) +      IndexQueue.push_back(j); +    for (j = i; j > 0; --j) +      IndexQueue.push_back(j - 1); + +    for (auto &k : IndexQueue) { +      if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) { +        Tails.insert(Stores[k]);          Heads.insert(Stores[i]); -        ConsecutiveChain[Stores[i]] = Stores[j]; +        ConsecutiveChain[Stores[i]] = Stores[k]; +        break;        }      }    } @@ -3428,7 +3529,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,          unsigned VecIdx = 0;          for (auto &V : BuildVectorSlice) {            IRBuilder<true, NoFolder> Builder( -              ++BasicBlock::iterator(InsertAfter)); +              InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter));            InsertElementInst *IE = cast<InsertElementInst>(V);            Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(                VectorizedRoot, Builder.getInt32(VecIdx++))); @@ -3552,16 +3653,17 @@ class HorizontalReduction {    unsigned ReductionOpcode;    /// The opcode of the values we perform a reduction on.    unsigned ReducedValueOpcode; -  /// The width of one full horizontal reduction operation. -  unsigned ReduxWidth;    /// Should we model this reduction as a pairwise reduction tree or a tree that    /// splits the vector in halves and adds those halves.    bool IsPairwiseReduction;  public: +  /// The width of one full horizontal reduction operation. +  unsigned ReduxWidth; +    HorizontalReduction()      : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), -    ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} +    ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {}    /// \brief Try to find a reduction tree.    bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { @@ -3607,11 +3709,11 @@ public:        return false;      // Post order traverse the reduction tree starting at B. We only handle true -    // trees containing only binary operators. -    SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack; +    // trees containing only binary operators or selects. +    SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;      Stack.push_back(std::make_pair(B, 0));      while (!Stack.empty()) { -      BinaryOperator *TreeN = Stack.back().first; +      Instruction *TreeN = Stack.back().first;        unsigned EdgeToVist = Stack.back().second++;        bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; @@ -3647,9 +3749,10 @@ public:        // Visit left or right.        Value *NextV = TreeN->getOperand(EdgeToVist); -      BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV); -      if (Next) -        Stack.push_back(std::make_pair(Next, 0)); +      // We currently only allow BinaryOperator's and SelectInst's as reduction +      // values in our tree. +      if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV)) +        Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0));        else if (NextV != Phi)          return false;      } @@ -3717,9 +3820,12 @@ public:      return VectorizedTree != nullptr;    } -private: +  unsigned numReductionValues() const { +    return ReducedVals.size(); +  } -  /// \brief Calcuate the cost of a reduction. +private: +  /// \brief Calculate the cost of a reduction.    int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {      Type *ScalarTy = FirstReducedVal->getType();      Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); @@ -3825,6 +3931,82 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) {    return V->getType() < V2->getType();  } +/// \brief Try and get a reduction value from a phi node. +/// +/// Given a phi node \p P in a block \p ParentBB, consider possible reductions +/// if they come from either \p ParentBB or a containing loop latch. +/// +/// \returns A candidate reduction value if possible, or \code nullptr \endcode +/// if not possible. +static Value *getReductionValue(const DominatorTree *DT, PHINode *P, +                                BasicBlock *ParentBB, LoopInfo *LI) { +  // There are situations where the reduction value is not dominated by the +  // reduction phi. Vectorizing such cases has been reported to cause +  // miscompiles. See PR25787. +  auto DominatedReduxValue = [&](Value *R) { +    return ( +        dyn_cast<Instruction>(R) && +        DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent())); +  }; + +  Value *Rdx = nullptr; + +  // Return the incoming value if it comes from the same BB as the phi node. +  if (P->getIncomingBlock(0) == ParentBB) { +    Rdx = P->getIncomingValue(0); +  } else if (P->getIncomingBlock(1) == ParentBB) { +    Rdx = P->getIncomingValue(1); +  } + +  if (Rdx && DominatedReduxValue(Rdx)) +    return Rdx; + +  // Otherwise, check whether we have a loop latch to look at. +  Loop *BBL = LI->getLoopFor(ParentBB); +  if (!BBL) +    return nullptr; +  BasicBlock *BBLatch = BBL->getLoopLatch(); +  if (!BBLatch) +    return nullptr; + +  // There is a loop latch, return the incoming value if it comes from +  // that. This reduction pattern occassionaly turns up. +  if (P->getIncomingBlock(0) == BBLatch) { +    Rdx = P->getIncomingValue(0); +  } else if (P->getIncomingBlock(1) == BBLatch) { +    Rdx = P->getIncomingValue(1); +  } + +  if (Rdx && DominatedReduxValue(Rdx)) +    return Rdx; + +  return nullptr; +} + +/// \brief Attempt to reduce a horizontal reduction. +/// If it is legal to match a horizontal reduction feeding +/// the phi node P with reduction operators BI, then check if it +/// can be done. +/// \returns true if a horizontal reduction was matched and reduced. +/// \returns false if a horizontal reduction was not matched. +static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI, +                                        BoUpSLP &R, TargetTransformInfo *TTI) { +  if (!ShouldVectorizeHor) +    return false; + +  HorizontalReduction HorRdx; +  if (!HorRdx.matchAssociativeReduction(P, BI)) +    return false; + +  // If there is a sufficient number of reduction values, reduce +  // to a nearby power-of-2. Can safely generate oversized +  // vectors and rely on the backend to split them to legal sizes. +  HorRdx.ReduxWidth = +    std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues())); + +  return HorRdx.tryToReduce(R, TTI); +} +  bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {    bool Changed = false;    SmallVector<Value *, 4> Incoming; @@ -3881,7 +4063,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {      // We may go through BB multiple times so skip the one we have checked. -    if (!VisitedInstrs.insert(it).second) +    if (!VisitedInstrs.insert(&*it).second)        continue;      if (isa<DbgInfoIntrinsic>(it)) @@ -3892,20 +4074,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {        // Check that the PHI is a reduction PHI.        if (P->getNumIncomingValues() != 2)          return Changed; -      Value *Rdx = -          (P->getIncomingBlock(0) == BB -               ? (P->getIncomingValue(0)) -               : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) -                                               : nullptr)); + +      Value *Rdx = getReductionValue(DT, P, BB, LI); +        // Check if this is a Binary Operator.        BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);        if (!BI)          continue;        // Try to match and vectorize a horizontal reduction. -      HorizontalReduction HorRdx; -      if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) && -          HorRdx.tryToReduce(R, TTI)) { +      if (canMatchHorizontalReduction(P, BI, R, TTI)) {          Changed = true;          it = BB->begin();          e = BB->end(); @@ -3928,15 +4106,12 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {        continue;      } -    // Try to vectorize horizontal reductions feeding into a store.      if (ShouldStartVectorizeHorAtStore)        if (StoreInst *SI = dyn_cast<StoreInst>(it))          if (BinaryOperator *BinOp =                  dyn_cast<BinaryOperator>(SI->getValueOperand())) { -          HorizontalReduction HorRdx; -          if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) && -                HorRdx.tryToReduce(R, TTI)) || -               tryToVectorize(BinOp, R))) { +          if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) || +              tryToVectorize(BinOp, R)) {              Changed = true;              it = BB->begin();              e = BB->end(); @@ -4037,10 +4212,10 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {  char SLPVectorizer::ID = 0;  static const char lv_name[] = "SLP Vectorizer";  INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)  INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)  INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)  INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)  | 
