vendor/llvm/llvm-trunk-r305575

author: Dimitry Andric <dim@FreeBSD.org> 2017-06-16 21:03:24 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2017-06-16 21:03:24 +0000
commit: 7c7aba6e5fef47a01a136be655b0a92cfd7090f6 (patch)
tree: 99ec531924f6078534b100ab9d7696abce848099 /lib/Transforms
parent: 7ab83427af0f77b59941ceba41d509d7d097b065 (diff)
25 files changed, 1342 insertions, 766 deletions
diff --git a/lib/Transforms/IPO/CrossDSOCFI.cpp b/lib/Transforms/IPO/CrossDSOCFI.cpp
index 1b111de061576..d94aa5da85601 100644
--- a/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -95,6 +95,17 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
     }
   }
 
+  NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
+  if (CfiFunctionsMD) {
+    for (auto Func : CfiFunctionsMD->operands()) {
+      assert(Func->getNumOperands() >= 2);
+      for (unsigned I = 2; I < Func->getNumOperands(); ++I)
+        if (ConstantInt *TypeId =
+                extractNumericTypeId(cast<MDNode>(Func->getOperand(I).get())))
+          TypeIds.insert(TypeId->getZExtValue());
+    }
+  }
+
   LLVMContext &Ctx = M.getContext();
   Constant *C = M.getOrInsertFunction(
       "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx),
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index c0dfeede05c5a..ad89e40661c67 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -523,40 +523,47 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
       if (!Callee || Callee->isDeclaration())
         continue;
 
-      // If this call site is dead and it is to a readonly function, we should
-      // just delete the call instead of trying to inline it, regardless of
-      // size.  This happens because IPSCCP propagates the result out of the
-      // call and then we're left with the dead call.
-      if (isInstructionTriviallyDead(CS.getInstruction(), &TLI)) {
-        DEBUG(dbgs() << "    -> Deleting dead call: " << *CS.getInstruction()
-                     << "\n");
-        // Update the call graph by deleting the edge from Callee to Caller.
-        CG[Caller]->removeCallEdgeFor(CS);
-        CS.getInstruction()->eraseFromParent();
-        ++NumCallsDeleted;
-      } else {
+      Instruction *Instr = CS.getInstruction();
+
+      bool IsTriviallyDead = isInstructionTriviallyDead(Instr, &TLI);
+
+      int InlineHistoryID;
+      if (!IsTriviallyDead) {
         // If this call site was obtained by inlining another function, verify
         // that the include path for the function did not include the callee
         // itself.  If so, we'd be recursively inlining the same function,
         // which would provide the same callsites, which would cause us to
         // infinitely inline.
-        int InlineHistoryID = CallSites[CSi].second;
+        InlineHistoryID = CallSites[CSi].second;
         if (InlineHistoryID != -1 &&
             InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory))
           continue;
+      }
 
+      // FIXME for new PM: because of the old PM we currently generate ORE and
+      // in turn BFI on demand.  With the new PM, the ORE dependency should
+      // just become a regular analysis dependency.
+      OptimizationRemarkEmitter ORE(Caller);
+
+      // If the policy determines that we should inline this function,
+      // delete the call instead.
+      if (!shouldInline(CS, GetInlineCost, ORE))
+        continue;
+
+      // If this call site is dead and it is to a readonly function, we should
+      // just delete the call instead of trying to inline it, regardless of
+      // size.  This happens because IPSCCP propagates the result out of the
+      // call and then we're left with the dead call.
+      if (IsTriviallyDead) {
+        DEBUG(dbgs() << "    -> Deleting dead call: " << *Instr << "\n");
+        // Update the call graph by deleting the edge from Callee to Caller.
+        CG[Caller]->removeCallEdgeFor(CS);
+        Instr->eraseFromParent();
+        ++NumCallsDeleted;
+      } else {
         // Get DebugLoc to report. CS will be invalid after Inliner.
-        DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+        DebugLoc DLoc = Instr->getDebugLoc();
         BasicBlock *Block = CS.getParent();
-        // FIXME for new PM: because of the old PM we currently generate ORE and
-        // in turn BFI on demand.  With the new PM, the ORE dependency should
-        // just become a regular analysis dependency.
-        OptimizationRemarkEmitter ORE(Caller);
-
-        // If the policy determines that we should inline this function,
-        // try to do so.
-        if (!shouldInline(CS, GetInlineCost, ORE))
-          continue;
 
         // Attempt to inline the function.
         using namespace ore;
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 90896d285f5af..b406c22c69d7a 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -206,17 +207,26 @@ struct ByteArrayInfo {
 class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
   GlobalObject *GO;
   size_t NTypes;
+  // For functions: true if this is a definition (either in the merged module or
+  // in one of the thinlto modules).
+  bool IsDefinition;
+  // For functions: true if this function is either defined or used in a thinlto
+  // module and its jumptable entry needs to be exported to thinlto backends.
+  bool IsExported;
 
   friend TrailingObjects;
   size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; }
 
 public:
   static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO,
+                                  bool IsDefinition, bool IsExported,
                                   ArrayRef<MDNode *> Types) {
     auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate(
         totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember)));
     GTM->GO = GO;
     GTM->NTypes = Types.size();
+    GTM->IsDefinition = IsDefinition;
+    GTM->IsExported = IsExported;
     std::uninitialized_copy(Types.begin(), Types.end(),
                             GTM->getTrailingObjects<MDNode *>());
     return GTM;
@@ -224,6 +234,12 @@ public:
   GlobalObject *getGlobal() const {
     return GO;
   }
+  bool isDefinition() const {
+    return IsDefinition;
+  }
+  bool isExported() const {
+    return IsExported;
+  }
   ArrayRef<MDNode *> types() const {
     return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes);
   }
@@ -294,6 +310,7 @@ class LowerTypeTestsModule {
   void exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
   TypeIdLowering importTypeId(StringRef TypeId);
   void importTypeTest(CallInst *CI);
+  void importFunction(Function *F, bool isDefinition);
 
   BitSetInfo
   buildBitSet(Metadata *TypeId,
@@ -820,6 +837,41 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
   CI->eraseFromParent();
 }
 
+// ThinLTO backend: the function F has a jump table entry; update this module
+// accordingly. isDefinition describes the type of the jump table entry.
+void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
+  assert(F->getType()->getAddressSpace() == 0);
+
+  // Declaration of a local function - nothing to do.
+  if (F->isDeclarationForLinker() && isDefinition)
+    return;
+
+  GlobalValue::VisibilityTypes Visibility = F->getVisibility();
+  std::string Name = F->getName();
+  Function *FDecl;
+
+  if (F->isDeclarationForLinker() && !isDefinition) {
+    // Declaration of an external function.
+    FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
+                             Name + ".cfi_jt", &M);
+    FDecl->setVisibility(GlobalValue::HiddenVisibility);
+  } else {
+    // Definition.
+    assert(isDefinition);
+    F->setName(Name + ".cfi");
+    F->setLinkage(GlobalValue::ExternalLinkage);
+    F->setVisibility(GlobalValue::HiddenVisibility);
+    FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
+                             Name, &M);
+    FDecl->setVisibility(Visibility);
+  }
+
+  if (F->isWeakForLinker())
+    replaceWeakDeclarationWithJumpTablePtr(F, FDecl);
+  else
+    F->replaceAllUsesWith(FDecl);
+}
+
 void LowerTypeTestsModule::lowerTypeTestCalls(
     ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
     const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
@@ -1143,7 +1195,6 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
   // arithmetic that we normally use for globals.
 
   // FIXME: find a better way to represent the jumptable in the IR.
-
   assert(!Functions.empty());
 
   // Build a simple layout based on the regular layout of jump tables.
@@ -1167,6 +1218,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
   // references to the original functions with references to the aliases.
   for (unsigned I = 0; I != Functions.size(); ++I) {
     Function *F = cast<Function>(Functions[I]->getGlobal());
+    bool IsDefinition = Functions[I]->isDefinition();
 
     Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
         ConstantExpr::getInBoundsGetElementPtr(
@@ -1174,7 +1226,18 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
             ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
                                  ConstantInt::get(IntPtrTy, I)}),
         F->getType());
-    if (F->isDeclarationForLinker()) {
+    if (Functions[I]->isExported()) {
+      if (IsDefinition) {
+        ExportSummary->cfiFunctionDefs().insert(F->getName());
+      } else {
+        GlobalAlias *JtAlias = GlobalAlias::create(
+            F->getValueType(), 0, GlobalValue::ExternalLinkage,
+            F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
+        JtAlias->setVisibility(GlobalValue::HiddenVisibility);
+        ExportSummary->cfiFunctionDecls().insert(F->getName());
+      }
+    }
+    if (!IsDefinition) {
       if (F->isWeakForLinker())
         replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr);
       else
@@ -1182,9 +1245,8 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
     } else {
       assert(F->getType()->getAddressSpace() == 0);
 
-      GlobalAlias *FAlias = GlobalAlias::create(F->getValueType(), 0,
-                                                F->getLinkage(), "",
-                                                CombinedGlobalElemPtr, &M);
+      GlobalAlias *FAlias = GlobalAlias::create(
+          F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M);
       FAlias->setVisibility(F->getVisibility());
       FAlias->takeName(F);
       if (FAlias->hasName())
@@ -1353,15 +1415,37 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
 bool LowerTypeTestsModule::lower() {
   Function *TypeTestFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
-  if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary)
+  if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary &&
+      !ImportSummary)
     return false;
 
   if (ImportSummary) {
-    for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
-         UI != UE;) {
-      auto *CI = cast<CallInst>((*UI++).getUser());
-      importTypeTest(CI);
+    if (TypeTestFunc) {
+      for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
+           UI != UE;) {
+        auto *CI = cast<CallInst>((*UI++).getUser());
+        importTypeTest(CI);
+      }
+    }
+
+    SmallVector<Function *, 8> Defs;
+    SmallVector<Function *, 8> Decls;
+    for (auto &F : M) {
+      // CFI functions are either external, or promoted. A local function may
+      // have the same name, but it's not the one we are looking for.
+      if (F.hasLocalLinkage())
+        continue;
+      if (ImportSummary->cfiFunctionDefs().count(F.getName()))
+        Defs.push_back(&F);
+      else if (ImportSummary->cfiFunctionDecls().count(F.getName()))
+        Decls.push_back(&F);
     }
+
+    for (auto F : Defs)
+      importFunction(F, /*isDefinition*/ true);
+    for (auto F : Decls)
+      importFunction(F, /*isDefinition*/ false);
+
     return true;
   }
 
@@ -1387,6 +1471,58 @@ bool LowerTypeTestsModule::lower() {
   llvm::DenseMap<Metadata *, TIInfo> TypeIdInfo;
   unsigned I = 0;
   SmallVector<MDNode *, 2> Types;
+
+  struct ExportedFunctionInfo {
+    CfiFunctionLinkage Linkage;
+    MDNode *FuncMD; // {name, linkage, type[, type...]}
+  };
+  DenseMap<StringRef, ExportedFunctionInfo> ExportedFunctions;
+  if (ExportSummary) {
+    NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
+    if (CfiFunctionsMD) {
+      for (auto FuncMD : CfiFunctionsMD->operands()) {
+        assert(FuncMD->getNumOperands() >= 2);
+        StringRef FunctionName =
+            cast<MDString>(FuncMD->getOperand(0))->getString();
+        if (!ExportSummary->isGUIDLive(GlobalValue::getGUID(
+                GlobalValue::dropLLVMManglingEscape(FunctionName))))
+          continue;
+        CfiFunctionLinkage Linkage = static_cast<CfiFunctionLinkage>(
+            cast<ConstantAsMetadata>(FuncMD->getOperand(1))
+                ->getValue()
+                ->getUniqueInteger()
+                .getZExtValue());
+        auto P = ExportedFunctions.insert({FunctionName, {Linkage, FuncMD}});
+        if (!P.second && P.first->second.Linkage != CFL_Definition)
+          P.first->second = {Linkage, FuncMD};
+      }
+
+      for (const auto &P : ExportedFunctions) {
+        StringRef FunctionName = P.first;
+        CfiFunctionLinkage Linkage = P.second.Linkage;
+        MDNode *FuncMD = P.second.FuncMD;
+        Function *F = M.getFunction(FunctionName);
+        if (!F)
+          F = Function::Create(
+              FunctionType::get(Type::getVoidTy(M.getContext()), false),
+              GlobalVariable::ExternalLinkage, FunctionName, &M);
+
+        if (Linkage == CFL_Definition)
+          F->eraseMetadata(LLVMContext::MD_type);
+
+        if (F->isDeclaration()) {
+          if (Linkage == CFL_WeakDeclaration)
+            F->setLinkage(GlobalValue::ExternalWeakLinkage);
+
+          SmallVector<MDNode *, 2> Types;
+          for (unsigned I = 2; I < FuncMD->getNumOperands(); ++I)
+            F->addMetadata(LLVMContext::MD_type,
+                           *cast<MDNode>(FuncMD->getOperand(I).get()));
+        }
+      }
+    }
+  }
+
   for (GlobalObject &GO : M.global_objects()) {
     if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker())
       continue;
@@ -1396,7 +1532,15 @@ bool LowerTypeTestsModule::lower() {
     if (Types.empty())
       continue;
 
-    auto *GTM = GlobalTypeMember::create(Alloc, &GO, Types);
+    bool IsDefinition = !GO.isDeclarationForLinker();
+    bool IsExported = false;
+    if (isa<Function>(GO) && ExportedFunctions.count(GO.getName())) {
+      IsDefinition |= ExportedFunctions[GO.getName()].Linkage == CFL_Definition;
+      IsExported = true;
+    }
+
+    auto *GTM =
+        GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types);
     for (MDNode *Type : Types) {
       verifyTypeMDNode(&GO, Type);
       auto &Info = TypeIdInfo[cast<MDNode>(Type)->getOperand(1)];
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index ea805efc66b79..8840435af6421 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -103,6 +103,35 @@ struct PartialInlinerImpl {
   bool run(Module &M);
   Function *unswitchFunction(Function *F);
 
+  // This class speculatively clones the the function to be partial inlined.
+  // At the end of partial inlining, the remaining callsites to the cloned
+  // function that are not partially inlined will be fixed up to reference
+  // the original function, and the cloned function will be erased.
+  struct FunctionCloner {
+    FunctionCloner(Function *F, FunctionOutliningInfo *OI);
+    ~FunctionCloner();
+
+    // Prepare for function outlining: making sure there is only
+    // one incoming edge from the extracted/outlined region to
+    // the return block.
+    void NormalizeReturnBlock();
+
+    // Do function outlining:
+    Function *doFunctionOutlining();
+
+    Function *OrigFunc = nullptr;
+    Function *ClonedFunc = nullptr;
+    Function *OutlinedFunc = nullptr;
+    BasicBlock *OutliningCallBB = nullptr;
+    // ClonedFunc is inlined in one of its callers after function
+    // outlining.
+    bool IsFunctionInlined = false;
+    // The cost of the region to be outlined.
+    int OutlinedRegionCost = 0;
+    std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
+    std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
+  };
+
 private:
   int NumPartialInlining = 0;
   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
@@ -114,27 +143,18 @@ private:
   // The result is no larger than 1 and is represented using BP.
   // (Note that the outlined region's 'head' block can only have incoming
   // edges from the guarding entry blocks).
-  BranchProbability getOutliningCallBBRelativeFreq(Function *F,
-                                                   FunctionOutliningInfo *OI,
-                                                   Function *DuplicateFunction,
-                                                   BlockFrequencyInfo *BFI,
-                                                   BasicBlock *OutliningCallBB);
+  BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
 
   // Return true if the callee of CS should be partially inlined with
   // profit.
-  bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI,
-                           BlockFrequencyInfo *CalleeBFI,
-                           BasicBlock *OutliningCallBB,
-                           int OutliningCallOverhead,
+  bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
+                           BlockFrequency WeightedOutliningRcost,
                            OptimizationRemarkEmitter &ORE);
 
   // Try to inline DuplicateFunction (cloned from F with call to
   // the OutlinedFunction into its callers. Return true
   // if there is any successful inlining.
-  bool tryPartialInline(Function *DuplicateFunction,
-                        Function *F, /*orignal function */
-                        FunctionOutliningInfo *OI, Function *OutlinedFunction,
-                        BlockFrequencyInfo *CalleeBFI);
+  bool tryPartialInline(FunctionCloner &Cloner);
 
   // Compute the mapping from use site of DuplicationFunction to the enclosing
   // BB's profile count.
@@ -146,7 +166,7 @@ private:
             NumPartialInlining >= MaxNumPartialInlining);
   }
 
-  CallSite getCallSite(User *U) {
+  static CallSite getCallSite(User *U) {
     CallSite CS;
     if (CallInst *CI = dyn_cast<CallInst>(U))
       CS = CallSite(CI);
@@ -157,7 +177,7 @@ private:
     return CS;
   }
 
-  CallSite getOneCallSiteTo(Function *F) {
+  static CallSite getOneCallSiteTo(Function *F) {
     User *User = *F->user_begin();
     return getCallSite(User);
   }
@@ -171,20 +191,15 @@ private:
 
   // Returns the costs associated with function outlining:
   // - The first value is the non-weighted runtime cost for making the call
-  //   to the outlined function 'OutlinedFunction', including the addtional
-  //   setup cost in the outlined function itself;
+  //   to the outlined function, including the addtional  setup cost in the
+  //    outlined function itself;
   // - The second value is the estimated size of the new call sequence in
-  //   basic block 'OutliningCallBB';
-  // - The third value is the estimated size of the original code from
-  //   function 'F' that is extracted into the outlined function.
-  std::tuple<int, int, int>
-  computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo,
-                        Function *OutlinedFunction,
-                        BasicBlock *OutliningCallBB);
+  //   basic block Cloner.OutliningCallBB;
+  std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
   // approximate both the size and runtime cost (Note that in the current
   // inline cost analysis, there is no clear distinction there either).
-  int computeBBInlineCost(BasicBlock *BB);
+  static int computeBBInlineCost(BasicBlock *BB);
 
   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
 
@@ -396,19 +411,19 @@ static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
   return false;
 }
 
-BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
-    Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction,
-    BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) {
+BranchProbability
+PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
 
   auto EntryFreq =
-      BFI->getBlockFreq(&DuplicateFunction->getEntryBlock());
-  auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB);
+      Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
+  auto OutliningCallFreq =
+      Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB);
 
   auto OutlineRegionRelFreq =
       BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
                                               EntryFreq.getFrequency());
 
-  if (hasProfileData(F, OI))
+  if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
     return OutlineRegionRelFreq;
 
   // When profile data is not available, we need to be conservative in
@@ -433,15 +448,17 @@ BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
 }
 
 bool PartialInlinerImpl::shouldPartialInline(
-    CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI,
-    BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB,
-    int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) {
+    CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
+    OptimizationRemarkEmitter &ORE) {
+
   using namespace ore;
   if (SkipCostAnalysis)
     return true;
 
   Instruction *Call = CS.getInstruction();
   Function *Callee = CS.getCalledFunction();
+  assert(Callee == Cloner.ClonedFunc);
+
   Function *Caller = CS.getCaller();
   auto &CalleeTTI = (*GetTTI)(*Callee);
   InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
@@ -449,14 +466,14 @@ bool PartialInlinerImpl::shouldPartialInline(
 
   if (IC.isAlways()) {
     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
-             << NV("Callee", F)
+             << NV("Callee", Cloner.OrigFunc)
              << " should always be fully inlined, not partially");
     return false;
   }
 
   if (IC.isNever()) {
     ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
-             << NV("Callee", F) << " not partially inlined into "
+             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
              << NV("Caller", Caller)
              << " because it should never be inlined (cost=never)");
     return false;
@@ -464,29 +481,25 @@ bool PartialInlinerImpl::shouldPartialInline(
 
   if (!IC) {
     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
-             << NV("Callee", F) << " not partially inlined into "
+             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
              << NV("Caller", Caller) << " because too costly to inline (cost="
              << NV("Cost", IC.getCost()) << ", threshold="
              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
     return false;
   }
   const DataLayout &DL = Caller->getParent()->getDataLayout();
+
   // The savings of eliminating the call:
   int NonWeightedSavings = getCallsiteCost(CS, DL);
   BlockFrequency NormWeightedSavings(NonWeightedSavings);
 
-  auto RelativeFreq =
-      getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB);
-  auto NormWeightedRcost =
-      BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq;
-
   // Weighted saving is smaller than weighted cost, return false
-  if (NormWeightedSavings < NormWeightedRcost) {
+  if (NormWeightedSavings < WeightedOutliningRcost) {
     ORE.emit(
         OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
-        << NV("Callee", F) << " not partially inlined into "
+        << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
         << NV("Caller", Caller) << " runtime overhead (overhead="
-        << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency())
+        << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
         << ", savings="
         << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
         << " of making the outlined call is too high");
@@ -495,7 +508,7 @@ bool PartialInlinerImpl::shouldPartialInline(
   }
 
   ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
-           << NV("Callee", F) << " can be partially inlined into "
+           << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
            << " (threshold="
            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
@@ -551,50 +564,32 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
   return InlineCost;
 }
 
-std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
-    Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction,
-    BasicBlock *OutliningCallBB) {
-  // First compute the cost of the outlined region 'OI' in the original
-  // function 'F'.
-  // FIXME: The code extractor (outliner) can now do code sinking/hoisting
-  // to reduce outlining cost. The hoisted/sunk code currently do not
-  // incur any runtime cost so it is still OK to compare the outlined
-  // function cost with the outlined region in the original function.
-  // If this ever changes, we will need to introduce new extractor api
-  // to pass the information.
-  int OutlinedRegionCost = 0;
-  for (BasicBlock &BB : *F) {
-    if (&BB != OI->ReturnBlock &&
-        // Assuming Entry set is small -- do a linear search here:
-        std::find(OI->Entries.begin(), OI->Entries.end(), &BB) ==
-            OI->Entries.end()) {
-      OutlinedRegionCost += computeBBInlineCost(&BB);
-    }
-  }
+std::tuple<int, int>
+PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
 
   // Now compute the cost of the call sequence to the outlined function
   // 'OutlinedFunction' in BB 'OutliningCallBB':
-  int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB);
+  int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB);
 
   // Now compute the cost of the extracted/outlined function itself:
   int OutlinedFunctionCost = 0;
-  for (BasicBlock &BB : *OutlinedFunction) {
+  for (BasicBlock &BB : *Cloner.OutlinedFunc) {
     OutlinedFunctionCost += computeBBInlineCost(&BB);
   }
 
-  assert(OutlinedFunctionCost >= OutlinedRegionCost &&
+  assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
          "Outlined function cost should be no less than the outlined region");
   // The code extractor introduces a new root and exit stub blocks with
   // additional unconditional branches. Those branches will be eliminated
   // later with bb layout. The cost should be adjusted accordingly:
   OutlinedFunctionCost -= 2 * InlineConstants::InstrCost;
 
-  int OutliningRuntimeOverhead = OutliningFuncCallCost +
-                                 (OutlinedFunctionCost - OutlinedRegionCost) +
-                                 ExtraOutliningPenalty;
+  int OutliningRuntimeOverhead =
+      OutliningFuncCallCost +
+      (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
+      ExtraOutliningPenalty;
 
-  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead,
-                         OutlinedRegionCost);
+  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
 }
 
 // Create the callsite to profile count map which is
@@ -641,42 +636,30 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap(
   }
 }
 
-Function *PartialInlinerImpl::unswitchFunction(Function *F) {
-
-  if (F->hasAddressTaken())
-    return nullptr;
-
-  // Let inliner handle it
-  if (F->hasFnAttribute(Attribute::AlwaysInline))
-    return nullptr;
-
-  if (F->hasFnAttribute(Attribute::NoInline))
-    return nullptr;
-
-  if (PSI->isFunctionEntryCold(F))
-    return nullptr;
-
-  if (F->user_begin() == F->user_end())
-    return nullptr;
-
-  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
-
-  if (!OI)
-    return nullptr;
+PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F,
+                                                   FunctionOutliningInfo *OI)
+    : OrigFunc(F) {
+  ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
 
   // Clone the function, so that we can hack away on it.
   ValueToValueMapTy VMap;
-  Function *DuplicateFunction = CloneFunction(F, VMap);
-  BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
-  BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
-  DenseSet<BasicBlock *> NewEntries;
+  ClonedFunc = CloneFunction(F, VMap);
+
+  ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+  ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
   for (BasicBlock *BB : OI->Entries) {
-    NewEntries.insert(cast<BasicBlock>(VMap[BB]));
+    ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
+  }
+  for (BasicBlock *E : OI->ReturnBlockPreds) {
+    BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
+    ClonedOI->ReturnBlockPreds.push_back(NewE);
   }
-
   // Go ahead and update all uses to the duplicate, so that we can just
   // use the inliner functionality when we're done hacking.
-  F->replaceAllUsesWith(DuplicateFunction);
+  F->replaceAllUsesWith(ClonedFunc);
+}
+
+void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
 
   auto getFirstPHI = [](BasicBlock *BB) {
     BasicBlock::iterator I = BB->begin();
@@ -692,14 +675,19 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
     }
     return FirstPhi;
   };
+
   // Special hackery is needed with PHI nodes that have inputs from more than
   // one extracted block.  For simplicity, just split the PHIs into a two-level
   // sequence of PHIs, some of which will go in the extracted region, and some
   // of which will go outside.
-  BasicBlock *PreReturn = NewReturnBlock;
+  BasicBlock *PreReturn = ClonedOI->ReturnBlock;
   // only split block when necessary:
   PHINode *FirstPhi = getFirstPHI(PreReturn);
-  unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size();
+  unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
+
+  if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
+    return;
+
   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
     Value *CommonValue = PN->getIncomingValue(0);
     if (all_of(PN->incoming_values(),
@@ -708,143 +696,185 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
     return nullptr;
   };
 
-  if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) {
-
-    NewReturnBlock = NewReturnBlock->splitBasicBlock(
-        NewReturnBlock->getFirstNonPHI()->getIterator());
-    BasicBlock::iterator I = PreReturn->begin();
-    Instruction *Ins = &NewReturnBlock->front();
-    SmallVector<Instruction *, 4> DeadPhis;
-    while (I != PreReturn->end()) {
-      PHINode *OldPhi = dyn_cast<PHINode>(I);
-      if (!OldPhi)
-        break;
-
-      PHINode *RetPhi =
-          PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
-      OldPhi->replaceAllUsesWith(RetPhi);
-      Ins = NewReturnBlock->getFirstNonPHI();
+  ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
+      ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
+  BasicBlock::iterator I = PreReturn->begin();
+  Instruction *Ins = &ClonedOI->ReturnBlock->front();
+  SmallVector<Instruction *, 4> DeadPhis;
+  while (I != PreReturn->end()) {
+    PHINode *OldPhi = dyn_cast<PHINode>(I);
+    if (!OldPhi)
+      break;
 
-      RetPhi->addIncoming(&*I, PreReturn);
-      for (BasicBlock *E : OI->ReturnBlockPreds) {
-        BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
-        RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE);
-        OldPhi->removeIncomingValue(NewE);
-      }
+    PHINode *RetPhi =
+        PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
+    OldPhi->replaceAllUsesWith(RetPhi);
+    Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
 
-      // After incoming values splitting, the old phi may become trivial.
-      // Keeping the trivial phi can introduce definition inside the outline
-      // region which is live-out, causing necessary overhead (load, store
-      // arg passing etc).
-      if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
-        OldPhi->replaceAllUsesWith(OldPhiVal);
-        DeadPhis.push_back(OldPhi);
-      }
-
-      ++I;
+    RetPhi->addIncoming(&*I, PreReturn);
+    for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
+      RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
+      OldPhi->removeIncomingValue(E);
     }
 
+    // After incoming values splitting, the old phi may become trivial.
+    // Keeping the trivial phi can introduce definition inside the outline
+    // region which is live-out, causing necessary overhead (load, store
+    // arg passing etc).
+    if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
+      OldPhi->replaceAllUsesWith(OldPhiVal);
+      DeadPhis.push_back(OldPhi);
+    }
+    ++I;
+    }
     for (auto *DP : DeadPhis)
       DP->eraseFromParent();
 
-    for (auto E : OI->ReturnBlockPreds) {
-      BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
-      NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);
+    for (auto E : ClonedOI->ReturnBlockPreds) {
+      E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
     }
-  }
+}
 
+Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() {
   // Returns true if the block is to be partial inlined into the caller
   // (i.e. not to be extracted to the out of line function)
-  auto ToBeInlined = [&](BasicBlock *BB) {
-    return BB == NewReturnBlock || NewEntries.count(BB);
+  auto ToBeInlined = [&, this](BasicBlock *BB) {
+    return BB == ClonedOI->ReturnBlock ||
+           (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
+            ClonedOI->Entries.end());
   };
+
   // Gather up the blocks that we're going to extract.
   std::vector<BasicBlock *> ToExtract;
-  ToExtract.push_back(NewNonReturnBlock);
-  for (BasicBlock &BB : *DuplicateFunction)
-    if (!ToBeInlined(&BB) && &BB != NewNonReturnBlock)
+  ToExtract.push_back(ClonedOI->NonReturnBlock);
+  OutlinedRegionCost +=
+      PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
+  for (BasicBlock &BB : *ClonedFunc)
+    if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
       ToExtract.push_back(&BB);
+      // FIXME: the code extractor may hoist/sink more code
+      // into the outlined function which may make the outlining
+      // overhead (the difference of the outlined function cost
+      // and OutliningRegionCost) look larger.
+      OutlinedRegionCost += computeBBInlineCost(&BB);
+    }
 
   // The CodeExtractor needs a dominator tree.
   DominatorTree DT;
-  DT.recalculate(*DuplicateFunction);
+  DT.recalculate(*ClonedFunc);
 
   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
   LoopInfo LI(DT);
-  BranchProbabilityInfo BPI(*DuplicateFunction, LI);
-  BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
+  BranchProbabilityInfo BPI(*ClonedFunc, LI);
+  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
 
   // Extract the body of the if.
-  Function *OutlinedFunction =
-      CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
-          .extractCodeRegion();
+  OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
+                               ClonedFuncBFI.get(), &BPI)
+                     .extractCodeRegion();
+
+  if (OutlinedFunc) {
+    OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
+        .getInstruction()
+        ->getParent();
+    assert(OutliningCallBB->getParent() == ClonedFunc);
+  }
 
-  bool AnyInline =
-      tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI);
+  return OutlinedFunc;
+}
 
+PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
   // Ditch the duplicate, since we're done with it, and rewrite all remaining
   // users (function pointers, etc.) back to the original function.
-  DuplicateFunction->replaceAllUsesWith(F);
-  DuplicateFunction->eraseFromParent();
+  ClonedFunc->replaceAllUsesWith(OrigFunc);
+  ClonedFunc->eraseFromParent();
+  if (!IsFunctionInlined) {
+    // Remove the function that is speculatively created if there is no
+    // reference.
+    if (OutlinedFunc)
+      OutlinedFunc->eraseFromParent();
+  }
+}
+
+Function *PartialInlinerImpl::unswitchFunction(Function *F) {
+
+  if (F->hasAddressTaken())
+    return nullptr;
+
+  // Let inliner handle it
+  if (F->hasFnAttribute(Attribute::AlwaysInline))
+    return nullptr;
+
+  if (F->hasFnAttribute(Attribute::NoInline))
+    return nullptr;
+
+  if (PSI->isFunctionEntryCold(F))
+    return nullptr;
+
+  if (F->user_begin() == F->user_end())
+    return nullptr;
+
+  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
+
+  if (!OI)
+    return nullptr;
+
+  FunctionCloner Cloner(F, OI.get());
+  Cloner.NormalizeReturnBlock();
+  Function *OutlinedFunction = Cloner.doFunctionOutlining();
+
+  bool AnyInline = tryPartialInline(Cloner);
 
   if (AnyInline)
     return OutlinedFunction;
 
-  // Remove the function that is speculatively created:
-  if (OutlinedFunction)
-    OutlinedFunction->eraseFromParent();
-
   return nullptr;
 }
 
-bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
-                                          Function *F,
-                                          FunctionOutliningInfo *OI,
-                                          Function *OutlinedFunction,
-                                          BlockFrequencyInfo *CalleeBFI) {
-  if (OutlinedFunction == nullptr)
-    return false;
-
+bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
   int NonWeightedRcost;
   int SizeCost;
-  int OutlinedRegionSizeCost;
 
-  auto OutliningCallBB =
-      getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent();
+  if (Cloner.OutlinedFunc == nullptr)
+    return false;
+
+  std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
 
-  std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) =
-      computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB);
+  auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
+  auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
 
   // The call sequence to the outlined function is larger than the original
   // outlined region size, it does not increase the chances of inlining
-  // 'F' with outlining (The inliner usies the size increase to model the
-  // the cost of inlining a callee).
-  if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) {
-    OptimizationRemarkEmitter ORE(F);
+  // the function with outlining (The inliner usies the size increase to
+  // model the cost of inlining a callee).
+  if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
+    OptimizationRemarkEmitter ORE(Cloner.OrigFunc);
     DebugLoc DLoc;
     BasicBlock *Block;
-    std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction);
+    std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
                                         DLoc, Block)
-             << ore::NV("Function", F)
+             << ore::NV("Function", Cloner.OrigFunc)
              << " not partially inlined into callers (Original Size = "
-             << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost)
+             << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
              << ", Size of call sequence to outlined function = "
              << ore::NV("NewSize", SizeCost) << ")");
     return false;
   }
 
-  assert(F->user_begin() == F->user_end() &&
+  assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
          "F's users should all be replaced!");
-  std::vector<User *> Users(DuplicateFunction->user_begin(),
-                            DuplicateFunction->user_end());
+
+  std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
+                            Cloner.ClonedFunc->user_end());
 
   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
-  if (F->getEntryCount())
-    computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap);
+  if (Cloner.OrigFunc->getEntryCount())
+    computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
 
-  auto CalleeEntryCount = F->getEntryCount();
+  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
   uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+
   bool AnyInline = false;
   for (User *User : Users) {
     CallSite CS = getCallSite(User);
@@ -854,13 +884,12 @@ bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
 
     OptimizationRemarkEmitter ORE(CS.getCaller());
 
-    if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB,
-                             NonWeightedRcost, ORE))
+    if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE))
       continue;
 
     ORE.emit(
         OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
-        << ore::NV("Callee", F) << " partially inlined into "
+        << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
         << ore::NV("Caller", CS.getCaller()));
 
     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
@@ -878,8 +907,11 @@ bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
     NumPartialInlined++;
   }
 
-  if (AnyInline && CalleeEntryCount)
-    F->setEntryCount(CalleeEntryCountV);
+  if (AnyInline) {
+    Cloner.IsFunctionInlined = true;
+    if (CalleeEntryCount)
+      Cloner.OrigFunc->setEntryCount(CalleeEntryCountV);
+  }
 
   return AnyInline;
 }
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 16fba32e98056..4bc64ab698ff9 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -141,6 +141,10 @@ static cl::opt<int> PreInlineThreshold(
     cl::desc("Control the amount of inlining in pre-instrumentation inliner "
              "(default = 75)"));
 
+static cl::opt<bool> EnableEarlyCSEMemSSA(
+    "enable-earlycse-memssa", cl::init(false), cl::Hidden,
+    cl::desc("Enable the EarlyCSE w/ MemorySSA pass (default = off)"));
+
 static cl::opt<bool> EnableGVNHoist(
     "enable-gvn-hoist", cl::init(false), cl::Hidden,
     cl::desc("Enable the GVN hoisting pass (default = off)"));
@@ -308,7 +312,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   // Start of function pass.
   // Break up aggregate allocas, using SSAUpdater.
   MPM.add(createSROAPass());
-  MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+  MPM.add(createEarlyCSEPass(EnableEarlyCSEMemSSA)); // Catch trivial redundancies
   if (EnableGVNHoist)
     MPM.add(createGVNHoistPass());
   if (EnableGVNSink) {
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index a7bcc7cc55325..802f470ffe1fb 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -32,7 +32,8 @@ namespace {
 
 // Promote each local-linkage entity defined by ExportM and used by ImportM by
 // changing visibility and appending the given ModuleId.
-void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) {
+void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
+                      SetVector<GlobalValue *> &PromoteExtra) {
   DenseMap<const Comdat *, Comdat *> RenamedComdats;
   for (auto &ExportGV : ExportM.global_values()) {
     if (!ExportGV.hasLocalLinkage())
@@ -40,7 +41,7 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) {
 
     auto Name = ExportGV.getName();
     GlobalValue *ImportGV = ImportM.getNamedValue(Name);
-    if (!ImportGV || ImportGV->use_empty())
+    if ((!ImportGV || ImportGV->use_empty()) && !PromoteExtra.count(&ExportGV))
       continue;
 
     std::string NewName = (Name + ModuleId).str();
@@ -53,8 +54,10 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) {
     ExportGV.setLinkage(GlobalValue::ExternalLinkage);
     ExportGV.setVisibility(GlobalValue::HiddenVisibility);
 
-    ImportGV->setName(NewName);
-    ImportGV->setVisibility(GlobalValue::HiddenVisibility);
+    if (ImportGV) {
+      ImportGV->setName(NewName);
+      ImportGV->setVisibility(GlobalValue::HiddenVisibility);
+    }
   }
 
   if (!RenamedComdats.empty())
@@ -296,6 +299,11 @@ void splitAndWriteThinLTOBitcode(
       F.setComdat(nullptr);
     }
 
+  SetVector<GlobalValue *> CfiFunctions;
+  for (auto &F : M)
+    if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F))
+      CfiFunctions.insert(&F);
+
   // Remove all globals with type metadata, globals with comdats that live in
   // MergedM, and aliases pointing to such globals from the thin LTO module.
   filterModule(&M, [&](const GlobalValue *GV) {
@@ -308,11 +316,39 @@ void splitAndWriteThinLTOBitcode(
     return true;
   });
 
-  promoteInternals(*MergedM, M, ModuleId);
-  promoteInternals(M, *MergedM, ModuleId);
+  promoteInternals(*MergedM, M, ModuleId, CfiFunctions);
+  promoteInternals(M, *MergedM, ModuleId, CfiFunctions);
+
+  SmallVector<MDNode *, 8> CfiFunctionMDs;
+  for (auto V : CfiFunctions) {
+    Function &F = *cast<Function>(V);
+    SmallVector<MDNode *, 2> Types;
+    F.getMetadata(LLVMContext::MD_type, Types);
+
+    auto &Ctx = MergedM->getContext();
+    SmallVector<Metadata *, 4> Elts;
+    Elts.push_back(MDString::get(Ctx, F.getName()));
+    CfiFunctionLinkage Linkage;
+    if (!F.isDeclarationForLinker())
+      Linkage = CFL_Definition;
+    else if (F.isWeakForLinker())
+      Linkage = CFL_WeakDeclaration;
+    else
+      Linkage = CFL_Declaration;
+    Elts.push_back(ConstantAsMetadata::get(
+        llvm::ConstantInt::get(Type::getInt8Ty(Ctx), Linkage)));
+    for (auto Type : Types)
+      Elts.push_back(Type);
+    CfiFunctionMDs.push_back(MDTuple::get(Ctx, Elts));
+  }
 
-  simplifyExternals(*MergedM);
+  if(!CfiFunctionMDs.empty()) {
+    NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("cfi.functions");
+    for (auto MD : CfiFunctionMDs)
+      NMD->addOperand(MD);
+  }
 
+  simplifyExternals(*MergedM);
 
   // FIXME: Try to re-use BSI and PFI from the original module here.
   ProfileSummaryInfo PSI(M);
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 4fe3225a21722..a881bda5ba98d 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -763,8 +763,54 @@ foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
   return nullptr;
 }
 
+// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
+// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
+Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
+                                                   bool JoinedByAnd,
+                                                   Instruction &CxtI) {
+  ICmpInst::Predicate Pred = LHS->getPredicate();
+  if (Pred != RHS->getPredicate())
+    return nullptr;
+  if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
+    return nullptr;
+  if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
+    return nullptr;
+
+  // TODO support vector splats
+  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  if (!LHSC || !RHSC || !LHSC->isZero() || !RHSC->isZero())
+    return nullptr;
+
+  Value *A, *B, *C, *D;
+  if (match(LHS->getOperand(0), m_And(m_Value(A), m_Value(B))) &&
+      match(RHS->getOperand(0), m_And(m_Value(C), m_Value(D)))) {
+    if (A == D || B == D)
+      std::swap(C, D);
+    if (B == C)
+      std::swap(A, B);
+
+    if (A == C &&
+        isKnownToBeAPowerOfTwo(B, false, 0, &CxtI) &&
+        isKnownToBeAPowerOfTwo(D, false, 0, &CxtI)) {
+      Value *Mask = Builder->CreateOr(B, D);
+      Value *Masked = Builder->CreateAnd(A, Mask);
+      auto NewPred = JoinedByAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+      return Builder->CreateICmp(NewPred, Masked, Mask);
+    }
+  }
+
+  return nullptr;
+}
+
 /// Fold (icmp)&(icmp) if possible.
-Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
+Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                    Instruction &CxtI) {
+  // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
+  // if K1 and K2 are a one-bit mask.
+  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, true, CxtI))
+    return V;
+
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
 
   // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
@@ -1127,8 +1173,8 @@ Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
   ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src);
   ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src);
   if (ICmp0 && ICmp1) {
-    Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1)
-                                              : foldOrOfICmps(ICmp0, ICmp1, &I);
+    Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1, I)
+                                              : foldOrOfICmps(ICmp0, ICmp1, I);
     if (Res)
       return CastInst::Create(CastOpcode, Res, DestTy);
     return nullptr;
@@ -1426,7 +1472,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
     ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
     if (LHS && RHS)
-      if (Value *Res = foldAndOfICmps(LHS, RHS))
+      if (Value *Res = foldAndOfICmps(LHS, RHS, I))
         return replaceInstUsesWith(I, Res);
 
     // TODO: Make this recursive; it's a little tricky because an arbitrary
@@ -1434,18 +1480,18 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     Value *X, *Y;
     if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = foldAndOfICmps(LHS, Cmp))
+        if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
           return replaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = foldAndOfICmps(LHS, Cmp))
+        if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
           return replaceInstUsesWith(I, Builder->CreateAnd(Res, X));
     }
     if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = foldAndOfICmps(Cmp, RHS))
+        if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
           return replaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = foldAndOfICmps(Cmp, RHS))
+        if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
           return replaceInstUsesWith(I, Builder->CreateAnd(Res, X));
     }
   }
@@ -1591,41 +1637,16 @@ static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
 
 /// Fold (icmp)|(icmp) if possible.
 Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                   Instruction *CxtI) {
-  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
-
+                                   Instruction &CxtI) {
   // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
   // if K1 and K2 are a one-bit mask.
-  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, false, CxtI))
+    return V;
 
-  if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSC && LHSC->isZero() &&
-      RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSC && RHSC->isZero()) {
-
-    BinaryOperator *LAnd = dyn_cast<BinaryOperator>(LHS->getOperand(0));
-    BinaryOperator *RAnd = dyn_cast<BinaryOperator>(RHS->getOperand(0));
-    if (LAnd && RAnd && LAnd->hasOneUse() && RHS->hasOneUse() &&
-        LAnd->getOpcode() == Instruction::And &&
-        RAnd->getOpcode() == Instruction::And) {
-
-      Value *Mask = nullptr;
-      Value *Masked = nullptr;
-      if (LAnd->getOperand(0) == RAnd->getOperand(0) &&
-          isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, CxtI) &&
-          isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, CxtI)) {
-        Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1));
-        Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask);
-      } else if (LAnd->getOperand(1) == RAnd->getOperand(1) &&
-                 isKnownToBeAPowerOfTwo(LAnd->getOperand(0), false, 0, CxtI) &&
-                 isKnownToBeAPowerOfTwo(RAnd->getOperand(0), false, 0, CxtI)) {
-        Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0));
-        Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask);
-      }
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
 
-      if (Masked)
-        return Builder->CreateICmp(ICmpInst::ICMP_NE, Masked, Mask);
-    }
-  }
+  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
 
   // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3)
   //                   -->  (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3)
@@ -2117,12 +2138,16 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   }
 
   // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C
+  // FIXME: The two hasOneUse calls here are the same call, maybe we were
+  // supposed to check Op1->operand(0)?
   if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
     if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
       if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse())
         return BinaryOperator::CreateOr(Op0, C);
 
   // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C
+  // FIXME: The two hasOneUse calls here are the same call, maybe we were
+  // supposed to check Op0->operand(0)?
   if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
     if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
       if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse())
@@ -2194,7 +2219,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
     ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
     if (LHS && RHS)
-      if (Value *Res = foldOrOfICmps(LHS, RHS, &I))
+      if (Value *Res = foldOrOfICmps(LHS, RHS, I))
         return replaceInstUsesWith(I, Res);
 
     // TODO: Make this recursive; it's a little tricky because an arbitrary
@@ -2202,18 +2227,18 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     Value *X, *Y;
     if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = foldOrOfICmps(LHS, Cmp, &I))
+        if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
           return replaceInstUsesWith(I, Builder->CreateOr(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = foldOrOfICmps(LHS, Cmp, &I))
+        if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
           return replaceInstUsesWith(I, Builder->CreateOr(Res, X));
     }
     if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = foldOrOfICmps(Cmp, RHS, &I))
+        if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
           return replaceInstUsesWith(I, Builder->CreateOr(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = foldOrOfICmps(Cmp, RHS, &I))
+        if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
           return replaceInstUsesWith(I, Builder->CreateOr(Res, X));
     }
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d29ed49eca0b0..c0830a5d21124 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -94,75 +94,80 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
   return ConstantVector::get(BoolVec);
 }
 
-Instruction *
-InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) {
+Instruction *InstCombiner::SimplifyElementUnorderedAtomicMemCpy(
+    ElementUnorderedAtomicMemCpyInst *AMI) {
   // Try to unfold this intrinsic into sequence of explicit atomic loads and
   // stores.
   // First check that number of elements is compile time constant.
-  auto *NumElementsCI = dyn_cast<ConstantInt>(AMI->getNumElements());
-  if (!NumElementsCI)
+  auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength());
+  if (!LengthCI)
     return nullptr;
 
   // Check that there are not too many elements.
-  uint64_t NumElements = NumElementsCI->getZExtValue();
+  uint64_t LengthInBytes = LengthCI->getZExtValue();
+  uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes();
+  uint64_t NumElements = LengthInBytes / ElementSizeInBytes;
   if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
     return nullptr;
 
-  // Don't unfold into illegal integers
-  uint64_t ElementSizeInBytes = AMI->getElementSizeInBytes() * 8;
-  if (!getDataLayout().isLegalInteger(ElementSizeInBytes))
-    return nullptr;
+  // Only expand if there are elements to copy.
+  if (NumElements > 0) {
+    // Don't unfold into illegal integers
+    uint64_t ElementSizeInBits = ElementSizeInBytes * 8;
+    if (!getDataLayout().isLegalInteger(ElementSizeInBits))
+      return nullptr;
 
-  // Cast source and destination to the correct type. Intrinsic input arguments
-  // are usually represented as i8*.
-  // Often operands will be explicitly casted to i8* and we can just strip
-  // those casts instead of inserting new ones. However it's easier to rely on
-  // other InstCombine rules which will cover trivial cases anyway.
-  Value *Src = AMI->getRawSource();
-  Value *Dst = AMI->getRawDest();
-  Type *ElementPointerType = Type::getIntNPtrTy(
-      AMI->getContext(), ElementSizeInBytes, Src->getType()->getPointerAddressSpace());
-
-  Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType,
-                                                "memcpy_unfold.src_casted");
-  Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType,
-                                                "memcpy_unfold.dst_casted");
-
-  for (uint64_t i = 0; i < NumElements; ++i) {
-    // Get current element addresses
-    ConstantInt *ElementIdxCI =
-        ConstantInt::get(AMI->getContext(), APInt(64, i));
-    Value *SrcElementAddr =
-        Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
-    Value *DstElementAddr =
-        Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
-
-    // Load from the source. Transfer alignment information and mark load as
-    // unordered atomic.
-    LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val");
-    Load->setOrdering(AtomicOrdering::Unordered);
-    // We know alignment of the first element. It is also guaranteed by the
-    // verifier that element size is less or equal than first element alignment
-    // and both of this values are powers of two.
-    // This means that all subsequent accesses are at least element size
-    // aligned.
-    // TODO: We can infer better alignment but there is no evidence that this
-    // will matter.
-    Load->setAlignment(i == 0 ? AMI->getSrcAlignment()
-                              : AMI->getElementSizeInBytes());
-    Load->setDebugLoc(AMI->getDebugLoc());
-
-    // Store loaded value via unordered atomic store.
-    StoreInst *Store = Builder->CreateStore(Load, DstElementAddr);
-    Store->setOrdering(AtomicOrdering::Unordered);
-    Store->setAlignment(i == 0 ? AMI->getDstAlignment()
-                               : AMI->getElementSizeInBytes());
-    Store->setDebugLoc(AMI->getDebugLoc());
+    // Cast source and destination to the correct type. Intrinsic input
+    // arguments are usually represented as i8*. Often operands will be
+    // explicitly casted to i8* and we can just strip those casts instead of
+    // inserting new ones. However it's easier to rely on other InstCombine
+    // rules which will cover trivial cases anyway.
+    Value *Src = AMI->getRawSource();
+    Value *Dst = AMI->getRawDest();
+    Type *ElementPointerType =
+        Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits,
+                           Src->getType()->getPointerAddressSpace());
+
+    Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType,
+                                                  "memcpy_unfold.src_casted");
+    Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType,
+                                                  "memcpy_unfold.dst_casted");
+
+    for (uint64_t i = 0; i < NumElements; ++i) {
+      // Get current element addresses
+      ConstantInt *ElementIdxCI =
+          ConstantInt::get(AMI->getContext(), APInt(64, i));
+      Value *SrcElementAddr =
+          Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
+      Value *DstElementAddr =
+          Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
+
+      // Load from the source. Transfer alignment information and mark load as
+      // unordered atomic.
+      LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val");
+      Load->setOrdering(AtomicOrdering::Unordered);
+      // We know alignment of the first element. It is also guaranteed by the
+      // verifier that element size is less or equal than first element
+      // alignment and both of this values are powers of two. This means that
+      // all subsequent accesses are at least element size aligned.
+      // TODO: We can infer better alignment but there is no evidence that this
+      // will matter.
+      Load->setAlignment(i == 0 ? AMI->getParamAlignment(1)
+                                : ElementSizeInBytes);
+      Load->setDebugLoc(AMI->getDebugLoc());
+
+      // Store loaded value via unordered atomic store.
+      StoreInst *Store = Builder->CreateStore(Load, DstElementAddr);
+      Store->setOrdering(AtomicOrdering::Unordered);
+      Store->setAlignment(i == 0 ? AMI->getParamAlignment(0)
+                                 : ElementSizeInBytes);
+      Store->setDebugLoc(AMI->getDebugLoc());
+    }
   }
 
   // Set the number of elements of the copy to 0, it will be deleted on the
   // next iteration.
-  AMI->setNumElements(Constant::getNullValue(NumElementsCI->getType()));
+  AMI->setLength(Constant::getNullValue(LengthCI->getType()));
   return AMI;
 }
 
@@ -1888,12 +1893,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (Changed) return II;
   }
 
-  if (auto *AMI = dyn_cast<ElementAtomicMemCpyInst>(II)) {
-    if (Constant *C = dyn_cast<Constant>(AMI->getNumElements()))
+  if (auto *AMI = dyn_cast<ElementUnorderedAtomicMemCpyInst>(II)) {
+    if (Constant *C = dyn_cast<Constant>(AMI->getLength()))
       if (C->isNullValue())
         return eraseInstFromFunction(*AMI);
 
-    if (Instruction *I = SimplifyElementAtomicMemCpy(AMI))
+    if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI))
       return I;
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index fd0a64a5bbb56..1a7db146df426 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -447,12 +447,14 @@ private:
   Instruction::CastOps isEliminableCastPair(const CastInst *CI1,
                                             const CastInst *CI2);
 
-  Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+  Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
   Value *foldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
-  Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI);
+  Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
   Value *foldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
   Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS);
 
+  Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
+                                       bool JoinedByAnd, Instruction &CxtI);
 public:
   /// \brief Inserts an instruction \p New before instruction \p Old
   ///
@@ -724,7 +726,8 @@ private:
   Instruction *MatchBSwap(BinaryOperator &I);
   bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
 
-  Instruction *SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI);
+  Instruction *
+  SimplifyElementUnorderedAtomicMemCpy(ElementUnorderedAtomicMemCpyInst *AMI);
   Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
   Instruction *SimplifyMemSet(MemSetInst *MI);
 
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 3f2ddcacce2b8..8cec865c6422a 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -682,11 +682,11 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
     }
 
-    if (match(Op0, m_SExt(m_Value(X)))) {
+    if (match(Op0, m_SExt(m_Value(X))) &&
+        (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
       // Are we moving the sign bit to the low bit and widening with high zeros?
       unsigned SrcTyBitWidth = X->getType()->getScalarSizeInBits();
-      if (ShAmt == BitWidth - 1 &&
-          (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
+      if (ShAmt == BitWidth - 1) {
         // lshr (sext i1 X to iN), N-1 --> zext X to iN
         if (SrcTyBitWidth == 1)
           return new ZExtInst(X, Ty);
@@ -698,7 +698,13 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
         }
       }
 
-      // TODO: Convert to ashr+zext if the shift equals the extension amount.
+      // lshr (sext iM X to iN), N-M --> zext (ashr X, min(N-M, M-1)) to iN
+      if (ShAmt == BitWidth - SrcTyBitWidth && Op0->hasOneUse()) {
+        // The new shift amount can't be more than the narrow source type.
+        unsigned NewShAmt = std::min(ShAmt, SrcTyBitWidth - 1);
+        Value *AShr = Builder->CreateAShr(X, NewShAmt);
+        return new ZExtInst(AShr, Ty);
+      }
     }
 
     if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) {
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 7ff69b9eb7f42..f2806e278e6e1 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(LLVMInstrumentation
   Instrumentation.cpp
   InstrProfiling.cpp
   PGOInstrumentation.cpp
+  PGOMemOPSizeOpt.cpp
   SanitizerCoverage.cpp
   ThreadSanitizer.cpp
   EfficiencySanitizer.cpp
diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 96027bc3d0a91..0d308810009d5 100644
--- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -56,8 +56,6 @@ using namespace llvm;
 
 STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
 STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
-STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
-STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
 
 // Command line option to disable indirect-call promotion with the default as
 // false. This is for debug purpose.
@@ -111,44 +109,6 @@ static cl::opt<bool>
     ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
                  cl::desc("Dump IR after transformation happens"));
 
-// The minimum call count to optimize memory intrinsic calls.
-static cl::opt<unsigned>
-    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
-                        cl::init(1000),
-                        cl::desc("The minimum count to optimize memory "
-                                 "intrinsic calls"));
-
-// Command line option to disable memory intrinsic optimization. The default is
-// false. This is for debug purpose.
-static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
-                                     cl::Hidden, cl::desc("Disable optimize"));
-
-// The percent threshold to optimize memory intrinsic calls.
-static cl::opt<unsigned>
-    MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
-                          cl::Hidden, cl::ZeroOrMore,
-                          cl::desc("The percentage threshold for the "
-                                   "memory intrinsic calls optimization"));
-
-// Maximum number of versions for optimizing memory intrinsic call.
-static cl::opt<unsigned>
-    MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
-                    cl::ZeroOrMore,
-                    cl::desc("The max version for the optimized memory "
-                             " intrinsic calls"));
-
-// Scale the counts from the annotation using the BB count value.
-static cl::opt<bool>
-    MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
-                    cl::desc("Scale the memop size counts using the basic "
-                             " block count value"));
-
-// This option sets the rangge of precise profile memop sizes.
-extern cl::opt<std::string> MemOPSizeRange;
-
-// This option sets the value that groups large memop sizes
-extern cl::opt<unsigned> MemOPSizeLarge;
-
 namespace {
 class PGOIndirectCallPromotionLegacyPass : public ModulePass {
 public:
@@ -173,24 +133,6 @@ private:
   // the promoted direct call.
   bool SamplePGO;
 };
-
-class PGOMemOPSizeOptLegacyPass : public FunctionPass {
-public:
-  static char ID;
-
-  PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
-    initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override { return "PGOMemOPSize"; }
-
-private:
-  bool runOnFunction(Function &F) override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<BlockFrequencyInfoWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-  }
-};
 } // end anonymous namespace
 
 char PGOIndirectCallPromotionLegacyPass::ID = 0;
@@ -204,19 +146,6 @@ ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO,
   return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO);
 }
 
-char PGOMemOPSizeOptLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
-                      "Optimize memory intrinsic using its size value profile",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
-                    "Optimize memory intrinsic using its size value profile",
-                    false, false)
-
-FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
-  return new PGOMemOPSizeOptLegacyPass();
-}
-
 namespace {
 // The class for main data structure to promote indirect calls to conditional
 // direct calls.
@@ -749,285 +678,3 @@ PreservedAnalyses PGOIndirectCallPromotion::run(Module &M,
 
   return PreservedAnalyses::none();
 }
-
-namespace {
-class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
-public:
-  MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI)
-      : Func(Func), BFI(BFI), Changed(false) {
-    ValueDataArray =
-        llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
-    // Get the MemOPSize range information from option MemOPSizeRange,
-    getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
-                                PreciseRangeLast);
-  }
-  bool isChanged() const { return Changed; }
-  void perform() {
-    WorkList.clear();
-    visit(Func);
-
-    for (auto &MI : WorkList) {
-      ++NumOfPGOMemOPAnnotate;
-      if (perform(MI)) {
-        Changed = true;
-        ++NumOfPGOMemOPOpt;
-        DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName()
-                     << "is Transformed.\n");
-      }
-    }
-  }
-
-  void visitMemIntrinsic(MemIntrinsic &MI) {
-    Value *Length = MI.getLength();
-    // Not perform on constant length calls.
-    if (dyn_cast<ConstantInt>(Length))
-      return;
-    WorkList.push_back(&MI);
-  }
-
-private:
-  Function &Func;
-  BlockFrequencyInfo &BFI;
-  bool Changed;
-  std::vector<MemIntrinsic *> WorkList;
-  // Start of the previse range.
-  int64_t PreciseRangeStart;
-  // Last value of the previse range.
-  int64_t PreciseRangeLast;
-  // The space to read the profile annotation.
-  std::unique_ptr<InstrProfValueData[]> ValueDataArray;
-  bool perform(MemIntrinsic *MI);
-
-  // This kind shows which group the value falls in. For PreciseValue, we have
-  // the profile count for that value. LargeGroup groups the values that are in
-  // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
-  enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
-
-  MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
-    if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
-      return LargeGroup;
-    if (Value == PreciseRangeLast + 1)
-      return NonLargeGroup;
-    return PreciseValue;
-  }
-};
-
-static const char *getMIName(const MemIntrinsic *MI) {
-  switch (MI->getIntrinsicID()) {
-  case Intrinsic::memcpy:
-    return "memcpy";
-  case Intrinsic::memmove:
-    return "memmove";
-  case Intrinsic::memset:
-    return "memset";
-  default:
-    return "unknown";
-  }
-}
-
-static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
-  assert(Count <= TotalCount);
-  if (Count < MemOPCountThreshold)
-    return false;
-  if (Count < TotalCount * MemOPPercentThreshold / 100)
-    return false;
-  return true;
-}
-
-static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
-                                      uint64_t Denom) {
-  if (!MemOPScaleCount)
-    return Count;
-  bool Overflowed;
-  uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
-  return ScaleCount / Denom;
-}
-
-bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
-  assert(MI);
-  if (MI->getIntrinsicID() == Intrinsic::memmove)
-    return false;
-
-  uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
-  uint64_t TotalCount;
-  if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
-                                ValueDataArray.get(), NumVals, TotalCount))
-    return false;
-
-  uint64_t ActualCount = TotalCount;
-  uint64_t SavedTotalCount = TotalCount;
-  if (MemOPScaleCount) {
-    auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
-    if (!BBEdgeCount)
-      return false;
-    ActualCount = *BBEdgeCount;
-  }
-
-  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
-  DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount
-               << "\n");
-  DEBUG(
-      for (auto &VD
-           : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
-
-  if (ActualCount < MemOPCountThreshold)
-    return false;
-  // Skip if the total value profiled count is 0, in which case we can't
-  // scale up the counts properly (and there is no profitable transformation).
-  if (TotalCount == 0)
-    return false;
-
-  TotalCount = ActualCount;
-  if (MemOPScaleCount)
-    DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
-                 << " denominator = " << SavedTotalCount << "\n");
-
-  // Keeping track of the count of the default case:
-  uint64_t RemainCount = TotalCount;
-  SmallVector<uint64_t, 16> SizeIds;
-  SmallVector<uint64_t, 16> CaseCounts;
-  uint64_t MaxCount = 0;
-  unsigned Version = 0;
-  // Default case is in the front -- save the slot here.
-  CaseCounts.push_back(0);
-  for (auto &VD : VDs) {
-    int64_t V = VD.Value;
-    uint64_t C = VD.Count;
-    if (MemOPScaleCount)
-      C = getScaledCount(C, ActualCount, SavedTotalCount);
-
-    // Only care precise value here.
-    if (getMemOPSizeKind(V) != PreciseValue)
-      continue;
-
-    // ValueCounts are sorted on the count. Break at the first un-profitable
-    // value.
-    if (!isProfitable(C, RemainCount))
-      break;
-
-    SizeIds.push_back(V);
-    CaseCounts.push_back(C);
-    if (C > MaxCount)
-      MaxCount = C;
-
-    assert(RemainCount >= C);
-    RemainCount -= C;
-
-    if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
-      break;
-  }
-
-  if (Version == 0)
-    return false;
-
-  CaseCounts[0] = RemainCount;
-  if (RemainCount > MaxCount)
-    MaxCount = RemainCount;
-
-  uint64_t SumForOpt = TotalCount - RemainCount;
-
-  DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
-               << " Versions (covering " << SumForOpt << " out of "
-               << TotalCount << ")\n");
-
-  // mem_op(..., size)
-  // ==>
-  // switch (size) {
-  //   case s1:
-  //      mem_op(..., s1);
-  //      goto merge_bb;
-  //   case s2:
-  //      mem_op(..., s2);
-  //      goto merge_bb;
-  //   ...
-  //   default:
-  //      mem_op(..., size);
-  //      goto merge_bb;
-  // }
-  // merge_bb:
-
-  BasicBlock *BB = MI->getParent();
-  DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
-  DEBUG(dbgs() << *BB << "\n");
-  auto OrigBBFreq = BFI.getBlockFreq(BB);
-
-  BasicBlock *DefaultBB = SplitBlock(BB, MI);
-  BasicBlock::iterator It(*MI);
-  ++It;
-  assert(It != DefaultBB->end());
-  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
-  MergeBB->setName("MemOP.Merge");
-  BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
-  DefaultBB->setName("MemOP.Default");
-
-  auto &Ctx = Func.getContext();
-  IRBuilder<> IRB(BB);
-  BB->getTerminator()->eraseFromParent();
-  Value *SizeVar = MI->getLength();
-  SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
-
-  // Clear the value profile data.
-  MI->setMetadata(LLVMContext::MD_prof, nullptr);
-
-  DEBUG(dbgs() << "\n\n== Basic Block After==\n");
-
-  for (uint64_t SizeId : SizeIds) {
-    ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId);
-    BasicBlock *CaseBB = BasicBlock::Create(
-        Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
-    Instruction *NewInst = MI->clone();
-    // Fix the argument.
-    dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId);
-    CaseBB->getInstList().push_back(NewInst);
-    IRBuilder<> IRBCase(CaseBB);
-    IRBCase.CreateBr(MergeBB);
-    SI->addCase(CaseSizeId, CaseBB);
-    DEBUG(dbgs() << *CaseBB << "\n");
-  }
-  setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
-
-  DEBUG(dbgs() << *BB << "\n");
-  DEBUG(dbgs() << *DefaultBB << "\n");
-  DEBUG(dbgs() << *MergeBB << "\n");
-
-  emitOptimizationRemark(Func.getContext(), "memop-opt", Func,
-                         MI->getDebugLoc(),
-                         Twine("optimize ") + getMIName(MI) + " with count " +
-                             Twine(SumForOpt) + " out of " + Twine(TotalCount) +
-                             " for " + Twine(Version) + " versions");
-
-  return true;
-}
-} // namespace
-
-static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) {
-  if (DisableMemOPOPT)
-    return false;
-
-  if (F.hasFnAttribute(Attribute::OptimizeForSize))
-    return false;
-  MemOPSizeOpt MemOPSizeOpt(F, BFI);
-  MemOPSizeOpt.perform();
-  return MemOPSizeOpt.isChanged();
-}
-
-bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
-  BlockFrequencyInfo &BFI =
-      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
-  return PGOMemOPSizeOptImpl(F, BFI);
-}
-
-namespace llvm {
-char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
-
-PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
-                                       FunctionAnalysisManager &FAM) {
-  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
-  bool Changed = PGOMemOPSizeOptImpl(F, BFI);
-  if (!Changed)
-    return PreservedAnalyses::all();
-  auto  PA = PreservedAnalyses();
-  PA.preserve<GlobalsAA>();
-  return PA;
-}
-} // namespace llvm
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
index f83c930ca61be..37f88d5f95f18 100644
--- a/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -343,14 +343,24 @@ static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
 
 static inline bool shouldRecordFunctionAddr(Function *F) {
   // Check the linkage
+  bool HasAvailableExternallyLinkage = F->hasAvailableExternallyLinkage();
   if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() &&
-      !F->hasAvailableExternallyLinkage())
+      !HasAvailableExternallyLinkage)
     return true;
+
+  // A function marked 'alwaysinline' with available_externally linkage can't
+  // have its address taken. Doing so would create an undefined external ref to
+  // the function, which would fail to link.
+  if (HasAvailableExternallyLinkage &&
+      F->hasFnAttribute(Attribute::AlwaysInline))
+    return false;
+
   // Prohibit function address recording if the function is both internal and
   // COMDAT. This avoids the profile data variable referencing internal symbols
   // in COMDAT.
   if (F->hasLocalLinkage() && F->hasComdat())
     return false;
+
   // Check uses of this function for other than direct calls or invokes to it.
   // Inline virtual functions have linkeOnceODR linkage. When a key method
   // exists, the vtable will only be emitted in the TU where the key method
diff --git a/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
new file mode 100644
index 0000000000000..0bc9ddfbe4d33
--- /dev/null
+++ b/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -0,0 +1,419 @@
+//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the transformation that optimizes memory intrinsics
+// such as memcpy using the size value profile. When memory intrinsic size
+// value profile metadata is available, a single memory intrinsic is expanded
+// to a sequence of guarded specialized versions that are called with the
+// hottest size(s), for later expansion into more optimal inline sequences.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-memop-opt"
+
+STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
+STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
+
+// The minimum call count to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
+                        cl::init(1000),
+                        cl::desc("The minimum count to optimize memory "
+                                 "intrinsic calls"));
+
+// Command line option to disable memory intrinsic optimization. The default is
+// false. This is for debug purpose.
+static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
+                                     cl::Hidden, cl::desc("Disable optimize"));
+
+// The percent threshold to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
+                          cl::Hidden, cl::ZeroOrMore,
+                          cl::desc("The percentage threshold for the "
+                                   "memory intrinsic calls optimization"));
+
+// Maximum number of versions for optimizing memory intrinsic call.
+static cl::opt<unsigned>
+    MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
+                    cl::ZeroOrMore,
+                    cl::desc("The max version for the optimized memory "
+                             " intrinsic calls"));
+
+// Scale the counts from the annotation using the BB count value.
+static cl::opt<bool>
+    MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
+                    cl::desc("Scale the memop size counts using the basic "
+                             " block count value"));
+
+// This option sets the rangge of precise profile memop sizes.
+extern cl::opt<std::string> MemOPSizeRange;
+
+// This option sets the value that groups large memop sizes
+extern cl::opt<unsigned> MemOPSizeLarge;
+
+namespace {
+class PGOMemOPSizeOptLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
+    initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOMemOPSize"; }
+
+private:
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // end anonymous namespace
+
+char PGOMemOPSizeOptLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                      "Optimize memory intrinsic using its size value profile",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                    "Optimize memory intrinsic using its size value profile",
+                    false, false)
+
+FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
+  return new PGOMemOPSizeOptLegacyPass();
+}
+
+namespace {
+class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
+public:
+  MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI)
+      : Func(Func), BFI(BFI), Changed(false) {
+    ValueDataArray =
+        llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
+    // Get the MemOPSize range information from option MemOPSizeRange,
+    getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
+                                PreciseRangeLast);
+  }
+  bool isChanged() const { return Changed; }
+  void perform() {
+    WorkList.clear();
+    visit(Func);
+
+    for (auto &MI : WorkList) {
+      ++NumOfPGOMemOPAnnotate;
+      if (perform(MI)) {
+        Changed = true;
+        ++NumOfPGOMemOPOpt;
+        DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName()
+                     << "is Transformed.\n");
+      }
+    }
+  }
+
+  void visitMemIntrinsic(MemIntrinsic &MI) {
+    Value *Length = MI.getLength();
+    // Not perform on constant length calls.
+    if (dyn_cast<ConstantInt>(Length))
+      return;
+    WorkList.push_back(&MI);
+  }
+
+private:
+  Function &Func;
+  BlockFrequencyInfo &BFI;
+  bool Changed;
+  std::vector<MemIntrinsic *> WorkList;
+  // Start of the previse range.
+  int64_t PreciseRangeStart;
+  // Last value of the previse range.
+  int64_t PreciseRangeLast;
+  // The space to read the profile annotation.
+  std::unique_ptr<InstrProfValueData[]> ValueDataArray;
+  bool perform(MemIntrinsic *MI);
+
+  // This kind shows which group the value falls in. For PreciseValue, we have
+  // the profile count for that value. LargeGroup groups the values that are in
+  // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
+  enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
+
+  MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
+    if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
+      return LargeGroup;
+    if (Value == PreciseRangeLast + 1)
+      return NonLargeGroup;
+    return PreciseValue;
+  }
+};
+
+static const char *getMIName(const MemIntrinsic *MI) {
+  switch (MI->getIntrinsicID()) {
+  case Intrinsic::memcpy:
+    return "memcpy";
+  case Intrinsic::memmove:
+    return "memmove";
+  case Intrinsic::memset:
+    return "memset";
+  default:
+    return "unknown";
+  }
+}
+
+static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
+  assert(Count <= TotalCount);
+  if (Count < MemOPCountThreshold)
+    return false;
+  if (Count < TotalCount * MemOPPercentThreshold / 100)
+    return false;
+  return true;
+}
+
+static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
+                                      uint64_t Denom) {
+  if (!MemOPScaleCount)
+    return Count;
+  bool Overflowed;
+  uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
+  return ScaleCount / Denom;
+}
+
+bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
+  assert(MI);
+  if (MI->getIntrinsicID() == Intrinsic::memmove)
+    return false;
+
+  uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
+  uint64_t TotalCount;
+  if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
+                                ValueDataArray.get(), NumVals, TotalCount))
+    return false;
+
+  uint64_t ActualCount = TotalCount;
+  uint64_t SavedTotalCount = TotalCount;
+  if (MemOPScaleCount) {
+    auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
+    if (!BBEdgeCount)
+      return false;
+    ActualCount = *BBEdgeCount;
+  }
+
+  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
+  DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount
+               << "\n");
+  DEBUG(
+      for (auto &VD
+           : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
+
+  if (ActualCount < MemOPCountThreshold)
+    return false;
+  // Skip if the total value profiled count is 0, in which case we can't
+  // scale up the counts properly (and there is no profitable transformation).
+  if (TotalCount == 0)
+    return false;
+
+  TotalCount = ActualCount;
+  if (MemOPScaleCount)
+    DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
+                 << " denominator = " << SavedTotalCount << "\n");
+
+  // Keeping track of the count of the default case:
+  uint64_t RemainCount = TotalCount;
+  uint64_t SavedRemainCount = SavedTotalCount;
+  SmallVector<uint64_t, 16> SizeIds;
+  SmallVector<uint64_t, 16> CaseCounts;
+  uint64_t MaxCount = 0;
+  unsigned Version = 0;
+  // Default case is in the front -- save the slot here.
+  CaseCounts.push_back(0);
+  for (auto &VD : VDs) {
+    int64_t V = VD.Value;
+    uint64_t C = VD.Count;
+    if (MemOPScaleCount)
+      C = getScaledCount(C, ActualCount, SavedTotalCount);
+
+    // Only care precise value here.
+    if (getMemOPSizeKind(V) != PreciseValue)
+      continue;
+
+    // ValueCounts are sorted on the count. Break at the first un-profitable
+    // value.
+    if (!isProfitable(C, RemainCount))
+      break;
+
+    SizeIds.push_back(V);
+    CaseCounts.push_back(C);
+    if (C > MaxCount)
+      MaxCount = C;
+
+    assert(RemainCount >= C);
+    RemainCount -= C;
+    assert(SavedRemainCount >= VD.Count);
+    SavedRemainCount -= VD.Count;
+
+    if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
+      break;
+  }
+
+  if (Version == 0)
+    return false;
+
+  CaseCounts[0] = RemainCount;
+  if (RemainCount > MaxCount)
+    MaxCount = RemainCount;
+
+  uint64_t SumForOpt = TotalCount - RemainCount;
+
+  DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
+               << " Versions (covering " << SumForOpt << " out of "
+               << TotalCount << ")\n");
+
+  // mem_op(..., size)
+  // ==>
+  // switch (size) {
+  //   case s1:
+  //      mem_op(..., s1);
+  //      goto merge_bb;
+  //   case s2:
+  //      mem_op(..., s2);
+  //      goto merge_bb;
+  //   ...
+  //   default:
+  //      mem_op(..., size);
+  //      goto merge_bb;
+  // }
+  // merge_bb:
+
+  BasicBlock *BB = MI->getParent();
+  DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+  DEBUG(dbgs() << *BB << "\n");
+  auto OrigBBFreq = BFI.getBlockFreq(BB);
+
+  BasicBlock *DefaultBB = SplitBlock(BB, MI);
+  BasicBlock::iterator It(*MI);
+  ++It;
+  assert(It != DefaultBB->end());
+  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
+  MergeBB->setName("MemOP.Merge");
+  BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
+  DefaultBB->setName("MemOP.Default");
+
+  auto &Ctx = Func.getContext();
+  IRBuilder<> IRB(BB);
+  BB->getTerminator()->eraseFromParent();
+  Value *SizeVar = MI->getLength();
+  SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
+
+  // Clear the value profile data.
+  MI->setMetadata(LLVMContext::MD_prof, nullptr);
+  // If all promoted, we don't need the MD.prof metadata.
+  if (SavedRemainCount > 0 || Version != NumVals)
+    // Otherwise we need update with the un-promoted records back.
+    annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
+                      SavedRemainCount, IPVK_MemOPSize, NumVals);
+
+  DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+
+  for (uint64_t SizeId : SizeIds) {
+    ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId);
+    BasicBlock *CaseBB = BasicBlock::Create(
+        Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
+    Instruction *NewInst = MI->clone();
+    // Fix the argument.
+    dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId);
+    CaseBB->getInstList().push_back(NewInst);
+    IRBuilder<> IRBCase(CaseBB);
+    IRBCase.CreateBr(MergeBB);
+    SI->addCase(CaseSizeId, CaseBB);
+    DEBUG(dbgs() << *CaseBB << "\n");
+  }
+  setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
+
+  DEBUG(dbgs() << *BB << "\n");
+  DEBUG(dbgs() << *DefaultBB << "\n");
+  DEBUG(dbgs() << *MergeBB << "\n");
+
+  emitOptimizationRemark(Func.getContext(), "memop-opt", Func,
+                         MI->getDebugLoc(),
+                         Twine("optimize ") + getMIName(MI) + " with count " +
+                             Twine(SumForOpt) + " out of " + Twine(TotalCount) +
+                             " for " + Twine(Version) + " versions");
+
+  return true;
+}
+} // namespace
+
+static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) {
+  if (DisableMemOPOPT)
+    return false;
+
+  if (F.hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+  MemOPSizeOpt MemOPSizeOpt(F, BFI);
+  MemOPSizeOpt.perform();
+  return MemOPSizeOpt.isChanged();
+}
+
+bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
+  BlockFrequencyInfo &BFI =
+      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+  return PGOMemOPSizeOptImpl(F, BFI);
+}
+
+namespace llvm {
+char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
+
+PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
+                                       FunctionAnalysisManager &FAM) {
+  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  bool Changed = PGOMemOPSizeOptImpl(F, BFI);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+} // namespace llvm
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 8aa40d1759de6..e3c36c98ab0db 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -61,7 +61,7 @@ static const char *const SanCov8bitCountersInitName =
     "__sanitizer_cov_8bit_counters_init";
 
 static const char *const SanCovGuardsSectionName = "sancov_guards";
-static const char *const SanCovCountersSectionName = "sancov_counters";
+static const char *const SanCovCountersSectionName = "sancov_cntrs";
 
 static cl::opt<int> ClCoverageLevel(
     "sanitizer-coverage-level",
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 7b625b9b136ec..2a4c9526dfcd9 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -551,7 +551,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
         BBChanged = true;        
       }
     }
-    };
+    }
 
     FnChanged |= BBChanged;
   }
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index c4f450949e6de..0f92760a874b5 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
@@ -506,7 +507,7 @@ private:
     if (MemoryAccess *MA = MSSA->getMemoryAccess(Inst)) {
       // Optimize MemoryPhi nodes that may become redundant by having all the
       // same input values once MA is removed.
-      SmallVector<MemoryPhi *, 4> PhisToCheck;
+      SmallSetVector<MemoryPhi *, 4> PhisToCheck;
       SmallVector<MemoryAccess *, 8> WorkQueue;
       WorkQueue.push_back(MA);
       // Process MemoryPhi nodes in FIFO order using a ever-growing vector since
@@ -517,7 +518,7 @@ private:
 
         for (auto *U : WI->users())
           if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U))
-            PhisToCheck.push_back(MP);
+            PhisToCheck.insert(MP);
 
         MSSAUpdater->removeMemoryAccess(WI);
 
diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp
index 8634816e702fb..5fd2dfc118b4b 100644
--- a/lib/Transforms/Scalar/GVNSink.cpp
+++ b/lib/Transforms/Scalar/GVNSink.cpp
@@ -64,6 +64,17 @@ using namespace llvm;
 
 STATISTIC(NumRemoved, "Number of instructions removed");
 
+namespace llvm {
+namespace GVNExpression {
+
+LLVM_DUMP_METHOD void Expression::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
+}
+}
+
 namespace {
 
 static bool isMemoryInst(const Instruction *I) {
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index b706152f30c81..8b435050ac769 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -983,21 +983,21 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   const SCEV *NumBytesS =
       SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
 
+  if (StoreSize != 1)
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
+                               SCEV::FlagNUW);
+
+  Value *NumBytes =
+      Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
+
   unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
   CallInst *NewCall = nullptr;
   // Check whether to generate an unordered atomic memcpy:
   //  If the load or store are atomic, then they must neccessarily be unordered
   //  by previous checks.
-  if (!SI->isAtomic() && !LI->isAtomic()) {
-    if (StoreSize != 1)
-      NumBytesS = SE->getMulExpr(
-          NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW);
-
-    Value *NumBytes =
-        Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
-
+  if (!SI->isAtomic() && !LI->isAtomic())
     NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align);
-  } else {
+  else {
     // We cannot allow unaligned ops for unordered load/store, so reject
     // anything where the alignment isn't at least the element size.
     if (Align < StoreSize)
@@ -1010,11 +1010,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
     if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
       return false;
 
-    Value *NumElements =
-        Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
+    NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
+        StoreBasePtr, LoadBasePtr, NumBytes, StoreSize);
 
-    NewCall = Builder.CreateElementAtomicMemCpy(StoreBasePtr, LoadBasePtr,
-                                                NumElements, StoreSize);
     // Propagate alignment info onto the pointer args. Note that unordered
     // atomic loads/stores are *required* by the spec to have an alignment
     // but non-atomic loads/stores may not.
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 6926aae37963f..cbbd55512c9f5 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -2195,7 +2195,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
 // For a given expression, mark the phi of ops instructions that could have
 // changed as a result.
 void NewGVN::markPhiOfOpsChanged(const Expression *E) {
-  touchAndErase(ExpressionToPhiOfOps, E);
+  touchAndErase(ExpressionToPhiOfOps, ExactEqualsExpression(*E));
 }
 
 // Perform congruence finding on a given value numbering expression.
@@ -3561,7 +3561,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
     // TODO: It would be faster to use getNumIncomingBlocks() on a phi node in
     // the block and subtract the pred count, but it's more complicated.
     if (ReachablePredCount.lookup(BB) !=
-        std::distance(pred_begin(BB), pred_end(BB))) {
+        unsigned(std::distance(pred_begin(BB), pred_end(BB)))) {
       for (auto II = BB->begin(); isa<PHINode>(II); ++II) {
         auto &PHI = cast<PHINode>(*II);
         ReplaceUnreachablePHIArgs(PHI, BB);
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index bae7911d222c9..a52739bb76f71 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -89,10 +89,10 @@ struct RewriteStatepointsForGC : public ModulePass {
       Changed |= runOnFunction(F);
 
     if (Changed) {
-      // stripNonValidAttributes asserts that shouldRewriteStatepointsIn
+      // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn
       // returns true for at least one function in the module.  Since at least
       // one function changed, we know that the precondition is satisfied.
-      stripNonValidAttributes(M);
+      stripNonValidAttributesAndMetadata(M);
     }
 
     return Changed;
@@ -105,20 +105,24 @@ struct RewriteStatepointsForGC : public ModulePass {
     AU.addRequired<TargetTransformInfoWrapperPass>();
   }
 
-  /// The IR fed into RewriteStatepointsForGC may have had attributes implying
-  /// dereferenceability that are no longer valid/correct after
-  /// RewriteStatepointsForGC has run.  This is because semantically, after
+  /// The IR fed into RewriteStatepointsForGC may have had attributes and
+  /// metadata implying dereferenceability that are no longer valid/correct after
+  /// RewriteStatepointsForGC has run. This is because semantically, after
   /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
-  /// heap.  stripNonValidAttributes (conservatively) restores correctness
-  /// by erasing all attributes in the module that externally imply
-  /// dereferenceability.
-  /// Similar reasoning also applies to the noalias attributes. gc.statepoint
-  /// can touch the entire heap including noalias objects.
-  void stripNonValidAttributes(Module &M);
-
-  // Helpers for stripNonValidAttributes
-  void stripNonValidAttributesFromBody(Function &F);
+  /// heap. stripNonValidAttributesAndMetadata (conservatively) restores
+  /// correctness by erasing all attributes in the module that externally imply
+  /// dereferenceability. Similar reasoning also applies to the noalias
+  /// attributes and metadata. gc.statepoint can touch the entire heap including
+  /// noalias objects.
+  void stripNonValidAttributesAndMetadata(Module &M);
+
+  // Helpers for stripNonValidAttributesAndMetadata
+  void stripNonValidAttributesAndMetadataFromBody(Function &F);
   void stripNonValidAttributesFromPrototype(Function &F);
+  // Certain metadata on instructions are invalid after running RS4GC.
+  // Optimizations that run after RS4GC can incorrectly use this metadata to
+  // optimize functions. We drop such metadata on the instruction.
+  void stripInvalidMetadataFromInstruction(Instruction &I);
 };
 } // namespace
 
@@ -2306,13 +2310,44 @@ RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
     RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
 }
 
-void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
+void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I) {
+
+  if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+    return;
+  // These are the attributes that are still valid on loads and stores after
+  // RS4GC.
+  // The metadata implying dereferenceability and noalias are (conservatively)
+  // dropped.  This is because semantically, after RewriteStatepointsForGC runs,
+  // all calls to gc.statepoint "free" the entire heap. Also, gc.statepoint can
+  // touch the entire heap including noalias objects. Note: The reasoning is
+  // same as stripping the dereferenceability and noalias attributes that are
+  // analogous to the metadata counterparts.
+  // We also drop the invariant.load metadata on the load because that metadata
+  // implies the address operand to the load points to memory that is never
+  // changed once it became dereferenceable. This is no longer true after RS4GC.
+  // Similar reasoning applies to invariant.group metadata, which applies to
+  // loads within a group.
+  unsigned ValidMetadataAfterRS4GC[] = {LLVMContext::MD_tbaa,
+                         LLVMContext::MD_range,
+                         LLVMContext::MD_alias_scope,
+                         LLVMContext::MD_nontemporal,
+                         LLVMContext::MD_nonnull,
+                         LLVMContext::MD_align,
+                         LLVMContext::MD_type};
+
+  // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC.
+  I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
+
+}
+
+void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) {
   if (F.empty())
     return;
 
   LLVMContext &Ctx = F.getContext();
   MDBuilder Builder(Ctx);
 
+
   for (Instruction &I : instructions(F)) {
     if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
       assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
@@ -2333,6 +2368,8 @@ void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
       I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
     }
 
+    stripInvalidMetadataFromInstruction(I);
+
     if (CallSite CS = CallSite(&I)) {
       for (int i = 0, e = CS.arg_size(); i != e; i++)
         if (isa<PointerType>(CS.getArgument(i)->getType()))
@@ -2357,7 +2394,7 @@ static bool shouldRewriteStatepointsIn(Function &F) {
     return false;
 }
 
-void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {
+void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) {
 #ifndef NDEBUG
   assert(any_of(M, shouldRewriteStatepointsIn) && "precondition!");
 #endif
@@ -2366,7 +2403,7 @@ void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {
     stripNonValidAttributesFromPrototype(F);
 
   for (Function &F : M)
-    stripNonValidAttributesFromBody(F);
+    stripNonValidAttributesAndMetadataFromBody(F);
 }
 
 bool RewriteStatepointsForGC::runOnFunction(Function &F) {
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 815492ac354c0..c6929c33b3e9e 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -515,10 +515,6 @@ private:
   void visitCmpInst(CmpInst &I);
   void visitExtractValueInst(ExtractValueInst &EVI);
   void visitInsertValueInst(InsertValueInst &IVI);
-  void visitLandingPadInst(LandingPadInst &I) { markOverdefined(&I); }
-  void visitFuncletPadInst(FuncletPadInst &FPI) {
-    markOverdefined(&FPI);
-  }
   void visitCatchSwitchInst(CatchSwitchInst &CPI) {
     markOverdefined(&CPI);
     visitTerminatorInst(CPI);
@@ -539,13 +535,6 @@ private:
   void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
-  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
-    markOverdefined(&I);
-  }
-  void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
-  void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
-  void visitVAArgInst     (Instruction &I) { markOverdefined(&I); }
-
   void visitInstruction(Instruction &I) {
     // If a new instruction is added to LLVM that we don't handle.
     DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 24d28a6c28311..5d57ed9718fb6 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -142,8 +142,139 @@ static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) {
   return false;
 }
 
-void CodeExtractor::findAllocas(ValueSet &SinkCands) const {
+static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) {
+  BasicBlock *CommonExitBlock = nullptr;
+  auto hasNonCommonExitSucc = [&](BasicBlock *Block) {
+    for (auto *Succ : successors(Block)) {
+      // Internal edges, ok.
+      if (Blocks.count(Succ))
+        continue;
+      if (!CommonExitBlock) {
+        CommonExitBlock = Succ;
+        continue;
+      }
+      if (CommonExitBlock == Succ)
+        continue;
+
+      return true;
+    }
+    return false;
+  };
+
+  if (any_of(Blocks, hasNonCommonExitSucc))
+    return nullptr;
+
+  return CommonExitBlock;
+}
+
+bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers(
+    Instruction *Addr) const {
+  AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets());
+  Function *Func = (*Blocks.begin())->getParent();
+  for (BasicBlock &BB : *Func) {
+    if (Blocks.count(&BB))
+      continue;
+    for (Instruction &II : BB) {
+
+      if (isa<DbgInfoIntrinsic>(II))
+        continue;
+
+      unsigned Opcode = II.getOpcode();
+      Value *MemAddr = nullptr;
+      switch (Opcode) {
+      case Instruction::Store:
+      case Instruction::Load: {
+        if (Opcode == Instruction::Store) {
+          StoreInst *SI = cast<StoreInst>(&II);
+          MemAddr = SI->getPointerOperand();
+        } else {
+          LoadInst *LI = cast<LoadInst>(&II);
+          MemAddr = LI->getPointerOperand();
+        }
+        // Global variable can not be aliased with locals.
+        if (dyn_cast<Constant>(MemAddr))
+          break;
+        Value *Base = MemAddr->stripInBoundsConstantOffsets();
+        if (!dyn_cast<AllocaInst>(Base) || Base == AI)
+          return false;
+        break;
+      }
+      default: {
+        IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II);
+        if (IntrInst) {
+          if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
+              IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
+            break;
+          return false;
+        }
+        // Treat all the other cases conservatively if it has side effects.
+        if (II.mayHaveSideEffects())
+          return false;
+      }
+      }
+    }
+  }
+
+  return true;
+}
+
+BasicBlock *
+CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) {
+  BasicBlock *SinglePredFromOutlineRegion = nullptr;
+  assert(!Blocks.count(CommonExitBlock) &&
+         "Expect a block outside the region!");
+  for (auto *Pred : predecessors(CommonExitBlock)) {
+    if (!Blocks.count(Pred))
+      continue;
+    if (!SinglePredFromOutlineRegion) {
+      SinglePredFromOutlineRegion = Pred;
+    } else if (SinglePredFromOutlineRegion != Pred) {
+      SinglePredFromOutlineRegion = nullptr;
+      break;
+    }
+  }
+
+  if (SinglePredFromOutlineRegion)
+    return SinglePredFromOutlineRegion;
+
+#ifndef NDEBUG
+  auto getFirstPHI = [](BasicBlock *BB) {
+    BasicBlock::iterator I = BB->begin();
+    PHINode *FirstPhi = nullptr;
+    while (I != BB->end()) {
+      PHINode *Phi = dyn_cast<PHINode>(I);
+      if (!Phi)
+        break;
+      if (!FirstPhi) {
+        FirstPhi = Phi;
+        break;
+      }
+    }
+    return FirstPhi;
+  };
+  // If there are any phi nodes, the single pred either exists or has already
+  // be created before code extraction.
+  assert(!getFirstPHI(CommonExitBlock) && "Phi not expected");
+#endif
+
+  BasicBlock *NewExitBlock = CommonExitBlock->splitBasicBlock(
+      CommonExitBlock->getFirstNonPHI()->getIterator());
+
+  for (auto *Pred : predecessors(CommonExitBlock)) {
+    if (Blocks.count(Pred))
+      continue;
+    Pred->getTerminator()->replaceUsesOfWith(CommonExitBlock, NewExitBlock);
+  }
+  // Now add the old exit block to the outline region.
+  Blocks.insert(CommonExitBlock);
+  return CommonExitBlock;
+}
+
+void CodeExtractor::findAllocas(ValueSet &SinkCands, ValueSet &HoistCands,
+                                BasicBlock *&ExitBlock) const {
   Function *Func = (*Blocks.begin())->getParent();
+  ExitBlock = getCommonExitBlock(Blocks);
+
   for (BasicBlock &BB : *Func) {
     if (Blocks.count(&BB))
       continue;
@@ -152,49 +283,96 @@ void CodeExtractor::findAllocas(ValueSet &SinkCands) const {
       if (!AI)
         continue;
 
-      // Returns true if matching life time markers are found within
-      // the outlined region.
-      auto GetLifeTimeMarkers = [&](Instruction *Addr) {
+      // Find the pair of life time markers for address 'Addr' that are either
+      // defined inside the outline region or can legally be shrinkwrapped into
+      // the outline region. If there are not other untracked uses of the
+      // address, return the pair of markers if found; otherwise return a pair
+      // of nullptr.
+      auto GetLifeTimeMarkers =
+          [&](Instruction *Addr, bool &SinkLifeStart,
+              bool &HoistLifeEnd) -> std::pair<Instruction *, Instruction *> {
         Instruction *LifeStart = nullptr, *LifeEnd = nullptr;
-        for (User *U : Addr->users()) {
-          if (!definedInRegion(Blocks, U))
-            return false;
 
+        for (User *U : Addr->users()) {
           IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(U);
           if (IntrInst) {
-            if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start)
+            if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) {
+              // Do not handle the case where AI has multiple start markers.
+              if (LifeStart)
+                return std::make_pair<Instruction *>(nullptr, nullptr);
               LifeStart = IntrInst;
-            if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
+            }
+            if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) {
+              if (LifeEnd)
+                return std::make_pair<Instruction *>(nullptr, nullptr);
               LifeEnd = IntrInst;
+            }
+            continue;
           }
+          // Find untracked uses of the address, bail.
+          if (!definedInRegion(Blocks, U))
+            return std::make_pair<Instruction *>(nullptr, nullptr);
         }
-        return LifeStart && LifeEnd;
+
+        if (!LifeStart || !LifeEnd)
+          return std::make_pair<Instruction *>(nullptr, nullptr);
+
+        SinkLifeStart = !definedInRegion(Blocks, LifeStart);
+        HoistLifeEnd = !definedInRegion(Blocks, LifeEnd);
+        // Do legality Check.
+        if ((SinkLifeStart || HoistLifeEnd) &&
+            !isLegalToShrinkwrapLifetimeMarkers(Addr))
+          return std::make_pair<Instruction *>(nullptr, nullptr);
+
+        // Check to see if we have a place to do hoisting, if not, bail.
+        if (HoistLifeEnd && !ExitBlock)
+          return std::make_pair<Instruction *>(nullptr, nullptr);
+
+        return std::make_pair(LifeStart, LifeEnd);
       };
 
-      if (GetLifeTimeMarkers(AI)) {
+      bool SinkLifeStart = false, HoistLifeEnd = false;
+      auto Markers = GetLifeTimeMarkers(AI, SinkLifeStart, HoistLifeEnd);
+
+      if (Markers.first) {
+        if (SinkLifeStart)
+          SinkCands.insert(Markers.first);
         SinkCands.insert(AI);
+        if (HoistLifeEnd)
+          HoistCands.insert(Markers.second);
         continue;
       }
 
-      // Follow the bitcast:
+      // Follow the bitcast.
       Instruction *MarkerAddr = nullptr;
       for (User *U : AI->users()) {
-        if (U->stripPointerCasts() == AI) {
+
+        if (U->stripInBoundsConstantOffsets() == AI) {
+          SinkLifeStart = false;
+          HoistLifeEnd = false;
           Instruction *Bitcast = cast<Instruction>(U);
-          if (GetLifeTimeMarkers(Bitcast)) {
+          Markers = GetLifeTimeMarkers(Bitcast, SinkLifeStart, HoistLifeEnd);
+          if (Markers.first) {
             MarkerAddr = Bitcast;
             continue;
           }
         }
+
+        // Found unknown use of AI.
         if (!definedInRegion(Blocks, U)) {
           MarkerAddr = nullptr;
           break;
         }
       }
+
       if (MarkerAddr) {
+        if (SinkLifeStart)
+          SinkCands.insert(Markers.first);
         if (!definedInRegion(Blocks, MarkerAddr))
           SinkCands.insert(MarkerAddr);
         SinkCands.insert(AI);
+        if (HoistLifeEnd)
+          HoistCands.insert(Markers.second);
       }
     }
   }
@@ -780,7 +958,8 @@ Function *CodeExtractor::extractCodeRegion() {
   if (!isEligible())
     return nullptr;
 
-  ValueSet inputs, outputs, SinkingCands;
+  ValueSet inputs, outputs, SinkingCands, HoistingCands;
+  BasicBlock *CommonExit = nullptr;
 
   // Assumption: this is a single-entry code region, and the header is the first
   // block in the region.
@@ -819,7 +998,8 @@ Function *CodeExtractor::extractCodeRegion() {
                                                "newFuncRoot");
   newFuncRoot->getInstList().push_back(BranchInst::Create(header));
 
-  findAllocas(SinkingCands);
+  findAllocas(SinkingCands, HoistingCands, CommonExit);
+  assert(HoistingCands.empty() || CommonExit);
 
   // Find inputs to, outputs from the code region.
   findInputsOutputs(inputs, outputs, SinkingCands);
@@ -829,6 +1009,13 @@ Function *CodeExtractor::extractCodeRegion() {
     cast<Instruction>(II)->moveBefore(*newFuncRoot,
                                       newFuncRoot->getFirstInsertionPt());
 
+  if (!HoistingCands.empty()) {
+    auto *HoistToBlock = findOrCreateBlockForHoisting(CommonExit);
+    Instruction *TI = HoistToBlock->getTerminator();
+    for (auto *II : HoistingCands)
+      cast<Instruction>(II)->moveBefore(TI);
+  }
+
   // Calculate the exit blocks for the extracted region and the total exit
   //  weights for each of those blocks.
   DenseMap<BasicBlock *, BlockFrequency> ExitWeights;
diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp
index 9e71cba4f1b7a..1260e35e934de 100644
--- a/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/lib/Transforms/Utils/PredicateInfo.cpp
@@ -460,6 +460,9 @@ void PredicateInfo::buildPredicateInfo() {
     if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) {
       if (!BI->isConditional())
         continue;
+      // Can't insert conditional information if they all go to the same place.
+      if (BI->getSuccessor(0) == BI->getSuccessor(1))
+        continue;
       processBranch(BI, BranchBB, OpsToRename);
     } else if (auto *SI = dyn_cast<SwitchInst>(BranchBB->getTerminator())) {
       processSwitch(SI, BranchBB, OpsToRename);
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 02a5d3dbeadfb..faa14046b1e3c 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -352,7 +352,7 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
     return false;
 
   typedef const SCEV *(ScalarEvolution::*OperationFunctionTy)(
-      const SCEV *, const SCEV *, SCEV::NoWrapFlags);
+      const SCEV *, const SCEV *, SCEV::NoWrapFlags, unsigned);
   typedef const SCEV *(ScalarEvolution::*ExtensionFunctionTy)(
       const SCEV *, Type *);
 
@@ -406,10 +406,11 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
     IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2);
 
   const SCEV *A =
-      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap), WideTy);
+      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0u),
+                       WideTy);
   const SCEV *B =
       (SE->*Operation)((SE->*Extension)(LHS, WideTy),
-                       (SE->*Extension)(RHS, WideTy), SCEV::FlagAnyWrap);
+                       (SE->*Extension)(RHS, WideTy), SCEV::FlagAnyWrap, 0u);
 
   if (A != B)
     return false;
@@ -530,8 +531,7 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
     return false;
 
   const SCEV *(ScalarEvolution::*GetExprForBO)(const SCEV *, const SCEV *,
-                                               SCEV::NoWrapFlags);
-
+                                               SCEV::NoWrapFlags, unsigned);
   switch (BO->getOpcode()) {
   default:
     return false;
@@ -560,7 +560,7 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
     const SCEV *ExtendAfterOp = SE->getZeroExtendExpr(SE->getSCEV(BO), WideTy);
     const SCEV *OpAfterExtend = (SE->*GetExprForBO)(
       SE->getZeroExtendExpr(LHS, WideTy), SE->getZeroExtendExpr(RHS, WideTy),
-      SCEV::FlagAnyWrap);
+      SCEV::FlagAnyWrap, 0u);
     if (ExtendAfterOp == OpAfterExtend) {
       BO->setHasNoUnsignedWrap();
       SE->forgetValue(BO);
@@ -572,7 +572,7 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
     const SCEV *ExtendAfterOp = SE->getSignExtendExpr(SE->getSCEV(BO), WideTy);
     const SCEV *OpAfterExtend = (SE->*GetExprForBO)(
       SE->getSignExtendExpr(LHS, WideTy), SE->getSignExtendExpr(RHS, WideTy),
-      SCEV::FlagAnyWrap);
+      SCEV::FlagAnyWrap, 0u);
     if (ExtendAfterOp == OpAfterExtend) {
       BO->setHasNoSignedWrap();
       SE->forgetValue(BO);
author	Dimitry Andric <dim@FreeBSD.org>	2017-06-16 21:03:24 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-06-16 21:03:24 +0000
commit	7c7aba6e5fef47a01a136be655b0a92cfd7090f6 (patch)
tree	99ec531924f6078534b100ab9d7696abce848099 /lib/Transforms
parent	7ab83427af0f77b59941ceba41d509d7d097b065 (diff)