vendor/llvm/llvm-trunk-r321017

author: Dimitry Andric <dim@FreeBSD.org> 2017-12-18 20:10:56 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2017-12-18 20:10:56 +0000
commit: 044eb2f6afba375a914ac9d8024f8f5142bb912e (patch)
tree: 1475247dc9f9fe5be155ebd4c9069c75aadf8c20 /lib/Transforms/IPO
parent: eb70dddbd77e120e5d490bd8fbe7ff3f8fa81c6b (diff)
28 files changed, 2669 insertions, 882 deletions
diff --git a/lib/Transforms/IPO/AlwaysInliner.cpp b/lib/Transforms/IPO/AlwaysInliner.cpp
index b7d96007c24a..a4bbc99b1f90 100644
--- a/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -15,15 +15,12 @@
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Transforms/IPO.h"
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 72bae203ee94..b25cbcad3b9d 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -1,4 +1,4 @@
-//===-- ArgumentPromotion.cpp - Promote by-reference arguments ------------===//
+//===- ArgumentPromotion.cpp - Promote by-reference arguments -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -31,30 +31,59 @@
 
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <map>
 #include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "argpromotion"
@@ -65,7 +94,7 @@ STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
 STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");
 
 /// A vector used to hold the indices of a single GEP instruction
-typedef std::vector<uint64_t> IndicesVector;
+using IndicesVector = std::vector<uint64_t>;
 
 /// DoPromotion - This method actually performs the promotion of the specified
 /// arguments, and returns the new function.  At this point, we know that it's
@@ -75,13 +104,12 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
             SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
             Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
                 ReplaceCallSite) {
-
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has modified arguments.
   FunctionType *FTy = F->getFunctionType();
   std::vector<Type *> Params;
 
-  typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable;
+  using ScalarizeTable = std::set<std::pair<Type *, IndicesVector>>;
 
   // ScalarizedElements - If we are promoting a pointer that has elements
   // accessed out of it, keep track of which elements are accessed so that we
@@ -89,7 +117,6 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
   //
   // Arguments that are directly loaded will have a zero element value here, to
   // handle cases where there are both a direct load and GEP accesses.
-  //
   std::map<Argument *, ScalarizeTable> ScalarizedElements;
 
   // OriginalLoads - Keep track of a representative load instruction from the
@@ -335,7 +362,6 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
   // Loop over the argument list, transferring uses of the old arguments over to
   // the new arguments, also transferring over the names as well.
-  //
   for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
                               I2 = NF->arg_begin();
        I != E; ++I) {
@@ -537,7 +563,7 @@ static void markIndicesSafe(const IndicesVector &ToMark,
 /// arguments passed in.
 static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
                                     AAResults &AAR, unsigned MaxElements) {
-  typedef std::set<IndicesVector> GEPIndicesSet;
+  using GEPIndicesSet = std::set<IndicesVector>;
 
   // Quick exit for unused arguments
   if (Arg->use_empty())
@@ -693,7 +719,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
     BasicBlock *BB = Load->getParent();
 
     MemoryLocation Loc = MemoryLocation::get(Load);
-    if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, MRI_Mod))
+    if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, ModRefInfo::Mod))
       return false; // Pointer is invalidated!
 
     // Now check every path from the entry block to the load for transparency.
@@ -714,7 +740,6 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
 
 /// \brief Checks if a type could have padding bytes.
 static bool isDenselyPacked(Type *type, const DataLayout &DL) {
-
   // There is no size information, so be conservative.
   if (!type->isSized())
     return false;
@@ -749,7 +774,6 @@ static bool isDenselyPacked(Type *type, const DataLayout &DL) {
 
 /// \brief Checks if the padding bytes of an argument could be accessed.
 static bool canPaddingBeAccessed(Argument *arg) {
-
   assert(arg->hasByValAttr());
 
   // Track all the pointers to the argument to make sure they are not captured.
@@ -788,7 +812,6 @@ static bool canPaddingBeAccessed(Argument *arg) {
 /// are any promotable arguments and if it is safe to promote the function (for
 /// example, all callers are direct).  If safe to promote some arguments, it
 /// calls the DoPromotion method.
-///
 static Function *
 promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
                  unsigned MaxElements,
@@ -964,9 +987,17 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
 }
 
 namespace {
+
 /// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
-///
 struct ArgPromotion : public CallGraphSCCPass {
+  // Pass identification, replacement for typeid
+  static char ID;
+
+  explicit ArgPromotion(unsigned MaxElements = 3)
+      : CallGraphSCCPass(ID), MaxElements(MaxElements) {
+    initializeArgPromotionPass(*PassRegistry::getPassRegistry());
+  }
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
@@ -975,21 +1006,20 @@ struct ArgPromotion : public CallGraphSCCPass {
   }
 
   bool runOnSCC(CallGraphSCC &SCC) override;
-  static char ID; // Pass identification, replacement for typeid
-  explicit ArgPromotion(unsigned MaxElements = 3)
-      : CallGraphSCCPass(ID), MaxElements(MaxElements) {
-    initializeArgPromotionPass(*PassRegistry::getPassRegistry());
-  }
 
 private:
   using llvm::Pass::doInitialization;
+
   bool doInitialization(CallGraph &CG) override;
+
   /// The maximum number of elements to expand, or 0 for unlimited.
   unsigned MaxElements;
 };
-}
+
+} // end anonymous namespace
 
 char ArgPromotion::ID = 0;
+
 INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
                       "Promote 'by reference' arguments to scalars", false,
                       false)
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 67f18a307b9b..397561746f86 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(LLVMipo
   AlwaysInliner.cpp
   ArgumentPromotion.cpp
   BarrierNoopPass.cpp
+  CalledValuePropagation.cpp
   ConstantMerge.cpp
   CrossDSOCFI.cpp
   DeadArgumentElimination.cpp
diff --git a/lib/Transforms/IPO/CalledValuePropagation.cpp b/lib/Transforms/IPO/CalledValuePropagation.cpp
new file mode 100644
index 000000000000..c5f6336aa2be
--- /dev/null
+++ b/lib/Transforms/IPO/CalledValuePropagation.cpp
@@ -0,0 +1,423 @@
+//===- CalledValuePropagation.cpp - Propagate called values -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that attaches !callees metadata to
+// indirect call sites. For a given call site, the metadata, if present,
+// indicates the set of functions the call site could possibly target at
+// run-time. This metadata is added to indirect call sites when the set of
+// possible targets can be determined by analysis and is known to be small. The
+// analysis driving the transformation is similar to constant propagation and
+// makes uses of the generic sparse propagation solver.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/CalledValuePropagation.h"
+#include "llvm/Analysis/SparsePropagation.h"
+#include "llvm/Analysis/ValueLatticeUtils.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "called-value-propagation"
+
+/// The maximum number of functions to track per lattice value. Once the number
+/// of functions a call site can possibly target exceeds this threshold, it's
+/// lattice value becomes overdefined. The number of possible lattice values is
+/// bounded by Ch(F, M), where F is the number of functions in the module and M
+/// is MaxFunctionsPerValue. As such, this value should be kept very small. We
+/// likely can't do anything useful for call sites with a large number of
+/// possible targets, anyway.
+static cl::opt<unsigned> MaxFunctionsPerValue(
+    "cvp-max-functions-per-value", cl::Hidden, cl::init(4),
+    cl::desc("The maximum number of functions to track per lattice value"));
+
+namespace {
+/// To enable interprocedural analysis, we assign LLVM values to the following
+/// groups. The register group represents SSA registers, the return group
+/// represents the return values of functions, and the memory group represents
+/// in-memory values. An LLVM Value can technically be in more than one group.
+/// It's necessary to distinguish these groups so we can, for example, track a
+/// global variable separately from the value stored at its location.
+enum class IPOGrouping { Register, Return, Memory };
+
+/// Our LatticeKeys are PointerIntPairs composed of LLVM values and groupings.
+using CVPLatticeKey = PointerIntPair<Value *, 2, IPOGrouping>;
+
+/// The lattice value type used by our custom lattice function. It holds the
+/// lattice state, and a set of functions.
+class CVPLatticeVal {
+public:
+  /// The states of the lattice values. Only the FunctionSet state is
+  /// interesting. It indicates the set of functions to which an LLVM value may
+  /// refer.
+  enum CVPLatticeStateTy { Undefined, FunctionSet, Overdefined, Untracked };
+
+  /// Comparator for sorting the functions set. We want to keep the order
+  /// deterministic for testing, etc.
+  struct Compare {
+    bool operator()(const Function *LHS, const Function *RHS) const {
+      return LHS->getName() < RHS->getName();
+    }
+  };
+
+  CVPLatticeVal() : LatticeState(Undefined) {}
+  CVPLatticeVal(CVPLatticeStateTy LatticeState) : LatticeState(LatticeState) {}
+  CVPLatticeVal(std::set<Function *, Compare> &&Functions)
+      : LatticeState(FunctionSet), Functions(Functions) {}
+
+  /// Get a reference to the functions held by this lattice value. The number
+  /// of functions will be zero for states other than FunctionSet.
+  const std::set<Function *, Compare> &getFunctions() const {
+    return Functions;
+  }
+
+  /// Returns true if the lattice value is in the FunctionSet state.
+  bool isFunctionSet() const { return LatticeState == FunctionSet; }
+
+  bool operator==(const CVPLatticeVal &RHS) const {
+    return LatticeState == RHS.LatticeState && Functions == RHS.Functions;
+  }
+
+  bool operator!=(const CVPLatticeVal &RHS) const {
+    return LatticeState != RHS.LatticeState || Functions != RHS.Functions;
+  }
+
+private:
+  /// Holds the state this lattice value is in.
+  CVPLatticeStateTy LatticeState;
+
+  /// Holds functions indicating the possible targets of call sites. This set
+  /// is empty for lattice values in the undefined, overdefined, and untracked
+  /// states. The maximum size of the set is controlled by
+  /// MaxFunctionsPerValue. Since most LLVM values are expected to be in
+  /// uninteresting states (i.e., overdefined), CVPLatticeVal objects should be
+  /// small and efficiently copyable.
+  std::set<Function *, Compare> Functions;
+};
+
+/// The custom lattice function used by the generic sparse propagation solver.
+/// It handles merging lattice values and computing new lattice values for
+/// constants, arguments, values returned from trackable functions, and values
+/// located in trackable global variables. It also computes the lattice values
+/// that change as a result of executing instructions.
+class CVPLatticeFunc
+    : public AbstractLatticeFunction<CVPLatticeKey, CVPLatticeVal> {
+public:
+  CVPLatticeFunc()
+      : AbstractLatticeFunction(CVPLatticeVal(CVPLatticeVal::Undefined),
+                                CVPLatticeVal(CVPLatticeVal::Overdefined),
+                                CVPLatticeVal(CVPLatticeVal::Untracked)) {}
+
+  /// Compute and return a CVPLatticeVal for the given CVPLatticeKey.
+  CVPLatticeVal ComputeLatticeVal(CVPLatticeKey Key) override {
+    switch (Key.getInt()) {
+    case IPOGrouping::Register:
+      if (isa<Instruction>(Key.getPointer())) {
+        return getUndefVal();
+      } else if (auto *A = dyn_cast<Argument>(Key.getPointer())) {
+        if (canTrackArgumentsInterprocedurally(A->getParent()))
+          return getUndefVal();
+      } else if (auto *C = dyn_cast<Constant>(Key.getPointer())) {
+        return computeConstant(C);
+      }
+      return getOverdefinedVal();
+    case IPOGrouping::Memory:
+    case IPOGrouping::Return:
+      if (auto *GV = dyn_cast<GlobalVariable>(Key.getPointer())) {
+        if (canTrackGlobalVariableInterprocedurally(GV))
+          return computeConstant(GV->getInitializer());
+      } else if (auto *F = cast<Function>(Key.getPointer()))
+        if (canTrackReturnsInterprocedurally(F))
+          return getUndefVal();
+    }
+    return getOverdefinedVal();
+  }
+
+  /// Merge the two given lattice values. The interesting cases are merging two
+  /// FunctionSet values and a FunctionSet value with an Undefined value. For
+  /// these cases, we simply union the function sets. If the size of the union
+  /// is greater than the maximum functions we track, the merged value is
+  /// overdefined.
+  CVPLatticeVal MergeValues(CVPLatticeVal X, CVPLatticeVal Y) override {
+    if (X == getOverdefinedVal() || Y == getOverdefinedVal())
+      return getOverdefinedVal();
+    if (X == getUndefVal() && Y == getUndefVal())
+      return getUndefVal();
+    std::set<Function *, CVPLatticeVal::Compare> Union;
+    std::set_union(X.getFunctions().begin(), X.getFunctions().end(),
+                   Y.getFunctions().begin(), Y.getFunctions().end(),
+                   std::inserter(Union, Union.begin()),
+                   CVPLatticeVal::Compare{});
+    if (Union.size() > MaxFunctionsPerValue)
+      return getOverdefinedVal();
+    return CVPLatticeVal(std::move(Union));
+  }
+
+  /// Compute the lattice values that change as a result of executing the given
+  /// instruction. The changed values are stored in \p ChangedValues. We handle
+  /// just a few kinds of instructions since we're only propagating values that
+  /// can be called.
+  void ComputeInstructionState(
+      Instruction &I, DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+      SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) override {
+    switch (I.getOpcode()) {
+    case Instruction::Call:
+      return visitCallSite(cast<CallInst>(&I), ChangedValues, SS);
+    case Instruction::Invoke:
+      return visitCallSite(cast<InvokeInst>(&I), ChangedValues, SS);
+    case Instruction::Load:
+      return visitLoad(*cast<LoadInst>(&I), ChangedValues, SS);
+    case Instruction::Ret:
+      return visitReturn(*cast<ReturnInst>(&I), ChangedValues, SS);
+    case Instruction::Select:
+      return visitSelect(*cast<SelectInst>(&I), ChangedValues, SS);
+    case Instruction::Store:
+      return visitStore(*cast<StoreInst>(&I), ChangedValues, SS);
+    default:
+      return visitInst(I, ChangedValues, SS);
+    }
+  }
+
+  /// Print the given CVPLatticeVal to the specified stream.
+  void PrintLatticeVal(CVPLatticeVal LV, raw_ostream &OS) override {
+    if (LV == getUndefVal())
+      OS << "Undefined  ";
+    else if (LV == getOverdefinedVal())
+      OS << "Overdefined";
+    else if (LV == getUntrackedVal())
+      OS << "Untracked  ";
+    else
+      OS << "FunctionSet";
+  }
+
+  /// Print the given CVPLatticeKey to the specified stream.
+  void PrintLatticeKey(CVPLatticeKey Key, raw_ostream &OS) override {
+    if (Key.getInt() == IPOGrouping::Register)
+      OS << "<reg> ";
+    else if (Key.getInt() == IPOGrouping::Memory)
+      OS << "<mem> ";
+    else if (Key.getInt() == IPOGrouping::Return)
+      OS << "<ret> ";
+    if (isa<Function>(Key.getPointer()))
+      OS << Key.getPointer()->getName();
+    else
+      OS << *Key.getPointer();
+  }
+
+  /// We collect a set of indirect calls when visiting call sites. This method
+  /// returns a reference to that set.
+  SmallPtrSetImpl<Instruction *> &getIndirectCalls() { return IndirectCalls; }
+
+private:
+  /// Holds the indirect calls we encounter during the analysis. We will attach
+  /// metadata to these calls after the analysis indicating the functions the
+  /// calls can possibly target.
+  SmallPtrSet<Instruction *, 32> IndirectCalls;
+
+  /// Compute a new lattice value for the given constant. The constant, after
+  /// stripping any pointer casts, should be a Function. We ignore null
+  /// pointers as an optimization, since calling these values is undefined
+  /// behavior.
+  CVPLatticeVal computeConstant(Constant *C) {
+    if (isa<ConstantPointerNull>(C))
+      return CVPLatticeVal(CVPLatticeVal::FunctionSet);
+    if (auto *F = dyn_cast<Function>(C->stripPointerCasts()))
+      return CVPLatticeVal({F});
+    return getOverdefinedVal();
+  }
+
+  /// Handle return instructions. The function's return state is the merge of
+  /// the returned value state and the function's return state.
+  void visitReturn(ReturnInst &I,
+                   DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                   SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+    Function *F = I.getParent()->getParent();
+    if (F->getReturnType()->isVoidTy())
+      return;
+    auto RegI = CVPLatticeKey(I.getReturnValue(), IPOGrouping::Register);
+    auto RetF = CVPLatticeKey(F, IPOGrouping::Return);
+    ChangedValues[RetF] =
+        MergeValues(SS.getValueState(RegI), SS.getValueState(RetF));
+  }
+
+  /// Handle call sites. The state of a called function's formal arguments is
+  /// the merge of the argument state with the call sites corresponding actual
+  /// argument state. The call site state is the merge of the call site state
+  /// with the returned value state of the called function.
+  void visitCallSite(CallSite CS,
+                     DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                     SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+    Function *F = CS.getCalledFunction();
+    Instruction *I = CS.getInstruction();
+    auto RegI = CVPLatticeKey(I, IPOGrouping::Register);
+
+    // If this is an indirect call, save it so we can quickly revisit it when
+    // attaching metadata.
+    if (!F)
+      IndirectCalls.insert(I);
+
+    // If we can't track the function's return values, there's nothing to do.
+    if (!F || !canTrackReturnsInterprocedurally(F)) {
+      ChangedValues[RegI] = getOverdefinedVal();
+      return;
+    }
+
+    // Inform the solver that the called function is executable, and perform
+    // the merges for the arguments and return value.
+    SS.MarkBlockExecutable(&F->front());
+    auto RetF = CVPLatticeKey(F, IPOGrouping::Return);
+    for (Argument &A : F->args()) {
+      auto RegFormal = CVPLatticeKey(&A, IPOGrouping::Register);
+      auto RegActual =
+          CVPLatticeKey(CS.getArgument(A.getArgNo()), IPOGrouping::Register);
+      ChangedValues[RegFormal] =
+          MergeValues(SS.getValueState(RegFormal), SS.getValueState(RegActual));
+    }
+    ChangedValues[RegI] =
+        MergeValues(SS.getValueState(RegI), SS.getValueState(RetF));
+  }
+
+  /// Handle select instructions. The select instruction state is the merge the
+  /// true and false value states.
+  void visitSelect(SelectInst &I,
+                   DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                   SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+    auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
+    auto RegT = CVPLatticeKey(I.getTrueValue(), IPOGrouping::Register);
+    auto RegF = CVPLatticeKey(I.getFalseValue(), IPOGrouping::Register);
+    ChangedValues[RegI] =
+        MergeValues(SS.getValueState(RegT), SS.getValueState(RegF));
+  }
+
+  /// Handle load instructions. If the pointer operand of the load is a global
+  /// variable, we attempt to track the value. The loaded value state is the
+  /// merge of the loaded value state with the global variable state.
+  void visitLoad(LoadInst &I,
+                 DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                 SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+    auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
+    if (auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand())) {
+      auto MemGV = CVPLatticeKey(GV, IPOGrouping::Memory);
+      ChangedValues[RegI] =
+          MergeValues(SS.getValueState(RegI), SS.getValueState(MemGV));
+    } else {
+      ChangedValues[RegI] = getOverdefinedVal();
+    }
+  }
+
+  /// Handle store instructions. If the pointer operand of the store is a
+  /// global variable, we attempt to track the value. The global variable state
+  /// is the merge of the stored value state with the global variable state.
+  void visitStore(StoreInst &I,
+                  DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                  SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+    auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand());
+    if (!GV)
+      return;
+    auto RegI = CVPLatticeKey(I.getValueOperand(), IPOGrouping::Register);
+    auto MemGV = CVPLatticeKey(GV, IPOGrouping::Memory);
+    ChangedValues[MemGV] =
+        MergeValues(SS.getValueState(RegI), SS.getValueState(MemGV));
+  }
+
+  /// Handle all other instructions. All other instructions are marked
+  /// overdefined.
+  void visitInst(Instruction &I,
+                 DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+                 SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+    auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
+    ChangedValues[RegI] = getOverdefinedVal();
+  }
+};
+} // namespace
+
+namespace llvm {
+/// A specialization of LatticeKeyInfo for CVPLatticeKeys. The generic solver
+/// must translate between LatticeKeys and LLVM Values when adding Values to
+/// its work list and inspecting the state of control-flow related values.
+template <> struct LatticeKeyInfo<CVPLatticeKey> {
+  static inline Value *getValueFromLatticeKey(CVPLatticeKey Key) {
+    return Key.getPointer();
+  }
+  static inline CVPLatticeKey getLatticeKeyFromValue(Value *V) {
+    return CVPLatticeKey(V, IPOGrouping::Register);
+  }
+};
+} // namespace llvm
+
+static bool runCVP(Module &M) {
+  // Our custom lattice function and generic sparse propagation solver.
+  CVPLatticeFunc Lattice;
+  SparseSolver<CVPLatticeKey, CVPLatticeVal> Solver(&Lattice);
+
+  // For each function in the module, if we can't track its arguments, let the
+  // generic solver assume it is executable.
+  for (Function &F : M)
+    if (!F.isDeclaration() && !canTrackArgumentsInterprocedurally(&F))
+      Solver.MarkBlockExecutable(&F.front());
+
+  // Solver our custom lattice. In doing so, we will also build a set of
+  // indirect call sites.
+  Solver.Solve();
+
+  // Attach metadata to the indirect call sites that were collected indicating
+  // the set of functions they can possibly target.
+  bool Changed = false;
+  MDBuilder MDB(M.getContext());
+  for (Instruction *C : Lattice.getIndirectCalls()) {
+    CallSite CS(C);
+    auto RegI = CVPLatticeKey(CS.getCalledValue(), IPOGrouping::Register);
+    CVPLatticeVal LV = Solver.getExistingValueState(RegI);
+    if (!LV.isFunctionSet() || LV.getFunctions().empty())
+      continue;
+    MDNode *Callees = MDB.createCallees(SmallVector<Function *, 4>(
+        LV.getFunctions().begin(), LV.getFunctions().end()));
+    C->setMetadata(LLVMContext::MD_callees, Callees);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses CalledValuePropagationPass::run(Module &M,
+                                                  ModuleAnalysisManager &) {
+  runCVP(M);
+  return PreservedAnalyses::all();
+}
+
+namespace {
+class CalledValuePropagationLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  CalledValuePropagationLegacyPass() : ModulePass(ID) {
+    initializeCalledValuePropagationLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    return runCVP(M);
+  }
+};
+} // namespace
+
+char CalledValuePropagationLegacyPass::ID = 0;
+INITIALIZE_PASS(CalledValuePropagationLegacyPass, "called-value-propagation",
+                "Called Value Propagation", false, false)
+
+ModulePass *llvm::createCalledValuePropagationPass() {
+  return new CalledValuePropagationLegacyPass();
+}
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index 62b5a9c9ba26..e0b1037053f0 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -19,16 +19,23 @@
 
 #include "llvm/Transforms/IPO/ConstantMerge.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Transforms/IPO.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "constmerge"
@@ -102,8 +109,7 @@ static bool mergeConstants(Module &M) {
   // constants together may allow us to merge other constants together if the
   // second level constants have initializers which point to the globals that
   // were just merged.
-  while (1) {
-
+  while (true) {
     // First: Find the canonical constants others will be merged with.
     for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
          GVI != E; ) {
@@ -225,23 +231,27 @@ PreservedAnalyses ConstantMergePass::run(Module &M, ModuleAnalysisManager &) {
 }
 
 namespace {
+
 struct ConstantMergeLegacyPass : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
+
   ConstantMergeLegacyPass() : ModulePass(ID) {
     initializeConstantMergeLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   // For this pass, process all of the globals in the module, eliminating
   // duplicate constants.
-  bool runOnModule(Module &M) {
+  bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
     return mergeConstants(M);
   }
 };
-}
+
+} // end anonymous namespace
 
 char ConstantMergeLegacyPass::ID = 0;
+
 INITIALIZE_PASS(ConstantMergeLegacyPass, "constmerge",
                 "Merge Duplicate Global Constants", false, false)
 
diff --git a/lib/Transforms/IPO/CrossDSOCFI.cpp b/lib/Transforms/IPO/CrossDSOCFI.cpp
index d94aa5da8560..886029ea58d5 100644
--- a/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -13,9 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/CrossDSOCFI.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -31,7 +31,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
 
@@ -81,7 +80,7 @@ ConstantInt *CrossDSOCFI::extractNumericTypeId(MDNode *MD) {
 void CrossDSOCFI::buildCFICheck(Module &M) {
   // FIXME: verify that __cfi_check ends up near the end of the code section,
   // but before the jump slots created in LowerTypeTests.
-  llvm::DenseSet<uint64_t> TypeIds;
+  SetVector<uint64_t> TypeIds;
   SmallVector<MDNode *, 2> Types;
   for (GlobalObject &GO : M.global_objects()) {
     Types.clear();
@@ -115,6 +114,11 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
   // linker knows about the symbol; this pass replaces the function body.
   F->deleteBody();
   F->setAlignment(4096);
+
+  Triple T(M.getTargetTriple());
+  if (T.isARM() || T.isThumb())
+    F->addFnAttr("target-features", "+thumb-mode");
+
   auto args = F->arg_begin();
   Value &CallSiteTypeId = *(args++);
   CallSiteTypeId.setName("CallSiteTypeId");
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 8e26849ea9e3..5446541550e5 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -1,4 +1,4 @@
-//===-- DeadArgumentElimination.cpp - Eliminate dead arguments ------------===//
+//===- DeadArgumentElimination.cpp - Eliminate dead arguments -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -20,24 +20,36 @@
 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
-#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <set>
-#include <tuple>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "deadargelim"
@@ -46,9 +58,10 @@ STATISTIC(NumArgumentsEliminated, "Number of unread args removed");
 STATISTIC(NumRetValsEliminated  , "Number of unused return values removed");
 STATISTIC(NumArgumentsReplacedWithUndef, 
           "Number of unread args replaced with undef");
+
 namespace {
+
   /// DAE - The dead argument elimination pass.
-  ///
   class DAE : public ModulePass {
   protected:
     // DAH uses this to specify a different ID.
@@ -56,6 +69,7 @@ namespace {
 
   public:
     static char ID; // Pass identification, replacement for typeid
+
     DAE() : ModulePass(ID) {
       initializeDAEPass(*PassRegistry::getPassRegistry());
     }
@@ -71,33 +85,38 @@ namespace {
 
     virtual bool ShouldHackArguments() const { return false; }
   };
-}
 
+} // end anonymous namespace
 
 char DAE::ID = 0;
+
 INITIALIZE_PASS(DAE, "deadargelim", "Dead Argument Elimination", false, false)
 
 namespace {
+
   /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but
   /// deletes arguments to functions which are external.  This is only for use
   /// by bugpoint.
   struct DAH : public DAE {
     static char ID;
+
     DAH() : DAE(ID) {}
 
     bool ShouldHackArguments() const override { return true; }
   };
-}
+
+} // end anonymous namespace
 
 char DAH::ID = 0;
+
 INITIALIZE_PASS(DAH, "deadarghaX0r", 
                 "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)",
                 false, false)
 
 /// createDeadArgEliminationPass - This pass removes arguments from functions
 /// which are not used by the body of the function.
-///
 ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); }
+
 ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); }
 
 /// DeleteDeadVarargs - If this is an function that takes a ... list, and if
@@ -140,7 +159,7 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
   // the old function, but doesn't have isVarArg set.
   FunctionType *FTy = Fn.getFunctionType();
 
-  std::vector<Type*> Params(FTy->param_begin(), FTy->param_end());
+  std::vector<Type *> Params(FTy->param_begin(), FTy->param_end());
   FunctionType *NFTy = FunctionType::get(FTy->getReturnType(),
                                                 Params, false);
   unsigned NumArgs = Params.size();
@@ -155,7 +174,7 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
   // Loop over all of the callers of the function, transforming the call sites
   // to pass in a smaller number of arguments into the new function.
   //
-  std::vector<Value*> Args;
+  std::vector<Value *> Args;
   for (Value::user_iterator I = Fn.user_begin(), E = Fn.user_end(); I != E; ) {
     CallSite CS(*I++);
     if (!CS)
@@ -214,7 +233,6 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
   // Loop over the argument list, transferring uses of the old arguments over to
   // the new arguments, also transferring over the names as well.  While we're at
   // it, remove the dead arguments from the DeadArguments list.
-  //
   for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(),
        I2 = NF->arg_begin(); I != E; ++I, ++I2) {
     // Move the name and users over to the new version.
@@ -343,7 +361,6 @@ DeadArgumentEliminationPass::MarkIfNotLive(RetOrArg Use,
   return MaybeLive;
 }
 
-
 /// SurveyUse - This looks at a single use of an argument or return value
 /// and determines if it should be alive or not. Adds this use to MaybeLiveUses
 /// if it causes the used value to become MaybeLive.
@@ -460,7 +477,6 @@ DeadArgumentEliminationPass::SurveyUses(const Value *V,
 //
 // We consider arguments of non-internal functions to be intrinsically alive as
 // well as arguments to functions which have their "address taken".
-//
 void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   // Functions with inalloca parameters are expecting args in a particular
   // register and memory layout.
@@ -478,11 +494,14 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   }
 
   unsigned RetCount = NumRetVals(&F);
+
   // Assume all return values are dead
-  typedef SmallVector<Liveness, 5> RetVals;
+  using RetVals = SmallVector<Liveness, 5>;
+
   RetVals RetValLiveness(RetCount, MaybeLive);
 
-  typedef SmallVector<UseVector, 5> RetUses;
+  using RetUses = SmallVector<UseVector, 5>;
+
   // These vectors map each return value to the uses that make it MaybeLive, so
   // we can add those to the Uses map if the return value really turns out to be
   // MaybeLive. Initialized to a list of RetCount empty lists.
@@ -601,15 +620,15 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
 void DeadArgumentEliminationPass::MarkValue(const RetOrArg &RA, Liveness L,
                                             const UseVector &MaybeLiveUses) {
   switch (L) {
-    case Live: MarkLive(RA); break;
+    case Live:
+      MarkLive(RA);
+      break;
     case MaybeLive:
-    {
       // Note any uses of this value, so this return value can be
       // marked live whenever one of the uses becomes live.
       for (const auto &MaybeLiveUse : MaybeLiveUses)
         Uses.insert(std::make_pair(MaybeLiveUse, RA));
       break;
-    }
   }
 }
 
@@ -762,7 +781,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
       // One return type? Just a simple value then, but only if we didn't use to
       // return a struct with that simple value before.
       NRetTy = RetTypes.front();
-    else if (RetTypes.size() == 0)
+    else if (RetTypes.empty())
       // No return types? Make it void, but only if we didn't use to return {}.
       NRetTy = Type::getVoidTy(F->getContext());
   }
@@ -808,7 +827,6 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
   // Loop over all of the callers of the function, transforming the call sites
   // to pass in a smaller number of arguments into the new function.
-  //
   std::vector<Value*> Args;
   while (!F->use_empty()) {
     CallSite CS(F->user_back());
diff --git a/lib/Transforms/IPO/ElimAvailExtern.cpp b/lib/Transforms/IPO/ElimAvailExtern.cpp
index ecff88c88dcb..d5fef59286dd 100644
--- a/lib/Transforms/IPO/ElimAvailExtern.cpp
+++ b/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -1,5 +1,4 @@
-//===-- ElimAvailExtern.cpp - DCE unreachable internal functions
-//----------------===//
+//===- ElimAvailExtern.cpp - DCE unreachable internal functions -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,11 +14,15 @@
 
 #include "llvm/Transforms/IPO/ElimAvailExtern.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Constants.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "elim-avail-extern"
@@ -69,8 +72,10 @@ EliminateAvailableExternallyPass::run(Module &M, ModuleAnalysisManager &) {
 }
 
 namespace {
+
 struct EliminateAvailableExternallyLegacyPass : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
+
   EliminateAvailableExternallyLegacyPass() : ModulePass(ID) {
     initializeEliminateAvailableExternallyLegacyPassPass(
         *PassRegistry::getPassRegistry());
@@ -78,16 +83,17 @@ struct EliminateAvailableExternallyLegacyPass : public ModulePass {
 
   // run - Do the EliminateAvailableExternally pass on the specified module,
   // optionally updating the specified callgraph to reflect the changes.
-  //
-  bool runOnModule(Module &M) {
+  bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
     return eliminateAvailableExternally(M);
   }
 };
-}
+
+} // end anonymous namespace
 
 char EliminateAvailableExternallyLegacyPass::ID = 0;
+
 INITIALIZE_PASS(EliminateAvailableExternallyLegacyPass, "elim-avail-extern",
                 "Eliminate Available Externally Globals", false, false)
 
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index d1147f7d844b..042cacb70ad0 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -12,8 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SetVector.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
diff --git a/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/lib/Transforms/IPO/ForceFunctionAttrs.cpp
index 968712138208..325a5d77aadb 100644
--- a/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -52,11 +52,13 @@ static Attribute::AttrKind parseAttrKind(StringRef Kind) {
       .Case("returns_twice", Attribute::ReturnsTwice)
       .Case("safestack", Attribute::SafeStack)
       .Case("sanitize_address", Attribute::SanitizeAddress)
+      .Case("sanitize_hwaddress", Attribute::SanitizeHWAddress)
       .Case("sanitize_memory", Attribute::SanitizeMemory)
       .Case("sanitize_thread", Attribute::SanitizeThread)
       .Case("ssp", Attribute::StackProtect)
       .Case("sspreq", Attribute::StackProtectReq)
       .Case("sspstrong", Attribute::StackProtectStrong)
+      .Case("strictfp", Attribute::StrictFP)
       .Case("uwtable", Attribute::UWTable)
       .Default(Attribute::None);
 }
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 813a4b6e2831..5352e32479bb 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -6,34 +6,61 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-///
+//
 /// \file
 /// This file implements interprocedural passes which walk the
 /// call-graph deducing and/or propagating function attributes.
-///
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include <cassert>
+#include <iterator>
+#include <map>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "functionattrs"
@@ -57,8 +84,10 @@ static cl::opt<bool> EnableNonnullArgPropagation(
              "caller functions."));
 
 namespace {
-typedef SmallSetVector<Function *, 8> SCCNodeSet;
-}
+
+using SCCNodeSet = SmallSetVector<Function *, 8>;
+
+} // end anonymous namespace
 
 /// Returns the memory access attribute for function F using AAR for AA results,
 /// where SCCNodes is the current SCC.
@@ -101,17 +130,18 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
           SCCNodes.count(CS.getCalledFunction()))
         continue;
       FunctionModRefBehavior MRB = AAR.getModRefBehavior(CS);
+      ModRefInfo MRI = createModRefInfo(MRB);
 
       // If the call doesn't access memory, we're done.
-      if (!(MRB & MRI_ModRef))
+      if (isNoModRef(MRI))
         continue;
 
       if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
         // The call could access any memory. If that includes writes, give up.
-        if (MRB & MRI_Mod)
+        if (isModSet(MRI))
           return MAK_MayWrite;
         // If it reads, note it.
-        if (MRB & MRI_Ref)
+        if (isRefSet(MRI))
           ReadsMemory = true;
         continue;
       }
@@ -133,10 +163,10 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
         if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
           continue;
 
-        if (MRB & MRI_Mod)
+        if (isModSet(MRI))
           // Writes non-local memory.  Give up.
           return MAK_MayWrite;
-        if (MRB & MRI_Ref)
+        if (isRefSet(MRI))
           // Ok, it reads non-local memory.
           ReadsMemory = true;
       }
@@ -237,6 +267,7 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
 }
 
 namespace {
+
 /// For a given pointer Argument, this retains a list of Arguments of functions
 /// in the same SCC that the pointer data flows into. We use this to build an
 /// SCC of the arguments.
@@ -248,7 +279,7 @@ struct ArgumentGraphNode {
 class ArgumentGraph {
   // We store pointers to ArgumentGraphNode objects, so it's important that
   // that they not move around upon insert.
-  typedef std::map<Argument *, ArgumentGraphNode> ArgumentMapTy;
+  using ArgumentMapTy = std::map<Argument *, ArgumentGraphNode>;
 
   ArgumentMapTy ArgumentMap;
 
@@ -263,7 +294,7 @@ class ArgumentGraph {
 public:
   ArgumentGraph() { SyntheticRoot.Definition = nullptr; }
 
-  typedef SmallVectorImpl<ArgumentGraphNode *>::iterator iterator;
+  using iterator = SmallVectorImpl<ArgumentGraphNode *>::iterator;
 
   iterator begin() { return SyntheticRoot.Uses.begin(); }
   iterator end() { return SyntheticRoot.Uses.end(); }
@@ -281,8 +312,7 @@ public:
 /// consider that a capture, instead adding it to the "Uses" list and
 /// continuing with the analysis.
 struct ArgumentUsesTracker : public CaptureTracker {
-  ArgumentUsesTracker(const SCCNodeSet &SCCNodes)
-      : Captured(false), SCCNodes(SCCNodes) {}
+  ArgumentUsesTracker(const SCCNodeSet &SCCNodes) : SCCNodes(SCCNodes) {}
 
   void tooManyUses() override { Captured = true; }
 
@@ -331,37 +361,45 @@ struct ArgumentUsesTracker : public CaptureTracker {
     return false;
   }
 
-  bool Captured; // True only if certainly captured (used outside our SCC).
-  SmallVector<Argument *, 4> Uses; // Uses within our SCC.
+  // True only if certainly captured (used outside our SCC).
+  bool Captured = false;
+
+  // Uses within our SCC.
+  SmallVector<Argument *, 4> Uses;
 
   const SCCNodeSet &SCCNodes;
 };
-}
+
+} // end anonymous namespace
 
 namespace llvm {
+
 template <> struct GraphTraits<ArgumentGraphNode *> {
-  typedef ArgumentGraphNode *NodeRef;
-  typedef SmallVectorImpl<ArgumentGraphNode *>::iterator ChildIteratorType;
+  using NodeRef = ArgumentGraphNode *;
+  using ChildIteratorType = SmallVectorImpl<ArgumentGraphNode *>::iterator;
 
   static NodeRef getEntryNode(NodeRef A) { return A; }
   static ChildIteratorType child_begin(NodeRef N) { return N->Uses.begin(); }
   static ChildIteratorType child_end(NodeRef N) { return N->Uses.end(); }
 };
+
 template <>
 struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> {
   static NodeRef getEntryNode(ArgumentGraph *AG) { return AG->getEntryNode(); }
+
   static ChildIteratorType nodes_begin(ArgumentGraph *AG) {
     return AG->begin();
   }
+
   static ChildIteratorType nodes_end(ArgumentGraph *AG) { return AG->end(); }
 };
-}
+
+} // end namespace llvm
 
 /// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone.
 static Attribute::AttrKind
 determinePointerReadAttrs(Argument *A,
                           const SmallPtrSet<Argument *, 8> &SCCNodes) {
-
   SmallVector<Use *, 32> Worklist;
   SmallSet<Use *, 32> Visited;
 
@@ -502,8 +540,8 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
       continue;
 
     // There is nothing to do if an argument is already marked as 'returned'.
-    if (any_of(F->args(),
-               [](const Argument &Arg) { return Arg.hasReturnedAttr(); }))
+    if (llvm::any_of(F->args(),
+                     [](const Argument &Arg) { return Arg.hasReturnedAttr(); }))
       continue;
 
     auto FindRetArg = [&]() -> Value * {
@@ -884,11 +922,13 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
     if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
       FlowsToReturn.insert(Ret->getReturnValue());
 
+  auto &DL = F->getParent()->getDataLayout();
+
   for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
     Value *RetVal = FlowsToReturn[i];
 
     // If this value is locally known to be non-null, we're good
-    if (isKnownNonNull(RetVal))
+    if (isKnownNonZero(RetVal, DL))
       continue;
 
     // Otherwise, we need to look upwards since we can't make any local
@@ -1135,8 +1175,11 @@ PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
 }
 
 namespace {
+
 struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass {
-  static char ID; // Pass identification, replacement for typeid
+  // Pass identification, replacement for typeid
+  static char ID;
+
   PostOrderFunctionAttrsLegacyPass() : CallGraphSCCPass(ID) {
     initializePostOrderFunctionAttrsLegacyPassPass(
         *PassRegistry::getPassRegistry());
@@ -1151,7 +1194,8 @@ struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass {
     CallGraphSCCPass::getAnalysisUsage(AU);
   }
 };
-}
+
+} // end anonymous namespace
 
 char PostOrderFunctionAttrsLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "functionattrs",
@@ -1214,8 +1258,11 @@ bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) {
 }
 
 namespace {
+
 struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass {
-  static char ID; // Pass identification, replacement for typeid
+  // Pass identification, replacement for typeid
+  static char ID;
+
   ReversePostOrderFunctionAttrsLegacyPass() : ModulePass(ID) {
     initializeReversePostOrderFunctionAttrsLegacyPassPass(
         *PassRegistry::getPassRegistry());
@@ -1229,9 +1276,11 @@ struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass {
     AU.addPreserved<CallGraphWrapperPass>();
   }
 };
-}
+
+} // end anonymous namespace
 
 char ReversePostOrderFunctionAttrsLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrsLegacyPass, "rpo-functionattrs",
                       "Deduce function attributes in RPO", false, false)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
@@ -1291,7 +1340,7 @@ static bool deduceFunctionAttributeInRPO(Module &M, CallGraph &CG) {
   }
 
   bool Changed = false;
-  for (auto *F : reverse(Worklist))
+  for (auto *F : llvm::reverse(Worklist))
     Changed |= addNoRecurseAttrsTopDown(*F);
 
   return Changed;
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 233a36d2bc54..b6d6201cd23b 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -12,30 +12,54 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/FunctionImport.h"
-
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/AutoUpgrade.h"
-#include "llvm/IR/DiagnosticPrinter.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/Linker/Linker.h"
-#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Linker/IRMover.h"
+#include "llvm/Object/ModuleSymbolTable.h"
+#include "llvm/Object/SymbolicFile.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
-
-#define DEBUG_TYPE "function-import"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <memory>
+#include <set>
+#include <string>
+#include <system_error>
+#include <tuple>
+#include <utility>
 
 using namespace llvm;
 
+#define DEBUG_TYPE "function-import"
+
 STATISTIC(NumImportedFunctions, "Number of functions imported");
 STATISTIC(NumImportedModules, "Number of modules imported from");
 STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
@@ -61,7 +85,7 @@ static cl::opt<float> ImportHotInstrFactor(
              "before processing newly imported functions"));
 
 static cl::opt<float> ImportHotMultiplier(
-    "import-hot-multiplier", cl::init(3.0), cl::Hidden, cl::value_desc("x"),
+    "import-hot-multiplier", cl::init(10.0), cl::Hidden, cl::value_desc("x"),
     cl::desc("Multiply the `import-instr-limit` threshold for hot callsites"));
 
 static cl::opt<float> ImportCriticalMultiplier(
@@ -91,6 +115,18 @@ static cl::opt<bool> EnableImportMetadata(
                                   ),
     cl::Hidden, cl::desc("Enable import metadata like 'thinlto_src_module'"));
 
+/// Summary file to use for function importing when using -function-import from
+/// the command line.
+static cl::opt<std::string>
+    SummaryFile("summary-file",
+                cl::desc("The summary file to use for function importing."));
+
+/// Used when testing importing from distributed indexes via opt
+// -function-import.
+static cl::opt<bool>
+    ImportAllIndex("import-all-index",
+                   cl::desc("Import all external functions in index."));
+
 // Load lazily a module from \p FileName in \p Context.
 static std::unique_ptr<Module> loadFile(const std::string &FileName,
                                         LLVMContext &Context) {
@@ -109,8 +145,6 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName,
   return Result;
 }
 
-namespace {
-
 /// Given a list of possible callee implementation for a call site, select one
 /// that fits the \p Threshold.
 ///
@@ -129,21 +163,26 @@ selectCallee(const ModuleSummaryIndex &Index,
       CalleeSummaryList,
       [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) {
         auto *GVSummary = SummaryPtr.get();
+        // For SamplePGO, in computeImportForFunction the OriginalId
+        // may have been used to locate the callee summary list (See
+        // comment there).
+        // The mapping from OriginalId to GUID may return a GUID
+        // that corresponds to a static variable. Filter it out here.
+        // This can happen when
+        // 1) There is a call to a library function which is not defined
+        // in the index.
+        // 2) There is a static variable with the  OriginalGUID identical
+        // to the GUID of the library function in 1);
+        // When this happens, the logic for SamplePGO kicks in and
+        // the static variable in 2) will be found, which needs to be
+        // filtered out.
+        if (GVSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind)
+          return false;
         if (GlobalValue::isInterposableLinkage(GVSummary->linkage()))
           // There is no point in importing these, we can't inline them
           return false;
-        if (auto *AS = dyn_cast<AliasSummary>(GVSummary)) {
-          GVSummary = &AS->getAliasee();
-          // Alias can't point to "available_externally". However when we import
-          // linkOnceODR the linkage does not change. So we import the alias
-          // and aliasee only in this case.
-          // FIXME: we should import alias as available_externally *function*,
-          // the destination module does need to know it is an alias.
-          if (!GlobalValue::isLinkOnceODRLinkage(GVSummary->linkage()))
-            return false;
-        }
 
-        auto *Summary = cast<FunctionSummary>(GVSummary);
+        auto *Summary = cast<FunctionSummary>(GVSummary->getBaseObject());
 
         // If this is a local function, make sure we import the copy
         // in the caller's module. The only time a local function can
@@ -174,9 +213,28 @@ selectCallee(const ModuleSummaryIndex &Index,
   return cast<GlobalValueSummary>(It->get());
 }
 
+namespace {
+
 using EdgeInfo = std::tuple<const FunctionSummary *, unsigned /* Threshold */,
                             GlobalValue::GUID>;
 
+} // anonymous namespace
+
+static ValueInfo
+updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) {
+  if (!VI.getSummaryList().empty())
+    return VI;
+  // For SamplePGO, the indirect call targets for local functions will
+  // have its original name annotated in profile. We try to find the
+  // corresponding PGOFuncName as the GUID.
+  // FIXME: Consider updating the edges in the graph after building
+  // it, rather than needing to perform this mapping on each walk.
+  auto GUID = Index.getGUIDFromOriginalID(VI.getGUID());
+  if (GUID == 0)
+    return nullptr;
+  return Index.getValueInfo(GUID);
+}
+
 /// Compute the list of functions to import for a given caller. Mark these
 /// imported functions and the symbols they reference in their source module as
 /// exported from their source module.
@@ -191,17 +249,9 @@ static void computeImportForFunction(
     DEBUG(dbgs() << " edge -> " << VI.getGUID() << " Threshold:" << Threshold
                  << "\n");
 
-    if (VI.getSummaryList().empty()) {
-      // For SamplePGO, the indirect call targets for local functions will
-      // have its original name annotated in profile. We try to find the
-      // corresponding PGOFuncName as the GUID.
-      auto GUID = Index.getGUIDFromOriginalID(VI.getGUID());
-      if (GUID == 0)
-        continue;
-      VI = Index.getValueInfo(GUID);
-      if (!VI)
-        continue;
-    }
+    VI = updateValueInfoForIndirectCalls(Index, VI);
+    if (!VI)
+      continue;
 
     if (DefinedGVSummaries.count(VI.getGUID())) {
       DEBUG(dbgs() << "ignored! Target already in destination module.\n");
@@ -227,16 +277,9 @@ static void computeImportForFunction(
       DEBUG(dbgs() << "ignored! No qualifying callee with summary found.\n");
       continue;
     }
-    // "Resolve" the summary, traversing alias,
-    const FunctionSummary *ResolvedCalleeSummary;
-    if (isa<AliasSummary>(CalleeSummary)) {
-      ResolvedCalleeSummary = cast<FunctionSummary>(
-          &cast<AliasSummary>(CalleeSummary)->getAliasee());
-      assert(
-          GlobalValue::isLinkOnceODRLinkage(ResolvedCalleeSummary->linkage()) &&
-          "Unexpected alias to a non-linkonceODR in import list");
-    } else
-      ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
+
+    // "Resolve" the summary
+    const auto *ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary->getBaseObject());
 
     assert(ResolvedCalleeSummary->instCount() <= NewThreshold &&
            "selectCallee() didn't honor the threshold");
@@ -312,14 +355,12 @@ static void ComputeImportForModule(
       DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n");
       continue;
     }
-    auto *Summary = GVSummary.second;
-    if (auto *AS = dyn_cast<AliasSummary>(Summary))
-      Summary = &AS->getAliasee();
-    auto *FuncSummary = dyn_cast<FunctionSummary>(Summary);
+    auto *FuncSummary =
+        dyn_cast<FunctionSummary>(GVSummary.second->getBaseObject());
     if (!FuncSummary)
       // Skip import for global variables
       continue;
-    DEBUG(dbgs() << "Initalize import for " << GVSummary.first << "\n");
+    DEBUG(dbgs() << "Initialize import for " << GVSummary.first << "\n");
     computeImportForFunction(*FuncSummary, Index, ImportInstrLimit,
                              DefinedGVSummaries, Worklist, ImportList,
                              ExportLists);
@@ -344,8 +385,6 @@ static void ComputeImportForModule(
   }
 }
 
-} // anonymous namespace
-
 /// Compute all the import and export for every module using the Index.
 void llvm::ComputeCrossModuleImport(
     const ModuleSummaryIndex &Index,
@@ -395,11 +434,23 @@ void llvm::ComputeCrossModuleImport(
 #endif
 }
 
+#ifndef NDEBUG
+static void dumpImportListForModule(StringRef ModulePath,
+                                    FunctionImporter::ImportMapTy &ImportList) {
+  DEBUG(dbgs() << "* Module " << ModulePath << " imports from "
+               << ImportList.size() << " modules.\n");
+  for (auto &Src : ImportList) {
+    auto SrcModName = Src.first();
+    DEBUG(dbgs() << " - " << Src.second.size() << " functions imported from "
+                 << SrcModName << "\n");
+  }
+}
+#endif
+
 /// Compute all the imports for the given module in the Index.
 void llvm::ComputeCrossModuleImportForModule(
     StringRef ModulePath, const ModuleSummaryIndex &Index,
     FunctionImporter::ImportMapTy &ImportList) {
-
   // Collect the list of functions this module defines.
   // GUID -> Summary
   GVSummaryMapTy FunctionSummaryMap;
@@ -410,13 +461,34 @@ void llvm::ComputeCrossModuleImportForModule(
   ComputeImportForModule(FunctionSummaryMap, Index, ImportList);
 
 #ifndef NDEBUG
-  DEBUG(dbgs() << "* Module " << ModulePath << " imports from "
-               << ImportList.size() << " modules.\n");
-  for (auto &Src : ImportList) {
-    auto SrcModName = Src.first();
-    DEBUG(dbgs() << " - " << Src.second.size() << " functions imported from "
-                 << SrcModName << "\n");
+  dumpImportListForModule(ModulePath, ImportList);
+#endif
+}
+
+// Mark all external summaries in Index for import into the given module.
+// Used for distributed builds using a distributed index.
+void llvm::ComputeCrossModuleImportForModuleFromIndex(
+    StringRef ModulePath, const ModuleSummaryIndex &Index,
+    FunctionImporter::ImportMapTy &ImportList) {
+  for (auto &GlobalList : Index) {
+    // Ignore entries for undefined references.
+    if (GlobalList.second.SummaryList.empty())
+      continue;
+
+    auto GUID = GlobalList.first;
+    assert(GlobalList.second.SummaryList.size() == 1 &&
+           "Expected individual combined index to have one summary per GUID");
+    auto &Summary = GlobalList.second.SummaryList[0];
+    // Skip the summaries for the importing module. These are included to
+    // e.g. record required linkage changes.
+    if (Summary->modulePath() == ModulePath)
+      continue;
+    // Doesn't matter what value we plug in to the map, just needs an entry
+    // to provoke importing by thinBackend.
+    ImportList[Summary->modulePath()][GUID] = 1;
   }
+#ifndef NDEBUG
+  dumpImportListForModule(ModulePath, ImportList);
 #endif
 }
 
@@ -453,6 +525,17 @@ void llvm::computeDeadSymbols(
   // Make value live and add it to the worklist if it was not live before.
   // FIXME: we should only make the prevailing copy live here
   auto visit = [&](ValueInfo VI) {
+    // FIXME: If we knew which edges were created for indirect call profiles,
+    // we could skip them here. Any that are live should be reached via
+    // other edges, e.g. reference edges. Otherwise, using a profile collected
+    // on a slightly different binary might provoke preserving, importing
+    // and ultimately promoting calls to functions not linked into this
+    // binary, which increases the binary size unnecessarily. Note that
+    // if this code changes, the importer needs to change so that edges
+    // to functions marked dead are skipped.
+    VI = updateValueInfoForIndirectCalls(Index, VI);
+    if (!VI)
+      return;
     for (auto &S : VI.getSummaryList())
       if (S->isLive())
         return;
@@ -465,17 +548,12 @@ void llvm::computeDeadSymbols(
   while (!Worklist.empty()) {
     auto VI = Worklist.pop_back_val();
     for (auto &Summary : VI.getSummaryList()) {
-      for (auto Ref : Summary->refs())
+      GlobalValueSummary *Base = Summary->getBaseObject();
+      for (auto Ref : Base->refs())
         visit(Ref);
-      if (auto *FS = dyn_cast<FunctionSummary>(Summary.get()))
+      if (auto *FS = dyn_cast<FunctionSummary>(Base))
         for (auto Call : FS->calls())
           visit(Call.first);
-      if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) {
-        auto AliaseeGUID = AS->getAliasee().getOriginalName();
-        ValueInfo AliaseeVI = Index.getValueInfo(AliaseeGUID);
-        if (AliaseeVI)
-          visit(AliaseeVI);
-      }
     }
   }
   Index.setWithGlobalValueDeadStripping();
@@ -600,23 +678,9 @@ void llvm::thinLTOResolveWeakForLinkerModule(
 /// Run internalization on \p TheModule based on symmary analysis.
 void llvm::thinLTOInternalizeModule(Module &TheModule,
                                     const GVSummaryMapTy &DefinedGlobals) {
-  // Parse inline ASM and collect the list of symbols that are not defined in
-  // the current module.
-  StringSet<> AsmUndefinedRefs;
-  ModuleSymbolTable::CollectAsmSymbols(
-      TheModule,
-      [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) {
-        if (Flags & object::BasicSymbolRef::SF_Undefined)
-          AsmUndefinedRefs.insert(Name);
-      });
-
   // Declare a callback for the internalize pass that will ask for every
   // candidate GlobalValue if it can be internalized or not.
   auto MustPreserveGV = [&](const GlobalValue &GV) -> bool {
-    // Can't be internalized if referenced in inline asm.
-    if (AsmUndefinedRefs.count(GV.getName()))
-      return true;
-
     // Lookup the linkage recorded in the summaries during global analysis.
     auto GS = DefinedGlobals.find(GV.getGUID());
     if (GS == DefinedGlobals.end()) {
@@ -647,12 +711,25 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
 
   // FIXME: See if we can just internalize directly here via linkage changes
   // based on the index, rather than invoking internalizeModule.
-  llvm::internalizeModule(TheModule, MustPreserveGV);
+  internalizeModule(TheModule, MustPreserveGV);
+}
+
+/// Make alias a clone of its aliasee.
+static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) {
+  Function *Fn = cast<Function>(GA->getBaseObject());
+
+  ValueToValueMapTy VMap;
+  Function *NewFn = CloneFunction(Fn, VMap);
+  // Clone should use the original alias's linkage and name, and we ensure
+  // all uses of alias instead use the new clone (casted if necessary).
+  NewFn->setLinkage(GA->getLinkage());
+  GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewFn, GA->getType()));
+  NewFn->takeName(GA);
+  return NewFn;
 }
 
 // Automatically import functions in Module \p DestModule based on the summaries
 // index.
-//
 Expected<bool> FunctionImporter::importFunctions(
     Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) {
   DEBUG(dbgs() << "Starting import for Module "
@@ -699,10 +776,9 @@ Expected<bool> FunctionImporter::importFunctions(
           // Add 'thinlto_src_module' metadata for statistics and debugging.
           F.setMetadata(
               "thinlto_src_module",
-              llvm::MDNode::get(
-                  DestModule.getContext(),
-                  {llvm::MDString::get(DestModule.getContext(),
-                                       SrcModule->getSourceFileName())}));
+              MDNode::get(DestModule.getContext(),
+                          {MDString::get(DestModule.getContext(),
+                                         SrcModule->getSourceFileName())}));
         }
         GlobalsToImport.insert(&F);
       }
@@ -722,13 +798,6 @@ Expected<bool> FunctionImporter::importFunctions(
       }
     }
     for (GlobalAlias &GA : SrcModule->aliases()) {
-      // FIXME: This should eventually be controlled entirely by the summary.
-      if (FunctionImportGlobalProcessing::doImportAsDefinition(
-              &GA, &GlobalsToImport)) {
-        GlobalsToImport.insert(&GA);
-        continue;
-      }
-
       if (!GA.hasName())
         continue;
       auto GUID = GA.getGUID();
@@ -737,25 +806,25 @@ Expected<bool> FunctionImporter::importFunctions(
                    << " " << GA.getName() << " from "
                    << SrcModule->getSourceFileName() << "\n");
       if (Import) {
-        // Alias can't point to "available_externally". However when we import
-        // linkOnceODR the linkage does not change. So we import the alias
-        // and aliasee only in this case. This has been handled by
-        // computeImportForFunction()
-        GlobalObject *GO = GA.getBaseObject();
-        assert(GO->hasLinkOnceODRLinkage() &&
-               "Unexpected alias to a non-linkonceODR in import list");
-#ifndef NDEBUG
-        if (!GlobalsToImport.count(GO))
-          DEBUG(dbgs() << " alias triggers importing aliasee " << GO->getGUID()
-                       << " " << GO->getName() << " from "
-                       << SrcModule->getSourceFileName() << "\n");
-#endif
-        if (Error Err = GO->materialize())
-          return std::move(Err);
-        GlobalsToImport.insert(GO);
         if (Error Err = GA.materialize())
           return std::move(Err);
-        GlobalsToImport.insert(&GA);
+        // Import alias as a copy of its aliasee.
+        GlobalObject *Base = GA.getBaseObject();
+        if (Error Err = Base->materialize())
+          return std::move(Err);
+        auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA);
+        DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID()
+              << " " << Base->getName() << " from "
+              << SrcModule->getSourceFileName() << "\n");
+        if (EnableImportMetadata) {
+          // Add 'thinlto_src_module' metadata for statistics and debugging.
+          Fn->setMetadata(
+              "thinlto_src_module",
+              MDNode::get(DestModule.getContext(),
+                          {MDString::get(DestModule.getContext(),
+                                         SrcModule->getSourceFileName())}));
+        }
+        GlobalsToImport.insert(Fn);
       }
     }
 
@@ -789,12 +858,6 @@ Expected<bool> FunctionImporter::importFunctions(
   return ImportedCount;
 }
 
-/// Summary file to use for function importing when using -function-import from
-/// the command line.
-static cl::opt<std::string>
-    SummaryFile("summary-file",
-                cl::desc("The summary file to use for function importing."));
-
 static bool doImportingForModule(Module &M) {
   if (SummaryFile.empty())
     report_fatal_error("error: -function-import requires -summary-file\n");
@@ -809,8 +872,15 @@ static bool doImportingForModule(Module &M) {
 
   // First step is collecting the import list.
   FunctionImporter::ImportMapTy ImportList;
-  ComputeCrossModuleImportForModule(M.getModuleIdentifier(), *Index,
-                                    ImportList);
+  // If requested, simply import all functions in the index. This is used
+  // when testing distributed backend handling via the opt tool, when
+  // we have distributed indexes containing exactly the summaries to import.
+  if (ImportAllIndex)
+    ComputeCrossModuleImportForModuleFromIndex(M.getModuleIdentifier(), *Index,
+                                               ImportList);
+  else
+    ComputeCrossModuleImportForModule(M.getModuleIdentifier(), *Index,
+                                      ImportList);
 
   // Conservatively mark all internal values as promoted. This interface is
   // only used when doing importing via the function importing pass. The pass
@@ -848,17 +918,18 @@ static bool doImportingForModule(Module &M) {
 }
 
 namespace {
+
 /// Pass that performs cross-module function import provided a summary file.
 class FunctionImportLegacyPass : public ModulePass {
 public:
   /// Pass identification, replacement for typeid
   static char ID;
 
+  explicit FunctionImportLegacyPass() : ModulePass(ID) {}
+
   /// Specify pass name for debug output
   StringRef getPassName() const override { return "Function Importing"; }
 
-  explicit FunctionImportLegacyPass() : ModulePass(ID) {}
-
   bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
@@ -866,7 +937,8 @@ public:
     return doImportingForModule(M);
   }
 };
-} // anonymous namespace
+
+} // end anonymous namespace
 
 PreservedAnalyses FunctionImportPass::run(Module &M,
                                           ModuleAnalysisManager &AM) {
@@ -881,7 +953,9 @@ INITIALIZE_PASS(FunctionImportLegacyPass, "function-import",
                 "Summary Based Function Import", false, false)
 
 namespace llvm {
+
 Pass *createFunctionImportPass() {
   return new FunctionImportLegacyPass();
 }
-}
+
+} // end namespace llvm
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index c91e8b454927..ada9eb80e680 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -18,7 +18,6 @@
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
@@ -115,7 +114,7 @@ void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) {
     ComputeDependencies(User, Deps);
   Deps.erase(&GV); // Remove self-reference.
   for (GlobalValue *GVU : Deps) {
-    GVDependencies.insert(std::make_pair(GVU, &GV));
+    GVDependencies[GVU].insert(&GV);
   }
 }
 
@@ -199,8 +198,8 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
                                            AliveGlobals.end()};
   while (!NewLiveGVs.empty()) {
     GlobalValue *LGV = NewLiveGVs.pop_back_val();
-    for (auto &&GVD : make_range(GVDependencies.equal_range(LGV)))
-      MarkLive(*GVD.second, &NewLiveGVs);
+    for (auto *GVD : GVDependencies[LGV])
+      MarkLive(*GVD, &NewLiveGVs);
   }
 
   // Now that all globals which are needed are in the AliveGlobals set, we loop
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 93eab680ca6b..4bb2984e3b47 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -20,22 +20,41 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -45,7 +64,11 @@
 #include "llvm/Transforms/Utils/Evaluator.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "globalopt"
@@ -139,7 +162,7 @@ static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
     }
 
     V = I->getOperand(0);
-  } while (1);
+  } while (true);
 }
 
 /// This GV is a pointer root.  Loop over all users of the global and clean up
@@ -220,7 +243,7 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
           break;
         I->eraseFromParent();
         I = J;
-      } while (1);
+      } while (true);
       I->eraseFromParent();
     }
   }
@@ -348,7 +371,6 @@ static bool isSafeSROAElementUse(Value *V) {
   return true;
 }
 
-
 /// U is a direct user of the specified global value.  Look at it and its uses
 /// and decide whether it is safe to SROA this global.
 static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
@@ -402,11 +424,8 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
     }
   }
 
-  for (User *UU : U->users())
-    if (!isSafeSROAElementUse(UU))
-      return false;
-
-  return true;
+  return llvm::all_of(U->users(),
+                      [](User *UU) { return isSafeSROAElementUse(UU); });
 }
 
 /// Look at all uses of the global and decide whether it is safe for us to
@@ -419,6 +438,27 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
   return true;
 }
 
+/// Copy over the debug info for a variable to its SRA replacements.
+static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
+                                 uint64_t FragmentOffsetInBits,
+                                 uint64_t FragmentSizeInBits,
+                                 unsigned NumElements) {
+  SmallVector<DIGlobalVariableExpression *, 1> GVs;
+  GV->getDebugInfo(GVs);
+  for (auto *GVE : GVs) {
+    DIVariable *Var = GVE->getVariable();
+    DIExpression *Expr = GVE->getExpression();
+    if (NumElements > 1) {
+      if (auto E = DIExpression::createFragmentExpression(
+              Expr, FragmentOffsetInBits, FragmentSizeInBits))
+        Expr = *E;
+      else
+        return;
+    }
+    auto *NGVE = DIGlobalVariableExpression::get(GVE->getContext(), Var, Expr);
+    NGV->addDebugInfo(NGVE);
+  }
+}
 
 /// Perform scalar replacement of aggregates on the specified global variable.
 /// This opens the door for other optimizations by exposing the behavior of the
@@ -434,7 +474,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
   Constant *Init = GV->getInitializer();
   Type *Ty = Init->getType();
 
-  std::vector<GlobalVariable*> NewGlobals;
+  std::vector<GlobalVariable *> NewGlobals;
   Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
 
   // Get the alignment of the global, either explicit or target-specific.
@@ -443,9 +483,11 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
     StartAlignment = DL.getABITypeAlignment(GV->getType());
 
   if (StructType *STy = dyn_cast<StructType>(Ty)) {
-    NewGlobals.reserve(STy->getNumElements());
+    uint64_t FragmentOffset = 0;
+    unsigned NumElements = STy->getNumElements();
+    NewGlobals.reserve(NumElements);
     const StructLayout &Layout = *DL.getStructLayout(STy);
-    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+    for (unsigned i = 0, e = NumElements; i != e; ++i) {
       Constant *In = Init->getAggregateElement(i);
       assert(In && "Couldn't get element of initializer?");
       GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false,
@@ -465,15 +507,22 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
       unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset);
       if (NewAlign > DL.getABITypeAlignment(STy->getElementType(i)))
         NGV->setAlignment(NewAlign);
+
+      // Copy over the debug info for the variable.
+      FragmentOffset = alignTo(FragmentOffset, NewAlign);
+      uint64_t Size = DL.getTypeSizeInBits(NGV->getValueType());
+      transferSRADebugInfo(GV, NGV, FragmentOffset, Size, NumElements);
+      FragmentOffset += Size;
     }
   } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
     unsigned NumElements = STy->getNumElements();
     if (NumElements > 16 && GV->hasNUsesOrMore(16))
       return nullptr; // It's not worth it.
     NewGlobals.reserve(NumElements);
-
-    uint64_t EltSize = DL.getTypeAllocSize(STy->getElementType());
-    unsigned EltAlign = DL.getABITypeAlignment(STy->getElementType());
+    auto ElTy = STy->getElementType();
+    uint64_t EltSize = DL.getTypeAllocSize(ElTy);
+    unsigned EltAlign = DL.getABITypeAlignment(ElTy);
+    uint64_t FragmentSizeInBits = DL.getTypeSizeInBits(ElTy);
     for (unsigned i = 0, e = NumElements; i != e; ++i) {
       Constant *In = Init->getAggregateElement(i);
       assert(In && "Couldn't get element of initializer?");
@@ -494,6 +543,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
       unsigned NewAlign = (unsigned)MinAlign(StartAlignment, EltSize*i);
       if (NewAlign > EltAlign)
         NGV->setAlignment(NewAlign);
+      transferSRADebugInfo(GV, NGV, FragmentSizeInBits * i, FragmentSizeInBits,
+                           NumElements);
     }
   }
 
@@ -689,7 +740,6 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
   return Changed;
 }
 
-
 /// The specified global has only one non-null value stored into it.  If there
 /// are uses of the loaded value that would trap if the loaded value is
 /// dynamically null, then we know that they cannot be reachable with a null
@@ -1045,7 +1095,6 @@ static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,
   return true;
 }
 
-
 /// If all users of values loaded from GV are simple enough to perform HeapSRA,
 /// return true.
 static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV,
@@ -1095,9 +1144,9 @@ static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV,
 }
 
 static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
-               DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
-                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
-  std::vector<Value*> &FieldVals = InsertedScalarizedValues[V];
+              DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) {
+  std::vector<Value *> &FieldVals = InsertedScalarizedValues[V];
 
   if (FieldNo >= FieldVals.size())
     FieldVals.resize(FieldNo+1);
@@ -1139,8 +1188,8 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
 /// Given a load instruction and a value derived from the load, rewrite the
 /// derived value to use the HeapSRoA'd load.
 static void RewriteHeapSROALoadUser(Instruction *LoadUser,
-             DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
-                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+              DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) {
   // If this is a comparison against null, handle it.
   if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) {
     assert(isa<ConstantPointerNull>(SCI->getOperand(1)));
@@ -1187,7 +1236,7 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser,
   // processed.
   PHINode *PN = cast<PHINode>(LoadUser);
   if (!InsertedScalarizedValues.insert(std::make_pair(PN,
-                                              std::vector<Value*>())).second)
+                                              std::vector<Value *>())).second)
     return;
 
   // If this is the first time we've seen this PHI, recursively process all
@@ -1202,8 +1251,8 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser,
 /// global.  Eliminate all uses of Ptr, making them use FieldGlobals instead.
 /// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA.
 static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
-               DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
-                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+              DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues,
+                  std::vector<std::pair<PHINode *, unsigned> > &PHIsToRewrite) {
   for (auto UI = Load->user_begin(), E = Load->user_end(); UI != E;) {
     Instruction *User = cast<Instruction>(*UI++);
     RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
@@ -1232,8 +1281,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
 
   // Okay, at this point, there are no users of the malloc.  Insert N
   // new mallocs at the same place as CI, and N globals.
-  std::vector<Value*> FieldGlobals;
-  std::vector<Value*> FieldMallocs;
+  std::vector<Value *> FieldGlobals;
+  std::vector<Value *> FieldMallocs;
 
   SmallVector<OperandBundleDef, 1> OpBundles;
   CI->getOperandBundlesAsDefs(OpBundles);
@@ -1330,10 +1379,10 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
 
   /// As we process loads, if we can't immediately update all uses of the load,
   /// keep track of what scalarized loads are inserted for a given load.
-  DenseMap<Value*, std::vector<Value*> > InsertedScalarizedValues;
+  DenseMap<Value *, std::vector<Value *>> InsertedScalarizedValues;
   InsertedScalarizedValues[GV] = FieldGlobals;
 
-  std::vector<std::pair<PHINode*, unsigned> > PHIsToRewrite;
+  std::vector<std::pair<PHINode *, unsigned>> PHIsToRewrite;
 
   // Okay, the malloc site is completely handled.  All of the uses of GV are now
   // loads, and all uses of those loads are simple.  Rewrite them to use loads
@@ -1379,7 +1428,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   }
 
   // Drop all inter-phi links and any loads that made it this far.
-  for (DenseMap<Value*, std::vector<Value*> >::iterator
+  for (DenseMap<Value *, std::vector<Value *>>::iterator
        I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
        I != E; ++I) {
     if (PHINode *PN = dyn_cast<PHINode>(I->first))
@@ -1389,7 +1438,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   }
 
   // Delete all the phis and loads now that inter-references are dead.
-  for (DenseMap<Value*, std::vector<Value*> >::iterator
+  for (DenseMap<Value *, std::vector<Value *>>::iterator
        I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
        I != E; ++I) {
     if (PHINode *PN = dyn_cast<PHINode>(I->first))
@@ -1576,12 +1625,57 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
   assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) &&
          "No reason to shrink to bool!");
 
+  SmallVector<DIGlobalVariableExpression *, 1> GVs;
+  GV->getDebugInfo(GVs);
+
   // If initialized to zero and storing one into the global, we can use a cast
   // instead of a select to synthesize the desired value.
   bool IsOneZero = false;
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal))
+  bool EmitOneOrZero = true;
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)){
     IsOneZero = InitVal->isNullValue() && CI->isOne();
 
+    if (ConstantInt *CIInit = dyn_cast<ConstantInt>(GV->getInitializer())){
+      uint64_t ValInit = CIInit->getZExtValue();
+      uint64_t ValOther = CI->getZExtValue();
+      uint64_t ValMinus = ValOther - ValInit;
+
+      for(auto *GVe : GVs){
+        DIGlobalVariable *DGV = GVe->getVariable();
+        DIExpression *E = GVe->getExpression();
+
+        // It is expected that the address of global optimized variable is on
+        // top of the stack. After optimization, value of that variable will
+        // be ether 0 for initial value or 1 for other value. The following
+        // expression should return constant integer value depending on the
+        // value at global object address:
+        // val * (ValOther - ValInit) + ValInit:
+        // DW_OP_deref DW_OP_constu <ValMinus>
+        // DW_OP_mul DW_OP_constu <ValInit> DW_OP_plus DW_OP_stack_value
+        E = DIExpression::get(NewGV->getContext(),
+                             {dwarf::DW_OP_deref,
+                              dwarf::DW_OP_constu,
+                              ValMinus,
+                              dwarf::DW_OP_mul,
+                              dwarf::DW_OP_constu,
+                              ValInit,
+                              dwarf::DW_OP_plus,
+                              dwarf::DW_OP_stack_value});
+        DIGlobalVariableExpression *DGVE =
+          DIGlobalVariableExpression::get(NewGV->getContext(), DGV, E);
+        NewGV->addDebugInfo(DGVE);
+     }
+     EmitOneOrZero = false;
+    }
+  }
+
+  if (EmitOneOrZero) {
+     // FIXME: This will only emit address for debugger on which will
+     // be written only 0 or 1.
+     for(auto *GV : GVs)
+       NewGV->addDebugInfo(GV);
+   }
+
   while (!GV->use_empty()) {
     Instruction *UI = cast<Instruction>(GV->user_back());
     if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
@@ -2202,7 +2296,7 @@ static void setUsedInitializer(GlobalVariable &V,
   // Type of pointer to the array of pointers.
   PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext(), 0);
 
-  SmallVector<llvm::Constant *, 8> UsedArray;
+  SmallVector<Constant *, 8> UsedArray;
   for (GlobalValue *GV : Init) {
     Constant *Cast
       = ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Int8PtrTy);
@@ -2215,14 +2309,15 @@ static void setUsedInitializer(GlobalVariable &V,
   Module *M = V.getParent();
   V.removeFromParent();
   GlobalVariable *NV =
-      new GlobalVariable(*M, ATy, false, llvm::GlobalValue::AppendingLinkage,
-                         llvm::ConstantArray::get(ATy, UsedArray), "");
+      new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage,
+                         ConstantArray::get(ATy, UsedArray), "");
   NV->takeName(&V);
   NV->setSection("llvm.metadata");
   delete &V;
 }
 
 namespace {
+
 /// An easy to access representation of llvm.used and llvm.compiler.used.
 class LLVMUsed {
   SmallPtrSet<GlobalValue *, 8> Used;
@@ -2235,25 +2330,34 @@ public:
     UsedV = collectUsedGlobalVariables(M, Used, false);
     CompilerUsedV = collectUsedGlobalVariables(M, CompilerUsed, true);
   }
-  typedef SmallPtrSet<GlobalValue *, 8>::iterator iterator;
-  typedef iterator_range<iterator> used_iterator_range;
+
+  using iterator = SmallPtrSet<GlobalValue *, 8>::iterator;
+  using used_iterator_range = iterator_range<iterator>;
+
   iterator usedBegin() { return Used.begin(); }
   iterator usedEnd() { return Used.end(); }
+
   used_iterator_range used() {
     return used_iterator_range(usedBegin(), usedEnd());
   }
+
   iterator compilerUsedBegin() { return CompilerUsed.begin(); }
   iterator compilerUsedEnd() { return CompilerUsed.end(); }
+
   used_iterator_range compilerUsed() {
     return used_iterator_range(compilerUsedBegin(), compilerUsedEnd());
   }
+
   bool usedCount(GlobalValue *GV) const { return Used.count(GV); }
+
   bool compilerUsedCount(GlobalValue *GV) const {
     return CompilerUsed.count(GV);
   }
+
   bool usedErase(GlobalValue *GV) { return Used.erase(GV); }
   bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); }
   bool usedInsert(GlobalValue *GV) { return Used.insert(GV).second; }
+
   bool compilerUsedInsert(GlobalValue *GV) {
     return CompilerUsed.insert(GV).second;
   }
@@ -2265,7 +2369,8 @@ public:
       setUsedInitializer(*CompilerUsedV, CompilerUsed);
   }
 };
-}
+
+} // end anonymous namespace
 
 static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) {
   if (GA.use_empty()) // No use at all.
@@ -2580,8 +2685,10 @@ PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
 }
 
 namespace {
+
 struct GlobalOptLegacyPass : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
+
   GlobalOptLegacyPass() : ModulePass(ID) {
     initializeGlobalOptLegacyPassPass(*PassRegistry::getPassRegistry());
   }
@@ -2603,9 +2710,11 @@ struct GlobalOptLegacyPass : public ModulePass {
     AU.addRequired<DominatorTreeWrapperPass>();
   }
 };
-}
+
+} // end anonymous namespace
 
 char GlobalOptLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt",
                       "Global Variable Optimizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
diff --git a/lib/Transforms/IPO/GlobalSplit.cpp b/lib/Transforms/IPO/GlobalSplit.cpp
index e47d881d1127..792f4b3052a3 100644
--- a/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/lib/Transforms/IPO/GlobalSplit.cpp
@@ -15,22 +15,30 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/GlobalSplit.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Transforms/IPO.h"
-
-#include <set>
+#include <cstdint>
+#include <vector>
 
 using namespace llvm;
 
-namespace {
-
-bool splitGlobal(GlobalVariable &GV) {
+static bool splitGlobal(GlobalVariable &GV) {
   // If the address of the global is taken outside of the module, we cannot
   // apply this transformation.
   if (!GV.hasLocalLinkage())
@@ -130,7 +138,7 @@ bool splitGlobal(GlobalVariable &GV) {
   return true;
 }
 
-bool splitGlobals(Module &M) {
+static bool splitGlobals(Module &M) {
   // First, see if the module uses either of the llvm.type.test or
   // llvm.type.checked.load intrinsics, which indicates that splitting globals
   // may be beneficial.
@@ -151,12 +159,16 @@ bool splitGlobals(Module &M) {
   return Changed;
 }
 
+namespace {
+
 struct GlobalSplit : public ModulePass {
   static char ID;
+
   GlobalSplit() : ModulePass(ID) {
     initializeGlobalSplitPass(*PassRegistry::getPassRegistry());
   }
-  bool runOnModule(Module &M) {
+
+  bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
 
@@ -164,11 +176,12 @@ struct GlobalSplit : public ModulePass {
   }
 };
 
-}
+} // end anonymous namespace
 
-INITIALIZE_PASS(GlobalSplit, "globalsplit", "Global splitter", false, false)
 char GlobalSplit::ID = 0;
 
+INITIALIZE_PASS(GlobalSplit, "globalsplit", "Global splitter", false, false)
+
 ModulePass *llvm::createGlobalSplitPass() {
   return new GlobalSplit;
 }
diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp
index 5bb305ca84d0..d5d35ee89e0e 100644
--- a/lib/Transforms/IPO/IPO.cpp
+++ b/lib/Transforms/IPO/IPO.cpp
@@ -25,6 +25,7 @@ using namespace llvm;
 
 void llvm::initializeIPO(PassRegistry &Registry) {
   initializeArgPromotionPass(Registry);
+  initializeCalledValuePropagationLegacyPassPass(Registry);
   initializeConstantMergeLegacyPassPass(Registry);
   initializeCrossDSOCFIPass(Registry);
   initializeDAEPass(Registry);
@@ -67,6 +68,10 @@ void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createArgumentPromotionPass());
 }
 
+void LLVMAddCalledValuePropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCalledValuePropagationPass());
+}
+
 void LLVMAddConstantMergePass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createConstantMergePass());
 }
diff --git a/lib/Transforms/IPO/InferFunctionAttrs.cpp b/lib/Transforms/IPO/InferFunctionAttrs.cpp
index 15d7515cc842..470f97b8ba61 100644
--- a/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 50e7cc89a3b3..b259a0abd63c 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -21,7 +20,6 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Transforms/IPO.h"
@@ -57,12 +55,23 @@ public:
   InlineCost getInlineCost(CallSite CS) override {
     Function *Callee = CS.getCalledFunction();
     TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
+
+    bool RemarksEnabled = false;
+    const auto &BBs = CS.getCaller()->getBasicBlockList();
+    if (!BBs.empty()) {
+      auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
+      if (DI.isEnabled())
+        RemarksEnabled = true;
+    }
+    OptimizationRemarkEmitter ORE(CS.getCaller());
+
     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
         [&](Function &F) -> AssumptionCache & {
       return ACT->getAssumptionCache(F);
     };
     return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache,
-                               /*GetBFI=*/None, PSI);
+                               /*GetBFI=*/None, PSI,
+                               RemarksEnabled ? &ORE : nullptr);
   }
 
   bool runOnSCC(CallGraphSCC &SCC) override;
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 317770d133b3..4449c87ddefa 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -14,29 +14,60 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/Inliner.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <tuple>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "inline"
@@ -63,13 +94,16 @@ static cl::opt<bool>
                                 cl::init(false), cl::Hidden);
 
 namespace {
+
 enum class InlinerFunctionImportStatsOpts {
   No = 0,
   Basic = 1,
   Verbose = 2,
 };
 
-cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats(
+} // end anonymous namespace
+
+static cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats(
     "inliner-function-import-stats",
     cl::init(InlinerFunctionImportStatsOpts::No),
     cl::values(clEnumValN(InlinerFunctionImportStatsOpts::Basic, "basic",
@@ -77,10 +111,8 @@ cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats(
                clEnumValN(InlinerFunctionImportStatsOpts::Verbose, "verbose",
                           "printing of statistics for each inlined function")),
     cl::Hidden, cl::desc("Enable inliner stats for imported functions"));
-} // namespace
 
-LegacyInlinerBase::LegacyInlinerBase(char &ID)
-    : CallGraphSCCPass(ID), InsertLifetime(true) {}
+LegacyInlinerBase::LegacyInlinerBase(char &ID) : CallGraphSCCPass(ID) {}
 
 LegacyInlinerBase::LegacyInlinerBase(char &ID, bool InsertLifetime)
     : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {}
@@ -96,7 +128,7 @@ void LegacyInlinerBase::getAnalysisUsage(AnalysisUsage &AU) const {
   CallGraphSCCPass::getAnalysisUsage(AU);
 }
 
-typedef DenseMap<ArrayType *, std::vector<AllocaInst *>> InlinedArrayAllocasTy;
+using InlinedArrayAllocasTy = DenseMap<ArrayType *, std::vector<AllocaInst *>>;
 
 /// Look at all of the allocas that we inlined through this call site.  If we
 /// have already inlined other allocas through other calls into this function,
@@ -161,7 +193,6 @@ static void mergeInlinedArrayAllocas(
     // function.  Also, AllocasForType can be empty of course!
     bool MergedAwayAlloca = false;
     for (AllocaInst *AvailableAlloca : AllocasForType) {
-
       unsigned Align1 = AI->getAlignment(),
                Align2 = AvailableAlloca->getAlignment();
 
@@ -267,7 +298,6 @@ static bool
 shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
                  int &TotalSecondaryCost,
                  function_ref<InlineCost(CallSite CS)> GetInlineCost) {
-
   // For now we only handle local or inline functions.
   if (!Caller->hasLocalLinkage() && !Caller->hasLinkOnceODRLinkage())
     return false;
@@ -335,11 +365,14 @@ shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
   return false;
 }
 
-/// Return true if the inliner should attempt to inline at the given CallSite.
-static bool shouldInline(CallSite CS,
-                         function_ref<InlineCost(CallSite CS)> GetInlineCost,
-                         OptimizationRemarkEmitter &ORE) {
+/// Return the cost only if the inliner should attempt to inline at the given
+/// CallSite. If we return the cost, we will emit an optimisation remark later
+/// using that cost, so we won't do so from this function.
+static Optional<InlineCost>
+shouldInline(CallSite CS, function_ref<InlineCost(CallSite CS)> GetInlineCost,
+             OptimizationRemarkEmitter &ORE) {
   using namespace ore;
+
   InlineCost IC = GetInlineCost(CS);
   Instruction *Call = CS.getInstruction();
   Function *Callee = CS.getCalledFunction();
@@ -348,32 +381,33 @@ static bool shouldInline(CallSite CS,
   if (IC.isAlways()) {
     DEBUG(dbgs() << "    Inlining: cost=always"
                  << ", Call: " << *CS.getInstruction() << "\n");
-    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
-             << NV("Callee", Callee)
-             << " should always be inlined (cost=always)");
-    return true;
+    return IC;
   }
 
   if (IC.isNever()) {
     DEBUG(dbgs() << "    NOT Inlining: cost=never"
                  << ", Call: " << *CS.getInstruction() << "\n");
-    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
              << NV("Callee", Callee) << " not inlined into "
              << NV("Caller", Caller)
-             << " because it should never be inlined (cost=never)");
-    return false;
+             << " because it should never be inlined (cost=never)";
+    });
+    return None;
   }
 
   if (!IC) {
     DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
-                 << ", thres=" << (IC.getCostDelta() + IC.getCost())
+                 << ", thres=" << IC.getThreshold()
                  << ", Call: " << *CS.getInstruction() << "\n");
-    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
              << NV("Callee", Callee) << " not inlined into "
              << NV("Caller", Caller) << " because too costly to inline (cost="
-             << NV("Cost", IC.getCost()) << ", threshold="
-             << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
-    return false;
+             << NV("Cost", IC.getCost())
+             << ", threshold=" << NV("Threshold", IC.getThreshold()) << ")";
+    });
+    return None;
   }
 
   int TotalSecondaryCost = 0;
@@ -381,23 +415,23 @@ static bool shouldInline(CallSite CS,
     DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction()
                  << " Cost = " << IC.getCost()
                  << ", outer Cost = " << TotalSecondaryCost << '\n');
-    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts",
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts",
                                       Call)
              << "Not inlining. Cost of inlining " << NV("Callee", Callee)
              << " increases the cost of inlining " << NV("Caller", Caller)
-             << " in other contexts");
-    return false;
+             << " in other contexts";
+    });
+
+    // IC does not bool() to false, so get an InlineCost that will.
+    // This will not be inspected to make an error message.
+    return None;
   }
 
   DEBUG(dbgs() << "    Inlining: cost=" << IC.getCost()
-               << ", thres=" << (IC.getCostDelta() + IC.getCost())
+               << ", thres=" << IC.getThreshold()
                << ", Call: " << *CS.getInstruction() << '\n');
-  ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBeInlined", Call)
-           << NV("Callee", Callee) << " can be inlined into "
-           << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
-           << " (threshold="
-           << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
-  return true;
+  return IC;
 }
 
 /// Return true if the specified inline history ID
@@ -475,11 +509,14 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
         if (Function *Callee = CS.getCalledFunction())
           if (Callee->isDeclaration()) {
             using namespace ore;
-            ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
+
+            ORE.emit([&]() {
+              return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
                      << NV("Callee", Callee) << " will not be inlined into "
                      << NV("Caller", CS.getCaller())
                      << " because its definition is unavailable"
-                     << setIsVerbose());
+                     << setIsVerbose();
+            });
             continue;
           }
 
@@ -545,9 +582,10 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
       // just become a regular analysis dependency.
       OptimizationRemarkEmitter ORE(Caller);
 
+      Optional<InlineCost> OIC = shouldInline(CS, GetInlineCost, ORE);
       // If the policy determines that we should inline this function,
       // delete the call instead.
-      if (!shouldInline(CS, GetInlineCost, ORE))
+      if (!OIC)
         continue;
 
       // If this call site is dead and it is to a readonly function, we should
@@ -562,26 +600,40 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
         ++NumCallsDeleted;
       } else {
         // Get DebugLoc to report. CS will be invalid after Inliner.
-        DebugLoc DLoc = Instr->getDebugLoc();
+        DebugLoc DLoc = CS->getDebugLoc();
         BasicBlock *Block = CS.getParent();
 
         // Attempt to inline the function.
         using namespace ore;
+
         if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas,
                                   InlineHistoryID, InsertLifetime, AARGetter,
                                   ImportedFunctionsStats)) {
-          ORE.emit(
-              OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
-              << NV("Callee", Callee) << " will not be inlined into "
-              << NV("Caller", Caller));
+          ORE.emit([&]() {
+            return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc,
+                                            Block)
+                   << NV("Callee", Callee) << " will not be inlined into "
+                   << NV("Caller", Caller);
+          });
           continue;
         }
         ++NumInlined;
 
-        // Report the inline decision.
-        ORE.emit(OptimizationRemark(DEBUG_TYPE, "Inlined", DLoc, Block)
-                 << NV("Callee", Callee) << " inlined into "
-                 << NV("Caller", Caller));
+        ORE.emit([&]() {
+          bool AlwaysInline = OIC->isAlways();
+          StringRef RemarkName = AlwaysInline ? "AlwaysInline" : "Inlined";
+          OptimizationRemark R(DEBUG_TYPE, RemarkName, DLoc, Block);
+          R << NV("Callee", Callee) << " inlined into ";
+          R << NV("Caller", Caller);
+          if (AlwaysInline)
+            R << " with cost=always";
+          else {
+            R << " with cost=" << NV("Cost", OIC->getCost());
+            R << " (threshold=" << NV("Threshold", OIC->getThreshold());
+            R << ")";
+          }
+          return R;
+        });
 
         // If inlining this function gave us any new call sites, throw them
         // onto our worklist to process.  They are useful inline candidates.
@@ -601,7 +653,6 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
       if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() &&
           // TODO: Can remove if in SCC now.
           !SCCFunctions.count(Callee) &&
-
           // The function may be apparently dead, but if there are indirect
           // callgraph references to the node, we cannot delete it yet, this
           // could invalidate the CGSCC iterator.
@@ -840,6 +891,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     FunctionAnalysisManager &FAM =
         AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG)
             .getManager();
+
+    // Get the remarks emission analysis for the caller.
+    auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
         [&](Function &F) -> AssumptionCache & {
       return FAM.getResult<AssumptionAnalysis>(F);
@@ -852,12 +907,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       Function &Callee = *CS.getCalledFunction();
       auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee);
       return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, {GetBFI},
-                           PSI);
+                           PSI, &ORE);
     };
 
-    // Get the remarks emission analysis for the caller.
-    auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
     // Now process as many calls as we have within this caller in the sequnece.
     // We bail out as soon as the caller has to change so we can update the
     // call graph and prepare the context of that new caller.
@@ -872,8 +924,22 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
           InlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory))
         continue;
 
+      // Check if this inlining may repeat breaking an SCC apart that has
+      // already been split once before. In that case, inlining here may
+      // trigger infinite inlining, much like is prevented within the inliner
+      // itself by the InlineHistory above, but spread across CGSCC iterations
+      // and thus hidden from the full inline history.
+      if (CG.lookupSCC(*CG.lookup(Callee)) == C &&
+          UR.InlinedInternalEdges.count({&N, C})) {
+        DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node "
+                        "previously split out of this SCC by inlining: "
+                     << F.getName() << " -> " << Callee.getName() << "\n");
+        continue;
+      }
+
+      Optional<InlineCost> OIC = shouldInline(CS, GetInlineCost, ORE);
       // Check whether we want to inline this callsite.
-      if (!shouldInline(CS, GetInlineCost, ORE))
+      if (!OIC)
         continue;
 
       // Setup the data structure used to plumb customization into the
@@ -883,11 +949,39 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
           &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())),
           &FAM.getResult<BlockFrequencyAnalysis>(Callee));
 
-      if (!InlineFunction(CS, IFI))
+      // Get DebugLoc to report. CS will be invalid after Inliner.
+      DebugLoc DLoc = CS->getDebugLoc();
+      BasicBlock *Block = CS.getParent();
+
+      using namespace ore;
+
+      if (!InlineFunction(CS, IFI)) {
+        ORE.emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
+                 << NV("Callee", &Callee) << " will not be inlined into "
+                 << NV("Caller", &F);
+        });
         continue;
+      }
       DidInline = true;
       InlinedCallees.insert(&Callee);
 
+      ORE.emit([&]() {
+        bool AlwaysInline = OIC->isAlways();
+        StringRef RemarkName = AlwaysInline ? "AlwaysInline" : "Inlined";
+        OptimizationRemark R(DEBUG_TYPE, RemarkName, DLoc, Block);
+        R << NV("Callee", &Callee) << " inlined into ";
+        R << NV("Caller", &F);
+        if (AlwaysInline)
+          R << " with cost=always";
+        else {
+          R << " with cost=" << NV("Cost", OIC->getCost());
+          R << " (threshold=" << NV("Threshold", OIC->getThreshold());
+          R << ")";
+        }
+        return R;
+      });
+
       // Add any new callsites to defined functions to the worklist.
       if (!IFI.InlinedCallSites.empty()) {
         int NewHistoryID = InlineHistory.size();
@@ -949,17 +1043,38 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       for (LazyCallGraph::Edge &E : *CalleeN)
         RC->insertTrivialRefEdge(N, E.getNode());
     }
-    InlinedCallees.clear();
 
     // At this point, since we have made changes we have at least removed
     // a call instruction. However, in the process we do some incremental
     // simplification of the surrounding code. This simplification can
     // essentially do all of the same things as a function pass and we can
     // re-use the exact same logic for updating the call graph to reflect the
-    // change..
+    // change.
+    LazyCallGraph::SCC *OldC = C;
     C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR);
     DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
     RC = &C->getOuterRefSCC();
+
+    // If this causes an SCC to split apart into multiple smaller SCCs, there
+    // is a subtle risk we need to prepare for. Other transformations may
+    // expose an "infinite inlining" opportunity later, and because of the SCC
+    // mutation, we will revisit this function and potentially re-inline. If we
+    // do, and that re-inlining also has the potentially to mutate the SCC
+    // structure, the infinite inlining problem can manifest through infinite
+    // SCC splits and merges. To avoid this, we capture the originating caller
+    // node and the SCC containing the call edge. This is a slight over
+    // approximation of the possible inlining decisions that must be avoided,
+    // but is relatively efficient to store.
+    // FIXME: This seems like a very heavyweight way of retaining the inline
+    // history, we should look for a more efficient way of tracking it.
+    if (C != OldC && llvm::any_of(InlinedCallees, [&](Function *Callee) {
+          return CG.lookupSCC(*CG.lookup(*Callee)) == OldC;
+        })) {
+      DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, "
+                      "retaining this to avoid infinite inlining.\n");
+      UR.InlinedInternalEdges.insert({&N, OldC});
+    }
+    InlinedCallees.clear();
   }
 
   // Now that we've finished inlining all of the calls across this SCC, delete
@@ -976,8 +1091,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     FunctionAnalysisManager &FAM =
         AM.getResult<FunctionAnalysisManagerCGSCCProxy>(DeadC, CG)
             .getManager();
-    FAM.clear(*DeadF);
-    AM.clear(DeadC);
+    FAM.clear(*DeadF, DeadF->getName());
+    AM.clear(DeadC, DeadC.getName());
     auto &DeadRC = DeadC.getOuterRefSCC();
     CG.removeDeadFunction(*DeadF);
 
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index c74b0a35e296..36b6bdba2cd0 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -80,7 +80,7 @@ INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single",
 //
 Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); }
 
-bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &) {
+bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (skipLoop(L))
     return false;
 
@@ -143,7 +143,8 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &) {
       Changed = true;
       // After extraction, the loop is replaced by a function call, so
       // we shouldn't try to run any more loop passes on it.
-      LI.markAsRemoved(L);
+      LPM.markLoopAsDeleted(*L);
+      LI.erase(L);
     }
     ++NumExtracted;
   }
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 693df5e7ba92..8db7e1e142d2 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1,4 +1,4 @@
-//===-- LowerTypeTests.cpp - type metadata lowering pass ------------------===//
+//===- LowerTypeTests.cpp - type metadata lowering pass -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,32 +13,70 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/ModuleSummaryIndexYAML.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TrailingObjects.h"
+#include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <set>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace lowertypetests;
@@ -197,6 +235,7 @@ struct ByteArrayInfo {
   uint64_t BitSize;
   GlobalVariable *ByteArray;
   GlobalVariable *MaskGlobal;
+  uint8_t *MaskPtr = nullptr;
 };
 
 /// A POD-like structure that we use to store a global reference together with
@@ -205,16 +244,19 @@ struct ByteArrayInfo {
 /// operation involving a map lookup; this data structure helps to reduce the
 /// number of times we need to do this lookup.
 class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
+  friend TrailingObjects;
+
   GlobalObject *GO;
   size_t NTypes;
+
   // For functions: true if this is a definition (either in the merged module or
   // in one of the thinlto modules).
   bool IsDefinition;
+
   // For functions: true if this function is either defined or used in a thinlto
   // module and its jumptable entry needs to be exported to thinlto backends.
   bool IsExported;
 
-  friend TrailingObjects;
   size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; }
 
 public:
@@ -231,15 +273,19 @@ public:
                             GTM->getTrailingObjects<MDNode *>());
     return GTM;
   }
+
   GlobalObject *getGlobal() const {
     return GO;
   }
+
   bool isDefinition() const {
     return IsDefinition;
   }
+
   bool isExported() const {
     return IsExported;
   }
+
   ArrayRef<MDNode *> types() const {
     return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes);
   }
@@ -258,6 +304,7 @@ class LowerTypeTestsModule {
   IntegerType *Int1Ty = Type::getInt1Ty(M.getContext());
   IntegerType *Int8Ty = Type::getInt8Ty(M.getContext());
   PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
+  ArrayType *Int8Arr0Ty = ArrayType::get(Type::getInt8Ty(M.getContext()), 0);
   IntegerType *Int32Ty = Type::getInt32Ty(M.getContext());
   PointerType *Int32PtrTy = PointerType::getUnqual(Int32Ty);
   IntegerType *Int64Ty = Type::getInt64Ty(M.getContext());
@@ -307,7 +354,8 @@ class LowerTypeTestsModule {
 
   Function *WeakInitializerFn = nullptr;
 
-  void exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
+  bool shouldExportConstantsAsAbsoluteSymbols();
+  uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
   TypeIdLowering importTypeId(StringRef TypeId);
   void importTypeTest(CallInst *CI);
   void importFunction(Function *F, bool isDefinition);
@@ -329,6 +377,7 @@ class LowerTypeTestsModule {
   unsigned getJumpTableEntrySize();
   Type *getJumpTableEntryType();
   void createJumpTableEntry(raw_ostream &AsmOS, raw_ostream &ConstraintOS,
+                            Triple::ArchType JumpTableArch,
                             SmallVectorImpl<Value *> &AsmArgs, Function *Dest);
   void verifyTypeMDNode(GlobalObject *GO, MDNode *Type);
   void buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
@@ -350,6 +399,7 @@ class LowerTypeTestsModule {
 public:
   LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary,
                        const ModuleSummaryIndex *ImportSummary);
+
   bool lower();
 
   // Lower the module using the action and summary passed as command line
@@ -385,11 +435,12 @@ struct LowerTypeTests : public ModulePass {
   }
 };
 
-} // anonymous namespace
+} // end anonymous namespace
+
+char LowerTypeTests::ID = 0;
 
 INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
                 false)
-char LowerTypeTests::ID = 0;
 
 ModulePass *
 llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
@@ -473,6 +524,8 @@ void LowerTypeTestsModule::allocateByteArrays() {
     BAI->MaskGlobal->replaceAllUsesWith(
         ConstantExpr::getIntToPtr(ConstantInt::get(Int8Ty, Mask), Int8PtrTy));
     BAI->MaskGlobal->eraseFromParent();
+    if (BAI->MaskPtr)
+      *BAI->MaskPtr = Mask;
   }
 
   Constant *ByteArrayConst = ConstantDataArray::get(M.getContext(), BAB.Bytes);
@@ -634,6 +687,10 @@ Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
                            Br->getMetadata(LLVMContext::MD_prof));
         ReplaceInstWithInst(InitialBB->getTerminator(), NewBr);
 
+        // Update phis in Else resulting from InitialBB being split
+        for (auto &Phi : Else->phis())
+          Phi.addIncoming(Phi.getIncomingValueForBlock(Then), InitialBB);
+
         IRBuilder<> ThenB(CI);
         return createBitSetTest(ThenB, TIL, BitOffset);
       }
@@ -720,13 +777,21 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
   }
 }
 
+bool LowerTypeTestsModule::shouldExportConstantsAsAbsoluteSymbols() {
+  return (Arch == Triple::x86 || Arch == Triple::x86_64) &&
+         ObjectFormat == Triple::ELF;
+}
+
 /// Export the given type identifier so that ThinLTO backends may import it.
 /// Type identifiers are exported by adding coarse-grained information about how
 /// to test the type identifier to the summary, and creating symbols in the
 /// object file (aliases and absolute symbols) containing fine-grained
 /// information about the type identifier.
-void LowerTypeTestsModule::exportTypeId(StringRef TypeId,
-                                        const TypeIdLowering &TIL) {
+///
+/// Returns a pointer to the location in which to store the bitmask, if
+/// applicable.
+uint8_t *LowerTypeTestsModule::exportTypeId(StringRef TypeId,
+                                            const TypeIdLowering &TIL) {
   TypeTestResolution &TTRes =
       ExportSummary->getOrInsertTypeIdSummary(TypeId).TTRes;
   TTRes.TheKind = TIL.TheKind;
@@ -738,14 +803,21 @@ void LowerTypeTestsModule::exportTypeId(StringRef TypeId,
     GA->setVisibility(GlobalValue::HiddenVisibility);
   };
 
+  auto ExportConstant = [&](StringRef Name, uint64_t &Storage, Constant *C) {
+    if (shouldExportConstantsAsAbsoluteSymbols())
+      ExportGlobal(Name, ConstantExpr::getIntToPtr(C, Int8PtrTy));
+    else
+      Storage = cast<ConstantInt>(C)->getZExtValue();
+  };
+
   if (TIL.TheKind != TypeTestResolution::Unsat)
     ExportGlobal("global_addr", TIL.OffsetedGlobal);
 
   if (TIL.TheKind == TypeTestResolution::ByteArray ||
       TIL.TheKind == TypeTestResolution::Inline ||
       TIL.TheKind == TypeTestResolution::AllOnes) {
-    ExportGlobal("align", ConstantExpr::getIntToPtr(TIL.AlignLog2, Int8PtrTy));
-    ExportGlobal("size_m1", ConstantExpr::getIntToPtr(TIL.SizeM1, Int8PtrTy));
+    ExportConstant("align", TTRes.AlignLog2, TIL.AlignLog2);
+    ExportConstant("size_m1", TTRes.SizeM1, TIL.SizeM1);
 
     uint64_t BitSize = cast<ConstantInt>(TIL.SizeM1)->getZExtValue() + 1;
     if (TIL.TheKind == TypeTestResolution::Inline)
@@ -756,12 +828,16 @@ void LowerTypeTestsModule::exportTypeId(StringRef TypeId,
 
   if (TIL.TheKind == TypeTestResolution::ByteArray) {
     ExportGlobal("byte_array", TIL.TheByteArray);
-    ExportGlobal("bit_mask", TIL.BitMask);
+    if (shouldExportConstantsAsAbsoluteSymbols())
+      ExportGlobal("bit_mask", TIL.BitMask);
+    else
+      return &TTRes.BitMask;
   }
 
   if (TIL.TheKind == TypeTestResolution::Inline)
-    ExportGlobal("inline_bits",
-                 ConstantExpr::getIntToPtr(TIL.InlineBits, Int8PtrTy));
+    ExportConstant("inline_bits", TTRes.InlineBits, TIL.InlineBits);
+
+  return nullptr;
 }
 
 LowerTypeTestsModule::TypeIdLowering
@@ -774,16 +850,34 @@ LowerTypeTestsModule::importTypeId(StringRef TypeId) {
   TypeIdLowering TIL;
   TIL.TheKind = TTRes.TheKind;
 
-  auto ImportGlobal = [&](StringRef Name, unsigned AbsWidth) {
-    Constant *C =
-        M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(), Int8Ty);
-    auto *GV = dyn_cast<GlobalVariable>(C);
-    // We only need to set metadata if the global is newly created, in which
-    // case it would not have hidden visibility.
-    if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility)
+  auto ImportGlobal = [&](StringRef Name) {
+    // Give the global a type of length 0 so that it is not assumed not to alias
+    // with any other global.
+    Constant *C = M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(),
+                                      Int8Arr0Ty);
+    if (auto *GV = dyn_cast<GlobalVariable>(C))
+      GV->setVisibility(GlobalValue::HiddenVisibility);
+    C = ConstantExpr::getBitCast(C, Int8PtrTy);
+    return C;
+  };
+
+  auto ImportConstant = [&](StringRef Name, uint64_t Const, unsigned AbsWidth,
+                            Type *Ty) {
+    if (!shouldExportConstantsAsAbsoluteSymbols()) {
+      Constant *C =
+          ConstantInt::get(isa<IntegerType>(Ty) ? Ty : Int64Ty, Const);
+      if (!isa<IntegerType>(Ty))
+        C = ConstantExpr::getIntToPtr(C, Ty);
+      return C;
+    }
+
+    Constant *C = ImportGlobal(Name);
+    auto *GV = cast<GlobalVariable>(C->stripPointerCasts());
+    if (isa<IntegerType>(Ty))
+      C = ConstantExpr::getPtrToInt(C, Ty);
+    if (GV->getMetadata(LLVMContext::MD_absolute_symbol))
       return C;
 
-    GV->setVisibility(GlobalValue::HiddenVisibility);
     auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
       auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
       auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
@@ -792,30 +886,30 @@ LowerTypeTestsModule::importTypeId(StringRef TypeId) {
     };
     if (AbsWidth == IntPtrTy->getBitWidth())
       SetAbsRange(~0ull, ~0ull); // Full set.
-    else if (AbsWidth)
+    else
       SetAbsRange(0, 1ull << AbsWidth);
     return C;
   };
 
   if (TIL.TheKind != TypeTestResolution::Unsat)
-    TIL.OffsetedGlobal = ImportGlobal("global_addr", 0);
+    TIL.OffsetedGlobal = ImportGlobal("global_addr");
 
   if (TIL.TheKind == TypeTestResolution::ByteArray ||
       TIL.TheKind == TypeTestResolution::Inline ||
       TIL.TheKind == TypeTestResolution::AllOnes) {
-    TIL.AlignLog2 = ConstantExpr::getPtrToInt(ImportGlobal("align", 8), Int8Ty);
-    TIL.SizeM1 = ConstantExpr::getPtrToInt(
-        ImportGlobal("size_m1", TTRes.SizeM1BitWidth), IntPtrTy);
+    TIL.AlignLog2 = ImportConstant("align", TTRes.AlignLog2, 8, Int8Ty);
+    TIL.SizeM1 =
+        ImportConstant("size_m1", TTRes.SizeM1, TTRes.SizeM1BitWidth, IntPtrTy);
   }
 
   if (TIL.TheKind == TypeTestResolution::ByteArray) {
-    TIL.TheByteArray = ImportGlobal("byte_array", 0);
-    TIL.BitMask = ImportGlobal("bit_mask", 8);
+    TIL.TheByteArray = ImportGlobal("byte_array");
+    TIL.BitMask = ImportConstant("bit_mask", TTRes.BitMask, 8, Int8PtrTy);
   }
 
   if (TIL.TheKind == TypeTestResolution::Inline)
-    TIL.InlineBits = ConstantExpr::getPtrToInt(
-        ImportGlobal("inline_bits", 1 << TTRes.SizeM1BitWidth),
+    TIL.InlineBits = ImportConstant(
+        "inline_bits", TTRes.InlineBits, 1 << TTRes.SizeM1BitWidth,
         TTRes.SizeM1BitWidth <= 5 ? Int32Ty : Int64Ty);
 
   return TIL;
@@ -894,6 +988,7 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
       BSI.print(dbgs());
     });
 
+    ByteArrayInfo *BAI = nullptr;
     TypeIdLowering TIL;
     TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr(
         Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)),
@@ -915,15 +1010,18 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
     } else {
       TIL.TheKind = TypeTestResolution::ByteArray;
       ++NumByteArraysCreated;
-      ByteArrayInfo *BAI = createByteArray(BSI);
+      BAI = createByteArray(BSI);
       TIL.TheByteArray = BAI->ByteArray;
       TIL.BitMask = BAI->MaskGlobal;
     }
 
     TypeIdUserInfo &TIUI = TypeIdUsers[TypeId];
 
-    if (TIUI.IsExported)
-      exportTypeId(cast<MDString>(TypeId)->getString(), TIL);
+    if (TIUI.IsExported) {
+      uint8_t *MaskPtr = exportTypeId(cast<MDString>(TypeId)->getString(), TIL);
+      if (BAI)
+        BAI->MaskPtr = MaskPtr;
+    }
 
     // Lower each call to llvm.type.test for this type identifier.
     for (CallInst *CI : TIUI.CallSites) {
@@ -979,15 +1077,16 @@ unsigned LowerTypeTestsModule::getJumpTableEntrySize() {
 // constraints and arguments to AsmOS, ConstraintOS and AsmArgs.
 void LowerTypeTestsModule::createJumpTableEntry(
     raw_ostream &AsmOS, raw_ostream &ConstraintOS,
-    SmallVectorImpl<Value *> &AsmArgs, Function *Dest) {
+    Triple::ArchType JumpTableArch, SmallVectorImpl<Value *> &AsmArgs,
+    Function *Dest) {
   unsigned ArgIndex = AsmArgs.size();
 
-  if (Arch == Triple::x86 || Arch == Triple::x86_64) {
+  if (JumpTableArch == Triple::x86 || JumpTableArch == Triple::x86_64) {
     AsmOS << "jmp ${" << ArgIndex << ":c}@plt\n";
     AsmOS << "int3\nint3\nint3\n";
-  } else if (Arch == Triple::arm || Arch == Triple::aarch64) {
+  } else if (JumpTableArch == Triple::arm || JumpTableArch == Triple::aarch64) {
     AsmOS << "b $" << ArgIndex << "\n";
-  } else if (Arch == Triple::thumb) {
+  } else if (JumpTableArch == Triple::thumb) {
     AsmOS << "b.w $" << ArgIndex << "\n";
   } else {
     report_fatal_error("Unsupported architecture for jump tables");
@@ -1074,6 +1173,46 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
   PlaceholderFn->eraseFromParent();
 }
 
+static bool isThumbFunction(Function *F, Triple::ArchType ModuleArch) {
+  Attribute TFAttr = F->getFnAttribute("target-features");
+  if (!TFAttr.hasAttribute(Attribute::None)) {
+    SmallVector<StringRef, 6> Features;
+    TFAttr.getValueAsString().split(Features, ',');
+    for (StringRef Feature : Features) {
+      if (Feature == "-thumb-mode")
+        return false;
+      else if (Feature == "+thumb-mode")
+        return true;
+    }
+  }
+
+  return ModuleArch == Triple::thumb;
+}
+
+// Each jump table must be either ARM or Thumb as a whole for the bit-test math
+// to work. Pick one that matches the majority of members to minimize interop
+// veneers inserted by the linker.
+static Triple::ArchType
+selectJumpTableArmEncoding(ArrayRef<GlobalTypeMember *> Functions,
+                           Triple::ArchType ModuleArch) {
+  if (ModuleArch != Triple::arm && ModuleArch != Triple::thumb)
+    return ModuleArch;
+
+  unsigned ArmCount = 0, ThumbCount = 0;
+  for (const auto GTM : Functions) {
+    if (!GTM->isDefinition()) {
+      // PLT stubs are always ARM.
+      ++ArmCount;
+      continue;
+    }
+
+    Function *F = cast<Function>(GTM->getGlobal());
+    ++(isThumbFunction(F, ModuleArch) ? ThumbCount : ArmCount);
+  }
+
+  return ArmCount > ThumbCount ? Triple::arm : Triple::thumb;
+}
+
 void LowerTypeTestsModule::createJumpTable(
     Function *F, ArrayRef<GlobalTypeMember *> Functions) {
   std::string AsmStr, ConstraintStr;
@@ -1081,8 +1220,10 @@ void LowerTypeTestsModule::createJumpTable(
   SmallVector<Value *, 16> AsmArgs;
   AsmArgs.reserve(Functions.size() * 2);
 
+  Triple::ArchType JumpTableArch = selectJumpTableArmEncoding(Functions, Arch);
+
   for (unsigned I = 0; I != Functions.size(); ++I)
-    createJumpTableEntry(AsmOS, ConstraintOS, AsmArgs,
+    createJumpTableEntry(AsmOS, ConstraintOS, JumpTableArch, AsmArgs,
                          cast<Function>(Functions[I]->getGlobal()));
 
   // Try to emit the jump table at the end of the text segment.
@@ -1098,11 +1239,15 @@ void LowerTypeTestsModule::createJumpTable(
   // Luckily, this function does not get any prologue even without the
   // attribute.
   if (OS != Triple::Win32)
-    F->addFnAttr(llvm::Attribute::Naked);
-  // Thumb jump table assembly needs Thumb2. The following attribute is added by
-  // Clang for -march=armv7.
-  if (Arch == Triple::thumb)
+    F->addFnAttr(Attribute::Naked);
+  if (JumpTableArch == Triple::arm)
+    F->addFnAttr("target-features", "-thumb-mode");
+  if (JumpTableArch == Triple::thumb) {
+    F->addFnAttr("target-features", "+thumb-mode");
+    // Thumb jump table assembly needs Thumb2. The following attribute is added
+    // by Clang for -march=armv7.
     F->addFnAttr("target-cpu", "cortex-a8");
+  }
 
   BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F);
   IRBuilder<> IRB(BB);
@@ -1256,7 +1401,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
       FAlias->takeName(F);
       if (FAlias->hasName())
         F->setName(FAlias->getName() + ".cfi");
-      F->replaceAllUsesWith(FAlias);
+      F->replaceUsesExceptBlockAddr(FAlias);
     }
     if (!F->isDeclarationForLinker())
       F->setLinkage(GlobalValue::InternalLinkage);
@@ -1303,7 +1448,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsWASM(
 
 void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
     ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) {
-  llvm::DenseMap<Metadata *, uint64_t> TypeIdIndices;
+  DenseMap<Metadata *, uint64_t> TypeIdIndices;
   for (unsigned I = 0; I != TypeIds.size(); ++I)
     TypeIdIndices[TypeIds[I]] = I;
 
@@ -1335,38 +1480,25 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
   for (auto &&MemSet : TypeMembers)
     GLB.addFragment(MemSet);
 
-  // Build the bitsets from this disjoint set.
-  if (Globals.empty() || isa<GlobalVariable>(Globals[0]->getGlobal())) {
-    // Build a vector of global variables with the computed layout.
-    std::vector<GlobalTypeMember *> OrderedGVs(Globals.size());
-    auto OGI = OrderedGVs.begin();
-    for (auto &&F : GLB.Fragments) {
-      for (auto &&Offset : F) {
-        auto GV = dyn_cast<GlobalVariable>(Globals[Offset]->getGlobal());
-        if (!GV)
-          report_fatal_error("Type identifier may not contain both global "
-                             "variables and functions");
-        *OGI++ = Globals[Offset];
-      }
-    }
-
-    buildBitSetsFromGlobalVariables(TypeIds, OrderedGVs);
-  } else {
-    // Build a vector of functions with the computed layout.
-    std::vector<GlobalTypeMember *> OrderedFns(Globals.size());
-    auto OFI = OrderedFns.begin();
-    for (auto &&F : GLB.Fragments) {
-      for (auto &&Offset : F) {
-        auto Fn = dyn_cast<Function>(Globals[Offset]->getGlobal());
-        if (!Fn)
-          report_fatal_error("Type identifier may not contain both global "
-                             "variables and functions");
-        *OFI++ = Globals[Offset];
-      }
+  // Build a vector of globals with the computed layout.
+  bool IsGlobalSet =
+      Globals.empty() || isa<GlobalVariable>(Globals[0]->getGlobal());
+  std::vector<GlobalTypeMember *> OrderedGTMs(Globals.size());
+  auto OGTMI = OrderedGTMs.begin();
+  for (auto &&F : GLB.Fragments) {
+    for (auto &&Offset : F) {
+      if (IsGlobalSet != isa<GlobalVariable>(Globals[Offset]->getGlobal()))
+        report_fatal_error("Type identifier may not contain both global "
+                           "variables and functions");
+      *OGTMI++ = Globals[Offset];
     }
-
-    buildBitSetsFromFunctions(TypeIds, OrderedFns);
   }
+
+  // Build the bitsets from this disjoint set.
+  if (IsGlobalSet)
+    buildBitSetsFromGlobalVariables(TypeIds, OrderedGTMs);
+  else
+    buildBitSetsFromFunctions(TypeIds, OrderedGTMs);
 }
 
 /// Lower all type tests in this module.
@@ -1457,8 +1589,8 @@ bool LowerTypeTestsModule::lower() {
   // Equivalence class set containing type identifiers and the globals that
   // reference them. This is used to partition the set of type identifiers in
   // the module into disjoint sets.
-  typedef EquivalenceClasses<PointerUnion<GlobalTypeMember *, Metadata *>>
-      GlobalClassesTy;
+  using GlobalClassesTy =
+      EquivalenceClasses<PointerUnion<GlobalTypeMember *, Metadata *>>;
   GlobalClassesTy GlobalClasses;
 
   // Verify the type metadata and build a few data structures to let us
@@ -1473,7 +1605,7 @@ bool LowerTypeTestsModule::lower() {
     unsigned Index;
     std::vector<GlobalTypeMember *> RefGlobals;
   };
-  llvm::DenseMap<Metadata *, TIInfo> TypeIdInfo;
+  DenseMap<Metadata *, TIInfo> TypeIdInfo;
   unsigned I = 0;
   SmallVector<MDNode *, 2> Types;
 
@@ -1512,14 +1644,27 @@ bool LowerTypeTestsModule::lower() {
               FunctionType::get(Type::getVoidTy(M.getContext()), false),
               GlobalVariable::ExternalLinkage, FunctionName, &M);
 
-        if (Linkage == CFL_Definition)
-          F->eraseMetadata(LLVMContext::MD_type);
+        // If the function is available_externally, remove its definition so
+        // that it is handled the same way as a declaration. Later we will try
+        // to create an alias using this function's linkage, which will fail if
+        // the linkage is available_externally. This will also result in us
+        // following the code path below to replace the type metadata.
+        if (F->hasAvailableExternallyLinkage()) {
+          F->setLinkage(GlobalValue::ExternalLinkage);
+          F->deleteBody();
+          F->setComdat(nullptr);
+          F->clearMetadata();
+        }
 
+        // If the function in the full LTO module is a declaration, replace its
+        // type metadata with the type metadata we found in cfi.functions. That
+        // metadata is presumed to be more accurate than the metadata attached
+        // to the declaration.
         if (F->isDeclaration()) {
           if (Linkage == CFL_WeakDeclaration)
             F->setLinkage(GlobalValue::ExternalWeakLinkage);
 
-          SmallVector<MDNode *, 2> Types;
+          F->eraseMetadata(LLVMContext::MD_type);
           for (unsigned I = 2; I < FuncMD->getNumOperands(); ++I)
             F->addMetadata(LLVMContext::MD_type,
                            *cast<MDNode>(FuncMD->getOperand(I).get()));
@@ -1548,7 +1693,7 @@ bool LowerTypeTestsModule::lower() {
         GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types);
     for (MDNode *Type : Types) {
       verifyTypeMDNode(&GO, Type);
-      auto &Info = TypeIdInfo[cast<MDNode>(Type)->getOperand(1)];
+      auto &Info = TypeIdInfo[Type->getOperand(1)];
       Info.Index = ++I;
       Info.RefGlobals.push_back(GTM);
     }
@@ -1596,12 +1741,12 @@ bool LowerTypeTestsModule::lower() {
 
     for (auto &P : *ExportSummary) {
       for (auto &S : P.second.SummaryList) {
-        auto *FS = dyn_cast<FunctionSummary>(S.get());
-        if (!FS || !ExportSummary->isGlobalValueLive(FS))
+        if (!ExportSummary->isGlobalValueLive(S.get()))
           continue;
-        for (GlobalValue::GUID G : FS->type_tests())
-          for (Metadata *MD : MetadataByGUID[G])
-            AddTypeIdUse(MD).IsExported = true;
+        if (auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()))
+          for (GlobalValue::GUID G : FS->type_tests())
+            for (Metadata *MD : MetadataByGUID[G])
+              AddTypeIdUse(MD).IsExported = true;
       }
     }
   }
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 0e478ba607be..76b90391fbb1 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -89,28 +89,45 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/FunctionComparator.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <set>
+#include <utility>
 #include <vector>
 
 using namespace llvm;
@@ -119,7 +136,6 @@ using namespace llvm;
 
 STATISTIC(NumFunctionsMerged, "Number of functions merged");
 STATISTIC(NumThunksWritten, "Number of thunks generated");
-STATISTIC(NumAliasesWritten, "Number of aliases generated");
 STATISTIC(NumDoubleWeak, "Number of new functions created");
 
 static cl::opt<unsigned> NumFunctionsForSanityCheck(
@@ -154,10 +170,12 @@ namespace {
 class FunctionNode {
   mutable AssertingVH<Function> F;
   FunctionComparator::FunctionHash Hash;
+
 public:
   // Note the hash is recalculated potentially multiple times, but it is cheap.
   FunctionNode(Function *F)
     : F(F), Hash(FunctionComparator::functionHash(*F))  {}
+
   Function *getFunc() const { return F; }
   FunctionComparator::FunctionHash getHash() const { return Hash; }
 
@@ -174,13 +192,12 @@ public:
 /// by considering all pointer types to be equivalent. Once identified,
 /// MergeFunctions will fold them by replacing a call to one to a call to a
 /// bitcast of the other.
-///
 class MergeFunctions : public ModulePass {
 public:
   static char ID;
+
   MergeFunctions()
-    : ModulePass(ID), FnTree(FunctionNodeCmp(&GlobalNumbers)), FNodesInTree(),
-      HasGlobalAliases(false) {
+    : ModulePass(ID), FnTree(FunctionNodeCmp(&GlobalNumbers)) {
     initializeMergeFunctionsPass(*PassRegistry::getPassRegistry());
   }
 
@@ -191,8 +208,10 @@ private:
   // not need to become larger with another pointer.
   class FunctionNodeCmp {
     GlobalNumberState* GlobalNumbers;
+
   public:
     FunctionNodeCmp(GlobalNumberState* GN) : GlobalNumbers(GN) {}
+
     bool operator()(const FunctionNode &LHS, const FunctionNode &RHS) const {
       // Order first by hashes, then full function comparison.
       if (LHS.getHash() != RHS.getHash())
@@ -201,7 +220,7 @@ private:
       return FCmp.compare() == -1;
     }
   };
-  typedef std::set<FunctionNode, FunctionNodeCmp> FnTreeType;
+  using FnTreeType = std::set<FunctionNode, FunctionNodeCmp>;
 
   GlobalNumberState GlobalNumbers;
 
@@ -209,9 +228,9 @@ private:
   /// analyzed again.
   std::vector<WeakTrackingVH> Deferred;
 
+#ifndef NDEBUG
   /// Checks the rules of order relation introduced among functions set.
   /// Returns true, if sanity check has been passed, and false if failed.
-#ifndef NDEBUG
   bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist);
 #endif
 
@@ -236,9 +255,6 @@ private:
   /// again.
   void mergeTwoFunctions(Function *F, Function *G);
 
-  /// Replace G with a thunk or an alias to F. Deletes G.
-  void writeThunkOrAlias(Function *F, Function *G);
-
   /// Fill PDIUnrelatedWL with instructions from the entry block that are
   /// unrelated to parameter related debug info.
   void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock,
@@ -256,29 +272,25 @@ private:
   /// delete G.
   void writeThunk(Function *F, Function *G);
 
-  /// Replace G with an alias to F. Deletes G.
-  void writeAlias(Function *F, Function *G);
-
   /// Replace function F with function G in the function tree.
   void replaceFunctionInTree(const FunctionNode &FN, Function *G);
 
   /// The set of all distinct functions. Use the insert() and remove() methods
   /// to modify it. The map allows efficient lookup and deferring of Functions.
   FnTreeType FnTree;
+
   // Map functions to the iterators of the FunctionNode which contains them
   // in the FnTree. This must be updated carefully whenever the FnTree is
   // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid
   // dangling iterators into FnTree. The invariant that preserves this is that
   // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree.
   ValueMap<Function*, FnTreeType::iterator> FNodesInTree;
-
-  /// Whether or not the target supports global aliases.
-  bool HasGlobalAliases;
 };
 
 } // end anonymous namespace
 
 char MergeFunctions::ID = 0;
+
 INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false)
 
 ModulePass *llvm::createMergeFunctionsPass() {
@@ -454,19 +466,6 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
   }
 }
 
-// Replace G with an alias to F if possible, or else a thunk to F. Deletes G.
-void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
-  if (HasGlobalAliases && G->hasGlobalUnnamedAddr()) {
-    if (G->hasExternalLinkage() || G->hasLocalLinkage() ||
-        G->hasWeakLinkage()) {
-      writeAlias(F, G);
-      return;
-    }
-  }
-
-  writeThunk(F, G);
-}
-
 // Helper for writeThunk,
 // Selects proper bitcast operation,
 // but a bit simpler then CastInst::getCastOpcode.
@@ -499,7 +498,6 @@ static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
 // parameter debug info, from the entry block.
 void MergeFunctions::eraseInstsUnrelatedToPDI(
     std::vector<Instruction *> &PDIUnrelatedWL) {
-
   DEBUG(dbgs() << " Erasing instructions (in reverse order of appearance in "
                   "entry block) unrelated to parameter debug info from entry "
                   "block: {\n");
@@ -517,7 +515,6 @@ void MergeFunctions::eraseInstsUnrelatedToPDI(
 
 // Reduce G to its entry block.
 void MergeFunctions::eraseTail(Function *G) {
-
   std::vector<BasicBlock *> WorklistBB;
   for (Function::iterator BBI = std::next(G->begin()), BBE = G->end();
        BBI != BBE; ++BBI) {
@@ -542,7 +539,6 @@ void MergeFunctions::eraseTail(Function *G) {
 // PDIUnrelatedWL with such instructions.
 void MergeFunctions::filterInstsUnrelatedToPDI(
     BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL) {
-
   std::set<Instruction *> PDIRelated;
   for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end();
        BI != BIE; ++BI) {
@@ -652,9 +648,18 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
 // call sites to point to F even when within the same translation unit.
 void MergeFunctions::writeThunk(Function *F, Function *G) {
   if (!G->isInterposable() && !MergeFunctionsPDI) {
-    // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
-    // above).
-    replaceDirectCallers(G, F);
+    if (G->hasGlobalUnnamedAddr()) {
+      // G might have been a key in our GlobalNumberState, and it's illegal
+      // to replace a key in ValueMap<GlobalValue *> with a non-global.
+      GlobalNumbers.erase(G);
+      // If G's address is not significant, replace it entirely.
+      Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
+      G->replaceAllUsesWith(BitcastF);
+    } else {
+      // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
+      // above).
+      replaceDirectCallers(G, F);
+    }
   }
 
   // If G was internal then we may have replaced all uses of G with F. If so,
@@ -665,6 +670,16 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
     return;
   }
 
+  // Don't merge tiny functions using a thunk, since it can just end up
+  // making the function larger.
+  if (F->size() == 1) {
+    if (F->front().size() <= 2) {
+      DEBUG(dbgs() << "writeThunk: " << F->getName()
+                   << " is too small to bother creating a thunk for\n");
+      return;
+    }
+  }
+
   BasicBlock *GEntryBlock = nullptr;
   std::vector<Instruction *> PDIUnrelatedWL;
   BasicBlock *BB = nullptr;
@@ -691,7 +706,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   SmallVector<Value *, 16> Args;
   unsigned i = 0;
   FunctionType *FFTy = F->getFunctionType();
-  for (Argument & AI : H->args()) {
+  for (Argument &AI : H->args()) {
     Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i)));
     ++i;
   }
@@ -734,20 +749,6 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   ++NumThunksWritten;
 }
 
-// Replace G with an alias to F and delete G.
-void MergeFunctions::writeAlias(Function *F, Function *G) {
-  auto *GA = GlobalAlias::create(G->getLinkage(), "", F);
-  F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
-  GA->takeName(G);
-  GA->setVisibility(G->getVisibility());
-  removeUsers(G);
-  G->replaceAllUsesWith(GA);
-  G->eraseFromParent();
-
-  DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n');
-  ++NumAliasesWritten;
-}
-
 // Merge two equivalent functions. Upon completion, Function G is deleted.
 void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
   if (F->isInterposable()) {
@@ -763,19 +764,14 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
 
     unsigned MaxAlignment = std::max(G->getAlignment(), H->getAlignment());
 
-    if (HasGlobalAliases) {
-      writeAlias(F, G);
-      writeAlias(F, H);
-    } else {
-      writeThunk(F, G);
-      writeThunk(F, H);
-    }
+    writeThunk(F, G);
+    writeThunk(F, H);
 
     F->setAlignment(MaxAlignment);
     F->setLinkage(GlobalValue::PrivateLinkage);
     ++NumDoubleWeak;
   } else {
-    writeThunkOrAlias(F, G);
+    writeThunk(F, G);
   }
 
   ++NumFunctionsMerged;
@@ -816,18 +812,6 @@ bool MergeFunctions::insert(Function *NewFunction) {
 
   const FunctionNode &OldF = *Result.first;
 
-  // Don't merge tiny functions, since it can just end up making the function
-  // larger.
-  // FIXME: Should still merge them if they are unnamed_addr and produce an
-  // alias.
-  if (NewFunction->size() == 1) {
-    if (NewFunction->front().size() <= 2) {
-      DEBUG(dbgs() << NewFunction->getName()
-                   << " is to small to bother merging\n");
-      return false;
-    }
-  }
-
   // Impose a total order (by name) on the replacement of functions. This is
   // important when operating on more than one module independently to prevent
   // cycles of thunks calling each other when the modules are linked together.
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 8840435af642..683655f1f68b 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -13,45 +13,126 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/PartialInlining.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <tuple>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "partial-inlining"
 
 STATISTIC(NumPartialInlined,
           "Number of callsites functions partially inlined into.");
+STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
+                                        "cold outlined regions were partially "
+                                        "inlined into its caller(s).");
+STATISTIC(NumColdRegionsFound,
+           "Number of cold single entry/exit regions found.");
+STATISTIC(NumColdRegionsOutlined,
+           "Number of cold single entry/exit regions outlined.");
 
 // Command line option to disable partial-inlining. The default is false:
 static cl::opt<bool>
     DisablePartialInlining("disable-partial-inlining", cl::init(false),
-                           cl::Hidden, cl::desc("Disable partial ininling"));
+                           cl::Hidden, cl::desc("Disable partial inlining"));
+// Command line option to disable multi-region partial-inlining. The default is
+// false:
+static cl::opt<bool> DisableMultiRegionPartialInline(
+    "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
+    cl::desc("Disable multi-region partial inlining"));
+
+// Command line option to force outlining in regions with live exit variables.
+// The default is false:
+static cl::opt<bool>
+    ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
+               cl::desc("Force outline regions with live exits"));
+
+// Command line option to enable marking outline functions with Cold Calling
+// Convention. The default is false:
+static cl::opt<bool>
+    MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
+                       cl::desc("Mark outline function calls with ColdCC"));
+
+#ifndef NDEBUG
+// Command line option to debug partial-inlining. The default is none:
+static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
+                                          cl::init(false), cl::Hidden,
+                                          cl::desc("Trace partial inlining."));
+#endif
+
 // This is an option used by testing:
 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
                                       cl::init(false), cl::ZeroOrMore,
                                       cl::ReallyHidden,
                                       cl::desc("Skip Cost Analysis"));
+// Used to determine if a cold region is worth outlining based on
+// its inlining cost compared to the original function.  Default is set at 10%.
+// ie. if the cold region reduces the inlining cost of the original function by
+// at least 10%.
+static cl::opt<float> MinRegionSizeRatio(
+    "min-region-size-ratio", cl::init(0.1), cl::Hidden,
+    cl::desc("Minimum ratio comparing relative sizes of each "
+             "outline candidate and original function"));
+// Used to tune the minimum number of execution counts needed in the predecessor
+// block to the cold edge. ie. confidence interval.
+static cl::opt<unsigned>
+    MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
+                             cl::desc("Minimum block executions to consider "
+                                      "its BranchProbabilityInfo valid"));
+// Used to determine when an edge is considered cold. Default is set to 10%. ie.
+// if the branch probability is 10% or less, then it is deemed as 'cold'.
+static cl::opt<float> ColdBranchRatio(
+    "cold-branch-ratio", cl::init(0.1), cl::Hidden,
+    cl::desc("Minimum BranchProbability to consider a region cold."));
 
 static cl::opt<unsigned> MaxNumInlineBlocks(
     "max-num-inline-blocks", cl::init(5), cl::Hidden,
-    cl::desc("Max Number of Blocks  To be Partially Inlined"));
+    cl::desc("Max number of blocks to be partially inlined"));
 
 // Command line option to set the maximum number of partial inlining allowed
 // for the module. The default value of -1 means no limit.
@@ -75,9 +156,8 @@ static cl::opt<unsigned> ExtraOutliningPenalty(
 namespace {
 
 struct FunctionOutliningInfo {
-  FunctionOutliningInfo()
-      : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr),
-        ReturnBlockPreds() {}
+  FunctionOutliningInfo() = default;
+
   // Returns the number of blocks to be inlined including all blocks
   // in Entries and one return block.
   unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
@@ -85,30 +165,69 @@ struct FunctionOutliningInfo {
   // A set of blocks including the function entry that guard
   // the region to be outlined.
   SmallVector<BasicBlock *, 4> Entries;
+
   // The return block that is not included in the outlined region.
-  BasicBlock *ReturnBlock;
+  BasicBlock *ReturnBlock = nullptr;
+
   // The dominating block of the region to be outlined.
-  BasicBlock *NonReturnBlock;
+  BasicBlock *NonReturnBlock = nullptr;
+
   // The set of blocks in Entries that that are predecessors to ReturnBlock
   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
 };
 
+struct FunctionOutliningMultiRegionInfo {
+  FunctionOutliningMultiRegionInfo()
+      : ORI() {}
+
+  // Container for outline regions
+  struct OutlineRegionInfo {
+    OutlineRegionInfo(SmallVector<BasicBlock *, 8> Region,
+                      BasicBlock *EntryBlock, BasicBlock *ExitBlock,
+                      BasicBlock *ReturnBlock)
+        : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock),
+          ReturnBlock(ReturnBlock) {}
+    SmallVector<BasicBlock *, 8> Region;
+    BasicBlock *EntryBlock;
+    BasicBlock *ExitBlock;
+    BasicBlock *ReturnBlock;
+  };
+
+  SmallVector<OutlineRegionInfo, 4> ORI;
+};
+
 struct PartialInlinerImpl {
+
   PartialInlinerImpl(
       std::function<AssumptionCache &(Function &)> *GetAC,
       std::function<TargetTransformInfo &(Function &)> *GTTI,
       Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
-      ProfileSummaryInfo *ProfSI)
-      : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
+      ProfileSummaryInfo *ProfSI,
+      std::function<OptimizationRemarkEmitter &(Function &)> *GORE)
+      : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI),
+        GetORE(GORE) {}
+
   bool run(Module &M);
-  Function *unswitchFunction(Function *F);
+  // Main part of the transformation that calls helper functions to find
+  // outlining candidates, clone & outline the function, and attempt to
+  // partially inline the resulting function. Returns true if
+  // inlining was successful, false otherwise.  Also returns the outline
+  // function (only if we partially inlined early returns) as there is a
+  // possibility to further "peel" early return statements that were left in the
+  // outline function due to code size.
+  std::pair<bool, Function *> unswitchFunction(Function *F);
 
   // This class speculatively clones the the function to be partial inlined.
   // At the end of partial inlining, the remaining callsites to the cloned
   // function that are not partially inlined will be fixed up to reference
   // the original function, and the cloned function will be erased.
   struct FunctionCloner {
-    FunctionCloner(Function *F, FunctionOutliningInfo *OI);
+    // Two constructors, one for single region outlining, the other for
+    // multi-region outlining.
+    FunctionCloner(Function *F, FunctionOutliningInfo *OI,
+                   OptimizationRemarkEmitter &ORE);
+    FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
+                   OptimizationRemarkEmitter &ORE);
     ~FunctionCloner();
 
     // Prepare for function outlining: making sure there is only
@@ -116,20 +235,34 @@ struct PartialInlinerImpl {
     // the return block.
     void NormalizeReturnBlock();
 
-    // Do function outlining:
-    Function *doFunctionOutlining();
+    // Do function outlining for cold regions.
+    bool doMultiRegionFunctionOutlining();
+    // Do function outlining for region after early return block(s).
+    // NOTE: For vararg functions that do the vararg handling in the outlined
+    //       function, we temporarily generate IR that does not properly
+    //       forward varargs to the outlined function. Calling InlineFunction
+    //       will update calls to the outlined functions to properly forward
+    //       the varargs.
+    Function *doSingleRegionFunctionOutlining();
 
     Function *OrigFunc = nullptr;
     Function *ClonedFunc = nullptr;
-    Function *OutlinedFunc = nullptr;
-    BasicBlock *OutliningCallBB = nullptr;
+
+    typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
+    // Keep track of Outlined Functions and the basic block they're called from.
+    SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
+
     // ClonedFunc is inlined in one of its callers after function
     // outlining.
     bool IsFunctionInlined = false;
     // The cost of the region to be outlined.
     int OutlinedRegionCost = 0;
+    // ClonedOI is specific to outlining non-early return blocks.
     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
+    // ClonedOMRI is specific to outlining cold regions.
+    std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
+    OptimizationRemarkEmitter &ORE;
   };
 
 private:
@@ -138,6 +271,7 @@ private:
   std::function<TargetTransformInfo &(Function &)> *GetTTI;
   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
   ProfileSummaryInfo *PSI;
+  std::function<OptimizationRemarkEmitter &(Function &)> *GetORE;
 
   // Return the frequency of the OutlininingBB relative to F's entry point.
   // The result is no larger than 1 and is represented using BP.
@@ -148,8 +282,7 @@ private:
   // Return true if the callee of CS should be partially inlined with
   // profit.
   bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
-                           BlockFrequency WeightedOutliningRcost,
-                           OptimizationRemarkEmitter &ORE);
+                           BlockFrequency WeightedOutliningRcost);
 
   // Try to inline DuplicateFunction (cloned from F with call to
   // the OutlinedFunction into its callers. Return true
@@ -196,17 +329,20 @@ private:
   // - The second value is the estimated size of the new call sequence in
   //   basic block Cloner.OutliningCallBB;
   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
+
   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
   // approximate both the size and runtime cost (Note that in the current
   // inline cost analysis, there is no clear distinction there either).
   static int computeBBInlineCost(BasicBlock *BB);
 
   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
-
+  std::unique_ptr<FunctionOutliningMultiRegionInfo>
+  computeOutliningColdRegionsInfo(Function *F);
 };
 
 struct PartialInlinerLegacyPass : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
+
   PartialInlinerLegacyPass() : ModulePass(ID) {
     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
   }
@@ -216,6 +352,7 @@ struct PartialInlinerLegacyPass : public ModulePass {
     AU.addRequired<ProfileSummaryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
   }
+
   bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
@@ -225,6 +362,7 @@ struct PartialInlinerLegacyPass : public ModulePass {
         &getAnalysis<TargetTransformInfoWrapperPass>();
     ProfileSummaryInfo *PSI =
         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+    std::unique_ptr<OptimizationRemarkEmitter> UPORE;
 
     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
         [&ACT](Function &F) -> AssumptionCache & {
@@ -236,9 +374,185 @@ struct PartialInlinerLegacyPass : public ModulePass {
       return TTIWP->getTTI(F);
     };
 
-    return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M);
+    std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
+        [&UPORE](Function &F) -> OptimizationRemarkEmitter & {
+      UPORE.reset(new OptimizationRemarkEmitter(&F));
+      return *UPORE.get();
+    };
+
+    return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI,
+                              &GetORE)
+        .run(M);
   }
 };
+
+} // end anonymous namespace
+
+std::unique_ptr<FunctionOutliningMultiRegionInfo>
+PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) {
+  BasicBlock *EntryBlock = &F->front();
+
+  DominatorTree DT(*F);
+  LoopInfo LI(DT);
+  BranchProbabilityInfo BPI(*F, LI);
+  std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
+  BlockFrequencyInfo *BFI;
+  if (!GetBFI) {
+    ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
+    BFI = ScopedBFI.get();
+  } else
+    BFI = &(*GetBFI)(*F);
+
+  auto &ORE = (*GetORE)(*F);
+
+  // Return if we don't have profiling information.
+  if (!PSI->hasInstrumentationProfile())
+    return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
+
+  std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
+      llvm::make_unique<FunctionOutliningMultiRegionInfo>();
+
+  auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
+    BasicBlock *Dom = BlockList.front();
+    return BlockList.size() > 1 &&
+           std::distance(pred_begin(Dom), pred_end(Dom)) == 1;
+  };
+
+  auto IsSingleExit =
+      [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
+    BasicBlock *ExitBlock = nullptr;
+    for (auto *Block : BlockList) {
+      for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
+        if (!is_contained(BlockList, *SI)) {
+          if (ExitBlock) {
+            ORE.emit([&]() {
+              return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
+                                              &SI->front())
+                     << "Region dominated by "
+                     << ore::NV("Block", BlockList.front()->getName())
+                     << " has more than one region exit edge.";
+            });
+            return nullptr;
+          } else
+            ExitBlock = Block;
+        }
+      }
+    }
+    return ExitBlock;
+  };
+
+  auto BBProfileCount = [BFI](BasicBlock *BB) {
+    return BFI->getBlockProfileCount(BB)
+               ? BFI->getBlockProfileCount(BB).getValue()
+               : 0;
+  };
+
+  // Use the same computeBBInlineCost function to compute the cost savings of
+  // the outlining the candidate region.
+  int OverallFunctionCost = 0;
+  for (auto &BB : *F)
+    OverallFunctionCost += computeBBInlineCost(&BB);
+
+#ifndef NDEBUG
+  if (TracePartialInlining)
+    dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
+#endif
+  int MinOutlineRegionCost =
+      static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
+  BranchProbability MinBranchProbability(
+      static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
+      MinBlockCounterExecution);
+  bool ColdCandidateFound = false;
+  BasicBlock *CurrEntry = EntryBlock;
+  std::vector<BasicBlock *> DFS;
+  DenseMap<BasicBlock *, bool> VisitedMap;
+  DFS.push_back(CurrEntry);
+  VisitedMap[CurrEntry] = true;
+  // Use Depth First Search on the basic blocks to find CFG edges that are
+  // considered cold.
+  // Cold regions considered must also have its inline cost compared to the
+  // overall inline cost of the original function.  The region is outlined only
+  // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
+  // more.
+  while (!DFS.empty()) {
+    auto *thisBB = DFS.back();
+    DFS.pop_back();
+    // Only consider regions with predecessor blocks that are considered
+    // not-cold (default: part of the top 99.99% of all block counters)
+    // AND greater than our minimum block execution count (default: 100).
+    if (PSI->isColdBB(thisBB, BFI) ||
+        BBProfileCount(thisBB) < MinBlockCounterExecution)
+      continue;
+    for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
+      if (VisitedMap[*SI])
+        continue;
+      VisitedMap[*SI] = true;
+      DFS.push_back(*SI);
+      // If branch isn't cold, we skip to the next one.
+      BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
+      if (SuccProb > MinBranchProbability)
+        continue;
+#ifndef NDEBUG
+      if (TracePartialInlining) {
+        dbgs() << "Found cold edge: " << thisBB->getName() << "->"
+               << (*SI)->getName() << "\nBranch Probability = " << SuccProb
+               << "\n";
+      }
+#endif
+      SmallVector<BasicBlock *, 8> DominateVector;
+      DT.getDescendants(*SI, DominateVector);
+      // We can only outline single entry regions (for now).
+      if (!IsSingleEntry(DominateVector))
+        continue;
+      BasicBlock *ExitBlock = nullptr;
+      // We can only outline single exit regions (for now).
+      if (!(ExitBlock = IsSingleExit(DominateVector)))
+        continue;
+      int OutlineRegionCost = 0;
+      for (auto *BB : DominateVector)
+        OutlineRegionCost += computeBBInlineCost(BB);
+
+#ifndef NDEBUG
+      if (TracePartialInlining)
+        dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
+#endif
+
+      if (OutlineRegionCost < MinOutlineRegionCost) {
+        ORE.emit([&]() {
+          return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
+                                            &SI->front())
+                 << ore::NV("Callee", F) << " inline cost-savings smaller than "
+                 << ore::NV("Cost", MinOutlineRegionCost);
+        });
+        continue;
+      }
+      // For now, ignore blocks that belong to a SISE region that is a
+      // candidate for outlining.  In the future, we may want to look
+      // at inner regions because the outer region may have live-exit
+      // variables.
+      for (auto *BB : DominateVector)
+        VisitedMap[BB] = true;
+      // ReturnBlock here means the block after the outline call
+      BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
+      // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
+      FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
+          DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
+      RegInfo.Region = DominateVector;
+      OutliningInfo->ORI.push_back(RegInfo);
+#ifndef NDEBUG
+      if (TracePartialInlining) {
+        dbgs() << "Found Cold Candidate starting at block: "
+               << DominateVector.front()->getName() << "\n";
+      }
+#endif
+      ColdCandidateFound = true;
+      NumColdRegionsFound++;
+    }
+  }
+  if (ColdCandidateFound)
+    return OutliningInfo;
+  else
+    return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
 }
 
 std::unique_ptr<FunctionOutliningInfo>
@@ -319,7 +633,6 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
 
     OutliningInfo->Entries.push_back(CurrEntry);
     CurrEntry = OtherSucc;
-
   } while (true);
 
   if (!CandidateFound)
@@ -413,15 +726,19 @@ static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
 
 BranchProbability
 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
-
+  BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
   auto EntryFreq =
       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
   auto OutliningCallFreq =
-      Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB);
-
-  auto OutlineRegionRelFreq =
-      BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
-                                              EntryFreq.getFrequency());
+      Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
+  // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
+  // we outlined any regions, so we may encounter situations where the
+  // OutliningCallFreq is *slightly* bigger than the EntryFreq.
+  if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
+    OutliningCallFreq = EntryFreq;
+  }
+  auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
+      OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
 
   if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
     return OutlineRegionRelFreq;
@@ -448,10 +765,10 @@ PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
 }
 
 bool PartialInlinerImpl::shouldPartialInline(
-    CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
-    OptimizationRemarkEmitter &ORE) {
-
+    CallSite CS, FunctionCloner &Cloner,
+    BlockFrequency WeightedOutliningRcost) {
   using namespace ore;
+
   if (SkipCostAnalysis)
     return true;
 
@@ -461,30 +778,37 @@ bool PartialInlinerImpl::shouldPartialInline(
 
   Function *Caller = CS.getCaller();
   auto &CalleeTTI = (*GetTTI)(*Callee);
+  auto &ORE = (*GetORE)(*Caller);
   InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
-                                *GetAssumptionCache, GetBFI, PSI);
+                                *GetAssumptionCache, GetBFI, PSI, &ORE);
 
   if (IC.isAlways()) {
-    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
              << NV("Callee", Cloner.OrigFunc)
-             << " should always be fully inlined, not partially");
+             << " should always be fully inlined, not partially";
+    });
     return false;
   }
 
   if (IC.isNever()) {
-    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
              << NV("Caller", Caller)
-             << " because it should never be inlined (cost=never)");
+             << " because it should never be inlined (cost=never)";
+    });
     return false;
   }
 
   if (!IC) {
-    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
              << NV("Caller", Caller) << " because too costly to inline (cost="
              << NV("Cost", IC.getCost()) << ", threshold="
-             << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
+             << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
+    });
     return false;
   }
   const DataLayout &DL = Caller->getParent()->getDataLayout();
@@ -495,23 +819,28 @@ bool PartialInlinerImpl::shouldPartialInline(
 
   // Weighted saving is smaller than weighted cost, return false
   if (NormWeightedSavings < WeightedOutliningRcost) {
-    ORE.emit(
-        OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
-        << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
-        << NV("Caller", Caller) << " runtime overhead (overhead="
-        << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
-        << ", savings="
-        << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
-        << " of making the outlined call is too high");
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
+                                        Call)
+             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+             << NV("Caller", Caller) << " runtime overhead (overhead="
+             << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
+             << ", savings="
+             << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
+             << ")"
+             << " of making the outlined call is too high";
+    });
 
     return false;
   }
 
-  ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
+  ORE.emit([&]() {
+    return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
            << " (threshold="
-           << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
+           << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
+  });
   return true;
 }
 
@@ -566,23 +895,26 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
 
 std::tuple<int, int>
 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
+  int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
+  for (auto FuncBBPair : Cloner.OutlinedFunctions) {
+    Function *OutlinedFunc = FuncBBPair.first;
+    BasicBlock* OutliningCallBB = FuncBBPair.second;
+    // Now compute the cost of the call sequence to the outlined function
+    // 'OutlinedFunction' in BB 'OutliningCallBB':
+    OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
 
-  // Now compute the cost of the call sequence to the outlined function
-  // 'OutlinedFunction' in BB 'OutliningCallBB':
-  int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB);
-
-  // Now compute the cost of the extracted/outlined function itself:
-  int OutlinedFunctionCost = 0;
-  for (BasicBlock &BB : *Cloner.OutlinedFunc) {
-    OutlinedFunctionCost += computeBBInlineCost(&BB);
+    // Now compute the cost of the extracted/outlined function itself:
+    for (BasicBlock &BB : *OutlinedFunc)
+      OutlinedFunctionCost += computeBBInlineCost(&BB);
   }
-
   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
          "Outlined function cost should be no less than the outlined region");
+
   // The code extractor introduces a new root and exit stub blocks with
   // additional unconditional branches. Those branches will be eliminated
   // later with bb layout. The cost should be adjusted accordingly:
-  OutlinedFunctionCost -= 2 * InlineConstants::InstrCost;
+  OutlinedFunctionCost -=
+      2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
 
   int OutliningRuntimeOverhead =
       OutliningFuncCallCost +
@@ -636,9 +968,9 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap(
   }
 }
 
-PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F,
-                                                   FunctionOutliningInfo *OI)
-    : OrigFunc(F) {
+PartialInlinerImpl::FunctionCloner::FunctionCloner(
+    Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE)
+    : OrigFunc(F), ORE(ORE) {
   ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
 
   // Clone the function, so that we can hack away on it.
@@ -659,8 +991,39 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F,
   F->replaceAllUsesWith(ClonedFunc);
 }
 
-void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
+PartialInlinerImpl::FunctionCloner::FunctionCloner(
+    Function *F, FunctionOutliningMultiRegionInfo *OI,
+    OptimizationRemarkEmitter &ORE)
+    : OrigFunc(F), ORE(ORE) {
+  ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>();
+
+  // Clone the function, so that we can hack away on it.
+  ValueToValueMapTy VMap;
+  ClonedFunc = CloneFunction(F, VMap);
+
+  // Go through all Outline Candidate Regions and update all BasicBlock
+  // information.
+  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
+       OI->ORI) {
+    SmallVector<BasicBlock *, 8> Region;
+    for (BasicBlock *BB : RegionInfo.Region) {
+      Region.push_back(cast<BasicBlock>(VMap[BB]));
+    }
+    BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
+    BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
+    BasicBlock *NewReturnBlock = nullptr;
+    if (RegionInfo.ReturnBlock)
+      NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
+    FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
+        Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
+    ClonedOMRI->ORI.push_back(MappedRegionInfo);
+  }
+  // Go ahead and update all uses to the duplicate, so that we can just
+  // use the inliner functionality when we're done hacking.
+  F->replaceAllUsesWith(ClonedFunc);
+}
 
+void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
   auto getFirstPHI = [](BasicBlock *BB) {
     BasicBlock::iterator I = BB->begin();
     PHINode *FirstPhi = nullptr;
@@ -676,6 +1039,11 @@ void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
     return FirstPhi;
   };
 
+  // Shouldn't need to normalize PHIs if we're not outlining non-early return
+  // blocks.
+  if (!ClonedOI)
+    return;
+
   // Special hackery is needed with PHI nodes that have inputs from more than
   // one extracted block.  For simplicity, just split the PHIs into a two-level
   // sequence of PHIs, some of which will go in the extracted region, and some
@@ -726,16 +1094,90 @@ void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
       DeadPhis.push_back(OldPhi);
     }
     ++I;
-    }
-    for (auto *DP : DeadPhis)
-      DP->eraseFromParent();
+  }
+  for (auto *DP : DeadPhis)
+    DP->eraseFromParent();
+
+  for (auto E : ClonedOI->ReturnBlockPreds) {
+    E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
+  }
+}
+
+bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
+
+  auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
+    int Cost = 0;
+    for (BasicBlock* BB : Region)
+      Cost += computeBBInlineCost(BB);
+    return Cost;
+  };
+
+  assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
+
+  if (ClonedOMRI->ORI.empty())
+    return false;
+
+  // The CodeExtractor needs a dominator tree.
+  DominatorTree DT;
+  DT.recalculate(*ClonedFunc);
+
+  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
+  LoopInfo LI(DT);
+  BranchProbabilityInfo BPI(*ClonedFunc, LI);
+  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
 
-    for (auto E : ClonedOI->ReturnBlockPreds) {
-      E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
+  SetVector<Value *> Inputs, Outputs, Sinks;
+  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
+       ClonedOMRI->ORI) {
+    int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
+
+    CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
+                     ClonedFuncBFI.get(), &BPI, /* AllowVarargs */ false);
+
+    CE.findInputsOutputs(Inputs, Outputs, Sinks);
+
+#ifndef NDEBUG
+    if (TracePartialInlining) {
+      dbgs() << "inputs: " << Inputs.size() << "\n";
+      dbgs() << "outputs: " << Outputs.size() << "\n";
+      for (Value *value : Inputs)
+        dbgs() << "value used in func: " << *value << "\n";
+      for (Value *output : Outputs)
+        dbgs() << "instr used in func: " << *output << "\n";
     }
+#endif
+    // Do not extract regions that have live exit variables.
+    if (Outputs.size() > 0 && !ForceLiveExit)
+      continue;
+
+    Function *OutlinedFunc = CE.extractCodeRegion();
+
+    if (OutlinedFunc) {
+      CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
+      BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent();
+      assert(OutliningCallBB->getParent() == ClonedFunc);
+      OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
+      NumColdRegionsOutlined++;
+      OutlinedRegionCost += CurrentOutlinedRegionCost;
+
+      if (MarkOutlinedColdCC) {
+        OutlinedFunc->setCallingConv(CallingConv::Cold);
+        OCS.setCallingConv(CallingConv::Cold);
+      }
+    } else
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
+                                        &RegionInfo.Region.front()->front())
+               << "Failed to extract region at block "
+               << ore::NV("Block", RegionInfo.Region.front());
+      });
+  }
+
+  return !OutlinedFunctions.empty();
 }
 
-Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() {
+Function *
+PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
   // Returns true if the block is to be partial inlined into the caller
   // (i.e. not to be extracted to the out of line function)
   auto ToBeInlined = [&, this](BasicBlock *BB) {
@@ -744,6 +1186,16 @@ Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() {
             ClonedOI->Entries.end());
   };
 
+  assert(ClonedOI && "Expecting OutlineInfo for single region outline");
+  // The CodeExtractor needs a dominator tree.
+  DominatorTree DT;
+  DT.recalculate(*ClonedFunc);
+
+  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
+  LoopInfo LI(DT);
+  BranchProbabilityInfo BPI(*ClonedFunc, LI);
+  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
+
   // Gather up the blocks that we're going to extract.
   std::vector<BasicBlock *> ToExtract;
   ToExtract.push_back(ClonedOI->NonReturnBlock);
@@ -759,26 +1211,27 @@ Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() {
       OutlinedRegionCost += computeBBInlineCost(&BB);
     }
 
-  // The CodeExtractor needs a dominator tree.
-  DominatorTree DT;
-  DT.recalculate(*ClonedFunc);
-
-  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
-  LoopInfo LI(DT);
-  BranchProbabilityInfo BPI(*ClonedFunc, LI);
-  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
-
   // Extract the body of the if.
-  OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
-                               ClonedFuncBFI.get(), &BPI)
-                     .extractCodeRegion();
+  Function *OutlinedFunc =
+      CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
+                    ClonedFuncBFI.get(), &BPI,
+                    /* AllowVarargs */ true)
+          .extractCodeRegion();
 
   if (OutlinedFunc) {
-    OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
-        .getInstruction()
-        ->getParent();
+    BasicBlock *OutliningCallBB =
+        PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
+            .getInstruction()
+            ->getParent();
     assert(OutliningCallBB->getParent() == ClonedFunc);
-  }
+    OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
+  } else
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
+                                      &ToExtract.front()->front())
+             << "Failed to extract region at block "
+             << ore::NV("Block", ToExtract.front());
+    });
 
   return OutlinedFunc;
 }
@@ -789,76 +1242,133 @@ PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
   ClonedFunc->replaceAllUsesWith(OrigFunc);
   ClonedFunc->eraseFromParent();
   if (!IsFunctionInlined) {
-    // Remove the function that is speculatively created if there is no
+    // Remove each function that was speculatively created if there is no
     // reference.
-    if (OutlinedFunc)
-      OutlinedFunc->eraseFromParent();
+    for (auto FuncBBPair : OutlinedFunctions) {
+      Function *Func = FuncBBPair.first;
+      Func->eraseFromParent();
+    }
   }
 }
 
-Function *PartialInlinerImpl::unswitchFunction(Function *F) {
+std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
 
   if (F->hasAddressTaken())
-    return nullptr;
+    return {false, nullptr};
 
   // Let inliner handle it
   if (F->hasFnAttribute(Attribute::AlwaysInline))
-    return nullptr;
+    return {false, nullptr};
 
   if (F->hasFnAttribute(Attribute::NoInline))
-    return nullptr;
+    return {false, nullptr};
 
   if (PSI->isFunctionEntryCold(F))
-    return nullptr;
+    return {false, nullptr};
 
   if (F->user_begin() == F->user_end())
-    return nullptr;
+    return {false, nullptr};
 
-  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
+  auto &ORE = (*GetORE)(*F);
+
+  // Only try to outline cold regions if we have a profile summary, which
+  // implies we have profiling information.
+  if (PSI->hasProfileSummary() && F->getEntryCount().hasValue() &&
+      !DisableMultiRegionPartialInline) {
+    std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
+        computeOutliningColdRegionsInfo(F);
+    if (OMRI) {
+      FunctionCloner Cloner(F, OMRI.get(), ORE);
+
+#ifndef NDEBUG
+      if (TracePartialInlining) {
+        dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n";
+        dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold()
+               << "\n";
+      }
+#endif
+      bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
+
+      if (DidOutline) {
+#ifndef NDEBUG
+        if (TracePartialInlining) {
+          dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
+          Cloner.ClonedFunc->print(dbgs());
+          dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
+        }
+#endif
 
+        if (tryPartialInline(Cloner))
+          return {true, nullptr};
+      }
+    }
+  }
+
+  // Fall-thru to regular partial inlining if we:
+  //    i) can't find any cold regions to outline, or
+  //   ii) can't inline the outlined function anywhere.
+  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
   if (!OI)
-    return nullptr;
+    return {false, nullptr};
 
-  FunctionCloner Cloner(F, OI.get());
+  FunctionCloner Cloner(F, OI.get(), ORE);
   Cloner.NormalizeReturnBlock();
-  Function *OutlinedFunction = Cloner.doFunctionOutlining();
+
+  Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
+
+  if (!OutlinedFunction)
+    return {false, nullptr};
 
   bool AnyInline = tryPartialInline(Cloner);
 
   if (AnyInline)
-    return OutlinedFunction;
+    return {true, OutlinedFunction};
 
-  return nullptr;
+  return {false, nullptr};
 }
 
 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
-  int NonWeightedRcost;
-  int SizeCost;
-
-  if (Cloner.OutlinedFunc == nullptr)
+  if (Cloner.OutlinedFunctions.empty())
     return false;
 
+  int SizeCost = 0;
+  BlockFrequency WeightedRcost;
+  int NonWeightedRcost;
   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
 
-  auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
-  auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
+  // Only calculate RelativeToEntryFreq when we are doing single region
+  // outlining.
+  BranchProbability RelativeToEntryFreq;
+  if (Cloner.ClonedOI) {
+    RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
+  } else
+    // RelativeToEntryFreq doesn't make sense when we have more than one
+    // outlined call because each call will have a different relative frequency
+    // to the entry block.  We can consider using the average, but the
+    // usefulness of that information is questionable. For now, assume we never
+    // execute the calls to outlined functions.
+    RelativeToEntryFreq = BranchProbability(0, 1);
+
+  WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
 
-  // The call sequence to the outlined function is larger than the original
-  // outlined region size, it does not increase the chances of inlining
-  // the function with outlining (The inliner usies the size increase to
+  // The call sequence(s) to the outlined function(s) are larger than the sum of
+  // the original outlined region size(s), it does not increase the chances of
+  // inlining the function with outlining (The inliner uses the size increase to
   // model the cost of inlining a callee).
   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
-    OptimizationRemarkEmitter ORE(Cloner.OrigFunc);
+    auto &ORE = (*GetORE)(*Cloner.OrigFunc);
     DebugLoc DLoc;
     BasicBlock *Block;
     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
-    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+    ORE.emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
                                         DLoc, Block)
              << ore::NV("Function", Cloner.OrigFunc)
              << " not partially inlined into callers (Original Size = "
              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
              << ", Size of call sequence to outlined function = "
-             << ore::NV("NewSize", SizeCost) << ")");
+             << ore::NV("NewSize", SizeCost) << ")";
+    });
     return false;
   }
 
@@ -882,18 +1392,26 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
     if (IsLimitReached())
       continue;
 
-    OptimizationRemarkEmitter ORE(CS.getCaller());
 
-    if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE))
+    if (!shouldPartialInline(CS, Cloner, WeightedRcost))
       continue;
 
-    ORE.emit(
-        OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
-        << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
-        << ore::NV("Caller", CS.getCaller()));
+    auto &ORE = (*GetORE)(*CS.getCaller());
+    // Construct remark before doing the inlining, as after successful inlining
+    // the callsite is removed.
+    OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
+    OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
+       << ore::NV("Caller", CS.getCaller());
 
     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
-    InlineFunction(CS, IFI);
+    // We can only forward varargs when we outlined a single region, else we
+    // bail on vararg functions.
+    if (!InlineFunction(CS, IFI, nullptr, true,
+                        (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
+                                         : nullptr)))
+      continue;
+
+    ORE.emit(OR);
 
     // Now update the entry count:
     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
@@ -904,13 +1422,23 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
     AnyInline = true;
     NumPartialInlining++;
     // Update the stats
-    NumPartialInlined++;
+    if (Cloner.ClonedOI)
+      NumPartialInlined++;
+    else
+      NumColdOutlinePartialInlined++;
+
   }
 
   if (AnyInline) {
     Cloner.IsFunctionInlined = true;
     if (CalleeEntryCount)
       Cloner.OrigFunc->setEntryCount(CalleeEntryCountV);
+    auto &ORE = (*GetORE)(*Cloner.OrigFunc);
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
+             << "Partially inlined into at least one caller";
+    });
+
   }
 
   return AnyInline;
@@ -944,8 +1472,10 @@ bool PartialInlinerImpl::run(Module &M) {
     if (Recursive)
       continue;
 
-    if (Function *NewFunc = unswitchFunction(CurrFunc)) {
-      Worklist.push_back(NewFunc);
+    std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
+    if (Result.second)
+      Worklist.push_back(Result.second);
+    if (Result.first) {
       Changed = true;
     }
   }
@@ -954,6 +1484,7 @@ bool PartialInlinerImpl::run(Module &M) {
 }
 
 char PartialInlinerLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
                       "Partial Inliner", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
@@ -985,9 +1516,15 @@ PreservedAnalyses PartialInlinerPass::run(Module &M,
     return FAM.getResult<TargetIRAnalysis>(F);
   };
 
+  std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
+      [&FAM](Function &F) -> OptimizationRemarkEmitter & {
+    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  };
+
   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
 
-  if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M))
+  if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI, &GetORE)
+          .run(M))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 0b319f6a488b..3855e6245d8e 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -26,11 +26,9 @@
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
@@ -94,15 +92,6 @@ static cl::opt<bool> EnableLoopInterchange(
     "enable-loopinterchange", cl::init(false), cl::Hidden,
     cl::desc("Enable the new, experimental LoopInterchange Pass"));
 
-static cl::opt<bool> EnableNonLTOGlobalsModRef(
-    "enable-non-lto-gmr", cl::init(true), cl::Hidden,
-    cl::desc(
-        "Enable the GlobalsModRef AliasAnalysis outside of the LTO pipeline."));
-
-static cl::opt<bool> EnableLoopLoadElim(
-    "enable-loop-load-elim", cl::init(true), cl::Hidden,
-    cl::desc("Enable the LoopLoadElimination Pass"));
-
 static cl::opt<bool>
     EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
                             cl::desc("Enable preparation for ThinLTO."));
@@ -160,7 +149,6 @@ PassManagerBuilder::PassManagerBuilder() {
     SizeLevel = 0;
     LibraryInfo = nullptr;
     Inliner = nullptr;
-    DisableUnitAtATime = false;
     DisableUnrollLoops = false;
     SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
@@ -251,6 +239,7 @@ void PassManagerBuilder::addInstructionCombiningPass(
 void PassManagerBuilder::populateFunctionPassManager(
     legacy::FunctionPassManager &FPM) {
   addExtensionsToPM(EP_EarlyAsPossible, FPM);
+  FPM.add(createEntryExitInstrumenterPass());
 
   // Add LibraryInfo if we have some.
   if (LibraryInfo)
@@ -428,6 +417,14 @@ void PassManagerBuilder::populateModulePassManager(
     else if (GlobalExtensionsNotEmpty() || !Extensions.empty())
       MPM.add(createBarrierNoopPass());
 
+    if (PerformThinLTO) {
+      // Drop available_externally and unreferenced globals. This is necessary
+      // with ThinLTO in order to avoid leaving undefined references to dead
+      // globals in the object file.
+      MPM.add(createEliminateAvailableExternallyPass());
+      MPM.add(createGlobalDCEPass());
+    }
+
     addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
 
     // Rename anon globals to be able to export them in the summary.
@@ -464,23 +461,25 @@ void PassManagerBuilder::populateModulePassManager(
   if (PrepareForThinLTOUsingPGOSampleProfile)
     DisableUnrollLoops = true;
 
-  if (!DisableUnitAtATime) {
-    // Infer attributes about declarations if possible.
-    MPM.add(createInferFunctionAttrsLegacyPass());
+  // Infer attributes about declarations if possible.
+  MPM.add(createInferFunctionAttrsLegacyPass());
 
-    addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
+  addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
-    MPM.add(createIPSCCPPass());          // IP SCCP
-    MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
-    // Promote any localized global vars.
-    MPM.add(createPromoteMemoryToRegisterPass());
+  if (OptLevel > 2)
+    MPM.add(createCallSiteSplittingPass());
 
-    MPM.add(createDeadArgEliminationPass()); // Dead argument elimination
+  MPM.add(createIPSCCPPass());          // IP SCCP
+  MPM.add(createCalledValuePropagationPass());
+  MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
+  // Promote any localized global vars.
+  MPM.add(createPromoteMemoryToRegisterPass());
 
-    addInstructionCombiningPass(MPM); // Clean up after IPCP & DAE
-    addExtensionsToPM(EP_Peephole, MPM);
-    MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
-  }
+  MPM.add(createDeadArgEliminationPass()); // Dead argument elimination
+
+  addInstructionCombiningPass(MPM); // Clean up after IPCP & DAE
+  addExtensionsToPM(EP_Peephole, MPM);
+  MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
 
   // For SamplePGO in ThinLTO compile phase, we do not want to do indirect
   // call promotion as it will change the CFG too much to make the 2nd
@@ -490,21 +489,21 @@ void PassManagerBuilder::populateModulePassManager(
   if (!PerformThinLTO && !PrepareForThinLTOUsingPGOSampleProfile)
     addPGOInstrPasses(MPM);
 
-  if (EnableNonLTOGlobalsModRef)
-    // We add a module alias analysis pass here. In part due to bugs in the
-    // analysis infrastructure this "works" in that the analysis stays alive
-    // for the entire SCC pass run below.
-    MPM.add(createGlobalsAAWrapperPass());
+  // We add a module alias analysis pass here. In part due to bugs in the
+  // analysis infrastructure this "works" in that the analysis stays alive
+  // for the entire SCC pass run below.
+  MPM.add(createGlobalsAAWrapperPass());
 
   // Start of CallGraph SCC passes.
-  if (!DisableUnitAtATime)
-    MPM.add(createPruneEHPass()); // Remove dead EH info
+  MPM.add(createPruneEHPass()); // Remove dead EH info
+  bool RunInliner = false;
   if (Inliner) {
     MPM.add(Inliner);
     Inliner = nullptr;
+    RunInliner = true;
   }
-  if (!DisableUnitAtATime)
-    MPM.add(createPostOrderFunctionAttrsLegacyPass());
+
+  MPM.add(createPostOrderFunctionAttrsLegacyPass());
   if (OptLevel > 2)
     MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
 
@@ -515,11 +514,11 @@ void PassManagerBuilder::populateModulePassManager(
   // pass manager that we are specifically trying to avoid. To prevent this
   // we must insert a no-op module pass to reset the pass manager.
   MPM.add(createBarrierNoopPass());
+
   if (RunPartialInlining)
     MPM.add(createPartialInliningPass());
 
-  if (!DisableUnitAtATime && OptLevel > 1 && !PrepareForLTO &&
-      !PrepareForThinLTO)
+  if (OptLevel > 1 && !PrepareForLTO && !PrepareForThinLTO)
     // Remove avail extern fns and globals definitions if we aren't
     // compiling an object file for later LTO. For LTO we want to preserve
     // these so they are eligible for inlining at link-time. Note if they
@@ -531,15 +530,26 @@ void PassManagerBuilder::populateModulePassManager(
     // and saves running remaining passes on the eliminated functions.
     MPM.add(createEliminateAvailableExternallyPass());
 
-  if (!DisableUnitAtATime)
-    MPM.add(createReversePostOrderFunctionAttrsPass());
+  MPM.add(createReversePostOrderFunctionAttrsPass());
+
+  // The inliner performs some kind of dead code elimination as it goes,
+  // but there are cases that are not really caught by it. We might
+  // at some point consider teaching the inliner about them, but it
+  // is OK for now to run GlobalOpt + GlobalDCE in tandem as their
+  // benefits generally outweight the cost, making the whole pipeline
+  // faster.
+  if (RunInliner) {
+    MPM.add(createGlobalOptimizerPass());
+    MPM.add(createGlobalDCEPass());
+  }
 
   // If we are planning to perform ThinLTO later, let's not bloat the code with
   // unrolling/vectorization/... now. We'll first run the inliner + CGSCC passes
   // during ThinLTO and perform the rest of the optimizations afterward.
   if (PrepareForThinLTO) {
-    // Reduce the size of the IR as much as possible.
-    MPM.add(createGlobalOptimizerPass());
+    // Ensure we perform any last passes, but do so before renaming anonymous
+    // globals in case the passes add any.
+    addExtensionsToPM(EP_OptimizerLast, MPM);
     // Rename anon globals to be able to export them in the summary.
     MPM.add(createNameAnonGlobalPass());
     return;
@@ -560,23 +570,22 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createLICMPass());                  // Hoist loop invariants
   }
 
-  if (EnableNonLTOGlobalsModRef)
-    // We add a fresh GlobalsModRef run at this point. This is particularly
-    // useful as the above will have inlined, DCE'ed, and function-attr
-    // propagated everything. We should at this point have a reasonably minimal
-    // and richly annotated call graph. By computing aliasing and mod/ref
-    // information for all local globals here, the late loop passes and notably
-    // the vectorizer will be able to use them to help recognize vectorizable
-    // memory operations.
-    //
-    // Note that this relies on a bug in the pass manager which preserves
-    // a module analysis into a function pass pipeline (and throughout it) so
-    // long as the first function pass doesn't invalidate the module analysis.
-    // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for
-    // this to work. Fortunately, it is trivial to preserve AliasAnalysis
-    // (doing nothing preserves it as it is required to be conservatively
-    // correct in the face of IR changes).
-    MPM.add(createGlobalsAAWrapperPass());
+  // We add a fresh GlobalsModRef run at this point. This is particularly
+  // useful as the above will have inlined, DCE'ed, and function-attr
+  // propagated everything. We should at this point have a reasonably minimal
+  // and richly annotated call graph. By computing aliasing and mod/ref
+  // information for all local globals here, the late loop passes and notably
+  // the vectorizer will be able to use them to help recognize vectorizable
+  // memory operations.
+  //
+  // Note that this relies on a bug in the pass manager which preserves
+  // a module analysis into a function pass pipeline (and throughout it) so
+  // long as the first function pass doesn't invalidate the module analysis.
+  // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for
+  // this to work. Fortunately, it is trivial to preserve AliasAnalysis
+  // (doing nothing preserves it as it is required to be conservatively
+  // correct in the face of IR changes).
+  MPM.add(createGlobalsAAWrapperPass());
 
   MPM.add(createFloat2IntPass());
 
@@ -597,8 +606,7 @@ void PassManagerBuilder::populateModulePassManager(
 
   // Eliminate loads by forwarding stores from the previous iteration to loads
   // of the current iteration.
-  if (EnableLoopLoadElim)
-    MPM.add(createLoopLoadEliminationPass());
+  MPM.add(createLoopLoadEliminationPass());
 
   // FIXME: Because of #pragma vectorize enable, the passes below are always
   // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
@@ -622,6 +630,13 @@ void PassManagerBuilder::populateModulePassManager(
     addInstructionCombiningPass(MPM);
   }
 
+  // Cleanup after loop vectorization, etc. Simplification passes like CVP and
+  // GVN, loop transforms, and others have already run, so it's now better to
+  // convert to more optimized IR using more aggressive simplify CFG options.
+  // The extra sinking transform can create larger basic blocks, so do this
+  // before SLP vectorization.
+  MPM.add(createCFGSimplificationPass(1, true, true, false, true));
+
   if (RunSLPAfterLoopVectorization && SLPVectorize) {
     MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
     if (OptLevel > 1 && ExtraVectorizerPasses) {
@@ -630,7 +645,6 @@ void PassManagerBuilder::populateModulePassManager(
   }
 
   addExtensionsToPM(EP_Peephole, MPM);
-  MPM.add(createLateCFGSimplificationPass()); // Switches to lookup tables
   addInstructionCombiningPass(MPM);
 
   if (!DisableUnrollLoops) {
@@ -650,16 +664,14 @@ void PassManagerBuilder::populateModulePassManager(
   // about pointer alignments.
   MPM.add(createAlignmentFromAssumptionsPass());
 
-  if (!DisableUnitAtATime) {
-    // FIXME: We shouldn't bother with this anymore.
-    MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
+  // FIXME: We shouldn't bother with this anymore.
+  MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
 
-    // GlobalOpt already deletes dead functions and globals, at -O2 try a
-    // late pass of GlobalDCE.  It is capable of deleting dead cycles.
-    if (OptLevel > 1) {
-      MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
-      MPM.add(createConstantMergePass());     // Merge dup global constants
-    }
+  // GlobalOpt already deletes dead functions and globals, at -O2 try a
+  // late pass of GlobalDCE.  It is capable of deleting dead cycles.
+  if (OptLevel > 1) {
+    MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
+    MPM.add(createConstantMergePass());     // Merge dup global constants
   }
 
   if (MergeFunctions)
@@ -673,6 +685,11 @@ void PassManagerBuilder::populateModulePassManager(
   // Get rid of LCSSA nodes.
   MPM.add(createInstructionSimplifierPass());
 
+  // This hoists/decomposes div/rem ops. It should run after other sink/hoist
+  // passes to avoid re-sinking, but before SimplifyCFG because it can allow
+  // flattening of blocks.
+  MPM.add(createDivRemPairsPass());
+
   // LoopSink (and other loop passes since the last simplifyCFG) might have
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
   MPM.add(createCFGSimplificationPass());
@@ -695,6 +712,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createInferFunctionAttrsLegacyPass());
 
   if (OptLevel > 1) {
+    // Split call-site with more constrained arguments.
+    PM.add(createCallSiteSplittingPass());
+
     // Indirect call promotion. This should promote all the targets that are
     // left by the earlier promotion pass that promotes intra-module targets.
     // This two-step promotion is to save the compile time. For LTO, it should
@@ -706,6 +726,10 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
     // opens opportunities for globalopt (and inlining) by substituting function
     // pointers passed as arguments to direct uses of functions.
     PM.add(createIPSCCPPass());
+
+    // Attach metadata to indirect call sites indicating the set of functions
+    // they may target at run-time. This should follow IPSCCP.
+    PM.add(createCalledValuePropagationPass());
   }
 
   // Infer attributes about definitions. The readnone attribute in particular is
@@ -936,8 +960,7 @@ LLVMPassManagerBuilderSetSizeLevel(LLVMPassManagerBuilderRef PMB,
 void
 LLVMPassManagerBuilderSetDisableUnitAtATime(LLVMPassManagerBuilderRef PMB,
                                             LLVMBool Value) {
-  PassManagerBuilder *Builder = unwrap(PMB);
-  Builder->DisableUnitAtATime = Value;
+  // NOTE: The DisableUnitAtATime switch has been removed.
 }
 
 void
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index 3fd59847a005..46b088189040 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -24,7 +24,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 6baada2c1ae1..f0e781b9d923 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -23,39 +23,65 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/SampleProfile.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/SampleProf.h"
 #include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/Format.h"
+#include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include <cctype>
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <map>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace sampleprof;
@@ -67,34 +93,39 @@ using namespace sampleprof;
 static cl::opt<std::string> SampleProfileFile(
     "sample-profile-file", cl::init(""), cl::value_desc("filename"),
     cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
+
 static cl::opt<unsigned> SampleProfileMaxPropagateIterations(
     "sample-profile-max-propagate-iterations", cl::init(100),
     cl::desc("Maximum number of iterations to go through when propagating "
              "sample block/edge weights through the CFG."));
+
 static cl::opt<unsigned> SampleProfileRecordCoverage(
     "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"),
     cl::desc("Emit a warning if less than N% of records in the input profile "
              "are matched to the IR."));
+
 static cl::opt<unsigned> SampleProfileSampleCoverage(
     "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"),
     cl::desc("Emit a warning if less than N% of samples in the input profile "
              "are matched to the IR."));
+
 static cl::opt<double> SampleProfileHotThreshold(
     "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"),
     cl::desc("Inlined functions that account for more than N% of all samples "
              "collected in the parent function, will be inlined again."));
 
 namespace {
-typedef DenseMap<const BasicBlock *, uint64_t> BlockWeightMap;
-typedef DenseMap<const BasicBlock *, const BasicBlock *> EquivalenceClassMap;
-typedef std::pair<const BasicBlock *, const BasicBlock *> Edge;
-typedef DenseMap<Edge, uint64_t> EdgeWeightMap;
-typedef DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>
-    BlockEdgeMap;
+
+using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
+using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
+using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
+using EdgeWeightMap = DenseMap<Edge, uint64_t>;
+using BlockEdgeMap =
+    DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
 
 class SampleCoverageTracker {
 public:
-  SampleCoverageTracker() : SampleCoverage(), TotalUsedSamples(0) {}
+  SampleCoverageTracker() = default;
 
   bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
                        uint32_t Discriminator, uint64_t Samples);
@@ -103,15 +134,16 @@ public:
   unsigned countBodyRecords(const FunctionSamples *FS) const;
   uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
   uint64_t countBodySamples(const FunctionSamples *FS) const;
+
   void clear() {
     SampleCoverage.clear();
     TotalUsedSamples = 0;
   }
 
 private:
-  typedef std::map<LineLocation, unsigned> BodySampleCoverageMap;
-  typedef DenseMap<const FunctionSamples *, BodySampleCoverageMap>
-      FunctionSamplesCoverageMap;
+  using BodySampleCoverageMap = std::map<LineLocation, unsigned>;
+  using FunctionSamplesCoverageMap =
+      DenseMap<const FunctionSamples *, BodySampleCoverageMap>;
 
   /// Coverage map for sampling records.
   ///
@@ -135,7 +167,7 @@ private:
   /// and all the inlined callsites. Strictly, we should have a map of counters
   /// keyed by FunctionSamples pointers, but these stats are cleared after
   /// every function, so we just need to keep a single counter.
-  uint64_t TotalUsedSamples;
+  uint64_t TotalUsedSamples = 0;
 };
 
 /// \brief Sample profile pass.
@@ -145,29 +177,31 @@ private:
 /// profile information found in that file.
 class SampleProfileLoader {
 public:
-  SampleProfileLoader(StringRef Name = SampleProfileFile)
-      : DT(nullptr), PDT(nullptr), LI(nullptr), ACT(nullptr), Reader(),
-        Samples(nullptr), Filename(Name), ProfileIsValid(false),
-        TotalCollectedSamples(0) {}
+  SampleProfileLoader(
+      StringRef Name, bool IsThinLTOPreLink,
+      std::function<AssumptionCache &(Function &)> GetAssumptionCache,
+      std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo)
+      : GetAC(GetAssumptionCache), GetTTI(GetTargetTransformInfo),
+        Filename(Name), IsThinLTOPreLink(IsThinLTOPreLink) {}
 
   bool doInitialization(Module &M);
-  bool runOnModule(Module &M);
-  void setACT(AssumptionCacheTracker *A) { ACT = A; }
+  bool runOnModule(Module &M, ModuleAnalysisManager *AM);
 
   void dump() { Reader->dump(); }
 
 protected:
-  bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
   unsigned getFunctionLoc(Function &F);
   bool emitAnnotations(Function &F);
   ErrorOr<uint64_t> getInstWeight(const Instruction &I);
   ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
   const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const;
   std::vector<const FunctionSamples *>
-  findIndirectCallFunctionSamples(const Instruction &I) const;
+  findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
   const FunctionSamples *findFunctionSamples(const Instruction &I) const;
+  bool inlineCallInstruction(Instruction *I);
   bool inlineHotFunctions(Function &F,
-                          DenseSet<GlobalValue::GUID> &ImportGUIDs);
+                          DenseSet<GlobalValue::GUID> &InlinedGUIDs);
   void printEdgeWeight(raw_ostream &OS, Edge E);
   void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
@@ -222,7 +256,8 @@ protected:
   std::unique_ptr<PostDomTreeBase<BasicBlock>> PDT;
   std::unique_ptr<LoopInfo> LI;
 
-  AssumptionCacheTracker *ACT;
+  std::function<AssumptionCache &(Function &)> GetAC;
+  std::function<TargetTransformInfo &(Function &)> GetTTI;
 
   /// \brief Predecessors for each basic block in the CFG.
   BlockEdgeMap Predecessors;
@@ -236,19 +271,28 @@ protected:
   std::unique_ptr<SampleProfileReader> Reader;
 
   /// \brief Samples collected for the body of this function.
-  FunctionSamples *Samples;
+  FunctionSamples *Samples = nullptr;
 
   /// \brief Name of the profile file to load.
   std::string Filename;
 
   /// \brief Flag indicating whether the profile input loaded successfully.
-  bool ProfileIsValid;
+  bool ProfileIsValid = false;
+
+  /// \brief Flag indicating if the pass is invoked in ThinLTO compile phase.
+  ///
+  /// In this phase, in annotation, we should not promote indirect calls.
+  /// Instead, we will mark GUIDs that needs to be annotated to the function.
+  bool IsThinLTOPreLink;
 
   /// \brief Total number of samples collected in this profile.
   ///
   /// This is the sum of all the samples collected in all the functions executed
   /// at runtime.
-  uint64_t TotalCollectedSamples;
+  uint64_t TotalCollectedSamples = 0;
+
+  /// \brief Optimization Remark Emitter used to emit diagnostic remarks.
+  OptimizationRemarkEmitter *ORE = nullptr;
 };
 
 class SampleProfileLoaderLegacyPass : public ModulePass {
@@ -256,8 +300,15 @@ public:
   // Class identification, replacement for typeinfo
   static char ID;
 
-  SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile)
-      : ModulePass(ID), SampleLoader(Name) {
+  SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile,
+                                bool IsThinLTOPreLink = false)
+      : ModulePass(ID), SampleLoader(Name, IsThinLTOPreLink,
+                                     [&](Function &F) -> AssumptionCache & {
+                                       return ACT->getAssumptionCache(F);
+                                     },
+                                     [&](Function &F) -> TargetTransformInfo & {
+                                       return TTIWP->getTTI(F);
+                                     }) {
     initializeSampleProfileLoaderLegacyPassPass(
         *PassRegistry::getPassRegistry());
   }
@@ -267,17 +318,23 @@ public:
   bool doInitialization(Module &M) override {
     return SampleLoader.doInitialization(M);
   }
+
   StringRef getPassName() const override { return "Sample profile pass"; }
   bool runOnModule(Module &M) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
   }
 
 private:
   SampleProfileLoader SampleLoader;
+  AssumptionCacheTracker *ACT = nullptr;
+  TargetTransformInfoWrapperPass *TTIWP = nullptr;
 };
 
+} // end anonymous namespace
+
 /// Return true if the given callsite is hot wrt to its caller.
 ///
 /// Functions that were inlined in the original binary will be represented
@@ -292,8 +349,8 @@ private:
 ///
 /// If that fraction is larger than the default given by
 /// SampleProfileHotThreshold, the callsite will be inlined again.
-bool callsiteIsHot(const FunctionSamples *CallerFS,
-                   const FunctionSamples *CallsiteFS) {
+static bool callsiteIsHot(const FunctionSamples *CallerFS,
+                          const FunctionSamples *CallsiteFS) {
   if (!CallsiteFS)
     return false; // The callsite was not inlined in the original binary.
 
@@ -309,7 +366,6 @@ bool callsiteIsHot(const FunctionSamples *CallerFS,
       (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0;
   return PercentSamples >= SampleProfileHotThreshold;
 }
-}
 
 /// Mark as used the sample record for the given function samples at
 /// (LineOffset, Discriminator).
@@ -423,6 +479,7 @@ unsigned SampleProfileLoader::getOffset(const DILocation *DIL) const {
          0xffff;
 }
 
+#ifndef NDEBUG
 /// \brief Print the weight of edge \p E on stream \p OS.
 ///
 /// \param OS  Stream to emit the output to.
@@ -453,6 +510,7 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
   uint64_t W = (I == BlockWeights.end() ? 0 : I->second);
   OS << "weight[" << BB->getName() << "]: " << W << "\n";
 }
+#endif
 
 /// \brief Get the weight for an instruction.
 ///
@@ -480,10 +538,12 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst))
     return std::error_code();
 
-  // If a call/invoke instruction is inlined in profile, but not inlined here,
+  // If a direct call/invoke instruction is inlined in profile
+  // (findCalleeFunctionSamples returns non-empty result), but not inlined here,
   // it means that the inlined callsite has no sample, thus the call
   // instruction should have 0 count.
   if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
+      !ImmutableCallSite(&Inst).isIndirectCall() &&
       findCalleeFunctionSamples(Inst))
     return 0;
 
@@ -495,13 +555,18 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
     bool FirstMark =
         CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
     if (FirstMark) {
-      const Function *F = Inst.getParent()->getParent();
-      LLVMContext &Ctx = F->getContext();
-      emitOptimizationRemark(
-          Ctx, DEBUG_TYPE, *F, DLoc,
-          Twine("Applied ") + Twine(*R) +
-              " samples from profile (offset: " + Twine(LineOffset) +
-              ((Discriminator) ? Twine(".") + Twine(Discriminator) : "") + ")");
+      ORE->emit([&]() {
+        OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
+        Remark << "Applied " << ore::NV("NumSamples", *R);
+        Remark << " samples from profile (offset: ";
+        Remark << ore::NV("LineOffset", LineOffset);
+        if (Discriminator) {
+          Remark << ".";
+          Remark << ore::NV("Discriminator", Discriminator);
+        }
+        Remark << ")";
+        return Remark;
+      });
     }
     DEBUG(dbgs() << "    " << DLoc.getLine() << "."
                  << DIL->getBaseDiscriminator() << ":" << Inst
@@ -588,10 +653,11 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
 }
 
 /// Returns a vector of FunctionSamples that are the indirect call targets
-/// of \p Inst. The vector is sorted by the total number of samples.
+/// of \p Inst. The vector is sorted by the total number of samples. Stores
+/// the total call count of the indirect call in \p Sum.
 std::vector<const FunctionSamples *>
 SampleProfileLoader::findIndirectCallFunctionSamples(
-    const Instruction &Inst) const {
+    const Instruction &Inst, uint64_t &Sum) const {
   const DILocation *DIL = Inst.getDebugLoc();
   std::vector<const FunctionSamples *> R;
 
@@ -603,16 +669,25 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
   if (FS == nullptr)
     return R;
 
+  uint32_t LineOffset = getOffset(DIL);
+  uint32_t Discriminator = DIL->getBaseDiscriminator();
+
+  auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
+  Sum = 0;
+  if (T)
+    for (const auto &T_C : T.get())
+      Sum += T_C.second;
   if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(
           LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()))) {
-    if (M->size() == 0)
+    if (M->empty())
       return R;
     for (const auto &NameFS : *M) {
+      Sum += NameFS.second.getEntrySamples();
       R.push_back(&NameFS.second);
     }
     std::sort(R.begin(), R.end(),
               [](const FunctionSamples *L, const FunctionSamples *R) {
-                return L->getTotalSamples() > R->getTotalSamples();
+                return L->getEntrySamples() > R->getEntrySamples();
               });
   }
   return R;
@@ -650,6 +725,39 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
   return FS;
 }
 
+bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
+  assert(isa<CallInst>(I) || isa<InvokeInst>(I));
+  CallSite CS(I);
+  Function *CalledFunction = CS.getCalledFunction();
+  assert(CalledFunction);
+  DebugLoc DLoc = I->getDebugLoc();
+  BasicBlock *BB = I->getParent();
+  InlineParams Params = getInlineParams();
+  Params.ComputeFullInlineCost = true;
+  // Checks if there is anything in the reachable portion of the callee at
+  // this callsite that makes this inlining potentially illegal. Need to
+  // set ComputeFullInlineCost, otherwise getInlineCost may return early
+  // when cost exceeds threshold without checking all IRs in the callee.
+  // The acutal cost does not matter because we only checks isNever() to
+  // see if it is legal to inline the callsite.
+  InlineCost Cost = getInlineCost(CS, Params, GetTTI(*CalledFunction), GetAC,
+                                  None, nullptr, nullptr);
+  if (Cost.isNever()) {
+    ORE->emit(OptimizationRemark(DEBUG_TYPE, "Not inline", DLoc, BB)
+              << "incompatible inlining");
+    return false;
+  }
+  InlineFunctionInfo IFI(nullptr, &GetAC);
+  if (InlineFunction(CS, IFI)) {
+    // The call to InlineFunction erases I, so we can't pass it here.
+    ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB)
+              << "inlined hot callee '" << ore::NV("Callee", CalledFunction)
+              << "' into '" << ore::NV("Caller", BB->getParent()) << "'");
+    return true;
+  }
+  return false;
+}
+
 /// \brief Iteratively inline hot callsites of a function.
 ///
 /// Iteratively traverse all callsites of the function \p F, and find if
@@ -659,17 +767,14 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
 /// it to direct call. Each indirect call is limited with a single target.
 ///
 /// \param F function to perform iterative inlining.
-/// \param ImportGUIDs a set to be updated to include all GUIDs that come
-///     from a different module but inlined in the profiled binary.
+/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
+///     inlined in the profiled binary.
 ///
 /// \returns True if there is any inline happened.
 bool SampleProfileLoader::inlineHotFunctions(
-    Function &F, DenseSet<GlobalValue::GUID> &ImportGUIDs) {
+    Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
   DenseSet<Instruction *> PromotedInsns;
   bool Changed = false;
-  LLVMContext &Ctx = F.getContext();
-  std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&](
-      Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); };
   while (true) {
     bool LocalChanged = false;
     SmallVector<Instruction *, 10> CIS;
@@ -690,57 +795,59 @@ bool SampleProfileLoader::inlineHotFunctions(
       }
     }
     for (auto I : CIS) {
-      InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr);
       Function *CalledFunction = CallSite(I).getCalledFunction();
       // Do not inline recursive calls.
       if (CalledFunction == &F)
         continue;
-      Instruction *DI = I;
-      if (!CalledFunction && !PromotedInsns.count(I) &&
-          CallSite(I).isIndirectCall())
-        for (const auto *FS : findIndirectCallFunctionSamples(*I)) {
+      if (CallSite(I).isIndirectCall()) {
+        if (PromotedInsns.count(I))
+          continue;
+        uint64_t Sum;
+        for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
+          if (IsThinLTOPreLink) {
+            FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
+                                     Samples->getTotalSamples() *
+                                         SampleProfileHotThreshold / 100);
+            continue;
+          }
           auto CalleeFunctionName = FS->getName();
           // If it is a recursive call, we do not inline it as it could bloat
           // the code exponentially. There is way to better handle this, e.g.
           // clone the caller first, and inline the cloned caller if it is
-          // recursive. As llvm does not inline recursive calls, we will simply
-          // ignore it instead of handling it explicitly.
+          // recursive. As llvm does not inline recursive calls, we will
+          // simply ignore it instead of handling it explicitly.
           if (CalleeFunctionName == F.getName())
             continue;
+
           const char *Reason = "Callee function not available";
           auto R = SymbolMap.find(CalleeFunctionName);
-          if (R == SymbolMap.end())
-            continue;
-          CalledFunction = R->getValue();
-          if (CalledFunction && isLegalToPromote(I, CalledFunction, &Reason)) {
-            // The indirect target was promoted and inlined in the profile, as a
-            // result, we do not have profile info for the branch probability.
-            // We set the probability to 80% taken to indicate that the static
-            // call is likely taken.
-            DI = dyn_cast<Instruction>(
-                promoteIndirectCall(I, CalledFunction, 80, 100, false)
-                    ->stripPointerCasts());
+          if (R != SymbolMap.end() && R->getValue() &&
+              !R->getValue()->isDeclaration() &&
+              R->getValue()->getSubprogram() &&
+              isLegalToPromote(CallSite(I), R->getValue(), &Reason)) {
+            uint64_t C = FS->getEntrySamples();
+            Instruction *DI =
+                pgo::promoteIndirectCall(I, R->getValue(), C, Sum, false, ORE);
+            Sum -= C;
             PromotedInsns.insert(I);
+            // If profile mismatches, we should not attempt to inline DI.
+            if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
+                inlineCallInstruction(DI))
+              LocalChanged = true;
           } else {
-            DEBUG(dbgs() << "\nFailed to promote indirect call to "
-                         << CalleeFunctionName << " because " << Reason
-                         << "\n");
-            continue;
+            DEBUG(dbgs()
+                  << "\nFailed to promote indirect call to "
+                  << CalleeFunctionName << " because " << Reason << "\n");
           }
         }
-      if (!CalledFunction || !CalledFunction->getSubprogram()) {
-        findCalleeFunctionSamples(*I)->findImportedFunctions(
-            ImportGUIDs, F.getParent(),
+      } else if (CalledFunction && CalledFunction->getSubprogram() &&
+                 !CalledFunction->isDeclaration()) {
+        if (inlineCallInstruction(I))
+          LocalChanged = true;
+      } else if (IsThinLTOPreLink) {
+        findCalleeFunctionSamples(*I)->findInlinedFunctions(
+            InlinedGUIDs, F.getParent(),
             Samples->getTotalSamples() * SampleProfileHotThreshold / 100);
-        continue;
-      }
-      DebugLoc DLoc = I->getDebugLoc();
-      if (InlineFunction(CallSite(DI), IFI)) {
-        LocalChanged = true;
-        emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc,
-                               Twine("inlined hot callee '") +
-                                   CalledFunction->getName() + "' into '" +
-                                   F.getName() + "'");
       }
     }
     if (LocalChanged) {
@@ -1076,24 +1183,20 @@ void SampleProfileLoader::buildEdges(Function &F) {
   }
 }
 
-/// Sorts the CallTargetMap \p M by count in descending order and stores the
-/// sorted result in \p Sorted. Returns the total counts.
-static uint64_t SortCallTargets(SmallVector<InstrProfValueData, 2> &Sorted,
-                                const SampleRecord::CallTargetMap &M) {
-  Sorted.clear();
-  uint64_t Sum = 0;
-  for (auto I = M.begin(); I != M.end(); ++I) {
-    Sum += I->getValue();
-    Sorted.push_back({Function::getGUID(I->getKey()), I->getValue()});
-  }
-  std::sort(Sorted.begin(), Sorted.end(),
+/// Returns the sorted CallTargetMap \p M by count in descending order.
+static SmallVector<InstrProfValueData, 2> SortCallTargets(
+    const SampleRecord::CallTargetMap &M) {
+  SmallVector<InstrProfValueData, 2> R;
+  for (auto I = M.begin(); I != M.end(); ++I)
+    R.push_back({Function::getGUID(I->getKey()), I->getValue()});
+  std::sort(R.begin(), R.end(),
             [](const InstrProfValueData &L, const InstrProfValueData &R) {
               if (L.Count == R.Count)
                 return L.Value > R.Value;
               else
                 return L.Count > R.Count;
             });
-  return Sum;
+  return R;
 }
 
 /// \brief Propagate weights into edges
@@ -1184,10 +1287,12 @@ void SampleProfileLoader::propagateWeights(Function &F) {
           if (!FS)
             continue;
           auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
-          if (!T || T.get().size() == 0)
+          if (!T || T.get().empty())
             continue;
-          SmallVector<InstrProfValueData, 2> SortedCallTargets;
-          uint64_t Sum = SortCallTargets(SortedCallTargets, T.get());
+          SmallVector<InstrProfValueData, 2> SortedCallTargets =
+              SortCallTargets(T.get());
+          uint64_t Sum;
+          findIndirectCallFunctionSamples(I, Sum);
           annotateValueSite(*I.getParent()->getParent()->getParent(), I,
                             SortedCallTargets, Sum, IPVK_IndirectCallTarget,
                             SortedCallTargets.size());
@@ -1211,7 +1316,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
                  << ".\n");
     SmallVector<uint32_t, 4> Weights;
     uint32_t MaxWeight = 0;
-    DebugLoc MaxDestLoc;
+    Instruction *MaxDestInst;
     for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
       BasicBlock *Succ = TI->getSuccessor(I);
       Edge E = std::make_pair(BB, Succ);
@@ -1230,7 +1335,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
       if (Weight != 0) {
         if (Weight > MaxWeight) {
           MaxWeight = Weight;
-          MaxDestLoc = Succ->getFirstNonPHIOrDbgOrLifetime()->getDebugLoc();
+          MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
         }
       }
     }
@@ -1243,15 +1348,13 @@ void SampleProfileLoader::propagateWeights(Function &F) {
     // weights, the second pass does not need to set it.
     if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
       DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
-      TI->setMetadata(llvm::LLVMContext::MD_prof,
+      TI->setMetadata(LLVMContext::MD_prof,
                       MDB.createBranchWeights(Weights));
-      emitOptimizationRemark(
-          Ctx, DEBUG_TYPE, F, MaxDestLoc,
-          Twine("most popular destination for conditional branches at ") +
-              ((BranchLoc) ? Twine(BranchLoc->getFilename() + ":" +
-                                   Twine(BranchLoc.getLine()) + ":" +
-                                   Twine(BranchLoc.getCol()))
-                           : Twine("<UNKNOWN LOCATION>")));
+      ORE->emit([&]() {
+        return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
+               << "most popular destination for conditional branches at "
+               << ore::NV("CondBranchesLoc", BranchLoc);
+      });
     } else {
       DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
     }
@@ -1351,18 +1454,19 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
   DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
                << ": " << getFunctionLoc(F) << "\n");
 
-  DenseSet<GlobalValue::GUID> ImportGUIDs;
-  Changed |= inlineHotFunctions(F, ImportGUIDs);
+  DenseSet<GlobalValue::GUID> InlinedGUIDs;
+  Changed |= inlineHotFunctions(F, InlinedGUIDs);
 
   // Compute basic block weights.
   Changed |= computeBlockWeights(F);
 
   if (Changed) {
     // Add an entry count to the function using the samples gathered at the
-    // function entry. Also sets the GUIDs that comes from a different
-    // module but inlined in the profiled binary. This is aiming at making
-    // the IR match the profiled binary before annotation.
-    F.setEntryCount(Samples->getHeadSamples() + 1, &ImportGUIDs);
+    // function entry.
+    // Sets the GUIDs that are inlined in the profiled binary. This is used
+    // for ThinLink to make correct liveness analysis, and also make the IR
+    // match the profiled binary before annotation.
+    F.setEntryCount(Samples->getHeadSamples() + 1, &InlinedGUIDs);
 
     // Compute dominance and loop info needed for propagation.
     computeDominanceAndLoopInfo(F);
@@ -1404,9 +1508,11 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
 }
 
 char SampleProfileLoaderLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
                       "Sample Profile loader", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
                     "Sample Profile loader", false, false)
 
@@ -1431,7 +1537,7 @@ ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
   return new SampleProfileLoaderLegacyPass(Name);
 }
 
-bool SampleProfileLoader::runOnModule(Module &M) {
+bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM) {
   if (!ProfileIsValid)
     return false;
 
@@ -1463,7 +1569,7 @@ bool SampleProfileLoader::runOnModule(Module &M) {
   for (auto &F : M)
     if (!F.isDeclaration()) {
       clearFunctionData();
-      retval |= runOnFunction(F);
+      retval |= runOnFunction(F, AM);
     }
   if (M.getProfileSummary() == nullptr)
     M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
@@ -1471,13 +1577,23 @@ bool SampleProfileLoader::runOnModule(Module &M) {
 }
 
 bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
-  // FIXME: pass in AssumptionCache correctly for the new pass manager.
-  SampleLoader.setACT(&getAnalysis<AssumptionCacheTracker>());
-  return SampleLoader.runOnModule(M);
+  ACT = &getAnalysis<AssumptionCacheTracker>();
+  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
+  return SampleLoader.runOnModule(M, nullptr);
 }
 
-bool SampleProfileLoader::runOnFunction(Function &F) {
+bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
   F.setEntryCount(0);
+  std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
+  if (AM) {
+    auto &FAM =
+        AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())
+            .getManager();
+    ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  } else {
+    OwnedORE = make_unique<OptimizationRemarkEmitter>(&F);
+    ORE = OwnedORE.get();
+  }
   Samples = Reader->getSamplesFor(F);
   if (Samples && !Samples->empty())
     return emitAnnotations(F);
@@ -1486,13 +1602,23 @@ bool SampleProfileLoader::runOnFunction(Function &F) {
 
 PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
                                                ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+    return FAM.getResult<AssumptionAnalysis>(F);
+  };
+  auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
 
   SampleProfileLoader SampleLoader(
-      ProfileFileName.empty() ? SampleProfileFile : ProfileFileName);
+      ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
+      IsThinLTOPreLink, GetAssumptionCache, GetTTI);
 
   SampleLoader.doInitialization(M);
 
-  if (!SampleLoader.runOnModule(M))
+  if (!SampleLoader.runOnModule(M, &AM))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 8ef6bb652309..945133074059 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -19,7 +19,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
@@ -40,9 +39,17 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
       continue;
 
     auto Name = ExportGV.getName();
-    GlobalValue *ImportGV = ImportM.getNamedValue(Name);
-    if ((!ImportGV || ImportGV->use_empty()) && !PromoteExtra.count(&ExportGV))
-      continue;
+    GlobalValue *ImportGV = nullptr;
+    if (!PromoteExtra.count(&ExportGV)) {
+      ImportGV = ImportM.getNamedValue(Name);
+      if (!ImportGV)
+        continue;
+      ImportGV->removeDeadConstantUsers();
+      if (ImportGV->use_empty()) {
+        ImportGV->eraseFromParent();
+        continue;
+      }
+    }
 
     std::string NewName = (Name + ModuleId).str();
 
@@ -141,7 +148,9 @@ void simplifyExternals(Module &M) {
       continue;
     }
 
-    if (!F.isDeclaration() || F.getFunctionType() == EmptyFT)
+    if (!F.isDeclaration() || F.getFunctionType() == EmptyFT ||
+        // Changing the type of an intrinsic may invalidate the IR.
+        F.getName().startswith("llvm."))
       continue;
 
     Function *NewF =
@@ -376,15 +385,14 @@ void splitAndWriteThinLTOBitcode(
   W.writeStrtab();
   OS << Buffer;
 
-  // If a minimized bitcode module was requested for the thin link,
-  // strip the debug info (the merged module was already stripped above)
-  // and write it to the given OS.
+  // If a minimized bitcode module was requested for the thin link, only
+  // the information that is needed by thin link will be written in the
+  // given OS (the merged module will be written as usual).
   if (ThinLinkOS) {
     Buffer.clear();
     BitcodeWriter W2(Buffer);
     StripDebugInfo(M);
-    W2.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
-                   /*GenerateHash=*/false, &ModHash);
+    W2.writeThinLinkBitcode(&M, Index, ModHash);
     W2.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
                    &MergedMIndex);
     W2.writeSymtab();
@@ -420,14 +428,11 @@ void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
   ModuleHash ModHash = {{0}};
   WriteBitcodeToFile(&M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
                      /*GenerateHash=*/true, &ModHash);
-  // If a minimized bitcode module was requested for the thin link,
-  // strip the debug info and write it to the given OS.
-  if (ThinLinkOS) {
-    StripDebugInfo(M);
-    WriteBitcodeToFile(&M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false,
-                       Index,
-                       /*GenerateHash=*/false, &ModHash);
-  }
+  // If a minimized bitcode module was requested for the thin link, only
+  // the information that is needed by thin link will be written in the
+  // given OS.
+  if (ThinLinkOS && Index)
+    WriteThinLinkBitcodeToFile(&M, *ThinLinkOS, *Index, ModHash);
 }
 
 class WriteThinLTOBitcode : public ModulePass {
diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 00769cd63229..ec56f0cde25d 100644
--- a/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -51,14 +51,13 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -275,18 +274,39 @@ struct VirtualCallSite {
   // of that field for details.
   unsigned *NumUnsafeUses;
 
-  void emitRemark(const Twine &OptName, const Twine &TargetName) {
+  void
+  emitRemark(const StringRef OptName, const StringRef TargetName,
+             function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
     Function *F = CS.getCaller();
-    emitOptimizationRemark(
-        F->getContext(), DEBUG_TYPE, *F,
-        CS.getInstruction()->getDebugLoc(),
-        OptName + ": devirtualized a call to " + TargetName);
+    DebugLoc DLoc = CS->getDebugLoc();
+    BasicBlock *Block = CS.getParent();
+
+    // In the new pass manager, we can request the optimization
+    // remark emitter pass on a per-function-basis, which the
+    // OREGetter will do for us.
+    // In the old pass manager, this is harder, so we just build
+    // a optimization remark emitter on the fly, when we need it.
+    std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
+    OptimizationRemarkEmitter *ORE;
+    if (OREGetter)
+      ORE = &OREGetter(F);
+    else {
+      OwnedORE = make_unique<OptimizationRemarkEmitter>(F);
+      ORE = OwnedORE.get();
+    }
+
+    using namespace ore;
+    ORE->emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
+              << NV("Optimization", OptName) << ": devirtualized a call to "
+              << NV("FunctionName", TargetName));
   }
 
-  void replaceAndErase(const Twine &OptName, const Twine &TargetName,
-                       bool RemarksEnabled, Value *New) {
+  void replaceAndErase(
+      const StringRef OptName, const StringRef TargetName, bool RemarksEnabled,
+      function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
+      Value *New) {
     if (RemarksEnabled)
-      emitRemark(OptName, TargetName);
+      emitRemark(OptName, TargetName, OREGetter);
     CS->replaceAllUsesWith(New);
     if (auto II = dyn_cast<InvokeInst>(CS.getInstruction())) {
       BranchInst::Create(II->getNormalDest(), CS.getInstruction());
@@ -383,6 +403,7 @@ struct DevirtModule {
   IntegerType *IntPtrTy;
 
   bool RemarksEnabled;
+  function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
 
   MapVector<VTableSlot, VTableSlotInfo> CallSlots;
 
@@ -397,6 +418,7 @@ struct DevirtModule {
   std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
 
   DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter,
+               function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
                ModuleSummaryIndex *ExportSummary,
                const ModuleSummaryIndex *ImportSummary)
       : M(M), AARGetter(AARGetter), ExportSummary(ExportSummary),
@@ -405,7 +427,7 @@ struct DevirtModule {
         Int32Ty(Type::getInt32Ty(M.getContext())),
         Int64Ty(Type::getInt64Ty(M.getContext())),
         IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)),
-        RemarksEnabled(areRemarksEnabled()) {
+        RemarksEnabled(areRemarksEnabled()), OREGetter(OREGetter) {
     assert(!(ExportSummary && ImportSummary));
   }
 
@@ -444,16 +466,23 @@ struct DevirtModule {
   std::string getGlobalName(VTableSlot Slot, ArrayRef<uint64_t> Args,
                             StringRef Name);
 
+  bool shouldExportConstantsAsAbsoluteSymbols();
+
   // This function is called during the export phase to create a symbol
   // definition containing information about the given vtable slot and list of
   // arguments.
   void exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name,
                     Constant *C);
+  void exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name,
+                      uint32_t Const, uint32_t &Storage);
 
   // This function is called during the import phase to create a reference to
   // the symbol definition created during the export phase.
   Constant *importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
-                         StringRef Name, unsigned AbsWidth = 0);
+                         StringRef Name);
+  Constant *importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                           StringRef Name, IntegerType *IntTy,
+                           uint32_t Storage);
 
   void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne,
                             Constant *UniqueMemberAddr);
@@ -482,8 +511,9 @@ struct DevirtModule {
 
   // Lower the module using the action and summary passed as command line
   // arguments. For testing purposes only.
-  static bool runForTesting(Module &M,
-                            function_ref<AAResults &(Function &)> AARGetter);
+  static bool runForTesting(
+      Module &M, function_ref<AAResults &(Function &)> AARGetter,
+      function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
 };
 
 struct WholeProgramDevirt : public ModulePass {
@@ -508,9 +538,14 @@ struct WholeProgramDevirt : public ModulePass {
   bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
+
+    auto OREGetter = function_ref<OptimizationRemarkEmitter &(Function *)>();
+
     if (UseCommandLine)
-      return DevirtModule::runForTesting(M, LegacyAARGetter(*this));
-    return DevirtModule(M, LegacyAARGetter(*this), ExportSummary, ImportSummary)
+      return DevirtModule::runForTesting(M, LegacyAARGetter(*this), OREGetter);
+
+    return DevirtModule(M, LegacyAARGetter(*this), OREGetter, ExportSummary,
+                        ImportSummary)
         .run();
   }
 
@@ -542,13 +577,17 @@ PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
   auto AARGetter = [&](Function &F) -> AAResults & {
     return FAM.getResult<AAManager>(F);
   };
-  if (!DevirtModule(M, AARGetter, nullptr, nullptr).run())
+  auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
+    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+  };
+  if (!DevirtModule(M, AARGetter, OREGetter, nullptr, nullptr).run())
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
 
 bool DevirtModule::runForTesting(
-    Module &M, function_ref<AAResults &(Function &)> AARGetter) {
+    Module &M, function_ref<AAResults &(Function &)> AARGetter,
+    function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
   ModuleSummaryIndex Summary;
 
   // Handle the command-line summary arguments. This code is for testing
@@ -566,7 +605,7 @@ bool DevirtModule::runForTesting(
 
   bool Changed =
       DevirtModule(
-          M, AARGetter,
+          M, AARGetter, OREGetter,
           ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
           ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr)
           .run();
@@ -684,7 +723,7 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
   auto Apply = [&](CallSiteInfo &CSInfo) {
     for (auto &&VCallSite : CSInfo.CallSites) {
       if (RemarksEnabled)
-        VCallSite.emitRemark("single-impl", TheFn->getName());
+        VCallSite.emitRemark("single-impl", TheFn->getName(), OREGetter);
       VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast(
           TheFn, VCallSite.CS.getCalledValue()->getType()));
       // This use is no longer unsafe.
@@ -724,9 +763,24 @@ bool DevirtModule::trySingleImplDevirt(
   // to make it visible to thin LTO objects. We can only get here during the
   // ThinLTO export phase.
   if (TheFn->hasLocalLinkage()) {
+    std::string NewName = (TheFn->getName() + "$merged").str();
+
+    // Since we are renaming the function, any comdats with the same name must
+    // also be renamed. This is required when targeting COFF, as the comdat name
+    // must match one of the names of the symbols in the comdat.
+    if (Comdat *C = TheFn->getComdat()) {
+      if (C->getName() == TheFn->getName()) {
+        Comdat *NewC = M.getOrInsertComdat(NewName);
+        NewC->setSelectionKind(C->getSelectionKind());
+        for (GlobalObject &GO : M.global_objects())
+          if (GO.getComdat() == C)
+            GO.setComdat(NewC);
+      }
+    }
+
     TheFn->setLinkage(GlobalValue::ExternalLinkage);
     TheFn->setVisibility(GlobalValue::HiddenVisibility);
-    TheFn->setName(TheFn->getName() + "$merged");
+    TheFn->setName(NewName);
   }
 
   Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
@@ -769,7 +823,7 @@ void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
                                          uint64_t TheRetVal) {
   for (auto Call : CSInfo.CallSites)
     Call.replaceAndErase(
-        "uniform-ret-val", FnName, RemarksEnabled,
+        "uniform-ret-val", FnName, RemarksEnabled, OREGetter,
         ConstantInt::get(cast<IntegerType>(Call.CS.getType()), TheRetVal));
   CSInfo.markDevirt();
 }
@@ -808,6 +862,12 @@ std::string DevirtModule::getGlobalName(VTableSlot Slot,
   return OS.str();
 }
 
+bool DevirtModule::shouldExportConstantsAsAbsoluteSymbols() {
+  Triple T(M.getTargetTriple());
+  return (T.getArch() == Triple::x86 || T.getArch() == Triple::x86_64) &&
+         T.getObjectFormat() == Triple::ELF;
+}
+
 void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
                                 StringRef Name, Constant *C) {
   GlobalAlias *GA = GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
@@ -815,27 +875,55 @@ void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
   GA->setVisibility(GlobalValue::HiddenVisibility);
 }
 
+void DevirtModule::exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                  StringRef Name, uint32_t Const,
+                                  uint32_t &Storage) {
+  if (shouldExportConstantsAsAbsoluteSymbols()) {
+    exportGlobal(
+        Slot, Args, Name,
+        ConstantExpr::getIntToPtr(ConstantInt::get(Int32Ty, Const), Int8PtrTy));
+    return;
+  }
+
+  Storage = Const;
+}
+
 Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
-                                     StringRef Name, unsigned AbsWidth) {
+                                     StringRef Name) {
   Constant *C = M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Ty);
   auto *GV = dyn_cast<GlobalVariable>(C);
+  if (GV)
+    GV->setVisibility(GlobalValue::HiddenVisibility);
+  return C;
+}
+
+Constant *DevirtModule::importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                       StringRef Name, IntegerType *IntTy,
+                                       uint32_t Storage) {
+  if (!shouldExportConstantsAsAbsoluteSymbols())
+    return ConstantInt::get(IntTy, Storage);
+
+  Constant *C = importGlobal(Slot, Args, Name);
+  auto *GV = cast<GlobalVariable>(C->stripPointerCasts());
+  C = ConstantExpr::getPtrToInt(C, IntTy);
+
   // We only need to set metadata if the global is newly created, in which
   // case it would not have hidden visibility.
-  if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility)
+  if (GV->getMetadata(LLVMContext::MD_absolute_symbol))
     return C;
 
-  GV->setVisibility(GlobalValue::HiddenVisibility);
   auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
     auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
     auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
     GV->setMetadata(LLVMContext::MD_absolute_symbol,
                     MDNode::get(M.getContext(), {MinC, MaxC}));
   };
+  unsigned AbsWidth = IntTy->getBitWidth();
   if (AbsWidth == IntPtrTy->getBitWidth())
     SetAbsRange(~0ull, ~0ull); // Full set.
-  else if (AbsWidth)
+  else
     SetAbsRange(0, 1ull << AbsWidth);
-  return GV;
+  return C;
 }
 
 void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
@@ -843,10 +931,12 @@ void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
                                         Constant *UniqueMemberAddr) {
   for (auto &&Call : CSInfo.CallSites) {
     IRBuilder<> B(Call.CS.getInstruction());
-    Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
-                              Call.VTable, UniqueMemberAddr);
+    Value *Cmp =
+        B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
+                     B.CreateBitCast(Call.VTable, Int8PtrTy), UniqueMemberAddr);
     Cmp = B.CreateZExt(Cmp, Call.CS->getType());
-    Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, Cmp);
+    Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, OREGetter,
+                         Cmp);
   }
   CSInfo.markDevirt();
 }
@@ -909,17 +999,19 @@ void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
   for (auto Call : CSInfo.CallSites) {
     auto *RetType = cast<IntegerType>(Call.CS.getType());
     IRBuilder<> B(Call.CS.getInstruction());
-    Value *Addr = B.CreateGEP(Int8Ty, Call.VTable, Byte);
+    Value *Addr =
+        B.CreateGEP(Int8Ty, B.CreateBitCast(Call.VTable, Int8PtrTy), Byte);
     if (RetType->getBitWidth() == 1) {
       Value *Bits = B.CreateLoad(Addr);
       Value *BitsAndBit = B.CreateAnd(Bits, Bit);
       auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
       Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled,
-                           IsBitSet);
+                           OREGetter, IsBitSet);
     } else {
       Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
       Value *Val = B.CreateLoad(RetType, ValAddr);
-      Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled, Val);
+      Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled,
+                           OREGetter, Val);
     }
   }
   CSInfo.markDevirt();
@@ -1007,18 +1099,18 @@ bool DevirtModule::tryVirtualConstProp(
       for (auto &&Target : TargetsForSlot)
         Target.WasDevirt = true;
 
-    Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte);
-    Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
 
     if (CSByConstantArg.second.isExported()) {
       ResByArg->TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp;
-      exportGlobal(Slot, CSByConstantArg.first, "byte",
-                   ConstantExpr::getIntToPtr(ByteConst, Int8PtrTy));
-      exportGlobal(Slot, CSByConstantArg.first, "bit",
-                   ConstantExpr::getIntToPtr(BitConst, Int8PtrTy));
+      exportConstant(Slot, CSByConstantArg.first, "byte", OffsetByte,
+                     ResByArg->Byte);
+      exportConstant(Slot, CSByConstantArg.first, "bit", 1ULL << OffsetBit,
+                     ResByArg->Bit);
     }
 
     // Rewrite each call to a load from OffsetByte/OffsetBit.
+    Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte);
+    Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
     applyVirtualConstProp(CSByConstantArg.second,
                           TargetsForSlot[0].Fn->getName(), ByteConst, BitConst);
   }
@@ -1112,8 +1204,7 @@ void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc,
       Value *Ptr = CI->getArgOperand(0)->stripPointerCasts();
       if (SeenPtrs.insert(Ptr).second) {
         for (DevirtCallSite Call : DevirtCalls) {
-          CallSlots[{TypeId, Call.Offset}].addCallSite(CI->getArgOperand(0),
-                                                       Call.CS, nullptr);
+          CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CS, nullptr);
         }
       }
     }
@@ -1250,10 +1341,10 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
       break;
     }
     case WholeProgramDevirtResolution::ByArg::VirtualConstProp: {
-      Constant *Byte = importGlobal(Slot, CSByConstantArg.first, "byte", 32);
-      Byte = ConstantExpr::getPtrToInt(Byte, Int32Ty);
-      Constant *Bit = importGlobal(Slot, CSByConstantArg.first, "bit", 8);
-      Bit = ConstantExpr::getPtrToInt(Bit, Int8Ty);
+      Constant *Byte = importConstant(Slot, CSByConstantArg.first, "byte",
+                                      Int32Ty, ResByArg.Byte);
+      Constant *Bit = importConstant(Slot, CSByConstantArg.first, "bit", Int8Ty,
+                                     ResByArg.Bit);
       applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit);
     }
     default:
@@ -1406,9 +1497,24 @@ bool DevirtModule::run() {
     // Generate remarks for each devirtualized function.
     for (const auto &DT : DevirtTargets) {
       Function *F = DT.second;
-      DISubprogram *SP = F->getSubprogram();
-      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, SP,
-                             Twine("devirtualized ") + F->getName());
+
+      // In the new pass manager, we can request the optimization
+      // remark emitter pass on a per-function-basis, which the
+      // OREGetter will do for us.
+      // In the old pass manager, this is harder, so we just build
+      // a optimization remark emitter on the fly, when we need it.
+      std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
+      OptimizationRemarkEmitter *ORE;
+      if (OREGetter)
+        ORE = &OREGetter(F);
+      else {
+        OwnedORE = make_unique<OptimizationRemarkEmitter>(F);
+        ORE = OwnedORE.get();
+      }
+
+      using namespace ore;
+      ORE->emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
+                << "devirtualized " << NV("FunctionName", F->getName()));
     }
   }
author	Dimitry Andric <dim@FreeBSD.org>	2017-12-18 20:10:56 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-12-18 20:10:56 +0000
commit	044eb2f6afba375a914ac9d8024f8f5142bb912e (patch)
tree	1475247dc9f9fe5be155ebd4c9069c75aadf8c20 /lib/Transforms/IPO
parent	eb70dddbd77e120e5d490bd8fbe7ff3f8fa81c6b (diff)