54 files changed, 1597 insertions, 422 deletions
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index d170eff17951..f2183ff52bfb 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -2377,10 +2377,21 @@ void LLVMSetExternallyInitialized(LLVMValueRef GlobalVar, LLVMBool IsExtInit);
  *
  * @{
  */
+
+/** Deprecated: Use LLVMAddAlias2 instead. */
 LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee,
                           const char *Name);
 
 /**
+ * Add a GlobalAlias with the given value type, address space and aliasee.
+ *
+ * @see llvm::GlobalAlias::create()
+ */
+LLVMValueRef LLVMAddAlias2(LLVMModuleRef M, LLVMTypeRef ValueTy,
+                           unsigned AddrSpace, LLVMValueRef Aliasee,
+                           const char *Name);
+
+/**
  * Obtain a GlobalAlias value from a Module by its name.
  *
  * The returned value corresponds to a llvm::GlobalAlias value.
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 595cd94b6b8f..c2660502a419 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -1458,10 +1458,8 @@ public:
   /// uint64_t. The bitwidth must be <= 64 or the value must fit within a
   /// uint64_t. Otherwise an assertion will result.
   uint64_t getZExtValue() const {
-    if (isSingleWord()) {
-      assert(BitWidth && "zero width values not allowed");
+    if (isSingleWord())
       return U.VAL;
-    }
     assert(getActiveBits() <= 64 && "Too many bits for uint64_t");
     return U.pVal[0];
   }
diff --git a/llvm/include/llvm/ADT/SCCIterator.h b/llvm/include/llvm/ADT/SCCIterator.h
index 8a7c0a78a0fc..ad35e09f0f74 100644
--- a/llvm/include/llvm/ADT/SCCIterator.h
+++ b/llvm/include/llvm/ADT/SCCIterator.h
@@ -28,6 +28,10 @@
 #include <cassert>
 #include <cstddef>
 #include <iterator>
+#include <queue>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 namespace llvm {
@@ -234,6 +238,135 @@ template <class T> scc_iterator<T> scc_end(const T &G) {
   return scc_iterator<T>::end(G);
 }
 
+/// Sort the nodes of a directed SCC in the decreasing order of the edge
+/// weights. The instantiating GraphT type should have weighted edge type
+/// declared in its graph traits in order to use this iterator.
+///
+/// This is implemented using Kruskal's minimal spanning tree algorithm followed
+/// by a BFS walk. First a maximum spanning tree (forest) is built based on all
+/// edges within the SCC collection. Then a BFS walk is initiated on tree nodes
+/// that do not have a predecessor. Finally, the BFS order computed is the
+/// traversal order of the nodes of the SCC. Such order ensures that
+/// high-weighted edges are visited first during the tranversal.
+template <class GraphT, class GT = GraphTraits<GraphT>>
+class scc_member_iterator {
+  using NodeType = typename GT::NodeType;
+  using EdgeType = typename GT::EdgeType;
+  using NodesType = std::vector<NodeType *>;
+
+  // Auxilary node information used during the MST calculation.
+  struct NodeInfo {
+    NodeInfo *Group = this;
+    uint32_t Rank = 0;
+    bool Visited = true;
+  };
+
+  // Find the root group of the node and compress the path from node to the
+  // root.
+  NodeInfo *find(NodeInfo *Node) {
+    if (Node->Group != Node)
+      Node->Group = find(Node->Group);
+    return Node->Group;
+  }
+
+  // Union the source and target node into the same group and return true.
+  // Returns false if they are already in the same group.
+  bool unionGroups(const EdgeType *Edge) {
+    NodeInfo *G1 = find(&NodeInfoMap[Edge->Source]);
+    NodeInfo *G2 = find(&NodeInfoMap[Edge->Target]);
+
+    // If the edge forms a cycle, do not add it to MST
+    if (G1 == G2)
+      return false;
+
+    // Make the smaller rank tree a direct child or the root of high rank tree.
+    if (G1->Rank < G1->Rank)
+      G1->Group = G2;
+    else {
+      G2->Group = G1;
+      // If the ranks are the same, increment root of one tree by one.
+      if (G1->Rank == G2->Rank)
+        G2->Rank++;
+    }
+    return true;
+  }
+
+  std::unordered_map<NodeType *, NodeInfo> NodeInfoMap;
+  NodesType Nodes;
+
+public:
+  scc_member_iterator(const NodesType &InputNodes);
+
+  NodesType &operator*() { return Nodes; }
+};
+
+template <class GraphT, class GT>
+scc_member_iterator<GraphT, GT>::scc_member_iterator(
+    const NodesType &InputNodes) {
+  if (InputNodes.size() <= 1) {
+    Nodes = InputNodes;
+    return;
+  }
+
+  // Initialize auxilary node information.
+  NodeInfoMap.clear();
+  for (auto *Node : InputNodes) {
+    // This is specifically used to construct a `NodeInfo` object in place. An
+    // insert operation will involve a copy construction which invalidate the
+    // initial value of the `Group` field which should be `this`.
+    (void)NodeInfoMap[Node].Group;
+  }
+
+  // Sort edges by weights.
+  struct EdgeComparer {
+    bool operator()(const EdgeType *L, const EdgeType *R) const {
+      return L->Weight > R->Weight;
+    }
+  };
+
+  std::multiset<const EdgeType *, EdgeComparer> SortedEdges;
+  for (auto *Node : InputNodes) {
+    for (auto &Edge : Node->Edges) {
+      if (NodeInfoMap.count(Edge.Target))
+        SortedEdges.insert(&Edge);
+    }
+  }
+
+  // Traverse all the edges and compute the Maximum Weight Spanning Tree
+  // using Kruskal's algorithm.
+  std::unordered_set<const EdgeType *> MSTEdges;
+  for (auto *Edge : SortedEdges) {
+    if (unionGroups(Edge))
+      MSTEdges.insert(Edge);
+  }
+
+  // Do BFS on MST, starting from nodes that have no incoming edge. These nodes
+  // are "roots" of the MST forest. This ensures that nodes are visited before
+  // their decsendents are, thus ensures hot edges are processed before cold
+  // edges, based on how MST is computed.
+  for (const auto *Edge : MSTEdges)
+    NodeInfoMap[Edge->Target].Visited = false;
+
+  std::queue<NodeType *> Queue;
+  for (auto &Node : NodeInfoMap)
+    if (Node.second.Visited)
+      Queue.push(Node.first);
+
+  while (!Queue.empty()) {
+    auto *Node = Queue.front();
+    Queue.pop();
+    Nodes.push_back(Node);
+    for (auto &Edge : Node->Edges) {
+      if (MSTEdges.count(&Edge) && !NodeInfoMap[Edge.Target].Visited) {
+        NodeInfoMap[Edge.Target].Visited = true;
+        Queue.push(Edge.Target);
+      }
+    }
+  }
+
+  assert(InputNodes.size() == Nodes.size() && "missing nodes in MST");
+  std::reverse(Nodes.begin(), Nodes.end());
+}
 } // end namespace llvm
 
 #endif // LLVM_ADT_SCCITERATOR_H
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index 48f15b02283a..f9b658ca960a 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -1016,20 +1016,39 @@ public:
 private:
   std::tuple<RangeTs...> Ranges;
 
-  template <size_t... Ns> iterator begin_impl(std::index_sequence<Ns...>) {
+  template <size_t... Ns>
+  iterator begin_impl(std::index_sequence<Ns...>) {
+    return iterator(std::get<Ns>(Ranges)...);
+  }
+  template <size_t... Ns>
+  iterator begin_impl(std::index_sequence<Ns...>) const {
     return iterator(std::get<Ns>(Ranges)...);
   }
   template <size_t... Ns> iterator end_impl(std::index_sequence<Ns...>) {
     return iterator(make_range(std::end(std::get<Ns>(Ranges)),
                                std::end(std::get<Ns>(Ranges)))...);
   }
+  template <size_t... Ns> iterator end_impl(std::index_sequence<Ns...>) const {
+    return iterator(make_range(std::end(std::get<Ns>(Ranges)),
+                               std::end(std::get<Ns>(Ranges)))...);
+  }
 
 public:
   concat_range(RangeTs &&... Ranges)
       : Ranges(std::forward<RangeTs>(Ranges)...) {}
 
-  iterator begin() { return begin_impl(std::index_sequence_for<RangeTs...>{}); }
-  iterator end() { return end_impl(std::index_sequence_for<RangeTs...>{}); }
+  iterator begin() {
+    return begin_impl(std::index_sequence_for<RangeTs...>{});
+  }
+  iterator begin() const {
+    return begin_impl(std::index_sequence_for<RangeTs...>{});
+  }
+  iterator end() {
+    return end_impl(std::index_sequence_for<RangeTs...>{});
+  }
+  iterator end() const {
+    return end_impl(std::index_sequence_for<RangeTs...>{});
+  }
 };
 
 } // end namespace detail
@@ -1977,10 +1996,16 @@ public:
   enumerator_iter<R> begin() {
     return enumerator_iter<R>(0, std::begin(TheRange));
   }
+  enumerator_iter<R> begin() const {
+    return enumerator_iter<R>(0, std::begin(TheRange));
+  }
 
   enumerator_iter<R> end() {
     return enumerator_iter<R>(std::end(TheRange));
   }
+  enumerator_iter<R> end() const {
+    return enumerator_iter<R>(std::end(TheRange));
+  }
 
 private:
   R TheRange;
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index c26dbc457949..ea4c0312e073 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
@@ -50,6 +51,7 @@ enum class RecurKind {
   FMul,       ///< Product of floats.
   FMin,       ///< FP min implemented in terms of select(cmp()).
   FMax,       ///< FP max implemented in terms of select(cmp()).
+  FMulAdd,    ///< Fused multiply-add of floats (a * b + c).
   SelectICmp, ///< Integer select(icmp(),x,y) where one of (x,y) is loop
               ///< invariant
   SelectFCmp  ///< Integer select(fcmp(),x,y) where one of (x,y) is loop
@@ -260,6 +262,12 @@ public:
   SmallVector<Instruction *, 4> getReductionOpChain(PHINode *Phi,
                                                     Loop *L) const;
 
+  /// Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
+  static bool isFMulAddIntrinsic(Instruction *I) {
+    return isa<IntrinsicInst>(I) &&
+           cast<IntrinsicInst>(I)->getIntrinsicID() == Intrinsic::fmuladd;
+  }
+
 private:
   // The starting value of the recurrence.
   // It does not have to be zero!
diff --git a/llvm/include/llvm/Analysis/Lint.h b/llvm/include/llvm/Analysis/Lint.h
index 6eb637e72782..4ceae2d29f16 100644
--- a/llvm/include/llvm/Analysis/Lint.h
+++ b/llvm/include/llvm/Analysis/Lint.h
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines lint interfaces that can be used for some sanity checking
-// of input to the system, and for checking that transformations
-// haven't done something bad. In contrast to the Verifier, the Lint checker
-// checks for undefined behavior or constructions with likely unintended
-// behavior.
+// This file defines lint interfaces that can be used for some validation of
+// input to the system, and for checking that transformations haven't done
+// something bad. In contrast to the Verifier, the Lint checker checks for
+// undefined behavior or constructions with likely unintended behavior.
 //
 // To see what specifically is checked, look at Lint.cpp
 //
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index a2260688e3d6..df50611832ce 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1378,6 +1378,8 @@ private:
   /// includes an exact count and a maximum count.
   ///
   class BackedgeTakenInfo {
+    friend class ScalarEvolution;
+
     /// A list of computable exits and their not-taken counts.  Loops almost
     /// never have more than one computable exit.
     SmallVector<ExitNotTakenInfo, 1> ExitNotTaken;
@@ -1398,9 +1400,6 @@ private:
     /// True iff the backedge is taken either exactly Max or zero times.
     bool MaxOrZero = false;
 
-    /// SCEV expressions used in any of the ExitNotTakenInfo counts.
-    SmallPtrSet<const SCEV *, 4> Operands;
-
     bool isComplete() const { return IsComplete; }
     const SCEV *getConstantMax() const { return ConstantMax; }
 
@@ -1466,10 +1465,6 @@ private:
     /// Return true if the number of times this backedge is taken is either the
     /// value returned by getConstantMax or zero.
     bool isConstantMaxOrZero(ScalarEvolution *SE) const;
-
-    /// Return true if any backedge taken count expressions refer to the given
-    /// subexpression.
-    bool hasOperand(const SCEV *S) const;
   };
 
   /// Cache the backedge-taken count of the loops for this function as they
@@ -1480,6 +1475,10 @@ private:
   /// function as they are computed.
   DenseMap<const Loop *, BackedgeTakenInfo> PredicatedBackedgeTakenCounts;
 
+  /// Loops whose backedge taken counts directly use this non-constant SCEV.
+  DenseMap<const SCEV *, SmallPtrSet<PointerIntPair<const Loop *, 1, bool>, 4>>
+      BECountUsers;
+
   /// This map contains entries for all of the PHI instructions that we
   /// attempt to compute constant evolutions for.  This allows us to avoid
   /// potentially expensive recomputation of these properties.  An instruction
@@ -1492,6 +1491,11 @@ private:
   DenseMap<const SCEV *, SmallVector<std::pair<const Loop *, const SCEV *>, 2>>
       ValuesAtScopes;
 
+  /// Reverse map for invalidation purposes: Stores of which SCEV and which
+  /// loop this is the value-at-scope of.
+  DenseMap<const SCEV *, SmallVector<std::pair<const Loop *, const SCEV *>, 2>>
+      ValuesAtScopesUsers;
+
   /// Memoized computeLoopDisposition results.
   DenseMap<const SCEV *,
            SmallVector<PointerIntPair<const Loop *, 2, LoopDisposition>, 2>>
@@ -1616,11 +1620,6 @@ private:
   /// SCEV+Loop pair.
   const SCEV *computeSCEVAtScope(const SCEV *S, const Loop *L);
 
-  /// This looks up computed SCEV values for all instructions that depend on
-  /// the given instruction and removes them from the ValueExprMap map if they
-  /// reference SymName. This is used during PHI resolution.
-  void forgetSymbolicName(Instruction *I, const SCEV *SymName);
-
   /// Return the BackedgeTakenInfo for the given loop, lazily computing new
   /// values if the loop hasn't been analyzed yet. The returned result is
   /// guaranteed not to be predicated.
@@ -1911,6 +1910,9 @@ private:
   bool splitBinaryAdd(const SCEV *Expr, const SCEV *&L, const SCEV *&R,
                       SCEV::NoWrapFlags &Flags);
 
+  /// Forget predicated/non-predicated backedge taken counts for the given loop.
+  void forgetBackedgeTakenCounts(const Loop *L, bool Predicated);
+
   /// Drop memoized information for all \p SCEVs.
   void forgetMemoizedResults(ArrayRef<const SCEV *> SCEVs);
 
@@ -1923,6 +1925,9 @@ private:
   /// Erase Value from ValueExprMap and ExprValueMap.
   void eraseValueFromMap(Value *V);
 
+  /// Insert V to S mapping into ValueExprMap and ExprValueMap.
+  void insertValueToMap(Value *V, const SCEV *S);
+
   /// Return false iff given SCEV contains a SCEVUnknown with NULL value-
   /// pointer.
   bool checkValidity(const SCEV *S) const;
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
index ded53617b304..9c1abef33b28 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
@@ -1048,6 +1048,12 @@ TLI_DEFINE_STRING_INTERNAL("memset")
 /// void memset_pattern16(void *b, const void *pattern16, size_t len);
 TLI_DEFINE_ENUM_INTERNAL(memset_pattern16)
 TLI_DEFINE_STRING_INTERNAL("memset_pattern16")
+/// void memset_pattern4(void *b, const void *pattern4, size_t len);
+TLI_DEFINE_ENUM_INTERNAL(memset_pattern4)
+TLI_DEFINE_STRING_INTERNAL("memset_pattern4")
+/// void memset_pattern8(void *b, const void *pattern8, size_t len);
+TLI_DEFINE_ENUM_INTERNAL(memset_pattern8)
+TLI_DEFINE_STRING_INTERNAL("memset_pattern8")
 /// int mkdir(const char *path, mode_t mode);
 TLI_DEFINE_ENUM_INTERNAL(mkdir)
 TLI_DEFINE_STRING_INTERNAL("mkdir")
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 24e2318de48b..751c88a4ecbb 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -115,7 +115,7 @@ struct VFShape {
 
     return {EC, Parameters};
   }
-  /// Sanity check on the Parameters in the VFShape.
+  /// Validation check on the Parameters in the VFShape.
   bool hasValidParameterList() const;
 };
 
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index a270fd399aeb..c199e933116a 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1602,6 +1602,13 @@ enum {
   NT_FREEBSD_PROCSTAT_AUXV = 16,
 };
 
+// NetBSD core note types.
+enum {
+  NT_NETBSDCORE_PROCINFO = 1,
+  NT_NETBSDCORE_AUXV = 2,
+  NT_NETBSDCORE_LWPSTATUS = 24,
+};
+
 // OpenBSD core note types.
 enum {
   NT_OPENBSD_PROCINFO = 10,
diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h
index ed3cd54df272..73d39fecc268 100644
--- a/llvm/include/llvm/CodeGen/CommandFlags.h
+++ b/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -130,6 +130,7 @@ bool getEnableMachineFunctionSplitter();
 bool getEnableDebugEntryValues();
 
 bool getValueTrackingVariableLocations();
+Optional<bool> getExplicitValueTrackingVariableLocations();
 
 bool getForceDwarfFrameSection();
 
@@ -170,6 +171,10 @@ void setFunctionAttributes(StringRef CPU, StringRef Features, Function &F);
 /// Set function attributes of functions in Module M based on CPU,
 /// Features, and command line flags.
 void setFunctionAttributes(StringRef CPU, StringRef Features, Module &M);
+
+/// Should value-tracking variable locations / instruction referencing be
+/// enabled by default for this triple?
+bool getDefaultValueTrackingVariableLocations(const llvm::Triple &T);
 } // namespace codegen
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index ff4ad4b72636..f3fa652b0175 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -564,6 +564,7 @@ public:
   /// This variant does not erase \p MI after calling the build function.
   void applyBuildFnNoErase(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  bool matchOrShiftToFunnelShift(MachineInstr &MI, BuildFnTy &MatchInfo);
   bool matchFunnelShiftToRotate(MachineInstr &MI);
   void applyFunnelShiftToRotate(MachineInstr &MI);
   bool matchRotateOutOfRange(MachineInstr &MI);
@@ -648,6 +649,54 @@ public:
   ///           (fma fneg(x), fneg(y), z) -> (fma x, y, z)
   bool matchRedundantNegOperands(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  bool canCombineFMadOrFMA(MachineInstr &MI, bool &AllowFusionGlobally,
+                           bool &HasFMAD, bool &Aggressive,
+                           bool CanReassociate = false);
+
+  /// Transform (fadd (fmul x, y), z) -> (fma x, y, z)
+  ///           (fadd (fmul x, y), z) -> (fmad x, y, z)
+  bool matchCombineFAddFMulToFMadOrFMA(MachineInstr &MI, BuildFnTy &MatchInfo);
+
+  /// Transform (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
+  ///           (fadd (fpext (fmul x, y)), z) -> (fmad (fpext x), (fpext y), z)
+  bool matchCombineFAddFpExtFMulToFMadOrFMA(MachineInstr &MI,
+                                            BuildFnTy &MatchInfo);
+
+  /// Transform (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y, (fma u, v, z))
+  ///          (fadd (fmad x, y, (fmul u, v)), z) -> (fmad x, y, (fmad u, v, z))
+  bool matchCombineFAddFMAFMulToFMadOrFMA(MachineInstr &MI,
+                                          BuildFnTy &MatchInfo);
+
+  // Transform (fadd (fma x, y, (fpext (fmul u, v))), z)
+  //            -> (fma x, y, (fma (fpext u), (fpext v), z))
+  //           (fadd (fmad x, y, (fpext (fmul u, v))), z)
+  //            -> (fmad x, y, (fmad (fpext u), (fpext v), z))
+  bool matchCombineFAddFpExtFMulToFMadOrFMAAggressive(MachineInstr &MI,
+                                                      BuildFnTy &MatchInfo);
+
+  /// Transform (fsub (fmul x, y), z) -> (fma x, y, -z)
+  ///           (fsub (fmul x, y), z) -> (fmad x, y, -z)
+  bool matchCombineFSubFMulToFMadOrFMA(MachineInstr &MI, BuildFnTy &MatchInfo);
+
+  /// Transform (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
+  ///           (fsub (fneg (fmul, x, y)), z) -> (fmad (fneg x), y, (fneg z))
+  bool matchCombineFSubFNegFMulToFMadOrFMA(MachineInstr &MI,
+                                           BuildFnTy &MatchInfo);
+
+  /// Transform (fsub (fpext (fmul x, y)), z)
+  ///           -> (fma (fpext x), (fpext y), (fneg z))
+  ///           (fsub (fpext (fmul x, y)), z)
+  ///           -> (fmad (fpext x), (fpext y), (fneg z))
+  bool matchCombineFSubFpExtFMulToFMadOrFMA(MachineInstr &MI,
+                                            BuildFnTy &MatchInfo);
+
+  /// Transform (fsub (fpext (fneg (fmul x, y))), z)
+  ///           -> (fneg (fma (fpext x), (fpext y), z))
+  ///           (fsub (fpext (fneg (fmul x, y))), z)
+  ///           -> (fneg (fmad (fpext x), (fpext y), z))
+  bool matchCombineFSubFpExtFNegFMulToFMadOrFMA(MachineInstr &MI,
+                                                BuildFnTy &MatchInfo);
+
 private:
   /// Given a non-indexed load or store instruction \p MI, find an offset that
   /// can be usefully and legally folded into it as a post-indexing operation.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index e813d030eec3..a41166bb4c6b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -129,6 +129,43 @@ inline SpecificConstantMatch m_SpecificICst(int64_t RequestedValue) {
   return SpecificConstantMatch(RequestedValue);
 }
 
+/// Matcher for a specific constant splat.
+struct SpecificConstantSplatMatch {
+  int64_t RequestedVal;
+  SpecificConstantSplatMatch(int64_t RequestedVal)
+      : RequestedVal(RequestedVal) {}
+  bool match(const MachineRegisterInfo &MRI, Register Reg) {
+    return isBuildVectorConstantSplat(Reg, MRI, RequestedVal,
+                                      /* AllowUndef */ false);
+  }
+};
+
+/// Matches a constant splat of \p RequestedValue.
+inline SpecificConstantSplatMatch m_SpecificICstSplat(int64_t RequestedValue) {
+  return SpecificConstantSplatMatch(RequestedValue);
+}
+
+/// Matcher for a specific constant or constant splat.
+struct SpecificConstantOrSplatMatch {
+  int64_t RequestedVal;
+  SpecificConstantOrSplatMatch(int64_t RequestedVal)
+      : RequestedVal(RequestedVal) {}
+  bool match(const MachineRegisterInfo &MRI, Register Reg) {
+    int64_t MatchedVal;
+    if (mi_match(Reg, MRI, m_ICst(MatchedVal)) && MatchedVal == RequestedVal)
+      return true;
+    return isBuildVectorConstantSplat(Reg, MRI, RequestedVal,
+                                      /* AllowUndef */ false);
+  }
+};
+
+/// Matches a \p RequestedValue constant or a constant splat of \p
+/// RequestedValue.
+inline SpecificConstantOrSplatMatch
+m_SpecificICstOrSplat(int64_t RequestedValue) {
+  return SpecificConstantOrSplatMatch(RequestedValue);
+}
+
 ///{
 /// Convenience matchers for specific integer values.
 inline SpecificConstantMatch m_ZeroInt() { return SpecificConstantMatch(0); }
@@ -489,6 +526,11 @@ inline UnaryOp_match<SrcTy, TargetOpcode::COPY> m_Copy(SrcTy &&Src) {
   return UnaryOp_match<SrcTy, TargetOpcode::COPY>(std::forward<SrcTy>(Src));
 }
 
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_FSQRT> m_GFSqrt(const SrcTy &Src) {
+  return UnaryOp_match<SrcTy, TargetOpcode::G_FSQRT>(Src);
+}
+
 // General helper for generic MI compares, i.e. G_ICMP and G_FCMP
 // TODO: Allow checking a specific predicate.
 template <typename Pred_P, typename LHS_P, typename RHS_P, unsigned Opcode>
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index 86545b976b8d..4126e2ac7b8f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -378,6 +378,18 @@ Optional<FPValueAndVReg> getFConstantSplat(Register VReg,
                                            const MachineRegisterInfo &MRI,
                                            bool AllowUndef = true);
 
+/// Return true if the specified register is defined by G_BUILD_VECTOR or
+/// G_BUILD_VECTOR_TRUNC where all of the elements are \p SplatValue or undef.
+bool isBuildVectorConstantSplat(const Register Reg,
+                                const MachineRegisterInfo &MRI,
+                                int64_t SplatValue, bool AllowUndef);
+
+/// Return true if the specified instruction is a G_BUILD_VECTOR or
+/// G_BUILD_VECTOR_TRUNC where all of the elements are \p SplatValue or undef.
+bool isBuildVectorConstantSplat(const MachineInstr &MI,
+                                const MachineRegisterInfo &MRI,
+                                int64_t SplatValue, bool AllowUndef);
+
 /// Return true if the specified instruction is a G_BUILD_VECTOR or
 /// G_BUILD_VECTOR_TRUNC where all of the elements are 0 or undef.
 bool isBuildVectorAllZeros(const MachineInstr &MI,
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index dcbd19ac6b5a..ec23dde0c6c0 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -938,7 +938,8 @@ public:
                                           int64_t Offset, LLT Ty);
   MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
                                           int64_t Offset, uint64_t Size) {
-    return getMachineMemOperand(MMO, Offset, LLT::scalar(8 * Size));
+    return getMachineMemOperand(
+        MMO, Offset, Size == ~UINT64_C(0) ? LLT() : LLT::scalar(8 * Size));
   }
 
   /// getMachineMemOperand - Allocate a new MachineMemOperand by copying
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index fa22ca6a98ac..a855a0797723 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -139,10 +139,13 @@ public:
   ///
   int getOffsetOfLocalArea() const { return LocalAreaOffset; }
 
-  /// isFPCloseToIncomingSP - Return true if the frame pointer is close to
-  /// the incoming stack pointer, false if it is close to the post-prologue
-  /// stack pointer.
-  virtual bool isFPCloseToIncomingSP() const { return true; }
+  /// Control the placement of special register scavenging spill slots when
+  /// allocating a stack frame.
+  ///
+  /// If this returns true, the frame indexes used by the RegScavenger will be
+  /// allocated closest to the incoming stack pointer.
+  virtual bool allocateScavengingFrameIndexesNearIncomingSP(
+    const MachineFunction &MF) const;
 
   /// assignCalleeSavedSpillSlots - Allows target to override spill slot
   /// assignment logic.  If implemented, assignCalleeSavedSpillSlots() should
@@ -220,6 +223,9 @@ public:
   virtual void inlineStackProbe(MachineFunction &MF,
                                 MachineBasicBlock &PrologueMBB) const {}
 
+  /// Does the stack probe function call return with a modified stack pointer?
+  virtual bool stackProbeFunctionModifiesSP() const { return false; }
+
   /// Adjust the prologue to have the function use segmented stacks. This works
   /// by adding a check even before the "normal" function prologue.
   virtual void adjustForSegmentedStacks(MachineFunction &MF,
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 8bc730a3eda5..d43dd9fac85d 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1913,6 +1913,12 @@ public:
         "Target didn't implement TargetInstrInfo::getOutliningCandidateInfo!");
   }
 
+  /// Optional target hook to create the LLVM IR attributes for the outlined
+  /// function. If overridden, the overriding function must call the default
+  /// implementation.
+  virtual void mergeOutliningCandidateAttributes(
+      Function &F, std::vector<outliner::Candidate> &Candidates) const;
+
   /// Returns how or if \p MI should be outlined.
   virtual outliner::InstrType
   getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 87f5168ec48f..d862701c37d7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -425,6 +425,12 @@ public:
     return true;
   }
 
+  /// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
+  /// using generic code in SelectionDAGBuilder.
+  virtual bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const {
+    return true;
+  }
+
   /// Return true if it is profitable to convert a select of FP constants into
   /// a constant pool load whose address depends on the select condition. The
   /// parameter may be used to differentiate a select with FP compare from
@@ -806,9 +812,12 @@ public:
   /// Return true if target always benefits from combining into FMA for a
   /// given value type. This must typically return false on targets where FMA
   /// takes more cycles to execute than FADD.
-  virtual bool enableAggressiveFMAFusion(EVT VT) const {
-    return false;
-  }
+  virtual bool enableAggressiveFMAFusion(EVT VT) const { return false; }
+
+  /// Return true if target always benefits from combining into FMA for a
+  /// given value type. This must typically return false on targets where FMA
+  /// takes more cycles to execute than FADD.
+  virtual bool enableAggressiveFMAFusion(LLT Ty) const { return false; }
 
   /// Return the ValueType of the result of SETCC operations.
   virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
@@ -2710,6 +2719,14 @@ public:
   /// Return true if an fpext operation input to an \p Opcode operation is free
   /// (for instance, because half-precision floating-point numbers are
   /// implicitly extended to float-precision) for an FMA instruction.
+  virtual bool isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
+                               LLT DestTy, LLT SrcTy) const {
+    return false;
+  }
+
+  /// Return true if an fpext operation input to an \p Opcode operation is free
+  /// (for instance, because half-precision floating-point numbers are
+  /// implicitly extended to float-precision) for an FMA instruction.
   virtual bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
                                EVT DestVT, EVT SrcVT) const {
     assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
@@ -2748,11 +2765,47 @@ public:
     return false;
   }
 
+  /// Return true if an FMA operation is faster than a pair of fmul and fadd
+  /// instructions. fmuladd intrinsics will be expanded to FMAs when this method
+  /// returns true, otherwise fmuladd is expanded to fmul + fadd.
+  ///
+  /// NOTE: This may be called before legalization on types for which FMAs are
+  /// not legal, but should return true if those types will eventually legalize
+  /// to types that support FMAs. After legalization, it will only be called on
+  /// types that support FMAs (via Legal or Custom actions)
+  virtual bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                          LLT) const {
+    return false;
+  }
+
   /// IR version
   virtual bool isFMAFasterThanFMulAndFAdd(const Function &F, Type *) const {
     return false;
   }
 
+  /// Returns true if \p MI can be combined with another instruction to
+  /// form TargetOpcode::G_FMAD. \p N may be an TargetOpcode::G_FADD,
+  /// TargetOpcode::G_FSUB, or an TargetOpcode::G_FMUL which will be
+  /// distributed into an fadd/fsub.
+  virtual bool isFMADLegal(const MachineInstr &MI, LLT Ty) const {
+    assert((MI.getOpcode() == TargetOpcode::G_FADD ||
+            MI.getOpcode() == TargetOpcode::G_FSUB ||
+            MI.getOpcode() == TargetOpcode::G_FMUL) &&
+           "unexpected node in FMAD forming combine");
+    switch (Ty.getScalarSizeInBits()) {
+    case 16:
+      return isOperationLegal(TargetOpcode::G_FMAD, MVT::f16);
+    case 32:
+      return isOperationLegal(TargetOpcode::G_FMAD, MVT::f32);
+    case 64:
+      return isOperationLegal(TargetOpcode::G_FMAD, MVT::f64);
+    default:
+      break;
+    }
+
+    return false;
+  }
+
   /// Returns true if be combined with to form an ISD::FMAD. \p N may be an
   /// ISD::FADD, ISD::FSUB, or an ISD::FMUL which will be distributed into an
   /// fadd/fsub.
@@ -2852,6 +2905,12 @@ public:
   /// passed to the fp16 to fp conversion library function.
   virtual bool shouldKeepZExtForFP16Conv() const { return false; }
 
+  /// Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT
+  /// from min(max(fptoi)) saturation patterns.
+  virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const {
+    return isOperationLegalOrCustom(Op, VT);
+  }
+
   //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 902973ff5722..ae1afeb668be 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -156,6 +156,11 @@ public:
                                    NormalUnits.getNumInfoUnits());
   }
 
+  const DWARFUnitVector &getNormalUnitsVector() {
+    parseNormalUnits();
+    return NormalUnits;
+  }
+
   /// Get units from .debug_types in this context.
   unit_iterator_range types_section_units() {
     parseNormalUnits();
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index d471b80c7fe1..505686bfbf59 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -14,6 +14,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include <cstdint>
 #include <map>
 #include <set>
@@ -153,8 +154,8 @@ private:
   /// \param SectionKind The object-file section kind that S comes from.
   ///
   /// \returns The number of errors that occurred during verification.
-  unsigned verifyUnitSection(const DWARFSection &S,
-                             DWARFSectionKind SectionKind);
+  unsigned verifyUnitSection(const DWARFSection &S);
+  unsigned verifyUnits(const DWARFUnitVector &Units);
 
   /// Verifies that a call site entry is nested within a subprogram with a
   /// DW_AT_call attribute.
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index 362e8ab8e296..2180be3341e1 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -519,6 +519,7 @@ private:
 /// symbols of an error.
 class MaterializationResponsibility {
   friend class ExecutionSession;
+  friend class JITDylib;
 
 public:
   MaterializationResponsibility(MaterializationResponsibility &&) = delete;
@@ -535,10 +536,10 @@ public:
 
   /// Returns the target JITDylib that these symbols are being materialized
   ///        into.
-  JITDylib &getTargetJITDylib() const { return *JD; }
+  JITDylib &getTargetJITDylib() const { return JD; }
 
   /// Returns the ExecutionSession for this instance.
-  ExecutionSession &getExecutionSession();
+  ExecutionSession &getExecutionSession() const;
 
   /// Returns the symbol flags map for this responsibility instance.
   /// Note: The returned flags may have transient flags (Lazy, Materializing)
@@ -640,15 +641,16 @@ public:
 private:
   /// Create a MaterializationResponsibility for the given JITDylib and
   ///        initial symbols.
-  MaterializationResponsibility(JITDylibSP JD, SymbolFlagsMap SymbolFlags,
+  MaterializationResponsibility(ResourceTrackerSP RT,
+                                SymbolFlagsMap SymbolFlags,
                                 SymbolStringPtr InitSymbol)
-      : JD(std::move(JD)), SymbolFlags(std::move(SymbolFlags)),
-        InitSymbol(std::move(InitSymbol)) {
-    assert(this->JD && "Cannot initialize with null JITDylib");
+      : JD(RT->getJITDylib()), RT(std::move(RT)),
+        SymbolFlags(std::move(SymbolFlags)), InitSymbol(std::move(InitSymbol)) {
     assert(!this->SymbolFlags.empty() && "Materializing nothing?");
   }
 
-  JITDylibSP JD;
+  JITDylib &JD;
+  ResourceTrackerSP RT;
   SymbolFlagsMap SymbolFlags;
   SymbolStringPtr InitSymbol;
 };
@@ -913,12 +915,26 @@ public:
                               const SymbolLookupSet &LookupSet) = 0;
 };
 
-/// A symbol table that supports asynchoronous symbol queries.
+/// Represents a JIT'd dynamic library.
+///
+/// This class aims to mimic the behavior of a regular dylib or shared object,
+/// but without requiring the contained program representations to be compiled
+/// up-front. The JITDylib's content is defined by adding MaterializationUnits,
+/// and contained MaterializationUnits will typically rely on the JITDylib's
+/// links-against order to resolve external references (similar to a regular
+/// dylib).
+///
+/// The JITDylib object is a thin wrapper that references state held by the
+/// ExecutionSession. JITDylibs can be removed, clearing this underlying state
+/// and leaving the JITDylib object in a defunct state. In this state the
+/// JITDylib's name is guaranteed to remain accessible. If the ExecutionSession
+/// is still alive then other operations are callable but will return an Error
+/// or null result (depending on the API). It is illegal to call any operation
+/// other than getName on a JITDylib after the ExecutionSession has been torn
+/// down.
 ///
-/// Represents a virtual shared object. Instances can not be copied or moved, so
-/// their addresses may be used as keys for resource management.
-/// JITDylib state changes must be made via an ExecutionSession to guarantee
-/// that they are synchronized with respect to other JITDylib operations.
+/// JITDylibs cannot be moved or copied. Their address is stable, and useful as
+/// a key in some JIT data structures.
 class JITDylib : public ThreadSafeRefCountedBase<JITDylib>,
                  public jitlink::JITLinkDylib {
   friend class AsynchronousSymbolQuery;
@@ -931,10 +947,21 @@ public:
   JITDylib &operator=(const JITDylib &) = delete;
   JITDylib(JITDylib &&) = delete;
   JITDylib &operator=(JITDylib &&) = delete;
+  ~JITDylib();
 
   /// Get a reference to the ExecutionSession for this JITDylib.
+  ///
+  /// It is legal to call this method on a defunct JITDylib, however the result
+  /// will only usable if the ExecutionSession is still alive. If this JITDylib
+  /// is held by an error that may have torn down the JIT then the result
+  /// should not be used.
   ExecutionSession &getExecutionSession() const { return ES; }
 
+  /// Dump current JITDylib state to OS.
+  ///
+  /// It is legal to call this method on a defunct JITDylib.
+  void dump(raw_ostream &OS);
+
   /// Calls remove on all trackers currently associated with this JITDylib.
   /// Does not run static deinits.
   ///
@@ -942,12 +969,21 @@ public:
   /// added concurrently while the clear is underway, and the newly added
   /// code will *not* be cleared. Adding new code concurrently with a clear
   /// is usually a bug and should be avoided.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   Error clear();
 
   /// Get the default resource tracker for this JITDylib.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   ResourceTrackerSP getDefaultResourceTracker();
 
   /// Create a resource tracker for this JITDylib.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   ResourceTrackerSP createResourceTracker();
 
   /// Adds a definition generator to this JITDylib and returns a referenece to
@@ -956,6 +992,9 @@ public:
   /// When JITDylibs are searched during lookup, if no existing definition of
   /// a symbol is found, then any generators that have been added are run (in
   /// the order that they were added) to potentially generate a definition.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   template <typename GeneratorT>
   GeneratorT &addGenerator(std::unique_ptr<GeneratorT> DefGenerator);
 
@@ -963,6 +1002,9 @@ public:
   ///
   /// The given generator must exist in this JITDylib's generators list (i.e.
   /// have been added and not yet removed).
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   void removeGenerator(DefinitionGenerator &G);
 
   /// Set the link order to be used when fixing up definitions in JITDylib.
@@ -983,26 +1025,41 @@ public:
   /// as the first in the link order (instead of this dylib) ensures that
   /// definitions within this dylib resolve to the lazy-compiling stubs,
   /// rather than immediately materializing the definitions in this dylib.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   void setLinkOrder(JITDylibSearchOrder NewSearchOrder,
                     bool LinkAgainstThisJITDylibFirst = true);
 
   /// Add the given JITDylib to the link order for definitions in this
   /// JITDylib.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   void addToLinkOrder(JITDylib &JD,
                       JITDylibLookupFlags JDLookupFlags =
                           JITDylibLookupFlags::MatchExportedSymbolsOnly);
 
   /// Replace OldJD with NewJD in the link order if OldJD is present.
   /// Otherwise this operation is a no-op.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   void replaceInLinkOrder(JITDylib &OldJD, JITDylib &NewJD,
                           JITDylibLookupFlags JDLookupFlags =
                               JITDylibLookupFlags::MatchExportedSymbolsOnly);
 
   /// Remove the given JITDylib from the link order for this JITDylib if it is
   /// present. Otherwise this operation is a no-op.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   void removeFromLinkOrder(JITDylib &JD);
 
   /// Do something with the link order (run under the session lock).
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   template <typename Func>
   auto withLinkOrderDo(Func &&F)
       -> decltype(F(std::declval<const JITDylibSearchOrder &>()));
@@ -1014,6 +1071,9 @@ public:
   ///
   /// This overload always takes ownership of the MaterializationUnit. If any
   /// errors occur, the MaterializationUnit consumed.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   template <typename MaterializationUnitType>
   Error define(std::unique_ptr<MaterializationUnitType> &&MU,
                ResourceTrackerSP RT = nullptr);
@@ -1025,6 +1085,9 @@ public:
   /// generated. If an error occurs, ownership remains with the caller. This
   /// may allow the caller to modify the MaterializationUnit to correct the
   /// issue, then re-call define.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   template <typename MaterializationUnitType>
   Error define(std::unique_ptr<MaterializationUnitType> &MU,
                ResourceTrackerSP RT = nullptr);
@@ -1039,28 +1102,40 @@ public:
   ///
   /// On success, all symbols are removed. On failure, the JITDylib state is
   /// left unmodified (no symbols are removed).
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   Error remove(const SymbolNameSet &Names);
 
-  /// Dump current JITDylib state to OS.
-  void dump(raw_ostream &OS);
-
   /// Returns the given JITDylibs and all of their transitive dependencies in
   /// DFS order (based on linkage relationships). Each JITDylib will appear
   /// only once.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   static std::vector<JITDylibSP> getDFSLinkOrder(ArrayRef<JITDylibSP> JDs);
 
   /// Returns the given JITDylibs and all of their transitive dependensies in
   /// reverse DFS order (based on linkage relationships). Each JITDylib will
   /// appear only once.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   static std::vector<JITDylibSP>
   getReverseDFSLinkOrder(ArrayRef<JITDylibSP> JDs);
 
   /// Return this JITDylib and its transitive dependencies in DFS order
   /// based on linkage relationships.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   std::vector<JITDylibSP> getDFSLinkOrder();
 
   /// Rteurn this JITDylib and its transitive dependencies in reverse DFS order
   /// based on linkage relationships.
+  ///
+  /// It is illegal to call this method on a defunct JITDylib and the client
+  /// is responsible for ensuring that they do not do so.
   std::vector<JITDylibSP> getReverseDFSLinkOrder();
 
 private:
@@ -1151,7 +1226,6 @@ private:
 
   JITDylib(ExecutionSession &ES, std::string Name);
 
-  ResourceTrackerSP getTracker(MaterializationResponsibility &MR);
   std::pair<AsynchronousSymbolQuerySet, std::shared_ptr<SymbolDependenceMap>>
   removeTracker(ResourceTracker &RT);
 
@@ -1197,8 +1271,8 @@ private:
       failSymbols(FailedSymbolsWorklist);
 
   ExecutionSession &ES;
+  enum { Open, Closing, Closed } State = Open;
   std::mutex GeneratorsMutex;
-  bool Open = true;
   SymbolTable Symbols;
   UnmaterializedInfosMap UnmaterializedInfos;
   MaterializingInfosMap MaterializingInfos;
@@ -1208,7 +1282,8 @@ private:
 
   // Map trackers to sets of symbols tracked.
   DenseMap<ResourceTracker *, SymbolNameVector> TrackerSymbols;
-  DenseMap<MaterializationResponsibility *, ResourceTracker *> MRTrackers;
+  DenseMap<ResourceTracker *, DenseSet<MaterializationResponsibility *>>
+      TrackerMRs;
 };
 
 /// Platforms set up standard symbols and mediate interactions between dynamic
@@ -1363,6 +1438,18 @@ public:
   /// If no Platform is attached this call is equivalent to createBareJITDylib.
   Expected<JITDylib &> createJITDylib(std::string Name);
 
+  /// Closes the given JITDylib.
+  ///
+  /// This method clears all resources held for the JITDylib, puts it in the
+  /// closed state, and clears all references held by the ExecutionSession and
+  /// other JITDylibs. No further code can be added to the JITDylib, and the
+  /// object will be freed once any remaining JITDylibSPs to it are destroyed.
+  ///
+  /// This method does *not* run static destructors.
+  ///
+  /// This method can only be called once for each JITDylib.
+  Error removeJITDylib(JITDylib &JD);
+
   /// Set the error reporter function.
   ExecutionSession &setErrorReporter(ErrorReporter ReportError) {
     this->ReportError = std::move(ReportError);
@@ -1574,9 +1661,9 @@ private:
                                       SymbolStringPtr InitSymbol) {
     auto &JD = RT.getJITDylib();
     std::unique_ptr<MaterializationResponsibility> MR(
-        new MaterializationResponsibility(&JD, std::move(Symbols),
+        new MaterializationResponsibility(&RT, std::move(Symbols),
                                           std::move(InitSymbol)));
-    JD.MRTrackers[MR.get()] = &RT;
+    JD.TrackerMRs[&RT].insert(MR.get());
     return MR;
   }
 
@@ -1660,18 +1747,17 @@ private:
       JITDispatchHandlers;
 };
 
-inline ExecutionSession &MaterializationResponsibility::getExecutionSession() {
-  return JD->getExecutionSession();
+inline ExecutionSession &
+MaterializationResponsibility::getExecutionSession() const {
+  return JD.getExecutionSession();
 }
 
 template <typename Func>
 Error MaterializationResponsibility::withResourceKeyDo(Func &&F) const {
-  return JD->getExecutionSession().runSessionLocked([&]() -> Error {
-    auto I = JD->MRTrackers.find(this);
-    assert(I != JD->MRTrackers.end() && "No tracker for this MR");
-    if (I->second->isDefunct())
-      return make_error<ResourceTrackerDefunct>(I->second);
-    F(I->second->getKeyUnsafe());
+  return JD.getExecutionSession().runSessionLocked([&]() -> Error {
+    if (RT->isDefunct())
+      return make_error<ResourceTrackerDefunct>(RT);
+    F(RT->getKeyUnsafe());
     return Error::success();
   });
 }
@@ -1679,14 +1765,17 @@ Error MaterializationResponsibility::withResourceKeyDo(Func &&F) const {
 template <typename GeneratorT>
 GeneratorT &JITDylib::addGenerator(std::unique_ptr<GeneratorT> DefGenerator) {
   auto &G = *DefGenerator;
-  std::lock_guard<std::mutex> Lock(GeneratorsMutex);
-  DefGenerators.push_back(std::move(DefGenerator));
+  ES.runSessionLocked([&] {
+    assert(State == Open && "Cannot add generator to closed JITDylib");
+    DefGenerators.push_back(std::move(DefGenerator));
+  });
   return G;
 }
 
 template <typename Func>
 auto JITDylib::withLinkOrderDo(Func &&F)
     -> decltype(F(std::declval<const JITDylibSearchOrder &>())) {
+  assert(State == Open && "Cannot use link order of closed JITDylib");
   return ES.runSessionLocked([&]() { return F(LinkOrder); });
 }
 
@@ -1715,6 +1804,8 @@ Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &&MU,
     });
 
   return ES.runSessionLocked([&, this]() -> Error {
+    assert(State == Open && "JD is defunct");
+
     if (auto Err = defineImpl(*MU))
       return Err;
 
@@ -1756,6 +1847,8 @@ Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &MU,
     });
 
   return ES.runSessionLocked([&, this]() -> Error {
+    assert(State == Open && "JD is defunct");
+
     if (auto Err = defineImpl(*MU))
       return Err;
 
@@ -1800,50 +1893,50 @@ private:
 // ---------------------------------------------
 
 inline MaterializationResponsibility::~MaterializationResponsibility() {
-  JD->getExecutionSession().OL_destroyMaterializationResponsibility(*this);
+  getExecutionSession().OL_destroyMaterializationResponsibility(*this);
 }
 
 inline SymbolNameSet MaterializationResponsibility::getRequestedSymbols() const {
-  return JD->getExecutionSession().OL_getRequestedSymbols(*this);
+  return getExecutionSession().OL_getRequestedSymbols(*this);
 }
 
 inline Error MaterializationResponsibility::notifyResolved(
     const SymbolMap &Symbols) {
-  return JD->getExecutionSession().OL_notifyResolved(*this, Symbols);
+  return getExecutionSession().OL_notifyResolved(*this, Symbols);
 }
 
 inline Error MaterializationResponsibility::notifyEmitted() {
-  return JD->getExecutionSession().OL_notifyEmitted(*this);
+  return getExecutionSession().OL_notifyEmitted(*this);
 }
 
 inline Error MaterializationResponsibility::defineMaterializing(
     SymbolFlagsMap SymbolFlags) {
-  return JD->getExecutionSession().OL_defineMaterializing(
-      *this, std::move(SymbolFlags));
+  return getExecutionSession().OL_defineMaterializing(*this,
+                                                      std::move(SymbolFlags));
 }
 
 inline void MaterializationResponsibility::failMaterialization() {
-  JD->getExecutionSession().OL_notifyFailed(*this);
+  getExecutionSession().OL_notifyFailed(*this);
 }
 
 inline Error MaterializationResponsibility::replace(
     std::unique_ptr<MaterializationUnit> MU) {
-  return JD->getExecutionSession().OL_replace(*this, std::move(MU));
+  return getExecutionSession().OL_replace(*this, std::move(MU));
 }
 
 inline Expected<std::unique_ptr<MaterializationResponsibility>>
 MaterializationResponsibility::delegate(const SymbolNameSet &Symbols) {
-  return JD->getExecutionSession().OL_delegate(*this, Symbols);
+  return getExecutionSession().OL_delegate(*this, Symbols);
 }
 
 inline void MaterializationResponsibility::addDependencies(
     const SymbolStringPtr &Name, const SymbolDependenceMap &Dependencies) {
-  JD->getExecutionSession().OL_addDependencies(*this, Name, Dependencies);
+  getExecutionSession().OL_addDependencies(*this, Name, Dependencies);
 }
 
 inline void MaterializationResponsibility::addDependenciesForAll(
     const SymbolDependenceMap &Dependencies) {
-  JD->getExecutionSession().OL_addDependenciesForAll(*this, Dependencies);
+  getExecutionSession().OL_addDependenciesForAll(*this, Dependencies);
 }
 
 } // End namespace orc
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 2fec3e7e4230..d2f9bac16e5a 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -120,6 +120,10 @@ enum class OMPScheduleType {
   Runtime = 37,
   Auto = 38, // auto
 
+  StaticBalancedChunked = 45, // static with chunk adjustment (e.g., simd)
+  GuidedSimd = 46,            // guided with chunk adjustment
+  RuntimeSimd = 47,           // runtime with chunk adjustment
+
   ModifierMonotonic =
       (1 << 29), // Set if the monotonic schedule modifier was present
   ModifierNonmonotonic =
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index b4e099e4ec20..bcf52278ccbb 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -1670,32 +1670,6 @@ public:
     return CreateAlignedLoad(Ty, Ptr, MaybeAlign(), isVolatile, Name);
   }
 
-  // Deprecated [opaque pointer types]
-  LLVM_ATTRIBUTE_DEPRECATED(LoadInst *CreateLoad(Value *Ptr,
-                                                 const char *Name),
-                            "Use the version that explicitly specifies the "
-                            "loaded type instead") {
-    return CreateLoad(Ptr->getType()->getPointerElementType(), Ptr, Name);
-  }
-
-  // Deprecated [opaque pointer types]
-  LLVM_ATTRIBUTE_DEPRECATED(LoadInst *CreateLoad(Value *Ptr,
-                                                 const Twine &Name = ""),
-                            "Use the version that explicitly specifies the "
-                            "loaded type instead") {
-    return CreateLoad(Ptr->getType()->getPointerElementType(), Ptr, Name);
-  }
-
-  // Deprecated [opaque pointer types]
-  LLVM_ATTRIBUTE_DEPRECATED(LoadInst *CreateLoad(Value *Ptr,
-                                                 bool isVolatile,
-                                                 const Twine &Name = ""),
-                            "Use the version that explicitly specifies the "
-                            "loaded type instead") {
-    return CreateLoad(Ptr->getType()->getPointerElementType(), Ptr, isVolatile,
-                      Name);
-  }
-
   StoreInst *CreateStore(Value *Val, Value *Ptr, bool isVolatile = false) {
     return CreateAlignedStore(Val, Ptr, MaybeAlign(), isVolatile);
   }
@@ -1719,35 +1693,6 @@ public:
     return Insert(new LoadInst(Ty, Ptr, Twine(), isVolatile, *Align), Name);
   }
 
-  // Deprecated [opaque pointer types]
-  LLVM_ATTRIBUTE_DEPRECATED(LoadInst *CreateAlignedLoad(Value *Ptr,
-                                                        MaybeAlign Align,
-                                                        const char *Name),
-                            "Use the version that explicitly specifies the "
-                            "loaded type instead") {
-    return CreateAlignedLoad(Ptr->getType()->getPointerElementType(), Ptr,
-                             Align, Name);
-  }
-  // Deprecated [opaque pointer types]
-  LLVM_ATTRIBUTE_DEPRECATED(LoadInst *CreateAlignedLoad(Value *Ptr,
-                                                        MaybeAlign Align,
-                                                        const Twine &Name = ""),
-                            "Use the version that explicitly specifies the "
-                            "loaded type instead") {
-    return CreateAlignedLoad(Ptr->getType()->getPointerElementType(), Ptr,
-                             Align, Name);
-  }
-  // Deprecated [opaque pointer types]
-  LLVM_ATTRIBUTE_DEPRECATED(LoadInst *CreateAlignedLoad(Value *Ptr,
-                                                        MaybeAlign Align,
-                                                        bool isVolatile,
-                                                        const Twine &Name = ""),
-                            "Use the version that explicitly specifies the "
-                            "loaded type instead") {
-    return CreateAlignedLoad(Ptr->getType()->getPointerElementType(), Ptr,
-                             Align, isVolatile, Name);
-  }
-
   StoreInst *CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align,
                                 bool isVolatile = false) {
     if (!Align) {
@@ -1788,14 +1733,6 @@ public:
     return Insert(new AtomicRMWInst(Op, Ptr, Val, *Align, Ordering, SSID));
   }
 
-  LLVM_ATTRIBUTE_DEPRECATED(
-      Value *CreateGEP(Value *Ptr, ArrayRef<Value *> IdxList,
-                       const Twine &Name = ""),
-      "Use the version with explicit element type instead") {
-    return CreateGEP(Ptr->getType()->getScalarType()->getPointerElementType(),
-                     Ptr, IdxList, Name);
-  }
-
   Value *CreateGEP(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
                    const Twine &Name = "") {
     if (auto *PC = dyn_cast<Constant>(Ptr)) {
@@ -1810,15 +1747,6 @@ public:
     return Insert(GetElementPtrInst::Create(Ty, Ptr, IdxList), Name);
   }
 
-  LLVM_ATTRIBUTE_DEPRECATED(
-      Value *CreateInBoundsGEP(Value *Ptr, ArrayRef<Value *> IdxList,
-                               const Twine &Name = ""),
-      "Use the version with explicit element type instead") {
-    return CreateInBoundsGEP(
-        Ptr->getType()->getScalarType()->getPointerElementType(), Ptr, IdxList,
-        Name);
-  }
-
   Value *CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
                            const Twine &Name = "") {
     if (auto *PC = dyn_cast<Constant>(Ptr)) {
@@ -1849,15 +1777,6 @@ public:
     return Insert(GetElementPtrInst::CreateInBounds(Ty, Ptr, Idx), Name);
   }
 
-  LLVM_ATTRIBUTE_DEPRECATED(
-      Value *CreateConstGEP1_32(Value *Ptr, unsigned Idx0,
-                                const Twine &Name = ""),
-      "Use the version with explicit element type instead") {
-    return CreateConstGEP1_32(
-        Ptr->getType()->getScalarType()->getPointerElementType(), Ptr, Idx0,
-        Name);
-  }
-
   Value *CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0,
                             const Twine &Name = "") {
     Value *Idx = ConstantInt::get(Type::getInt32Ty(Context), Idx0);
@@ -1914,15 +1833,6 @@ public:
     return Insert(GetElementPtrInst::Create(Ty, Ptr, Idx), Name);
   }
 
-  LLVM_ATTRIBUTE_DEPRECATED(
-      Value *CreateConstGEP1_64(Value *Ptr, uint64_t Idx0,
-                                const Twine &Name = ""),
-      "Use the version with explicit element type instead") {
-    return CreateConstGEP1_64(
-        Ptr->getType()->getScalarType()->getPointerElementType(), Ptr, Idx0,
-        Name);
-  }
-
   Value *CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0,
                                     const Twine &Name = "") {
     Value *Idx = ConstantInt::get(Type::getInt64Ty(Context), Idx0);
@@ -1933,15 +1843,6 @@ public:
     return Insert(GetElementPtrInst::CreateInBounds(Ty, Ptr, Idx), Name);
   }
 
-  LLVM_ATTRIBUTE_DEPRECATED(
-      Value *CreateConstInBoundsGEP1_64(Value *Ptr, uint64_t Idx0,
-                                        const Twine &Name = ""),
-      "Use the version with explicit element type instead") {
-    return CreateConstInBoundsGEP1_64(
-        Ptr->getType()->getScalarType()->getPointerElementType(), Ptr, Idx0,
-        Name);
-  }
-
   Value *CreateConstGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1,
                             const Twine &Name = "") {
     Value *Idxs[] = {
@@ -1955,15 +1856,6 @@ public:
     return Insert(GetElementPtrInst::Create(Ty, Ptr, Idxs), Name);
   }
 
-  LLVM_ATTRIBUTE_DEPRECATED(
-      Value *CreateConstGEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1,
-                                const Twine &Name = ""),
-      "Use the version with explicit element type instead") {
-    return CreateConstGEP2_64(
-        Ptr->getType()->getScalarType()->getPointerElementType(), Ptr, Idx0,
-        Idx1, Name);
-  }
-
   Value *CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0,
                                     uint64_t Idx1, const Twine &Name = "") {
     Value *Idxs[] = {
@@ -1977,28 +1869,11 @@ public:
     return Insert(GetElementPtrInst::CreateInBounds(Ty, Ptr, Idxs), Name);
   }
 
-  LLVM_ATTRIBUTE_DEPRECATED(
-      Value *CreateConstInBoundsGEP2_64(Value *Ptr, uint64_t Idx0,
-                                        uint64_t Idx1, const Twine &Name = ""),
-      "Use the version with explicit element type instead") {
-    return CreateConstInBoundsGEP2_64(
-        Ptr->getType()->getScalarType()->getPointerElementType(), Ptr, Idx0,
-        Idx1, Name);
-  }
-
   Value *CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx,
                          const Twine &Name = "") {
     return CreateConstInBoundsGEP2_32(Ty, Ptr, 0, Idx, Name);
   }
 
-  LLVM_ATTRIBUTE_DEPRECATED(
-      Value *CreateStructGEP(Value *Ptr, unsigned Idx, const Twine &Name = ""),
-      "Use the version with explicit element type instead") {
-    return CreateConstInBoundsGEP2_32(
-        Ptr->getType()->getScalarType()->getPointerElementType(), Ptr, 0, Idx,
-        Name);
-  }
-
   /// Same as CreateGlobalString, but return a pointer with "i8*" type
   /// instead of a pointer to array of i8.
   ///
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 6d32a898b668..046e9b5e809e 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -975,15 +975,6 @@ public:
                                           NameStr, InsertAtEnd);
   }
 
-  LLVM_ATTRIBUTE_DEPRECATED(static GetElementPtrInst *CreateInBounds(
-        Value *Ptr, ArrayRef<Value *> IdxList, const Twine &NameStr = "",
-        Instruction *InsertBefore = nullptr),
-      "Use the version with explicit element type instead") {
-    return CreateInBounds(
-        Ptr->getType()->getScalarType()->getPointerElementType(), Ptr, IdxList,
-        NameStr, InsertBefore);
-  }
-
   /// Create an "inbounds" getelementptr. See the documentation for the
   /// "inbounds" flag in LangRef.html for details.
   static GetElementPtrInst *
@@ -996,15 +987,6 @@ public:
     return GEP;
   }
 
-  LLVM_ATTRIBUTE_DEPRECATED(static GetElementPtrInst *CreateInBounds(
-        Value *Ptr, ArrayRef<Value *> IdxList, const Twine &NameStr,
-        BasicBlock *InsertAtEnd),
-      "Use the version with explicit element type instead") {
-    return CreateInBounds(
-        Ptr->getType()->getScalarType()->getPointerElementType(), Ptr, IdxList,
-        NameStr, InsertAtEnd);
-  }
-
   static GetElementPtrInst *CreateInBounds(Type *PointeeType, Value *Ptr,
                                            ArrayRef<Value *> IdxList,
                                            const Twine &NameStr,
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 8290342c0d51..b01fa10763b8 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -524,6 +524,20 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   def int_ppc_altivec_vprtybq : GCCBuiltin<"__builtin_altivec_vprtybq">,
               Intrinsic<[llvm_v1i128_ty],[llvm_v1i128_ty],[IntrNoMem]>;
 
+  // BCD intrinsics.
+  def int_ppc_bcdadd : GCCBuiltin<"__builtin_ppc_bcdadd">, Intrinsic<
+    [llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
+    [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+  def int_ppc_bcdadd_p : GCCBuiltin<"__builtin_ppc_bcdadd_p">, Intrinsic<
+    [llvm_i32_ty], [llvm_i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+    [IntrNoMem, ImmArg<ArgIndex<0>>]>;
+  def int_ppc_bcdsub : GCCBuiltin<"__builtin_ppc_bcdsub">, Intrinsic<
+    [llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
+    [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+  def int_ppc_bcdsub_p : GCCBuiltin<"__builtin_ppc_bcdsub_p">, Intrinsic<
+    [llvm_i32_ty], [llvm_i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+    [IntrNoMem, ImmArg<ArgIndex<0>>]>;
+
   // P10 Vector Extract with Mask
   def int_ppc_altivec_vextractbm : GCCBuiltin<"__builtin_altivec_vextractbm">,
               Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
@@ -1073,6 +1087,10 @@ let TargetPrefix = "ppc" in {  // All PPC intrinsics start with "llvm.ppc.".
               GCCBuiltin<"__builtin_altivec_crypto_vpermxor">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                          llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+  def int_ppc_altivec_crypto_vpermxor_be :
+              GCCBuiltin<"__builtin_altivec_crypto_vpermxor_be">,
+              Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
+                         llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
 
 def int_ppc_altivec_crypto_vshasigmad :
             GCCBuiltin<"__builtin_altivec_crypto_vshasigmad">,
diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h
index b83d83f0d0ab..7d232bba0864 100644
--- a/llvm/include/llvm/IR/Operator.h
+++ b/llvm/include/llvm/IR/Operator.h
@@ -250,8 +250,16 @@ public:
   bool operator!=(const FastMathFlags &OtherFlags) const {
     return Flags != OtherFlags.Flags;
   }
+
+  /// Print fast-math flags to \p O.
+  void print(raw_ostream &O) const;
 };
 
+inline raw_ostream &operator<<(raw_ostream &O, FastMathFlags FMF) {
+  FMF.print(O);
+  return O;
+}
+
 /// Utility class for floating point operations which can have
 /// information about relaxed accuracy requirements attached to them.
 class FPMathOperator : public Operator {
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index b858733530e3..320deb80bb1f 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2285,6 +2285,31 @@ m_Not(const ValTy &V) {
   return m_c_Xor(V, m_AllOnes());
 }
 
+template <typename ValTy> struct NotForbidUndef_match {
+  ValTy Val;
+  NotForbidUndef_match(const ValTy &V) : Val(V) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    // We do not use m_c_Xor because that could match an arbitrary APInt that is
+    // not -1 as C and then fail to match the other operand if it is -1.
+    // This code should still work even when both operands are constants.
+    Value *X;
+    const APInt *C;
+    if (m_Xor(m_Value(X), m_APIntForbidUndef(C)).match(V) && C->isAllOnes())
+      return Val.match(X);
+    if (m_Xor(m_APIntForbidUndef(C), m_Value(X)).match(V) && C->isAllOnes())
+      return Val.match(X);
+    return false;
+  }
+};
+
+/// Matches a bitwise 'not' as 'xor V, -1' or 'xor -1, V'. For vectors, the
+/// constant value must be composed of only -1 scalar elements.
+template <typename ValTy>
+inline NotForbidUndef_match<ValTy> m_NotForbidUndef(const ValTy &V) {
+  return NotForbidUndef_match<ValTy>(V);
+}
+
 /// Matches an SMin with LHS and RHS in either order.
 template <typename LHS, typename RHS>
 inline MaxMin_match<ICmpInst, LHS, RHS, smin_pred_ty, true>
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index 47431adc6fac..c899c46d4055 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -368,6 +368,8 @@ public:
 
   Type *getPointerElementType() const {
     assert(getTypeID() == PointerTyID);
+    assert(NumContainedTys &&
+           "Attempting to get element type of opaque pointer");
     return ContainedTys[0];
   }
 
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 361d6357b303..a3c6b4e70bf5 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -38,7 +38,7 @@
 // is one VP intrinsic that maps directly to one SDNode that goes by the
 // same name.  Since the operands are also the same, we open the property
 // scopes for both the VPIntrinsic and the SDNode at once.
-// \p SDOPC     The SelectionDAG Node id (eg VP_ADD).
+// \p VPSD     The SelectionDAG Node id (eg VP_ADD).
 // \p LEGALPOS The operand position of the SDNode that is used for legalizing
 //             this SDNode. This can be `-1`, in which case the return type of
 //             the SDNode is used.
@@ -46,12 +46,12 @@
 // \p MASKPOS  The mask operand position.
 // \p EVLPOS   The explicit vector length operand position.
 #ifndef BEGIN_REGISTER_VP_SDNODE
-#define BEGIN_REGISTER_VP_SDNODE(SDOPC, LEGALPOS, TDNAME, MASKPOS, EVLPOS)
+#define BEGIN_REGISTER_VP_SDNODE(VPSD, LEGALPOS, TDNAME, MASKPOS, EVLPOS)
 #endif
 
 // End the property scope of a new VP SDNode.
 #ifndef END_REGISTER_VP_SDNODE
-#define END_REGISTER_VP_SDNODE(SDOPC)
+#define END_REGISTER_VP_SDNODE(VPSD)
 #endif
 
 // Helper macros for the common "1:1 - Intrinsic : SDNode" case.
@@ -60,22 +60,21 @@
 // same name.  Since the operands are also the same, we open the property
 // scopes for both the VPIntrinsic and the SDNode at once.
 //
-// \p INTRIN   The canonical name (eg `vp_add`, which at the same time is the
+// \p VPID     The canonical name (eg `vp_add`, which at the same time is the
 //             name of the intrinsic and the TableGen def of the SDNode).
 // \p MASKPOS  The mask operand position.
 // \p EVLPOS   The explicit vector length operand position.
-// \p SDOPC    The SelectionDAG Node id (eg VP_ADD).
+// \p VPSD     The SelectionDAG Node id (eg VP_ADD).
 // \p LEGALPOS The operand position of the SDNode that is used for legalizing
 //             this SDNode. This can be `-1`, in which case the return type of
 //             the SDNode is used.
-#define BEGIN_REGISTER_VP(INTRIN, MASKPOS, EVLPOS, SDOPC, LEGALPOS) \
-BEGIN_REGISTER_VP_INTRINSIC(INTRIN, MASKPOS, EVLPOS) \
-BEGIN_REGISTER_VP_SDNODE(SDOPC, LEGALPOS, INTRIN, MASKPOS, EVLPOS)
-
-#define END_REGISTER_VP(INTRIN, SDOPC) \
-END_REGISTER_VP_INTRINSIC(INTRIN) \
-END_REGISTER_VP_SDNODE(SDOPC)
+#define BEGIN_REGISTER_VP(VPID, MASKPOS, EVLPOS, VPSD, LEGALPOS)               \
+  BEGIN_REGISTER_VP_INTRINSIC(VPID, MASKPOS, EVLPOS)                           \
+  BEGIN_REGISTER_VP_SDNODE(VPSD, LEGALPOS, VPID, MASKPOS, EVLPOS)
 
+#define END_REGISTER_VP(VPID, VPSD)                                            \
+  END_REGISTER_VP_INTRINSIC(VPID)                                              \
+  END_REGISTER_VP_SDNODE(VPSD)
 
 // The following macros attach properties to the scope they are placed in. This
 // assigns the property to the VP Intrinsic and/or SDNode that belongs to the
@@ -84,9 +83,9 @@ END_REGISTER_VP_SDNODE(SDOPC)
 // Property Macros {
 
 // The intrinsic and/or SDNode has the same function as this LLVM IR Opcode.
-// \p OPC  The standard IR opcode.
-#ifndef HANDLE_VP_TO_OPC
-#define HANDLE_VP_TO_OPC(OPC)
+// \p OPC      The opcode of the instruction with the same function.
+#ifndef VP_PROPERTY_FUNCTIONAL_OPC
+#define VP_PROPERTY_FUNCTIONAL_OPC(OPC)
 #endif
 
 // Whether the intrinsic may have a rounding mode or exception behavior operand
@@ -96,34 +95,30 @@ END_REGISTER_VP_SDNODE(SDOPC)
 // \p HASEXCEPT  '1' if the intrinsic can have an exception behavior operand
 //               bundle, '0' otherwise.
 // \p INTRINID  The constrained fp intrinsic this VP intrinsic corresponds to.
-#ifndef HANDLE_VP_TO_CONSTRAINEDFP
-#define HANDLE_VP_TO_CONSTRAINEDFP(HASROUND, HASEXCEPT, INTRINID)
+#ifndef VP_PROPERTY_CONSTRAINEDFP
+#define VP_PROPERTY_CONSTRAINEDFP(HASROUND, HASEXCEPT, INTRINID)
 #endif
 
 // Map this VP intrinsic to its canonical functional intrinsic.
-#ifndef HANDLE_VP_TO_INTRIN
-#define HANDLE_VP_TO_INTRIN(ID)
+// \p INTRIN     The non-VP intrinsics with the same function.
+#ifndef VP_PROPERTY_FUNCTIONAL_INTRINSIC
+#define VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN)
 #endif
 
 // This VP Intrinsic is a memory operation
 // The pointer arg is at POINTERPOS and the data arg is at DATAPOS.
-#ifndef HANDLE_VP_IS_MEMOP
-#define HANDLE_VP_IS_MEMOP(VPID, POINTERPOS, DATAPOS)
+#ifndef VP_PROPERTY_MEMOP
+#define VP_PROPERTY_MEMOP(POINTERPOS, DATAPOS)
 #endif
 
 // Map this VP reduction intrinsic to its reduction operand positions.
-#ifndef HANDLE_VP_REDUCTION
-#define HANDLE_VP_REDUCTION(ID, STARTPOS, VECTORPOS)
+#ifndef VP_PROPERTY_REDUCTION
+#define VP_PROPERTY_REDUCTION(STARTPOS, VECTORPOS)
 #endif
 
 // A property to infer VP binary-op SDNode opcodes automatically.
-#ifndef PROPERTY_VP_BINARYOP_SDNODE
-#define PROPERTY_VP_BINARYOP_SDNODE(ID)
-#endif
-
-// A property to infer VP reduction SDNode opcodes automatically.
-#ifndef PROPERTY_VP_REDUCTION_SDNODE
-#define PROPERTY_VP_REDUCTION_SDNODE(ID)
+#ifndef VP_PROPERTY_BINARYOP
+#define VP_PROPERTY_BINARYOP
 #endif
 
 /// } Property Macros
@@ -132,15 +127,14 @@ END_REGISTER_VP_SDNODE(SDOPC)
 
 // Specialized helper macro for integer binary operators (%x, %y, %mask, %evl).
 #ifdef HELPER_REGISTER_BINARY_INT_VP
-#error "The internal helper macro HELPER_REGISTER_BINARY_INT_VP is already defined!"
+#error                                                                         \
+    "The internal helper macro HELPER_REGISTER_BINARY_INT_VP is already defined!"
 #endif
-#define HELPER_REGISTER_BINARY_INT_VP(INTRIN, SDOPC, OPC) \
-BEGIN_REGISTER_VP(INTRIN, 2, 3, SDOPC, -1) \
-HANDLE_VP_TO_OPC(OPC) \
-PROPERTY_VP_BINARYOP_SDNODE(SDOPC) \
-END_REGISTER_VP(INTRIN, SDOPC)
-
-
+#define HELPER_REGISTER_BINARY_INT_VP(VPID, VPSD, IROPC)                       \
+  BEGIN_REGISTER_VP(VPID, 2, 3, VPSD, -1)                                      \
+  VP_PROPERTY_FUNCTIONAL_OPC(IROPC)                                            \
+  VP_PROPERTY_BINARYOP                                                         \
+  END_REGISTER_VP(VPID, VPSD)
 
 // llvm.vp.add(x,y,mask,vlen)
 HELPER_REGISTER_BINARY_INT_VP(vp_add, VP_ADD, Add)
@@ -193,12 +187,12 @@ HELPER_REGISTER_BINARY_INT_VP(vp_xor, VP_XOR, Xor)
 #error                                                                         \
     "The internal helper macro HELPER_REGISTER_BINARY_FP_VP is already defined!"
 #endif
-#define HELPER_REGISTER_BINARY_FP_VP(OPSUFFIX, SDOPC, OPC)                     \
-  BEGIN_REGISTER_VP(vp_##OPSUFFIX, 2, 3, SDOPC, -1)                            \
-  HANDLE_VP_TO_OPC(OPC)                                                        \
-  HANDLE_VP_TO_CONSTRAINEDFP(1, 1, experimental_constrained_##OPSUFFIX)        \
-  PROPERTY_VP_BINARYOP_SDNODE(SDOPC)                                           \
-  END_REGISTER_VP(vp_##OPSUFFIX, SDOPC)
+#define HELPER_REGISTER_BINARY_FP_VP(OPSUFFIX, VPSD, IROPC)                    \
+  BEGIN_REGISTER_VP(vp_##OPSUFFIX, 2, 3, VPSD, -1)                             \
+  VP_PROPERTY_FUNCTIONAL_OPC(IROPC)                                            \
+  VP_PROPERTY_CONSTRAINEDFP(1, 1, experimental_constrained_##OPSUFFIX)         \
+  VP_PROPERTY_BINARYOP                                                         \
+  END_REGISTER_VP(vp_##OPSUFFIX, VPSD)
 
 // llvm.vp.fadd(x,y,mask,vlen)
 HELPER_REGISTER_BINARY_FP_VP(fadd, VP_FADD, FAdd)
@@ -224,34 +218,34 @@ HELPER_REGISTER_BINARY_FP_VP(frem, VP_FREM, FRem)
 BEGIN_REGISTER_VP_INTRINSIC(vp_store, 2, 3)
 // chain = VP_STORE chain,val,base,offset,mask,evl
 BEGIN_REGISTER_VP_SDNODE(VP_STORE, 0, vp_store, 4, 5)
-HANDLE_VP_TO_OPC(Store)
-HANDLE_VP_TO_INTRIN(masked_store)
-HANDLE_VP_IS_MEMOP(vp_store, 1, 0)
+VP_PROPERTY_FUNCTIONAL_OPC(Store)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_store)
+VP_PROPERTY_MEMOP(1, 0)
 END_REGISTER_VP(vp_store, VP_STORE)
 
 // llvm.vp.scatter(ptr,val,mask,vlen)
 BEGIN_REGISTER_VP_INTRINSIC(vp_scatter, 2, 3)
 // chain = VP_SCATTER chain,val,base,indices,scale,mask,evl
 BEGIN_REGISTER_VP_SDNODE(VP_SCATTER, -1, vp_scatter, 5, 6)
-HANDLE_VP_TO_INTRIN(masked_scatter)
-HANDLE_VP_IS_MEMOP(vp_scatter, 1, 0)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_scatter)
+VP_PROPERTY_MEMOP(1, 0)
 END_REGISTER_VP(vp_scatter, VP_SCATTER)
 
 // llvm.vp.load(ptr,mask,vlen)
 BEGIN_REGISTER_VP_INTRINSIC(vp_load, 1, 2)
 // val,chain = VP_LOAD chain,base,offset,mask,evl
 BEGIN_REGISTER_VP_SDNODE(VP_LOAD, -1, vp_load, 3, 4)
-HANDLE_VP_TO_OPC(Load)
-HANDLE_VP_TO_INTRIN(masked_load)
-HANDLE_VP_IS_MEMOP(vp_load, 0, None)
+VP_PROPERTY_FUNCTIONAL_OPC(Load)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_load)
+VP_PROPERTY_MEMOP(0, None)
 END_REGISTER_VP(vp_load, VP_LOAD)
 
 // llvm.vp.gather(ptr,mask,vlen)
 BEGIN_REGISTER_VP_INTRINSIC(vp_gather, 1, 2)
 // val,chain = VP_GATHER chain,base,indices,scale,mask,evl
 BEGIN_REGISTER_VP_SDNODE(VP_GATHER, -1, vp_gather, 4, 5)
-HANDLE_VP_TO_INTRIN(masked_gather)
-HANDLE_VP_IS_MEMOP(vp_gather, 0, None)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_gather)
+VP_PROPERTY_MEMOP(0, None)
 END_REGISTER_VP(vp_gather, VP_GATHER)
 
 ///// } Memory Operations
@@ -260,14 +254,14 @@ END_REGISTER_VP(vp_gather, VP_GATHER)
 
 // Specialized helper macro for VP reductions (%start, %x, %mask, %evl).
 #ifdef HELPER_REGISTER_REDUCTION_VP
-#error "The internal helper macro HELPER_REGISTER_REDUCTION_VP is already defined!"
+#error                                                                         \
+    "The internal helper macro HELPER_REGISTER_REDUCTION_VP is already defined!"
 #endif
-#define HELPER_REGISTER_REDUCTION_VP(VPINTRIN, SDOPC, INTRIN) \
-BEGIN_REGISTER_VP(VPINTRIN, 2, 3, SDOPC, -1) \
-HANDLE_VP_TO_INTRIN(INTRIN) \
-HANDLE_VP_REDUCTION(VPINTRIN, 0, 1) \
-PROPERTY_VP_REDUCTION_SDNODE(SDOPC) \
-END_REGISTER_VP(VPINTRIN, SDOPC)
+#define HELPER_REGISTER_REDUCTION_VP(VPID, VPSD, INTRIN)                       \
+  BEGIN_REGISTER_VP(VPID, 2, 3, VPSD, -1)                                      \
+  VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN)                                     \
+  VP_PROPERTY_REDUCTION(0, 1)                                                  \
+  END_REGISTER_VP(VPID, VPSD)
 
 // llvm.vp.reduce.add(start,x,mask,vlen)
 HELPER_REGISTER_REDUCTION_VP(vp_reduce_add, VP_REDUCE_ADD,
@@ -320,19 +314,19 @@ HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmin, VP_REDUCE_FMIN,
 // fast-math flags in the IR and as two distinct ISD opcodes in the
 // SelectionDAG.
 #ifdef HELPER_REGISTER_REDUCTION_SEQ_VP
-#error "The internal helper macro HELPER_REGISTER_REDUCTION_SEQ_VP is already defined!"
+#error                                                                         \
+    "The internal helper macro HELPER_REGISTER_REDUCTION_SEQ_VP is already defined!"
 #endif
-#define HELPER_REGISTER_REDUCTION_SEQ_VP(VPINTRIN, SDOPC, SEQ_SDOPC, INTRIN) \
-BEGIN_REGISTER_VP_INTRINSIC(VPINTRIN, 2, 3) \
-BEGIN_REGISTER_VP_SDNODE(SDOPC, -1, VPINTRIN, 2, 3) \
-END_REGISTER_VP_SDNODE(SDOPC) \
-BEGIN_REGISTER_VP_SDNODE(SEQ_SDOPC, -1, VPINTRIN, 2, 3) \
-END_REGISTER_VP_SDNODE(SEQ_SDOPC) \
-HANDLE_VP_TO_INTRIN(INTRIN) \
-HANDLE_VP_REDUCTION(VPINTRIN, 0, 1) \
-PROPERTY_VP_REDUCTION_SDNODE(SDOPC) \
-PROPERTY_VP_REDUCTION_SDNODE(SEQ_SDOPC) \
-END_REGISTER_VP_INTRINSIC(VPINTRIN)
+#define HELPER_REGISTER_REDUCTION_SEQ_VP(VPID, VPSD, SEQ_VPSD, INTRIN)         \
+  BEGIN_REGISTER_VP_INTRINSIC(VPID, 2, 3)                                      \
+  BEGIN_REGISTER_VP_SDNODE(VPSD, -1, VPID, 2, 3)                               \
+  VP_PROPERTY_REDUCTION(0, 1)                                                  \
+  END_REGISTER_VP_SDNODE(VPSD)                                                 \
+  BEGIN_REGISTER_VP_SDNODE(SEQ_VPSD, -1, VPID, 2, 3)                           \
+  VP_PROPERTY_REDUCTION(0, 1)                                                  \
+  END_REGISTER_VP_SDNODE(SEQ_VPSD)                                             \
+  VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN)                                     \
+  END_REGISTER_VP_INTRINSIC(VPID)
 
 // llvm.vp.reduce.fadd(start,x,mask,vlen)
 HELPER_REGISTER_REDUCTION_SEQ_VP(vp_reduce_fadd, VP_REDUCE_FADD,
@@ -356,8 +350,7 @@ BEGIN_REGISTER_VP_INTRINSIC(vp_select, 0, 3)
 // END_REGISTER_CASES(vp_select, VP_SELECT)
 END_REGISTER_VP_INTRINSIC(vp_select)
 
-BEGIN_REGISTER_VP(experimental_vp_splice, 3, 5,
-                  EXPERIMENTAL_VP_SPLICE, -1)
+BEGIN_REGISTER_VP(experimental_vp_splice, 3, 5, EXPERIMENTAL_VP_SPLICE, -1)
 END_REGISTER_VP(experimental_vp_splice, EXPERIMENTAL_VP_SPLICE)
 
 ///// } Shuffles
@@ -368,10 +361,9 @@ END_REGISTER_VP(experimental_vp_splice, EXPERIMENTAL_VP_SPLICE)
 #undef END_REGISTER_VP
 #undef END_REGISTER_VP_INTRINSIC
 #undef END_REGISTER_VP_SDNODE
-#undef HANDLE_VP_TO_OPC
-#undef HANDLE_VP_TO_CONSTRAINEDFP
-#undef HANDLE_VP_TO_INTRIN
-#undef HANDLE_VP_IS_MEMOP
-#undef HANDLE_VP_REDUCTION
-#undef PROPERTY_VP_BINARYOP_SDNODE
-#undef PROPERTY_VP_REDUCTION_SDNODE
+#undef VP_PROPERTY_BINARYOP
+#undef VP_PROPERTY_CONSTRAINEDFP
+#undef VP_PROPERTY_FUNCTIONAL_INTRINSIC
+#undef VP_PROPERTY_FUNCTIONAL_OPC
+#undef VP_PROPERTY_MEMOP
+#undef VP_PROPERTY_REDUCTION
diff --git a/llvm/include/llvm/IR/Verifier.h b/llvm/include/llvm/IR/Verifier.h
index f4381d2ae4a9..52a4c7b4301f 100644
--- a/llvm/include/llvm/IR/Verifier.h
+++ b/llvm/include/llvm/IR/Verifier.h
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the function verifier interface, that can be used for some
-// sanity checking of input to the system, and for checking that transformations
-// haven't done something bad.
+// This file defines the function verifier interface, that can be used for
+// validation checking of input to the system, and for checking that
+// transformations haven't done something bad.
 //
 // Note that this does not provide full 'java style' security and verifications,
 // instead it just tries to ensure that code is well formed.
diff --git a/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h b/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h
index 2b0f391570cd..8c0ad2699b8d 100644
--- a/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h
+++ b/llvm/include/llvm/MCA/HardwareUnits/Scheduler.h
@@ -264,9 +264,10 @@ public:
   // Update the ready queues.
   void dump() const;
 
-  // This routine performs a sanity check.  This routine should only be called
-  // when we know that 'IR' is not in the scheduler's instruction queues.
-  void sanityCheck(const InstRef &IR) const {
+  // This routine performs a basic correctness check.  This routine should only
+  // be called when we know that 'IR' is not in the scheduler's instruction
+  // queues.
+  void instructionCheck(const InstRef &IR) const {
     assert(!is_contained(WaitSet, IR) && "Already in the wait set!");
     assert(!is_contained(ReadySet, IR) && "Already in the ready set!");
     assert(!is_contained(IssuedSet, IR) && "Already executing!");
diff --git a/llvm/include/llvm/ObjectYAML/MachOYAML.h b/llvm/include/llvm/ObjectYAML/MachOYAML.h
index ee89f4eac61f..38a7de3d6131 100644
--- a/llvm/include/llvm/ObjectYAML/MachOYAML.h
+++ b/llvm/include/llvm/ObjectYAML/MachOYAML.h
@@ -121,6 +121,7 @@ struct LinkEditData {
   MachOYAML::ExportEntry ExportTrie;
   std::vector<NListEntry> NameList;
   std::vector<StringRef> StringTable;
+  std::vector<yaml::Hex32> IndirectSymbols;
 
   bool isEmpty() const;
 };
diff --git a/llvm/include/llvm/ProfileData/MemProfData.inc b/llvm/include/llvm/ProfileData/MemProfData.inc
new file mode 100644
index 000000000000..d64227e4ba31
--- /dev/null
+++ b/llvm/include/llvm/ProfileData/MemProfData.inc
@@ -0,0 +1,61 @@
+#ifndef MEMPROF_DATA_INC
+#define MEMPROF_DATA_INC
+/*===-- MemProfData.inc - MemProf profiling runtime structures -*- C++ -*-=== *\
+|*
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+|* See https://llvm.org/LICENSE.txt for license information.
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+|*
+\*===----------------------------------------------------------------------===*/
+/*
+ * This is the main file that defines all the data structure, signature,
+ * constant literals that are shared across profiling runtime library,
+ * and host tools (reader/writer).
+ *
+ * This file has two identical copies. The primary copy lives in LLVM and
+ * the other one sits in compiler-rt/include/profile directory. To make changes
+ * in this file, first modify the primary copy and copy it over to compiler-rt.
+ * Testing of any change in this file can start only after the two copies are
+ * synced up.
+ *
+\*===----------------------------------------------------------------------===*/
+
+
+#ifdef _MSC_VER
+#define PACKED(__decl__) __pragma(pack(push,1)) __decl__ __pragma(pack(pop))
+#else
+#define PACKED(__decl__) __decl__ __attribute__((__packed__))
+#endif
+
+// A 64-bit magic number to uniquely identify the raw binary memprof profile file.
+#define MEMPROF_RAW_MAGIC_64                                                                        \
+  ((uint64_t)255 << 56 | (uint64_t)'m' << 48 | (uint64_t)'p' << 40 | (uint64_t)'r' << 32 |          \
+   (uint64_t)'o' << 24 | (uint64_t)'f' << 16 | (uint64_t)'r' << 8 | (uint64_t)129)
+
+// The version number of the raw binary format.
+#define MEMPROF_RAW_VERSION 1ULL
+
+namespace llvm {
+namespace memprof {
+// A struct describing the header used for the raw binary memprof profile format.
+PACKED(struct Header {
+  uint64_t Magic;
+  uint64_t Version;
+  uint64_t TotalSize;
+  uint64_t SegmentOffset;
+  uint64_t MIBOffset;
+  uint64_t StackOffset;
+});
+
+// A struct describing the information necessary to describe a /proc/maps
+// segment entry for a particular binary/library identified by its build id.
+PACKED(struct SegmentEntry {
+  uint64_t Start;
+  uint64_t End;
+  uint64_t Offset;
+  uint8_t BuildId[32];
+});
+} // namespace memprof
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/ProfileData/RawMemProfReader.h b/llvm/include/llvm/ProfileData/RawMemProfReader.h
new file mode 100644
index 000000000000..45544927a86f
--- /dev/null
+++ b/llvm/include/llvm/ProfileData/RawMemProfReader.h
@@ -0,0 +1,43 @@
+#ifndef LLVM_PROFILEDATA_RAWMEMPROFREADER_H_
+#define LLVM_PROFILEDATA_RAWMEMPROFREADER_H_
+//===- MemProfReader.h - Instrumented memory profiling reader ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for reading MemProf profiling data.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace llvm {
+namespace memprof {
+
+class RawMemProfReader {
+public:
+  RawMemProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
+      : DataBuffer(std::move(DataBuffer)) {}
+  // Prints aggregate counts for each raw profile parsed from the DataBuffer.
+  void printSummaries(raw_ostream &OS) const;
+
+  // Return true if the \p DataBuffer starts with magic bytes indicating it is
+  // a raw binary memprof profile.
+  static bool hasFormat(const MemoryBuffer &DataBuffer);
+
+  // Create a RawMemProfReader after sanity checking the contents of the file at
+  // \p Path.
+  static Expected<std::unique_ptr<RawMemProfReader>> create(const Twine &Path);
+
+private:
+  std::unique_ptr<MemoryBuffer> DataBuffer;
+};
+
+} // namespace memprof
+} // namespace llvm
+
+#endif // LLVM_PROFILEDATA_RAWMEMPROFREADER_H_
diff --git a/llvm/include/llvm/Support/AArch64TargetParser.def b/llvm/include/llvm/Support/AArch64TargetParser.def
index b3cfb71601f1..48e82fa55a0f 100644
--- a/llvm/include/llvm/Support/AArch64TargetParser.def
+++ b/llvm/include/llvm/Support/AArch64TargetParser.def
@@ -181,7 +181,8 @@ AARCH64_CPU_NAME("cortex-a78c", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
 AARCH64_CPU_NAME("cortex-a710", ARMV9A, FK_NEON_FP_ARMV8, false,
                  (AArch64::AEK_MTE | AArch64::AEK_PAUTH | AArch64::AEK_FLAGM |
                   AArch64::AEK_SB | AArch64::AEK_I8MM | AArch64::AEK_FP16FML |
-                  AArch64::AEK_SVE2BITPERM | AArch64::AEK_BF16))
+                  AArch64::AEK_SVE | AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM |
+                  AArch64::AEK_BF16))
 AARCH64_CPU_NAME("cortex-r82", ARMV8R, FK_CRYPTO_NEON_FP_ARMV8, false,
                  (AArch64::AEK_LSE))
 AARCH64_CPU_NAME("cortex-x1", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
diff --git a/llvm/include/llvm/Support/AArch64TargetParser.h b/llvm/include/llvm/Support/AArch64TargetParser.h
index 131a58412db6..15bb428f19bc 100644
--- a/llvm/include/llvm/Support/AArch64TargetParser.h
+++ b/llvm/include/llvm/Support/AArch64TargetParser.h
@@ -137,15 +137,6 @@ void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 
 bool isX18ReservedByDefault(const Triple &TT);
 
-struct ParsedBranchProtection {
-  StringRef Scope;
-  StringRef Key;
-  bool BranchTargetEnforcement;
-};
-
-bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
-                           StringRef &Err);
-
 } // namespace AArch64
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Support/ARMAttributeParser.h b/llvm/include/llvm/Support/ARMAttributeParser.h
index 5d12b7e08d58..b46a4d9f690f 100644
--- a/llvm/include/llvm/Support/ARMAttributeParser.h
+++ b/llvm/include/llvm/Support/ARMAttributeParser.h
@@ -67,6 +67,10 @@ class ARMAttributeParser : public ELFAttributeParser {
   Error DSP_extension(ARMBuildAttrs::AttrType tag);
   Error T2EE_use(ARMBuildAttrs::AttrType tag);
   Error Virtualization_use(ARMBuildAttrs::AttrType tag);
+  Error PAC_extension(ARMBuildAttrs::AttrType tag);
+  Error BTI_extension(ARMBuildAttrs::AttrType tag);
+  Error PACRET_use(ARMBuildAttrs::AttrType tag);
+  Error BTI_use(ARMBuildAttrs::AttrType tag);
   Error nodefaults(ARMBuildAttrs::AttrType tag);
 
 public:
diff --git a/llvm/include/llvm/Support/ARMBuildAttributes.h b/llvm/include/llvm/Support/ARMBuildAttributes.h
index 37c37522fd26..b4405e7d4908 100644
--- a/llvm/include/llvm/Support/ARMBuildAttributes.h
+++ b/llvm/include/llvm/Support/ARMBuildAttributes.h
@@ -70,9 +70,13 @@ enum AttrType : unsigned {
   DIV_use = 44,
   DSP_extension = 46,
   MVE_arch = 48,
+  PAC_extension = 50,
+  BTI_extension = 52,
   also_compatible_with = 65,
   conformance = 67,
   Virtualization_use = 68,
+  BTI_use = 74,
+  PACRET_use = 76,
 
   /// Legacy Tags
   Section = 2,               // deprecated (ABI r2.09)
@@ -237,7 +241,25 @@ enum {
   // Tag_Virtualization_use, (=68), uleb128
   AllowTZ = 1,
   AllowVirtualization = 2,
-  AllowTZVirtualization = 3
+  AllowTZVirtualization = 3,
+
+  // Tag_PAC_extension, (=50), uleb128
+  DisallowPAC = 0,
+  AllowPACInNOPSpace = 1,
+  AllowPAC = 2,
+
+  // Tag_BTI_extension, (=52), uleb128
+  DisallowBTI = 0,
+  AllowBTIInNOPSpace = 1,
+  AllowBTI = 2,
+
+  // Tag_BTI_use, (=74), uleb128
+  BTINotUsed = 0,
+  BTIUsed = 1,
+
+  // Tag_PACRET_use, (=76), uleb128
+  PACRETNotUsed = 0,
+  PACRETUsed = 1
 };
 
 } // namespace ARMBuildAttrs
diff --git a/llvm/include/llvm/Support/ARMTargetParser.def b/llvm/include/llvm/Support/ARMTargetParser.def
index fd08f3e6960c..7d29808f0501 100644
--- a/llvm/include/llvm/Support/ARMTargetParser.def
+++ b/llvm/include/llvm/Support/ARMTargetParser.def
@@ -201,6 +201,7 @@ ARM_ARCH_EXT_NAME("cdecp4",   ARM::AEK_CDECP4,   "+cdecp4",  "-cdecp4")
 ARM_ARCH_EXT_NAME("cdecp5",   ARM::AEK_CDECP5,   "+cdecp5",  "-cdecp5")
 ARM_ARCH_EXT_NAME("cdecp6",   ARM::AEK_CDECP6,   "+cdecp6",  "-cdecp6")
 ARM_ARCH_EXT_NAME("cdecp7",   ARM::AEK_CDECP7,   "+cdecp7",  "-cdecp7")
+ARM_ARCH_EXT_NAME("pacbti",   ARM::AEK_PACBTI,   "+pacbti",  "-pacbti")
 #undef ARM_ARCH_EXT_NAME
 
 #ifndef ARM_HW_DIV_NAME
diff --git a/llvm/include/llvm/Support/ARMTargetParser.h b/llvm/include/llvm/Support/ARMTargetParser.h
index b1ffcfb34552..b40704c24e87 100644
--- a/llvm/include/llvm/Support/ARMTargetParser.h
+++ b/llvm/include/llvm/Support/ARMTargetParser.h
@@ -59,7 +59,7 @@ enum ArchExtKind : uint64_t {
   AEK_CDECP5 =      1 << 27,
   AEK_CDECP6 =      1 << 28,
   AEK_CDECP7 =      1 << 29,
-
+  AEK_PACBTI =      1 << 30,
   // Unsupported extensions.
   AEK_OS       =    1ULL << 59,
   AEK_IWMMXT   =    1ULL << 60,
diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h
index 21fd50763b1f..f39400c26eab 100644
--- a/llvm/include/llvm/Support/GenericDomTree.h
+++ b/llvm/include/llvm/Support/GenericDomTree.h
@@ -528,9 +528,9 @@ protected:
   /// of CFG edges must not delete the CFG nodes before calling this function.
   ///
   /// The applyUpdates function can reorder the updates and remove redundant
-  /// ones internally. The batch updater is also able to detect sequences of
-  /// zero and exactly one update -- it's optimized to do less work in these
-  /// cases.
+  /// ones internally (as long as it is done in a deterministic fashion). The
+  /// batch updater is also able to detect sequences of zero and exactly one
+  /// update -- it's optimized to do less work in these cases.
   ///
   /// Note that for postdominators it automatically takes care of applying
   /// updates on reverse edges internally (so there's no need to swap the
@@ -538,8 +538,8 @@ protected:
   /// The type of updates is the same for DomTreeBase<T> and PostDomTreeBase<T>
   /// with the same template parameter T.
   ///
-  /// \param Updates An unordered sequence of updates to perform. The current
-  /// CFG and the reverse of these updates provides the pre-view of the CFG.
+  /// \param Updates An ordered sequence of updates to perform. The current CFG
+  /// and the reverse of these updates provides the pre-view of the CFG.
   ///
   void applyUpdates(ArrayRef<UpdateType> Updates) {
     GraphDiff<NodePtr, IsPostDominator> PreViewCFG(
@@ -547,9 +547,9 @@ protected:
     DomTreeBuilder::ApplyUpdates(*this, PreViewCFG, nullptr);
   }
 
-  /// \param Updates An unordered sequence of updates to perform. The current
-  /// CFG and the reverse of these updates provides the pre-view of the CFG.
-  /// \param PostViewUpdates An unordered sequence of update to perform in order
+  /// \param Updates An ordered sequence of updates to perform. The current CFG
+  /// and the reverse of these updates provides the pre-view of the CFG.
+  /// \param PostViewUpdates An ordered sequence of update to perform in order
   /// to obtain a post-view of the CFG. The DT will be updated assuming the
   /// obtained PostViewCFG is the desired end state.
   void applyUpdates(ArrayRef<UpdateType> Updates,
diff --git a/llvm/include/llvm/Support/HTTPClient.h b/llvm/include/llvm/Support/HTTPClient.h
new file mode 100644
index 000000000000..3172610c2d8b
--- /dev/null
+++ b/llvm/include/llvm/Support/HTTPClient.h
@@ -0,0 +1,113 @@
+//===-- llvm/Support/HTTPClient.h - HTTP client library ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declarations of the HTTPClient, HTTPMethod,
+/// HTTPResponseHandler, and BufferedHTTPResponseHandler classes, as well as
+/// the HTTPResponseBuffer and HTTPRequest structs.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_HTTP_CLIENT_H
+#define LLVM_SUPPORT_HTTP_CLIENT_H
+
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace llvm {
+
+enum class HTTPMethod { GET };
+
+/// A stateless description of an outbound HTTP request.
+struct HTTPRequest {
+  SmallString<128> Url;
+  HTTPMethod Method = HTTPMethod::GET;
+  bool FollowRedirects = true;
+  HTTPRequest(StringRef Url);
+};
+
+bool operator==(const HTTPRequest &A, const HTTPRequest &B);
+
+/// A handler for state updates occurring while an HTTPRequest is performed.
+/// Can trigger the client to abort the request by returning an Error from any
+/// of its methods.
+class HTTPResponseHandler {
+public:
+  /// Processes one line of HTTP response headers.
+  virtual Error handleHeaderLine(StringRef HeaderLine) = 0;
+
+  /// Processes an additional chunk of bytes of the HTTP response body.
+  virtual Error handleBodyChunk(StringRef BodyChunk) = 0;
+
+  /// Processes the HTTP response status code.
+  virtual Error handleStatusCode(unsigned Code) = 0;
+
+protected:
+  ~HTTPResponseHandler();
+};
+
+/// An HTTP response status code bundled with a buffer to store the body.
+struct HTTPResponseBuffer {
+  unsigned Code = 0;
+  std::unique_ptr<WritableMemoryBuffer> Body;
+};
+
+/// A simple handler which writes returned data to an HTTPResponseBuffer.
+/// Ignores all headers except the Content-Length, which it uses to
+/// allocate an appropriately-sized Body buffer.
+class BufferedHTTPResponseHandler final : public HTTPResponseHandler {
+  size_t Offset = 0;
+
+public:
+  /// Stores the data received from the HTTP server.
+  HTTPResponseBuffer ResponseBuffer;
+
+  /// These callbacks store the body and status code in an HTTPResponseBuffer
+  /// allocated based on Content-Length. The Content-Length header must be
+  /// handled by handleHeaderLine before any calls to handleBodyChunk.
+  Error handleHeaderLine(StringRef HeaderLine) override;
+  Error handleBodyChunk(StringRef BodyChunk) override;
+  Error handleStatusCode(unsigned Code) override;
+};
+
+/// A reusable client that can perform HTTPRequests through a network socket.
+class HTTPClient {
+public:
+  HTTPClient();
+  ~HTTPClient();
+
+  /// Returns true only if LLVM has been compiled with a working HTTPClient.
+  static bool isAvailable();
+
+  /// Must be called at the beginning of a program, while it is a single thread.
+  static void initialize();
+
+  /// Must be called at the end of a program, while it is a single thread.
+  static void cleanup();
+
+  /// Sets the timeout for the entire request, in milliseconds. A zero or
+  /// negative value means the request never times out.
+  void setTimeout(std::chrono::milliseconds Timeout);
+
+  /// Performs the Request, passing response data to the Handler. Returns all
+  /// errors which occur during the request. Aborts if an error is returned by a
+  /// Handler method.
+  Error perform(const HTTPRequest &Request, HTTPResponseHandler &Handler);
+
+  /// Performs the Request with the default BufferedHTTPResponseHandler, and
+  /// returns its HTTPResponseBuffer or an Error.
+  Expected<HTTPResponseBuffer> perform(const HTTPRequest &Request);
+
+  /// Performs an HTTPRequest with the default configuration to make a GET
+  /// request to the given Url. Returns an HTTPResponseBuffer or an Error.
+  Expected<HTTPResponseBuffer> get(StringRef Url);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_HTTP_CLIENT_H
diff --git a/llvm/include/llvm/Support/Mutex.h b/llvm/include/llvm/Support/Mutex.h
index 1d8a0d3c87cb..d73bb8ef1120 100644
--- a/llvm/include/llvm/Support/Mutex.h
+++ b/llvm/include/llvm/Support/Mutex.h
@@ -36,7 +36,7 @@ namespace llvm
           return true;
         } else {
           // Single-threaded debugging code.  This would be racy in
-          // multithreaded mode, but provides not sanity checks in single
+          // multithreaded mode, but provides not basic checks in single
           // threaded mode.
           ++acquired;
           return true;
@@ -49,7 +49,7 @@ namespace llvm
           return true;
         } else {
           // Single-threaded debugging code.  This would be racy in
-          // multithreaded mode, but provides not sanity checks in single
+          // multithreaded mode, but provides not basic checks in single
           // threaded mode.
           assert(acquired && "Lock not acquired before release!");
           --acquired;
diff --git a/llvm/include/llvm/Support/RWMutex.h b/llvm/include/llvm/Support/RWMutex.h
index 150bc7dbbce1..33a5d3efffee 100644
--- a/llvm/include/llvm/Support/RWMutex.h
+++ b/llvm/include/llvm/Support/RWMutex.h
@@ -114,7 +114,7 @@ public:
     }
 
     // Single-threaded debugging code.  This would be racy in multithreaded
-    // mode, but provides not sanity checks in single threaded mode.
+    // mode, but provides not basic checks in single threaded mode.
     ++readers;
     return true;
   }
@@ -126,7 +126,7 @@ public:
     }
 
     // Single-threaded debugging code.  This would be racy in multithreaded
-    // mode, but provides not sanity checks in single threaded mode.
+    // mode, but provides not basic checks in single threaded mode.
     assert(readers > 0 && "Reader lock not acquired before release!");
     --readers;
     return true;
@@ -139,7 +139,7 @@ public:
     }
 
     // Single-threaded debugging code.  This would be racy in multithreaded
-    // mode, but provides not sanity checks in single threaded mode.
+    // mode, but provides not basic checks in single threaded mode.
     assert(writers == 0 && "Writer lock already acquired!");
     ++writers;
     return true;
@@ -152,7 +152,7 @@ public:
     }
 
     // Single-threaded debugging code.  This would be racy in multithreaded
-    // mode, but provides not sanity checks in single threaded mode.
+    // mode, but provides not basic checks in single threaded mode.
     assert(writers == 1 && "Writer lock not acquired before release!");
     --writers;
     return true;
diff --git a/llvm/include/llvm/Support/TargetParser.h b/llvm/include/llvm/Support/TargetParser.h
index 366dd3cf55c6..b11467dcce28 100644
--- a/llvm/include/llvm/Support/TargetParser.h
+++ b/llvm/include/llvm/Support/TargetParser.h
@@ -177,6 +177,18 @@ StringRef resolveTuneCPUAlias(StringRef TuneCPU, bool IsRV64);
 
 } // namespace RISCV
 
+namespace ARM {
+struct ParsedBranchProtection {
+  StringRef Scope;
+  StringRef Key;
+  bool BranchTargetEnforcement;
+};
+
+bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
+                           StringRef &Err);
+
+} // namespace ARM
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Support/ThreadPool.h b/llvm/include/llvm/Support/ThreadPool.h
index 4c41b88d6043..8d30e8e92755 100644
--- a/llvm/include/llvm/Support/ThreadPool.h
+++ b/llvm/include/llvm/Support/ThreadPool.h
@@ -36,9 +36,6 @@ namespace llvm {
 /// for some work to become available.
 class ThreadPool {
 public:
-  using TaskTy = std::function<void()>;
-  using PackagedTaskTy = std::packaged_task<void()>;
-
   /// Construct a pool using the hardware strategy \p S for mapping hardware
   /// execution resources (threads, cores, CPUs)
   /// Defaults to using the maximum execution resources in the system, but
@@ -51,17 +48,17 @@ public:
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
   template <typename Function, typename... Args>
-  inline std::shared_future<void> async(Function &&F, Args &&... ArgList) {
+  inline auto async(Function &&F, Args &&...ArgList) {
     auto Task =
         std::bind(std::forward<Function>(F), std::forward<Args>(ArgList)...);
-    return asyncImpl(std::move(Task));
+    return async(std::move(Task));
   }
 
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
-  template <typename Function>
-  inline std::shared_future<void> async(Function &&F) {
-    return asyncImpl(std::forward<Function>(F));
+  template <typename Func>
+  auto async(Func &&F) -> std::shared_future<decltype(F())> {
+    return asyncImpl(std::function<decltype(F())()>(std::forward<Func>(F)));
   }
 
   /// Blocking wait for all the threads to complete and the queue to be empty.
@@ -74,17 +71,70 @@ public:
   bool isWorkerThread() const;
 
 private:
+  /// Helpers to create a promise and a callable wrapper of \p Task that sets
+  /// the result of the promise. Returns the callable and a future to access the
+  /// result.
+  template <typename ResTy>
+  static std::pair<std::function<void()>, std::future<ResTy>>
+  createTaskAndFuture(std::function<ResTy()> Task) {
+    std::shared_ptr<std::promise<ResTy>> Promise =
+        std::make_shared<std::promise<ResTy>>();
+    auto F = Promise->get_future();
+    return {
+        [Promise = std::move(Promise), Task]() { Promise->set_value(Task()); },
+        std::move(F)};
+  }
+  static std::pair<std::function<void()>, std::future<void>>
+  createTaskAndFuture(std::function<void()> Task) {
+    std::shared_ptr<std::promise<void>> Promise =
+        std::make_shared<std::promise<void>>();
+    auto F = Promise->get_future();
+    return {[Promise = std::move(Promise), Task]() {
+              Task();
+              Promise->set_value();
+            },
+            std::move(F)};
+  }
+
   bool workCompletedUnlocked() { return !ActiveThreads && Tasks.empty(); }
 
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
-  std::shared_future<void> asyncImpl(TaskTy F);
+  template <typename ResTy>
+  std::shared_future<ResTy> asyncImpl(std::function<ResTy()> Task) {
+
+#if LLVM_ENABLE_THREADS
+    /// Wrap the Task in a std::function<void()> that sets the result of the
+    /// corresponding future.
+    auto R = createTaskAndFuture(Task);
+
+    {
+      // Lock the queue and push the new task
+      std::unique_lock<std::mutex> LockGuard(QueueLock);
+
+      // Don't allow enqueueing after disabling the pool
+      assert(EnableFlag && "Queuing a thread during ThreadPool destruction");
+      Tasks.push(std::move(R.first));
+    }
+    QueueCondition.notify_one();
+    return R.second.share();
+
+#else // LLVM_ENABLE_THREADS Disabled
+
+    // Get a Future with launch::deferred execution using std::async
+    auto Future = std::async(std::launch::deferred, std::move(Task)).share();
+    // Wrap the future so that both ThreadPool::wait() can operate and the
+    // returned future can be sync'ed on.
+    Tasks.push([Future]() { Future.get(); });
+    return Future;
+#endif
+  }
 
   /// Threads in flight
   std::vector<llvm::thread> Threads;
 
   /// Tasks waiting for execution in the pool.
-  std::queue<PackagedTaskTy> Tasks;
+  std::queue<std::function<void()>> Tasks;
 
   /// Locking and signaling for accessing the Tasks queue.
   std::mutex QueueLock;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index e2d3dbdda88a..1d189c6dea6d 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -645,6 +645,13 @@ def extract_vec_elt_combines : GICombineGroup<[
   extract_vec_elt_build_vec,
   extract_all_elts_from_build_vector]>;
 
+def funnel_shift_from_or_shift : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_OR):$root,
+    [{ return Helper.matchOrShiftToFunnelShift(*${root}, ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])
+>;
+
 def funnel_shift_to_rotate : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_FSHL, G_FSHR):$root,
@@ -683,7 +690,8 @@ def bitfield_extract_from_and : GICombineRule<
     [{ return Helper.matchBitfieldExtractFromAnd(*${root}, ${info}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
 
-def funnel_shift_combines : GICombineGroup<[funnel_shift_to_rotate]>;
+def funnel_shift_combines : GICombineGroup<[funnel_shift_from_or_shift,
+                                            funnel_shift_to_rotate]>;
 
 def bitfield_extract_from_sext_inreg : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$info),
@@ -751,6 +759,84 @@ def redundant_neg_operands: GICombineRule<
     [{ return Helper.matchRedundantNegOperands(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
 
+// Transform (fadd x, (fmul y, z)) -> (fma y, z, x)
+//           (fadd x, (fmul y, z)) -> (fmad y, z, x)
+// Transform (fadd (fmul x, y), z) -> (fma x, y, z)
+//           (fadd (fmul x, y), z) -> (fmad x, y, z)
+def combine_fadd_fmul_to_fmad_or_fma: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_FADD):$root,
+         [{ return Helper.matchCombineFAddFMulToFMadOrFMA(*${root},
+                                                          ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
+
+// Transform (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
+//                                         -> (fmad (fpext x), (fpext y), z)
+// Transform (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
+//                                         -> (fmad (fpext y), (fpext z), x)
+def combine_fadd_fpext_fmul_to_fmad_or_fma: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_FADD):$root,
+         [{ return Helper.matchCombineFAddFpExtFMulToFMadOrFMA(*${root},
+                                                               ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
+
+// Transform (fadd (fma x, y, (fmul z, u)), v)  -> (fma x, y, (fma z, u, v))
+//           (fadd (fmad x, y, (fmul z, u)), v) -> (fmad x, y, (fmad z, u, v))
+// Transform (fadd v, (fma x, y, (fmul z, u)))  -> (fma x, y, (fma z, u, v))
+//           (fadd v, (fmad x, y, (fmul z, u))) -> (fmad x, y, (fmad z, u, v))
+def combine_fadd_fma_fmul_to_fmad_or_fma: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_FADD):$root,
+         [{ return Helper.matchCombineFAddFMAFMulToFMadOrFMA(*${root},
+                                                             ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
+
+// Transform (fadd (fma x, y, (fpext (fmul u, v))), z) ->
+//           (fma x, y, (fma (fpext u), (fpext v), z))
+def combine_fadd_fpext_fma_fmul_to_fmad_or_fma: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_FADD):$root,
+         [{ return Helper.matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
+                                                  *${root}, ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
+
+// Transform (fsub (fmul x, y), z) -> (fma x, y, -z)
+//                                 -> (fmad x, y, -z)
+def combine_fsub_fmul_to_fmad_or_fma: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_FSUB):$root,
+         [{ return Helper.matchCombineFSubFMulToFMadOrFMA(*${root},
+                                                          ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
+
+// Transform (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
+//           (fsub x, (fneg (fmul, y, z))) -> (fma y, z, x)
+def combine_fsub_fneg_fmul_to_fmad_or_fma: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_FSUB):$root,
+         [{ return Helper.matchCombineFSubFNegFMulToFMadOrFMA(*${root},
+                                                              ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
+
+// Transform (fsub (fpext (fmul x, y)), z) ->
+//           (fma (fpext x), (fpext y), (fneg z))
+def combine_fsub_fpext_fmul_to_fmad_or_fma: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_FSUB):$root,
+         [{ return Helper.matchCombineFSubFpExtFMulToFMadOrFMA(*${root},
+                                                               ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
+
+// Transform (fsub (fneg (fpext (fmul x, y))), z) ->
+//           (fneg (fma (fpext x), (fpext y), z))
+def combine_fsub_fpext_fneg_fmul_to_fmad_or_fma: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$info),
+  (match (wip_match_opcode G_FSUB):$root,
+         [{ return Helper.matchCombineFSubFpExtFNegFMulToFMadOrFMA(
+                                            *${root}, ${info}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -783,6 +869,12 @@ def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp]>;
 def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd,
                                        mul_by_neg_one]>;
 
+def fma_combines : GICombineGroup<[combine_fadd_fmul_to_fmad_or_fma,
+  combine_fadd_fpext_fmul_to_fmad_or_fma, combine_fadd_fma_fmul_to_fmad_or_fma,
+  combine_fadd_fpext_fma_fmul_to_fmad_or_fma, combine_fsub_fmul_to_fmad_or_fma,
+  combine_fsub_fneg_fmul_to_fmad_or_fma, combine_fsub_fpext_fmul_to_fmad_or_fma,
+  combine_fsub_fpext_fneg_fmul_to_fmad_or_fma]>;
+
 def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     extract_vec_elt_combines, combines_for_extload,
     combine_indexed_load_store, undef_combines, identity_combines, phi_combines,
@@ -799,7 +891,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     truncstore_merge, div_rem_to_divrem, funnel_shift_combines,
     form_bitfield_extract, constant_fold, fabs_fneg_fold,
     intdiv_combines, mulh_combines, redundant_neg_operands,
-    and_or_disjoint_mask ]>;
+    and_or_disjoint_mask, fma_combines]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
index 6e45f8f6fb05..429fcbd81b45 100644
--- a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
+++ b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
@@ -24,22 +24,47 @@ using namespace sampleprof;
 namespace llvm {
 namespace sampleprof {
 
+struct ProfiledCallGraphNode;
+
+struct ProfiledCallGraphEdge {
+  ProfiledCallGraphEdge(ProfiledCallGraphNode *Source,
+                        ProfiledCallGraphNode *Target, uint64_t Weight)
+      : Source(Source), Target(Target), Weight(Weight) {}
+  ProfiledCallGraphNode *Source;
+  ProfiledCallGraphNode *Target;
+  uint64_t Weight;
+
+  // The call destination is the only important data here,
+  // allow to transparently unwrap into it.
+  operator ProfiledCallGraphNode *() const { return Target; }
+};
+
 struct ProfiledCallGraphNode {
-  ProfiledCallGraphNode(StringRef FName = StringRef()) : Name(FName) {}
-  StringRef Name;
 
-  struct ProfiledCallGraphNodeComparer {
-    bool operator()(const ProfiledCallGraphNode *L,
-                    const ProfiledCallGraphNode *R) const {
-      return L->Name < R->Name;
+  // Sort edges by callee names only since all edges to be compared are from
+  // same caller. Edge weights are not considered either because for the same
+  // callee only the edge with the largest weight is added to the edge set.
+  struct ProfiledCallGraphEdgeComparer {
+    bool operator()(const ProfiledCallGraphEdge &L,
+                    const ProfiledCallGraphEdge &R) const {
+      return L.Target->Name < R.Target->Name;
     }
   };
-  std::set<ProfiledCallGraphNode *, ProfiledCallGraphNodeComparer> Callees;
+
+  using iterator = std::set<ProfiledCallGraphEdge>::iterator;
+  using const_iterator = std::set<ProfiledCallGraphEdge>::const_iterator;
+  using edge = ProfiledCallGraphEdge;
+  using edges = std::set<ProfiledCallGraphEdge, ProfiledCallGraphEdgeComparer>;
+
+  ProfiledCallGraphNode(StringRef FName = StringRef()) : Name(FName) {}
+
+  StringRef Name;
+  edges Edges;
 };
 
 class ProfiledCallGraph {
 public:
-  using iterator = std::set<ProfiledCallGraphNode *>::iterator;
+  using iterator = std::set<ProfiledCallGraphEdge>::iterator;
 
   // Constructor for non-CS profile.
   ProfiledCallGraph(SampleProfileMap &ProfileMap) {
@@ -63,8 +88,9 @@ public:
     while (!Queue.empty()) {
       ContextTrieNode *Caller = Queue.front();
       Queue.pop();
-      // Add calls for context. When AddNodeWithSamplesOnly is true, both caller
-      // and callee need to have context profile.
+      FunctionSamples *CallerSamples = Caller->getFunctionSamples();
+
+      // Add calls for context.
       // Note that callsite target samples are completely ignored since they can
       // conflict with the context edges, which are formed by context
       // compression during profile generation, for cyclic SCCs. This may
@@ -74,31 +100,61 @@ public:
         ContextTrieNode *Callee = &Child.second;
         addProfiledFunction(ContextTracker.getFuncNameFor(Callee));
         Queue.push(Callee);
+
+        // Fetch edge weight from the profile.
+        uint64_t Weight;
+        FunctionSamples *CalleeSamples = Callee->getFunctionSamples();
+        if (!CalleeSamples || !CallerSamples) {
+          Weight = 0;
+        } else {
+          uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples();
+          uint64_t CallsiteCount = 0;
+          LineLocation Callsite = Callee->getCallSiteLoc();
+          if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
+            SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
+            auto It = TargetCounts.find(CalleeSamples->getName());
+            if (It != TargetCounts.end())
+              CallsiteCount = It->second;
+          }
+          Weight = std::max(CallsiteCount, CalleeEntryCount);
+        }
+
         addProfiledCall(ContextTracker.getFuncNameFor(Caller),
-                        ContextTracker.getFuncNameFor(Callee));
+                        ContextTracker.getFuncNameFor(Callee), Weight);
       }
     }
   }
 
-  iterator begin() { return Root.Callees.begin(); }
-  iterator end() { return Root.Callees.end(); }
+  iterator begin() { return Root.Edges.begin(); }
+  iterator end() { return Root.Edges.end(); }
   ProfiledCallGraphNode *getEntryNode() { return &Root; }
   void addProfiledFunction(StringRef Name) {
     if (!ProfiledFunctions.count(Name)) {
       // Link to synthetic root to make sure every node is reachable
       // from root. This does not affect SCC order.
       ProfiledFunctions[Name] = ProfiledCallGraphNode(Name);
-      Root.Callees.insert(&ProfiledFunctions[Name]);
+      Root.Edges.emplace(&Root, &ProfiledFunctions[Name], 0);
     }
   }
 
-  void addProfiledCall(StringRef CallerName, StringRef CalleeName) {
+private:
+  void addProfiledCall(StringRef CallerName, StringRef CalleeName,
+                       uint64_t Weight = 0) {
     assert(ProfiledFunctions.count(CallerName));
     auto CalleeIt = ProfiledFunctions.find(CalleeName);
-    if (CalleeIt == ProfiledFunctions.end()) {
+    if (CalleeIt == ProfiledFunctions.end())
       return;
+    ProfiledCallGraphEdge Edge(&ProfiledFunctions[CallerName],
+                               &CalleeIt->second, Weight);
+    auto &Edges = ProfiledFunctions[CallerName].Edges;
+    auto EdgeIt = Edges.find(Edge);
+    if (EdgeIt == Edges.end()) {
+      Edges.insert(Edge);
+    } else if (EdgeIt->Weight < Edge.Weight) {
+      // Replace existing call edges with same target but smaller weight.
+      Edges.erase(EdgeIt);
+      Edges.insert(Edge);
     }
-    ProfiledFunctions[CallerName].Callees.insert(&CalleeIt->second);
   }
 
   void addProfiledCalls(const FunctionSamples &Samples) {
@@ -107,20 +163,20 @@ public:
     for (const auto &Sample : Samples.getBodySamples()) {
       for (const auto &Target : Sample.second.getCallTargets()) {
         addProfiledFunction(Target.first());
-        addProfiledCall(Samples.getFuncName(), Target.first());
+        addProfiledCall(Samples.getFuncName(), Target.first(), Target.second);
       }
     }
 
     for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) {
       for (const auto &InlinedSamples : CallsiteSamples.second) {
         addProfiledFunction(InlinedSamples.first);
-        addProfiledCall(Samples.getFuncName(), InlinedSamples.first);
+        addProfiledCall(Samples.getFuncName(), InlinedSamples.first,
+                        InlinedSamples.second.getEntrySamples());
         addProfiledCalls(InlinedSamples.second);
       }
     }
   }
 
-private:
   ProfiledCallGraphNode Root;
   StringMap<ProfiledCallGraphNode> ProfiledFunctions;
 };
@@ -128,12 +184,14 @@ private:
 } // end namespace sampleprof
 
 template <> struct GraphTraits<ProfiledCallGraphNode *> {
+  using NodeType = ProfiledCallGraphNode;
   using NodeRef = ProfiledCallGraphNode *;
-  using ChildIteratorType = std::set<ProfiledCallGraphNode *>::iterator;
+  using EdgeType = NodeType::edge;
+  using ChildIteratorType = NodeType::const_iterator;
 
   static NodeRef getEntryNode(NodeRef PCGN) { return PCGN; }
-  static ChildIteratorType child_begin(NodeRef N) { return N->Callees.begin(); }
-  static ChildIteratorType child_end(NodeRef N) { return N->Callees.end(); }
+  static ChildIteratorType child_begin(NodeRef N) { return N->Edges.begin(); }
+  static ChildIteratorType child_end(NodeRef N) { return N->Edges.end(); }
 };
 
 template <>
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
index c13407a44091..6002f0270083 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
@@ -141,7 +141,7 @@ private:
   AsanDtorKind DestructorKind;
 };
 
-// Insert AddressSanitizer (address sanity checking) instrumentation
+// Insert AddressSanitizer (address basic correctness checking) instrumentation
 FunctionPass *createAddressSanitizerFunctionPass(
     bool CompileKernel = false, bool Recover = false,
     bool UseAfterScope = false,
diff --git a/llvm/include/llvm/Transforms/Scalar/AnnotationRemarks.h b/llvm/include/llvm/Transforms/Scalar/AnnotationRemarks.h
index d76b55babc74..45983ad9d571 100644
--- a/llvm/include/llvm/Transforms/Scalar/AnnotationRemarks.h
+++ b/llvm/include/llvm/Transforms/Scalar/AnnotationRemarks.h
@@ -22,6 +22,7 @@ class Function;
 
 struct AnnotationRemarksPass : public PassInfoMixin<AnnotationRemarksPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 72cb606eb51a..3c529abce85a 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -55,7 +55,6 @@ class MDNode;
 class MemorySSAUpdater;
 class PHINode;
 class StoreInst;
-class SwitchInst;
 class TargetLibraryInfo;
 class TargetTransformInfo;
 
@@ -238,10 +237,6 @@ CallInst *createCallMatchingInvoke(InvokeInst *II);
 /// This function converts the specified invoek into a normall call.
 void changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr);
 
-/// This function removes the default destination from the specified switch.
-void createUnreachableSwitchDefault(SwitchInst *Switch,
-                                    DomTreeUpdater *DTU = nullptr);
-
 ///===---------------------------------------------------------------------===//
 ///  Dbg Intrinsic utilities
 ///
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
index 22b2295cc9d7..c233e3dc168e 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -169,6 +169,10 @@ public:
 
   /// Called to update debug info associated with the instruction.
   virtual void updateDebugInfo(Instruction *I) const {}
+
+  /// Return false if a sub-class wants to keep one of the loads/stores
+  /// after the SSA construction.
+  virtual bool shouldDelete(Instruction *I) const { return true; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h
new file mode 100644
index 000000000000..e1f681bbd367
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h
@@ -0,0 +1,284 @@
+//===- Transforms/Utils/SampleProfileInference.h ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file provides the interface for the profile inference algorithm, profi.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_SAMPLEPROFILEINFERENCE_H
+#define LLVM_TRANSFORMS_UTILS_SAMPLEPROFILEINFERENCE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+
+class BasicBlock;
+class Function;
+class MachineBasicBlock;
+class MachineFunction;
+
+namespace afdo_detail {
+
+template <class BlockT> struct TypeMap {};
+template <> struct TypeMap<BasicBlock> {
+  using BasicBlockT = BasicBlock;
+  using FunctionT = Function;
+};
+template <> struct TypeMap<MachineBasicBlock> {
+  using BasicBlockT = MachineBasicBlock;
+  using FunctionT = MachineFunction;
+};
+
+} // end namespace afdo_detail
+
+struct FlowJump;
+
+/// A wrapper of a binary basic block.
+struct FlowBlock {
+  uint64_t Index;
+  uint64_t Weight{0};
+  bool UnknownWeight{false};
+  uint64_t Flow{0};
+  bool HasSelfEdge{false};
+  std::vector<FlowJump *> SuccJumps;
+  std::vector<FlowJump *> PredJumps;
+
+  /// Check if it is the entry block in the function.
+  bool isEntry() const { return PredJumps.empty(); }
+
+  /// Check if it is an exit block in the function.
+  bool isExit() const { return SuccJumps.empty(); }
+};
+
+/// A wrapper of a jump between two basic blocks.
+struct FlowJump {
+  uint64_t Source;
+  uint64_t Target;
+  uint64_t Flow{0};
+  bool IsUnlikely{false};
+};
+
+/// A wrapper of binary function with basic blocks and jumps.
+struct FlowFunction {
+  std::vector<FlowBlock> Blocks;
+  std::vector<FlowJump> Jumps;
+  /// The index of the entry block.
+  uint64_t Entry;
+};
+
+void applyFlowInference(FlowFunction &Func);
+
+/// Sample profile inference pass.
+template <typename BT> class SampleProfileInference {
+public:
+  using BasicBlockT = typename afdo_detail::TypeMap<BT>::BasicBlockT;
+  using FunctionT = typename afdo_detail::TypeMap<BT>::FunctionT;
+  using Edge = std::pair<const BasicBlockT *, const BasicBlockT *>;
+  using BlockWeightMap = DenseMap<const BasicBlockT *, uint64_t>;
+  using EdgeWeightMap = DenseMap<Edge, uint64_t>;
+  using BlockEdgeMap =
+      DenseMap<const BasicBlockT *, SmallVector<const BasicBlockT *, 8>>;
+
+  SampleProfileInference(FunctionT &F, BlockEdgeMap &Successors,
+                         BlockWeightMap &SampleBlockWeights)
+      : F(F), Successors(Successors), SampleBlockWeights(SampleBlockWeights) {}
+
+  /// Apply the profile inference algorithm for a given function
+  void apply(BlockWeightMap &BlockWeights, EdgeWeightMap &EdgeWeights);
+
+private:
+  /// Try to infer branch probabilities mimicking implementation of
+  /// BranchProbabilityInfo. Unlikely taken branches are marked so that the
+  /// inference algorithm can avoid sending flow along corresponding edges.
+  void findUnlikelyJumps(const std::vector<const BasicBlockT *> &BasicBlocks,
+                         BlockEdgeMap &Successors, FlowFunction &Func);
+
+  /// Determine whether the block is an exit in the CFG.
+  bool isExit(const BasicBlockT *BB);
+
+  /// Function.
+  const FunctionT &F;
+
+  /// Successors for each basic block in the CFG.
+  BlockEdgeMap &Successors;
+
+  /// Map basic blocks to their sampled weights.
+  BlockWeightMap &SampleBlockWeights;
+};
+
+template <typename BT>
+void SampleProfileInference<BT>::apply(BlockWeightMap &BlockWeights,
+                                       EdgeWeightMap &EdgeWeights) {
+  // Find all forwards reachable blocks which the inference algorithm will be
+  // applied on.
+  df_iterator_default_set<const BasicBlockT *> Reachable;
+  for (auto *BB : depth_first_ext(&F, Reachable))
+    (void)BB /* Mark all reachable blocks */;
+
+  // Find all backwards reachable blocks which the inference algorithm will be
+  // applied on.
+  df_iterator_default_set<const BasicBlockT *> InverseReachable;
+  for (const auto &BB : F) {
+    // An exit block is a block without any successors.
+    if (isExit(&BB)) {
+      for (auto *RBB : inverse_depth_first_ext(&BB, InverseReachable))
+        (void)RBB;
+    }
+  }
+
+  // Keep a stable order for reachable blocks
+  DenseMap<const BasicBlockT *, uint64_t> BlockIndex;
+  std::vector<const BasicBlockT *> BasicBlocks;
+  BlockIndex.reserve(Reachable.size());
+  BasicBlocks.reserve(Reachable.size());
+  for (const auto &BB : F) {
+    if (Reachable.count(&BB) && InverseReachable.count(&BB)) {
+      BlockIndex[&BB] = BasicBlocks.size();
+      BasicBlocks.push_back(&BB);
+    }
+  }
+
+  BlockWeights.clear();
+  EdgeWeights.clear();
+  bool HasSamples = false;
+  for (const auto *BB : BasicBlocks) {
+    auto It = SampleBlockWeights.find(BB);
+    if (It != SampleBlockWeights.end() && It->second > 0) {
+      HasSamples = true;
+      BlockWeights[BB] = It->second;
+    }
+  }
+  // Quit early for functions with a single block or ones w/o samples
+  if (BasicBlocks.size() <= 1 || !HasSamples) {
+    return;
+  }
+
+  // Create necessary objects
+  FlowFunction Func;
+  Func.Blocks.reserve(BasicBlocks.size());
+  // Create FlowBlocks
+  for (const auto *BB : BasicBlocks) {
+    FlowBlock Block;
+    if (SampleBlockWeights.find(BB) != SampleBlockWeights.end()) {
+      Block.UnknownWeight = false;
+      Block.Weight = SampleBlockWeights[BB];
+    } else {
+      Block.UnknownWeight = true;
+      Block.Weight = 0;
+    }
+    Block.Index = Func.Blocks.size();
+    Func.Blocks.push_back(Block);
+  }
+  // Create FlowEdges
+  for (const auto *BB : BasicBlocks) {
+    for (auto *Succ : Successors[BB]) {
+      if (!BlockIndex.count(Succ))
+        continue;
+      FlowJump Jump;
+      Jump.Source = BlockIndex[BB];
+      Jump.Target = BlockIndex[Succ];
+      Func.Jumps.push_back(Jump);
+      if (BB == Succ) {
+        Func.Blocks[BlockIndex[BB]].HasSelfEdge = true;
+      }
+    }
+  }
+  for (auto &Jump : Func.Jumps) {
+    Func.Blocks[Jump.Source].SuccJumps.push_back(&Jump);
+    Func.Blocks[Jump.Target].PredJumps.push_back(&Jump);
+  }
+
+  // Try to infer probabilities of jumps based on the content of basic block
+  findUnlikelyJumps(BasicBlocks, Successors, Func);
+
+  // Find the entry block
+  for (size_t I = 0; I < Func.Blocks.size(); I++) {
+    if (Func.Blocks[I].isEntry()) {
+      Func.Entry = I;
+      break;
+    }
+  }
+
+  // Create and apply the inference network model.
+  applyFlowInference(Func);
+
+  // Extract the resulting weights from the control flow
+  // All weights are increased by one to avoid propagation errors introduced by
+  // zero weights.
+  for (const auto *BB : BasicBlocks) {
+    BlockWeights[BB] = Func.Blocks[BlockIndex[BB]].Flow;
+  }
+  for (auto &Jump : Func.Jumps) {
+    Edge E = std::make_pair(BasicBlocks[Jump.Source], BasicBlocks[Jump.Target]);
+    EdgeWeights[E] = Jump.Flow;
+  }
+
+#ifndef NDEBUG
+  // Unreachable blocks and edges should not have a weight.
+  for (auto &I : BlockWeights) {
+    assert(Reachable.contains(I.first));
+    assert(InverseReachable.contains(I.first));
+  }
+  for (auto &I : EdgeWeights) {
+    assert(Reachable.contains(I.first.first) &&
+           Reachable.contains(I.first.second));
+    assert(InverseReachable.contains(I.first.first) &&
+           InverseReachable.contains(I.first.second));
+  }
+#endif
+}
+
+template <typename BT>
+inline void SampleProfileInference<BT>::findUnlikelyJumps(
+    const std::vector<const BasicBlockT *> &BasicBlocks,
+    BlockEdgeMap &Successors, FlowFunction &Func) {}
+
+template <>
+inline void SampleProfileInference<BasicBlock>::findUnlikelyJumps(
+    const std::vector<const BasicBlockT *> &BasicBlocks,
+    BlockEdgeMap &Successors, FlowFunction &Func) {
+  for (auto &Jump : Func.Jumps) {
+    const auto *BB = BasicBlocks[Jump.Source];
+    const auto *Succ = BasicBlocks[Jump.Target];
+    const Instruction *TI = BB->getTerminator();
+    // Check if a block ends with InvokeInst and mark non-taken branch unlikely.
+    // In that case block Succ should be a landing pad
+    if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) {
+      if (isa<InvokeInst>(TI)) {
+        Jump.IsUnlikely = true;
+      }
+    }
+    const Instruction *SuccTI = Succ->getTerminator();
+    // Check if the target block contains UnreachableInst and mark it unlikely
+    if (SuccTI->getNumSuccessors() == 0) {
+      if (isa<UnreachableInst>(SuccTI)) {
+        Jump.IsUnlikely = true;
+      }
+    }
+  }
+}
+
+template <typename BT>
+inline bool SampleProfileInference<BT>::isExit(const BasicBlockT *BB) {
+  return BB->succ_empty();
+}
+
+template <>
+inline bool SampleProfileInference<BasicBlock>::isExit(const BasicBlock *BB) {
+  return succ_empty(BB);
+}
+
+} // end namespace llvm
+#endif // LLVM_TRANSFORMS_UTILS_SAMPLEPROFILEINFERENCE_H
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
index 6a2f0acf46f3..175bdde7fd05 100644
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
@@ -38,6 +38,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SampleProfileInference.h"
 #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
 
 namespace llvm {
@@ -74,6 +75,8 @@ template <> struct IRTraits<BasicBlock> {
 
 } // end namespace afdo_detail
 
+extern cl::opt<bool> SampleProfileUseProfi;
+
 template <typename BT> class SampleProfileLoaderBaseImpl {
 public:
   SampleProfileLoaderBaseImpl(std::string Name, std::string RemapName)
@@ -142,6 +145,9 @@ protected:
                            ArrayRef<BasicBlockT *> Descendants,
                            PostDominatorTreeT *DomTree);
   void propagateWeights(FunctionT &F);
+  void applyProfi(FunctionT &F, BlockEdgeMap &Successors,
+                  BlockWeightMap &SampleBlockWeights,
+                  BlockWeightMap &BlockWeights, EdgeWeightMap &EdgeWeights);
   uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
   void buildEdges(FunctionT &F);
   bool propagateThroughEdges(FunctionT &F, bool UpdateBlockCount);
@@ -150,6 +156,11 @@ protected:
   bool
   computeAndPropagateWeights(FunctionT &F,
                              const DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+  void initWeightPropagation(FunctionT &F,
+                             const DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+  void
+  finalizeWeightPropagation(FunctionT &F,
+                            const DenseSet<GlobalValue::GUID> &InlinedGUIDs);
   void emitCoverageRemarks(FunctionT &F);
 
   /// Map basic blocks to their computed weights.
@@ -741,50 +752,65 @@ void SampleProfileLoaderBaseImpl<BT>::buildEdges(FunctionT &F) {
 ///   known).
 template <typename BT>
 void SampleProfileLoaderBaseImpl<BT>::propagateWeights(FunctionT &F) {
-  bool Changed = true;
-  unsigned I = 0;
-
-  // If BB weight is larger than its corresponding loop's header BB weight,
-  // use the BB weight to replace the loop header BB weight.
-  for (auto &BI : F) {
-    BasicBlockT *BB = &BI;
-    LoopT *L = LI->getLoopFor(BB);
-    if (!L) {
-      continue;
+  // Flow-based profile inference is only usable with BasicBlock instantiation
+  // of SampleProfileLoaderBaseImpl.
+  if (SampleProfileUseProfi) {
+    // Prepare block sample counts for inference.
+    BlockWeightMap SampleBlockWeights;
+    for (const auto &BI : F) {
+      ErrorOr<uint64_t> Weight = getBlockWeight(&BI);
+      if (Weight)
+        SampleBlockWeights[&BI] = Weight.get();
     }
-    BasicBlockT *Header = L->getHeader();
-    if (Header && BlockWeights[BB] > BlockWeights[Header]) {
-      BlockWeights[Header] = BlockWeights[BB];
+    // Fill in BlockWeights and EdgeWeights using an inference algorithm.
+    applyProfi(F, Successors, SampleBlockWeights, BlockWeights, EdgeWeights);
+  } else {
+    bool Changed = true;
+    unsigned I = 0;
+
+    // If BB weight is larger than its corresponding loop's header BB weight,
+    // use the BB weight to replace the loop header BB weight.
+    for (auto &BI : F) {
+      BasicBlockT *BB = &BI;
+      LoopT *L = LI->getLoopFor(BB);
+      if (!L) {
+        continue;
+      }
+      BasicBlockT *Header = L->getHeader();
+      if (Header && BlockWeights[BB] > BlockWeights[Header]) {
+        BlockWeights[Header] = BlockWeights[BB];
+      }
     }
-  }
 
-  // Before propagation starts, build, for each block, a list of
-  // unique predecessors and successors. This is necessary to handle
-  // identical edges in multiway branches. Since we visit all blocks and all
-  // edges of the CFG, it is cleaner to build these lists once at the start
-  // of the pass.
-  buildEdges(F);
+    // Propagate until we converge or we go past the iteration limit.
+    while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+      Changed = propagateThroughEdges(F, false);
+    }
 
-  // Propagate until we converge or we go past the iteration limit.
-  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
-    Changed = propagateThroughEdges(F, false);
-  }
+    // The first propagation propagates BB counts from annotated BBs to unknown
+    // BBs. The 2nd propagation pass resets edges weights, and use all BB
+    // weights to propagate edge weights.
+    VisitedEdges.clear();
+    Changed = true;
+    while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+      Changed = propagateThroughEdges(F, false);
+    }
 
-  // The first propagation propagates BB counts from annotated BBs to unknown
-  // BBs. The 2nd propagation pass resets edges weights, and use all BB weights
-  // to propagate edge weights.
-  VisitedEdges.clear();
-  Changed = true;
-  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
-    Changed = propagateThroughEdges(F, false);
+    // The 3rd propagation pass allows adjust annotated BB weights that are
+    // obviously wrong.
+    Changed = true;
+    while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+      Changed = propagateThroughEdges(F, true);
+    }
   }
+}
 
-  // The 3rd propagation pass allows adjust annotated BB weights that are
-  // obviously wrong.
-  Changed = true;
-  while (Changed && I++ < SampleProfileMaxPropagateIterations) {
-    Changed = propagateThroughEdges(F, true);
-  }
+template <typename BT>
+void SampleProfileLoaderBaseImpl<BT>::applyProfi(
+    FunctionT &F, BlockEdgeMap &Successors, BlockWeightMap &SampleBlockWeights,
+    BlockWeightMap &BlockWeights, EdgeWeightMap &EdgeWeights) {
+  auto Infer = SampleProfileInference<BT>(F, Successors, SampleBlockWeights);
+  Infer.apply(BlockWeights, EdgeWeights);
 }
 
 /// Generate branch weight metadata for all branches in \p F.
@@ -842,26 +868,64 @@ bool SampleProfileLoaderBaseImpl<BT>::computeAndPropagateWeights(
   Changed |= computeBlockWeights(F);
 
   if (Changed) {
-    // Add an entry count to the function using the samples gathered at the
-    // function entry.
-    // Sets the GUIDs that are inlined in the profiled binary. This is used
-    // for ThinLink to make correct liveness analysis, and also make the IR
-    // match the profiled binary before annotation.
-    getFunction(F).setEntryCount(
-        ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
-        &InlinedGUIDs);
+    // Initialize propagation.
+    initWeightPropagation(F, InlinedGUIDs);
 
+    // Propagate weights to all edges.
+    propagateWeights(F);
+
+    // Post-process propagated weights.
+    finalizeWeightPropagation(F, InlinedGUIDs);
+  }
+
+  return Changed;
+}
+
+template <typename BT>
+void SampleProfileLoaderBaseImpl<BT>::initWeightPropagation(
+    FunctionT &F, const DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
+  // Add an entry count to the function using the samples gathered at the
+  // function entry.
+  // Sets the GUIDs that are inlined in the profiled binary. This is used
+  // for ThinLink to make correct liveness analysis, and also make the IR
+  // match the profiled binary before annotation.
+  getFunction(F).setEntryCount(
+      ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
+      &InlinedGUIDs);
+
+  if (!SampleProfileUseProfi) {
     // Compute dominance and loop info needed for propagation.
     computeDominanceAndLoopInfo(F);
 
     // Find equivalence classes.
     findEquivalenceClasses(F);
-
-    // Propagate weights to all edges.
-    propagateWeights(F);
   }
 
-  return Changed;
+  // Before propagation starts, build, for each block, a list of
+  // unique predecessors and successors. This is necessary to handle
+  // identical edges in multiway branches. Since we visit all blocks and all
+  // edges of the CFG, it is cleaner to build these lists once at the start
+  // of the pass.
+  buildEdges(F);
+}
+
+template <typename BT>
+void SampleProfileLoaderBaseImpl<BT>::finalizeWeightPropagation(
+    FunctionT &F, const DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
+  // If we utilize a flow-based count inference, then we trust the computed
+  // counts and set the entry count as computed by the algorithm. This is
+  // primarily done to sync the counts produced by profi and BFI inference,
+  // which uses the entry count for mass propagation.
+  // If profi produces a zero-value for the entry count, we fallback to
+  // Samples->getHeadSamples() + 1 to avoid functions with zero count.
+  if (SampleProfileUseProfi) {
+    const BasicBlockT *EntryBB = getEntryBB(&F);
+    if (BlockWeights[EntryBB] > 0) {
+      getFunction(F).setEntryCount(
+          ProfileCount(BlockWeights[EntryBB], Function::PCT_Real),
+          &InlinedGUIDs);
+    }
+  }
 }
 
 template <typename BT>