41 files changed, 2129 insertions, 1754 deletions
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index e852d663c6b4..e439c94a7325 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -2059,12 +2059,13 @@ char BasicAAWrapperPass::ID = 0;
 void BasicAAWrapperPass::anchor() {}
 
 INITIALIZE_PASS_BEGIN(BasicAAWrapperPass, "basicaa",
-                      "Basic Alias Analysis (stateless AA impl)", false, true)
+                      "Basic Alias Analysis (stateless AA impl)", true, true)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PhiValuesWrapperPass)
 INITIALIZE_PASS_END(BasicAAWrapperPass, "basicaa",
-                    "Basic Alias Analysis (stateless AA impl)", false, true)
+                    "Basic Alias Analysis (stateless AA impl)", true, true)
 
 FunctionPass *llvm::createBasicAAWrapperPass() {
   return new BasicAAWrapperPass();
diff --git a/llvm/lib/Target/Hexagon/RDFGraph.cpp b/llvm/lib/CodeGen/RDFGraph.cpp
index 0cb35dc98819..437a6b030096 100644
--- a/llvm/lib/Target/Hexagon/RDFGraph.cpp
+++ b/llvm/lib/CodeGen/RDFGraph.cpp
@@ -8,8 +8,6 @@
 //
 // Target-independent, SSA-based data flow graph for register data flow (RDF).
 //
-#include "RDFGraph.h"
-#include "RDFRegisters.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -20,6 +18,8 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -753,8 +753,10 @@ RegisterSet DataFlowGraph::getLandingPadLiveIns() const {
   const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
   if (RegisterId R = TLI.getExceptionPointerRegister(PF))
     LR.insert(RegisterRef(R));
-  if (RegisterId R = TLI.getExceptionSelectorRegister(PF))
-    LR.insert(RegisterRef(R));
+  if (!isFuncletEHPersonality(classifyEHPersonality(PF))) {
+    if (RegisterId R = TLI.getExceptionSelectorRegister(PF))
+      LR.insert(RegisterRef(R));
+  }
   return LR;
 }
 
diff --git a/llvm/lib/Target/Hexagon/RDFLiveness.cpp b/llvm/lib/CodeGen/RDFLiveness.cpp
index e2c007c9d01a..0bcd27f8ea45 100644
--- a/llvm/lib/Target/Hexagon/RDFLiveness.cpp
+++ b/llvm/lib/CodeGen/RDFLiveness.cpp
@@ -22,9 +22,6 @@
 // and Embedded Architectures and Compilers", 8 (4),
 // <10.1145/2086696.2086706>. <hal-00647369>
 //
-#include "RDFLiveness.h"
-#include "RDFGraph.h"
-#include "RDFRegisters.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -33,6 +30,9 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
diff --git a/llvm/lib/Target/Hexagon/RDFRegisters.cpp b/llvm/lib/CodeGen/RDFRegisters.cpp
index b5675784e34b..bd8661816e71 100644
--- a/llvm/lib/Target/Hexagon/RDFRegisters.cpp
+++ b/llvm/lib/CodeGen/RDFRegisters.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "RDFRegisters.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 297b11de17a9..fa2f0777897b 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -147,8 +147,17 @@ void llvm::computeLTOCacheKey(
   // Include the hash for the current module
   auto ModHash = Index.getModuleHash(ModuleID);
   Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
+
+  std::vector<uint64_t> ExportsGUID;
+  ExportsGUID.reserve(ExportList.size());
   for (const auto &VI : ExportList) {
     auto GUID = VI.getGUID();
+    ExportsGUID.push_back(GUID);
+  }
+
+  // Sort the export list elements GUIDs.
+  llvm::sort(ExportsGUID);
+  for (uint64_t GUID : ExportsGUID) {
     // The export list can impact the internalization, be conservative here
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&GUID, sizeof(GUID)));
   }
@@ -156,12 +165,23 @@ void llvm::computeLTOCacheKey(
   // Include the hash for every module we import functions from. The set of
   // imported symbols for each module may affect code generation and is
   // sensitive to link order, so include that as well.
-  for (auto &Entry : ImportList) {
-    auto ModHash = Index.getModuleHash(Entry.first());
+  using ImportMapIteratorTy = FunctionImporter::ImportMapTy::const_iterator;
+  std::vector<ImportMapIteratorTy> ImportModulesVector;
+  ImportModulesVector.reserve(ImportList.size());
+
+  for (ImportMapIteratorTy It = ImportList.begin(); It != ImportList.end();
+       ++It) {
+    ImportModulesVector.push_back(It);
+  }
+  llvm::sort(ImportModulesVector,
+             [](const ImportMapIteratorTy &Lhs, const ImportMapIteratorTy &Rhs)
+                 -> bool { return Lhs->getKey() < Rhs->getKey(); });
+  for (const ImportMapIteratorTy &EntryIt : ImportModulesVector) {
+    auto ModHash = Index.getModuleHash(EntryIt->first());
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
 
-    AddUint64(Entry.second.size());
-    for (auto &Fn : Entry.second)
+    AddUint64(EntryIt->second.size());
+    for (auto &Fn : EntryIt->second)
       AddUint64(Fn);
   }
 
diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 6f5f58554d09..d407edfbd966 100644
--- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -304,7 +304,7 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
 
   LLVM_DEBUG(dbgs() << "Replacing load of size " << size << " with constant "
                     << val << '\n');
-  SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64);
+  SDValue NVal = CurDAG->getConstant(val, DL, LD->getValueType(0));
 
   // After replacement, the current node is dead, we need to
   // go backward one step to make iterator still work
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index a9fb04f20d1c..6daeb3b4b63b 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -600,6 +600,38 @@ void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId,
                               bool CheckPointer, bool SeenPointer) {
   if (!Ty || DIToIdMap.find(Ty) != DIToIdMap.end()) {
     TypeId = DIToIdMap[Ty];
+
+    // To handle the case like the following:
+    //    struct t;
+    //    typedef struct t _t;
+    //    struct s1 { _t *c; };
+    //    int test1(struct s1 *arg) { ... }
+    //
+    //    struct t { int a; int b; };
+    //    struct s2 { _t c; }
+    //    int test2(struct s2 *arg) { ... }
+    //
+    // During traversing test1() argument, "_t" is recorded
+    // in DIToIdMap and a forward declaration fixup is created
+    // for "struct t" to avoid pointee type traversal.
+    //
+    // During traversing test2() argument, even if we see "_t" is
+    // already defined, we should keep moving to eventually
+    // bring in types for "struct t". Otherwise, the "struct s2"
+    // definition won't be correct.
+    if (Ty && (!CheckPointer || !SeenPointer)) {
+      if (const auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+        unsigned Tag = DTy->getTag();
+        if (Tag == dwarf::DW_TAG_typedef || Tag == dwarf::DW_TAG_const_type ||
+            Tag == dwarf::DW_TAG_volatile_type ||
+            Tag == dwarf::DW_TAG_restrict_type) {
+          uint32_t TmpTypeId;
+          visitTypeEntry(DTy->getBaseType(), TmpTypeId, CheckPointer,
+                         SeenPointer);
+        }
+      }
+    }
+
     return;
   }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 886034d9601a..f1fe51f5e54f 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -12,9 +12,6 @@
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
-#include "RDFGraph.h"
-#include "RDFLiveness.h"
-#include "RDFRegisters.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
@@ -27,6 +24,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCInstrDesc.h"
diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 517ad1c6ee7b..f26e23befde2 100644
--- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -11,9 +11,6 @@
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "RDFCopy.h"
 #include "RDFDeadCode.h"
-#include "RDFGraph.h"
-#include "RDFLiveness.h"
-#include "RDFRegisters.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -24,6 +21,9 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp
index a9d39fd4b2dc..34d58f0a7a23 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.cpp
+++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -11,13 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "RDFCopy.h"
-#include "RDFGraph.h"
-#include "RDFLiveness.h"
-#include "RDFRegisters.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
diff --git a/llvm/lib/Target/Hexagon/RDFCopy.h b/llvm/lib/Target/Hexagon/RDFCopy.h
index 1450ab884849..99b18a75d8c2 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.h
+++ b/llvm/lib/Target/Hexagon/RDFCopy.h
@@ -9,9 +9,9 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
 #define LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
 
-#include "RDFGraph.h"
-#include "RDFLiveness.h"
-#include "RDFRegisters.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include <map>
 #include <vector>
diff --git a/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
index af86c7b1956b..5a98debd3c00 100644
--- a/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -9,13 +9,13 @@
 // RDF-based generic dead code elimination.
 
 #include "RDFDeadCode.h"
-#include "RDFGraph.h"
-#include "RDFLiveness.h"
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
 #include "llvm/Support/Debug.h"
 
 #include <queue>
diff --git a/llvm/lib/Target/Hexagon/RDFDeadCode.h b/llvm/lib/Target/Hexagon/RDFDeadCode.h
index 7f91977e1d6c..859c8161d355 100644
--- a/llvm/lib/Target/Hexagon/RDFDeadCode.h
+++ b/llvm/lib/Target/Hexagon/RDFDeadCode.h
@@ -23,8 +23,8 @@
 #ifndef RDF_DEADCODE_H
 #define RDF_DEADCODE_H
 
-#include "RDFGraph.h"
-#include "RDFLiveness.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
 #include "llvm/ADT/SetVector.h"
 
 namespace llvm {
diff --git a/llvm/lib/Target/Hexagon/RDFGraph.h b/llvm/lib/Target/Hexagon/RDFGraph.h
deleted file mode 100644
index 585f43e116f9..000000000000
--- a/llvm/lib/Target/Hexagon/RDFGraph.h
+++ /dev/null
@@ -1,968 +0,0 @@
-//===- RDFGraph.h -----------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Target-independent, SSA-based data flow graph for register data flow (RDF)
-// for a non-SSA program representation (e.g. post-RA machine code).
-//
-//
-// *** Introduction
-//
-// The RDF graph is a collection of nodes, each of which denotes some element
-// of the program. There are two main types of such elements: code and refe-
-// rences. Conceptually, "code" is something that represents the structure
-// of the program, e.g. basic block or a statement, while "reference" is an
-// instance of accessing a register, e.g. a definition or a use. Nodes are
-// connected with each other based on the structure of the program (such as
-// blocks, instructions, etc.), and based on the data flow (e.g. reaching
-// definitions, reached uses, etc.). The single-reaching-definition principle
-// of SSA is generally observed, although, due to the non-SSA representation
-// of the program, there are some differences between the graph and a "pure"
-// SSA representation.
-//
-//
-// *** Implementation remarks
-//
-// Since the graph can contain a large number of nodes, memory consumption
-// was one of the major design considerations. As a result, there is a single
-// base class NodeBase which defines all members used by all possible derived
-// classes. The members are arranged in a union, and a derived class cannot
-// add any data members of its own. Each derived class only defines the
-// functional interface, i.e. member functions. NodeBase must be a POD,
-// which implies that all of its members must also be PODs.
-// Since nodes need to be connected with other nodes, pointers have been
-// replaced with 32-bit identifiers: each node has an id of type NodeId.
-// There are mapping functions in the graph that translate between actual
-// memory addresses and the corresponding identifiers.
-// A node id of 0 is equivalent to nullptr.
-//
-//
-// *** Structure of the graph
-//
-// A code node is always a collection of other nodes. For example, a code
-// node corresponding to a basic block will contain code nodes corresponding
-// to instructions. In turn, a code node corresponding to an instruction will
-// contain a list of reference nodes that correspond to the definitions and
-// uses of registers in that instruction. The members are arranged into a
-// circular list, which is yet another consequence of the effort to save
-// memory: for each member node it should be possible to obtain its owner,
-// and it should be possible to access all other members. There are other
-// ways to accomplish that, but the circular list seemed the most natural.
-//
-// +- CodeNode -+
-// |            | <---------------------------------------------------+
-// +-+--------+-+                                                     |
-//   |FirstM  |LastM                                                  |
-//   |        +-------------------------------------+                 |
-//   |                                              |                 |
-//   V                                              V                 |
-//  +----------+ Next +----------+ Next       Next +----------+ Next  |
-//  |          |----->|          |-----> ... ----->|          |----->-+
-//  +- Member -+      +- Member -+                 +- Member -+
-//
-// The order of members is such that related reference nodes (see below)
-// should be contiguous on the member list.
-//
-// A reference node is a node that encapsulates an access to a register,
-// in other words, data flowing into or out of a register. There are two
-// major kinds of reference nodes: defs and uses. A def node will contain
-// the id of the first reached use, and the id of the first reached def.
-// Each def and use will contain the id of the reaching def, and also the
-// id of the next reached def (for def nodes) or use (for use nodes).
-// The "next node sharing the same reaching def" is denoted as "sibling".
-// In summary:
-// - Def node contains: reaching def, sibling, first reached def, and first
-// reached use.
-// - Use node contains: reaching def and sibling.
-//
-// +-- DefNode --+
-// | R2 = ...    | <---+--------------------+
-// ++---------+--+     |                    |
-//  |Reached  |Reached |                    |
-//  |Def      |Use     |                    |
-//  |         |        |Reaching            |Reaching
-//  |         V        |Def                 |Def
-//  |      +-- UseNode --+ Sib  +-- UseNode --+ Sib       Sib
-//  |      | ... = R2    |----->| ... = R2    |----> ... ----> 0
-//  |      +-------------+      +-------------+
-//  V
-// +-- DefNode --+ Sib
-// | R2 = ...    |----> ...
-// ++---------+--+
-//  |         |
-//  |         |
-// ...       ...
-//
-// To get a full picture, the circular lists connecting blocks within a
-// function, instructions within a block, etc. should be superimposed with
-// the def-def, def-use links shown above.
-// To illustrate this, consider a small example in a pseudo-assembly:
-// foo:
-//   add r2, r0, r1   ; r2 = r0+r1
-//   addi r0, r2, 1   ; r0 = r2+1
-//   ret r0           ; return value in r0
-//
-// The graph (in a format used by the debugging functions) would look like:
-//
-//   DFG dump:[
-//   f1: Function foo
-//   b2: === %bb.0 === preds(0), succs(0):
-//   p3: phi [d4<r0>(,d12,u9):]
-//   p5: phi [d6<r1>(,,u10):]
-//   s7: add [d8<r2>(,,u13):, u9<r0>(d4):, u10<r1>(d6):]
-//   s11: addi [d12<r0>(d4,,u15):, u13<r2>(d8):]
-//   s14: ret [u15<r0>(d12):]
-//   ]
-//
-// The f1, b2, p3, etc. are node ids. The letter is prepended to indicate the
-// kind of the node (i.e. f - function, b - basic block, p - phi, s - state-
-// ment, d - def, u - use).
-// The format of a def node is:
-//   dN<R>(rd,d,u):sib,
-// where
-//   N   - numeric node id,
-//   R   - register being defined
-//   rd  - reaching def,
-//   d   - reached def,
-//   u   - reached use,
-//   sib - sibling.
-// The format of a use node is:
-//   uN<R>[!](rd):sib,
-// where
-//   N   - numeric node id,
-//   R   - register being used,
-//   rd  - reaching def,
-//   sib - sibling.
-// Possible annotations (usually preceding the node id):
-//   +   - preserving def,
-//   ~   - clobbering def,
-//   "   - shadow ref (follows the node id),
-//   !   - fixed register (appears after register name).
-//
-// The circular lists are not explicit in the dump.
-//
-//
-// *** Node attributes
-//
-// NodeBase has a member "Attrs", which is the primary way of determining
-// the node's characteristics. The fields in this member decide whether
-// the node is a code node or a reference node (i.e. node's "type"), then
-// within each type, the "kind" determines what specifically this node
-// represents. The remaining bits, "flags", contain additional information
-// that is even more detailed than the "kind".
-// CodeNode's kinds are:
-// - Phi:   Phi node, members are reference nodes.
-// - Stmt:  Statement, members are reference nodes.
-// - Block: Basic block, members are instruction nodes (i.e. Phi or Stmt).
-// - Func:  The whole function. The members are basic block nodes.
-// RefNode's kinds are:
-// - Use.
-// - Def.
-//
-// Meaning of flags:
-// - Preserving: applies only to defs. A preserving def is one that can
-//   preserve some of the original bits among those that are included in
-//   the register associated with that def. For example, if R0 is a 32-bit
-//   register, but a def can only change the lower 16 bits, then it will
-//   be marked as preserving.
-// - Shadow: a reference that has duplicates holding additional reaching
-//   defs (see more below).
-// - Clobbering: applied only to defs, indicates that the value generated
-//   by this def is unspecified. A typical example would be volatile registers
-//   after function calls.
-// - Fixed: the register in this def/use cannot be replaced with any other
-//   register. A typical case would be a parameter register to a call, or
-//   the register with the return value from a function.
-// - Undef: the register in this reference the register is assumed to have
-//   no pre-existing value, even if it appears to be reached by some def.
-//   This is typically used to prevent keeping registers artificially live
-//   in cases when they are defined via predicated instructions. For example:
-//     r0 = add-if-true cond, r10, r11                (1)
-//     r0 = add-if-false cond, r12, r13, implicit r0  (2)
-//     ... = r0                                       (3)
-//   Before (1), r0 is not intended to be live, and the use of r0 in (3) is
-//   not meant to be reached by any def preceding (1). However, since the
-//   defs in (1) and (2) are both preserving, these properties alone would
-//   imply that the use in (3) may indeed be reached by some prior def.
-//   Adding Undef flag to the def in (1) prevents that. The Undef flag
-//   may be applied to both defs and uses.
-// - Dead: applies only to defs. The value coming out of a "dead" def is
-//   assumed to be unused, even if the def appears to be reaching other defs
-//   or uses. The motivation for this flag comes from dead defs on function
-//   calls: there is no way to determine if such a def is dead without
-//   analyzing the target's ABI. Hence the graph should contain this info,
-//   as it is unavailable otherwise. On the other hand, a def without any
-//   uses on a typical instruction is not the intended target for this flag.
-//
-// *** Shadow references
-//
-// It may happen that a super-register can have two (or more) non-overlapping
-// sub-registers. When both of these sub-registers are defined and followed
-// by a use of the super-register, the use of the super-register will not
-// have a unique reaching def: both defs of the sub-registers need to be
-// accounted for. In such cases, a duplicate use of the super-register is
-// added and it points to the extra reaching def. Both uses are marked with
-// a flag "shadow". Example:
-// Assume t0 is a super-register of r0 and r1, r0 and r1 do not overlap:
-//   set r0, 1        ; r0 = 1
-//   set r1, 1        ; r1 = 1
-//   addi t1, t0, 1   ; t1 = t0+1
-//
-// The DFG:
-//   s1: set [d2<r0>(,,u9):]
-//   s3: set [d4<r1>(,,u10):]
-//   s5: addi [d6<t1>(,,):, u7"<t0>(d2):, u8"<t0>(d4):]
-//
-// The statement s5 has two use nodes for t0: u7" and u9". The quotation
-// mark " indicates that the node is a shadow.
-//
-
-#ifndef LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
-#define LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
-
-#include "RDFRegisters.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/MC/LaneBitmask.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <map>
-#include <set>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-// RDF uses uint32_t to refer to registers. This is to ensure that the type
-// size remains specific. In other places, registers are often stored using
-// unsigned.
-static_assert(sizeof(uint32_t) == sizeof(unsigned), "Those should be equal");
-
-namespace llvm {
-
-class MachineBasicBlock;
-class MachineDominanceFrontier;
-class MachineDominatorTree;
-class MachineFunction;
-class MachineInstr;
-class MachineOperand;
-class raw_ostream;
-class TargetInstrInfo;
-class TargetRegisterInfo;
-
-namespace rdf {
-
-  using NodeId = uint32_t;
-
-  struct DataFlowGraph;
-
-  struct NodeAttrs {
-    enum : uint16_t {
-      None          = 0x0000,   // Nothing
-
-      // Types: 2 bits
-      TypeMask      = 0x0003,
-      Code          = 0x0001,   // 01, Container
-      Ref           = 0x0002,   // 10, Reference
-
-      // Kind: 3 bits
-      KindMask      = 0x0007 << 2,
-      Def           = 0x0001 << 2,  // 001
-      Use           = 0x0002 << 2,  // 010
-      Phi           = 0x0003 << 2,  // 011
-      Stmt          = 0x0004 << 2,  // 100
-      Block         = 0x0005 << 2,  // 101
-      Func          = 0x0006 << 2,  // 110
-
-      // Flags: 7 bits for now
-      FlagMask      = 0x007F << 5,
-      Shadow        = 0x0001 << 5,  // 0000001, Has extra reaching defs.
-      Clobbering    = 0x0002 << 5,  // 0000010, Produces unspecified values.
-      PhiRef        = 0x0004 << 5,  // 0000100, Member of PhiNode.
-      Preserving    = 0x0008 << 5,  // 0001000, Def can keep original bits.
-      Fixed         = 0x0010 << 5,  // 0010000, Fixed register.
-      Undef         = 0x0020 << 5,  // 0100000, Has no pre-existing value.
-      Dead          = 0x0040 << 5,  // 1000000, Does not define a value.
-    };
-
-    static uint16_t type(uint16_t T)  { return T & TypeMask; }
-    static uint16_t kind(uint16_t T)  { return T & KindMask; }
-    static uint16_t flags(uint16_t T) { return T & FlagMask; }
-
-    static uint16_t set_type(uint16_t A, uint16_t T) {
-      return (A & ~TypeMask) | T;
-    }
-
-    static uint16_t set_kind(uint16_t A, uint16_t K) {
-      return (A & ~KindMask) | K;
-    }
-
-    static uint16_t set_flags(uint16_t A, uint16_t F) {
-      return (A & ~FlagMask) | F;
-    }
-
-    // Test if A contains B.
-    static bool contains(uint16_t A, uint16_t B) {
-      if (type(A) != Code)
-        return false;
-      uint16_t KB = kind(B);
-      switch (kind(A)) {
-        case Func:
-          return KB == Block;
-        case Block:
-          return KB == Phi || KB == Stmt;
-        case Phi:
-        case Stmt:
-          return type(B) == Ref;
-      }
-      return false;
-    }
-  };
-
-  struct BuildOptions {
-    enum : unsigned {
-      None          = 0x00,
-      KeepDeadPhis  = 0x01,   // Do not remove dead phis during build.
-    };
-  };
-
-  template <typename T> struct NodeAddr {
-    NodeAddr() = default;
-    NodeAddr(T A, NodeId I) : Addr(A), Id(I) {}
-
-    // Type cast (casting constructor). The reason for having this class
-    // instead of std::pair.
-    template <typename S> NodeAddr(const NodeAddr<S> &NA)
-      : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {}
-
-    bool operator== (const NodeAddr<T> &NA) const {
-      assert((Addr == NA.Addr) == (Id == NA.Id));
-      return Addr == NA.Addr;
-    }
-    bool operator!= (const NodeAddr<T> &NA) const {
-      return !operator==(NA);
-    }
-
-    T Addr = nullptr;
-    NodeId Id = 0;
-  };
-
-  struct NodeBase;
-
-  // Fast memory allocation and translation between node id and node address.
-  // This is really the same idea as the one underlying the "bump pointer
-  // allocator", the difference being in the translation. A node id is
-  // composed of two components: the index of the block in which it was
-  // allocated, and the index within the block. With the default settings,
-  // where the number of nodes per block is 4096, the node id (minus 1) is:
-  //
-  // bit position:                11             0
-  // +----------------------------+--------------+
-  // | Index of the block         |Index in block|
-  // +----------------------------+--------------+
-  //
-  // The actual node id is the above plus 1, to avoid creating a node id of 0.
-  //
-  // This method significantly improved the build time, compared to using maps
-  // (std::unordered_map or DenseMap) to translate between pointers and ids.
-  struct NodeAllocator {
-    // Amount of storage for a single node.
-    enum { NodeMemSize = 32 };
-
-    NodeAllocator(uint32_t NPB = 4096)
-        : NodesPerBlock(NPB), BitsPerIndex(Log2_32(NPB)),
-          IndexMask((1 << BitsPerIndex)-1) {
-      assert(isPowerOf2_32(NPB));
-    }
-
-    NodeBase *ptr(NodeId N) const {
-      uint32_t N1 = N-1;
-      uint32_t BlockN = N1 >> BitsPerIndex;
-      uint32_t Offset = (N1 & IndexMask) * NodeMemSize;
-      return reinterpret_cast<NodeBase*>(Blocks[BlockN]+Offset);
-    }
-
-    NodeId id(const NodeBase *P) const;
-    NodeAddr<NodeBase*> New();
-    void clear();
-
-  private:
-    void startNewBlock();
-    bool needNewBlock();
-
-    uint32_t makeId(uint32_t Block, uint32_t Index) const {
-      // Add 1 to the id, to avoid the id of 0, which is treated as "null".
-      return ((Block << BitsPerIndex) | Index) + 1;
-    }
-
-    const uint32_t NodesPerBlock;
-    const uint32_t BitsPerIndex;
-    const uint32_t IndexMask;
-    char *ActiveEnd = nullptr;
-    std::vector<char*> Blocks;
-    using AllocatorTy = BumpPtrAllocatorImpl<MallocAllocator, 65536>;
-    AllocatorTy MemPool;
-  };
-
-  using RegisterSet = std::set<RegisterRef>;
-
-  struct TargetOperandInfo {
-    TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {}
-    virtual ~TargetOperandInfo() = default;
-
-    virtual bool isPreserving(const MachineInstr &In, unsigned OpNum) const;
-    virtual bool isClobbering(const MachineInstr &In, unsigned OpNum) const;
-    virtual bool isFixedReg(const MachineInstr &In, unsigned OpNum) const;
-
-    const TargetInstrInfo &TII;
-  };
-
-  // Packed register reference. Only used for storage.
-  struct PackedRegisterRef {
-    RegisterId Reg;
-    uint32_t MaskId;
-  };
-
-  struct LaneMaskIndex : private IndexedSet<LaneBitmask> {
-    LaneMaskIndex() = default;
-
-    LaneBitmask getLaneMaskForIndex(uint32_t K) const {
-      return K == 0 ? LaneBitmask::getAll() : get(K);
-    }
-
-    uint32_t getIndexForLaneMask(LaneBitmask LM) {
-      assert(LM.any());
-      return LM.all() ? 0 : insert(LM);
-    }
-
-    uint32_t getIndexForLaneMask(LaneBitmask LM) const {
-      assert(LM.any());
-      return LM.all() ? 0 : find(LM);
-    }
-  };
-
-  struct NodeBase {
-  public:
-    // Make sure this is a POD.
-    NodeBase() = default;
-
-    uint16_t getType()  const { return NodeAttrs::type(Attrs); }
-    uint16_t getKind()  const { return NodeAttrs::kind(Attrs); }
-    uint16_t getFlags() const { return NodeAttrs::flags(Attrs); }
-    NodeId   getNext()  const { return Next; }
-
-    uint16_t getAttrs() const { return Attrs; }
-    void setAttrs(uint16_t A) { Attrs = A; }
-    void setFlags(uint16_t F) { setAttrs(NodeAttrs::set_flags(getAttrs(), F)); }
-
-    // Insert node NA after "this" in the circular chain.
-    void append(NodeAddr<NodeBase*> NA);
-
-    // Initialize all members to 0.
-    void init() { memset(this, 0, sizeof *this); }
-
-    void setNext(NodeId N) { Next = N; }
-
-  protected:
-    uint16_t Attrs;
-    uint16_t Reserved;
-    NodeId Next;                // Id of the next node in the circular chain.
-    // Definitions of nested types. Using anonymous nested structs would make
-    // this class definition clearer, but unnamed structs are not a part of
-    // the standard.
-    struct Def_struct  {
-      NodeId DD, DU;          // Ids of the first reached def and use.
-    };
-    struct PhiU_struct  {
-      NodeId PredB;           // Id of the predecessor block for a phi use.
-    };
-    struct Code_struct {
-      void *CP;               // Pointer to the actual code.
-      NodeId FirstM, LastM;   // Id of the first member and last.
-    };
-    struct Ref_struct {
-      NodeId RD, Sib;         // Ids of the reaching def and the sibling.
-      union {
-        Def_struct Def;
-        PhiU_struct PhiU;
-      };
-      union {
-        MachineOperand *Op;   // Non-phi refs point to a machine operand.
-        PackedRegisterRef PR; // Phi refs store register info directly.
-      };
-    };
-
-    // The actual payload.
-    union {
-      Ref_struct Ref;
-      Code_struct Code;
-    };
-  };
-  // The allocator allocates chunks of 32 bytes for each node. The fact that
-  // each node takes 32 bytes in memory is used for fast translation between
-  // the node id and the node address.
-  static_assert(sizeof(NodeBase) <= NodeAllocator::NodeMemSize,
-        "NodeBase must be at most NodeAllocator::NodeMemSize bytes");
-
-  using NodeList = SmallVector<NodeAddr<NodeBase *>, 4>;
-  using NodeSet = std::set<NodeId>;
-
-  struct RefNode : public NodeBase {
-    RefNode() = default;
-
-    RegisterRef getRegRef(const DataFlowGraph &G) const;
-
-    MachineOperand &getOp() {
-      assert(!(getFlags() & NodeAttrs::PhiRef));
-      return *Ref.Op;
-    }
-
-    void setRegRef(RegisterRef RR, DataFlowGraph &G);
-    void setRegRef(MachineOperand *Op, DataFlowGraph &G);
-
-    NodeId getReachingDef() const {
-      return Ref.RD;
-    }
-    void setReachingDef(NodeId RD) {
-      Ref.RD = RD;
-    }
-
-    NodeId getSibling() const {
-      return Ref.Sib;
-    }
-    void setSibling(NodeId Sib) {
-      Ref.Sib = Sib;
-    }
-
-    bool isUse() const {
-      assert(getType() == NodeAttrs::Ref);
-      return getKind() == NodeAttrs::Use;
-    }
-
-    bool isDef() const {
-      assert(getType() == NodeAttrs::Ref);
-      return getKind() == NodeAttrs::Def;
-    }
-
-    template <typename Predicate>
-    NodeAddr<RefNode*> getNextRef(RegisterRef RR, Predicate P, bool NextOnly,
-        const DataFlowGraph &G);
-    NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
-  };
-
-  struct DefNode : public RefNode {
-    NodeId getReachedDef() const {
-      return Ref.Def.DD;
-    }
-    void setReachedDef(NodeId D) {
-      Ref.Def.DD = D;
-    }
-    NodeId getReachedUse() const {
-      return Ref.Def.DU;
-    }
-    void setReachedUse(NodeId U) {
-      Ref.Def.DU = U;
-    }
-
-    void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
-  };
-
-  struct UseNode : public RefNode {
-    void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
-  };
-
-  struct PhiUseNode : public UseNode {
-    NodeId getPredecessor() const {
-      assert(getFlags() & NodeAttrs::PhiRef);
-      return Ref.PhiU.PredB;
-    }
-    void setPredecessor(NodeId B) {
-      assert(getFlags() & NodeAttrs::PhiRef);
-      Ref.PhiU.PredB = B;
-    }
-  };
-
-  struct CodeNode : public NodeBase {
-    template <typename T> T getCode() const {
-      return static_cast<T>(Code.CP);
-    }
-    void setCode(void *C) {
-      Code.CP = C;
-    }
-
-    NodeAddr<NodeBase*> getFirstMember(const DataFlowGraph &G) const;
-    NodeAddr<NodeBase*> getLastMember(const DataFlowGraph &G) const;
-    void addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
-    void addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA,
-        const DataFlowGraph &G);
-    void removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
-
-    NodeList members(const DataFlowGraph &G) const;
-    template <typename Predicate>
-    NodeList members_if(Predicate P, const DataFlowGraph &G) const;
-  };
-
-  struct InstrNode : public CodeNode {
-    NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
-  };
-
-  struct PhiNode : public InstrNode {
-    MachineInstr *getCode() const {
-      return nullptr;
-    }
-  };
-
-  struct StmtNode : public InstrNode {
-    MachineInstr *getCode() const {
-      return CodeNode::getCode<MachineInstr*>();
-    }
-  };
-
-  struct BlockNode : public CodeNode {
-    MachineBasicBlock *getCode() const {
-      return CodeNode::getCode<MachineBasicBlock*>();
-    }
-
-    void addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G);
-  };
-
-  struct FuncNode : public CodeNode {
-    MachineFunction *getCode() const {
-      return CodeNode::getCode<MachineFunction*>();
-    }
-
-    NodeAddr<BlockNode*> findBlock(const MachineBasicBlock *BB,
-        const DataFlowGraph &G) const;
-    NodeAddr<BlockNode*> getEntryBlock(const DataFlowGraph &G);
-  };
-
-  struct DataFlowGraph {
-    DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
-        const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
-        const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi);
-
-    NodeBase *ptr(NodeId N) const;
-    template <typename T> T ptr(NodeId N) const {
-      return static_cast<T>(ptr(N));
-    }
-
-    NodeId id(const NodeBase *P) const;
-
-    template <typename T> NodeAddr<T> addr(NodeId N) const {
-      return { ptr<T>(N), N };
-    }
-
-    NodeAddr<FuncNode*> getFunc() const { return Func; }
-    MachineFunction &getMF() const { return MF; }
-    const TargetInstrInfo &getTII() const { return TII; }
-    const TargetRegisterInfo &getTRI() const { return TRI; }
-    const PhysicalRegisterInfo &getPRI() const { return PRI; }
-    const MachineDominatorTree &getDT() const { return MDT; }
-    const MachineDominanceFrontier &getDF() const { return MDF; }
-    const RegisterAggr &getLiveIns() const { return LiveIns; }
-
-    struct DefStack {
-      DefStack() = default;
-
-      bool empty() const { return Stack.empty() || top() == bottom(); }
-
-    private:
-      using value_type = NodeAddr<DefNode *>;
-      struct Iterator {
-        using value_type = DefStack::value_type;
-
-        Iterator &up() { Pos = DS.nextUp(Pos); return *this; }
-        Iterator &down() { Pos = DS.nextDown(Pos); return *this; }
-
-        value_type operator*() const {
-          assert(Pos >= 1);
-          return DS.Stack[Pos-1];
-        }
-        const value_type *operator->() const {
-          assert(Pos >= 1);
-          return &DS.Stack[Pos-1];
-        }
-        bool operator==(const Iterator &It) const { return Pos == It.Pos; }
-        bool operator!=(const Iterator &It) const { return Pos != It.Pos; }
-
-      private:
-        friend struct DefStack;
-
-        Iterator(const DefStack &S, bool Top);
-
-        // Pos-1 is the index in the StorageType object that corresponds to
-        // the top of the DefStack.
-        const DefStack &DS;
-        unsigned Pos;
-      };
-
-    public:
-      using iterator = Iterator;
-
-      iterator top() const { return Iterator(*this, true); }
-      iterator bottom() const { return Iterator(*this, false); }
-      unsigned size() const;
-
-      void push(NodeAddr<DefNode*> DA) { Stack.push_back(DA); }
-      void pop();
-      void start_block(NodeId N);
-      void clear_block(NodeId N);
-
-    private:
-      friend struct Iterator;
-
-      using StorageType = std::vector<value_type>;
-
-      bool isDelimiter(const StorageType::value_type &P, NodeId N = 0) const {
-        return (P.Addr == nullptr) && (N == 0 || P.Id == N);
-      }
-
-      unsigned nextUp(unsigned P) const;
-      unsigned nextDown(unsigned P) const;
-
-      StorageType Stack;
-    };
-
-    // Make this std::unordered_map for speed of accessing elements.
-    // Map: Register (physical or virtual) -> DefStack
-    using DefStackMap = std::unordered_map<RegisterId, DefStack>;
-
-    void build(unsigned Options = BuildOptions::None);
-    void pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
-    void markBlock(NodeId B, DefStackMap &DefM);
-    void releaseBlock(NodeId B, DefStackMap &DefM);
-
-    PackedRegisterRef pack(RegisterRef RR) {
-      return { RR.Reg, LMI.getIndexForLaneMask(RR.Mask) };
-    }
-    PackedRegisterRef pack(RegisterRef RR) const {
-      return { RR.Reg, LMI.getIndexForLaneMask(RR.Mask) };
-    }
-    RegisterRef unpack(PackedRegisterRef PR) const {
-      return RegisterRef(PR.Reg, LMI.getLaneMaskForIndex(PR.MaskId));
-    }
-
-    RegisterRef makeRegRef(unsigned Reg, unsigned Sub) const;
-    RegisterRef makeRegRef(const MachineOperand &Op) const;
-    RegisterRef restrictRef(RegisterRef AR, RegisterRef BR) const;
-
-    NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA) const;
-    NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA, bool Create);
-    NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA) const;
-    NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA, bool Create);
-    NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA) const;
-
-    NodeList getRelatedRefs(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA) const;
-
-    NodeAddr<BlockNode*> findBlock(MachineBasicBlock *BB) const {
-      return BlockNodes.at(BB);
-    }
-
-    void unlinkUse(NodeAddr<UseNode*> UA, bool RemoveFromOwner) {
-      unlinkUseDF(UA);
-      if (RemoveFromOwner)
-        removeFromOwner(UA);
-    }
-
-    void unlinkDef(NodeAddr<DefNode*> DA, bool RemoveFromOwner) {
-      unlinkDefDF(DA);
-      if (RemoveFromOwner)
-        removeFromOwner(DA);
-    }
-
-    // Some useful filters.
-    template <uint16_t Kind>
-    static bool IsRef(const NodeAddr<NodeBase*> BA) {
-      return BA.Addr->getType() == NodeAttrs::Ref &&
-             BA.Addr->getKind() == Kind;
-    }
-
-    template <uint16_t Kind>
-    static bool IsCode(const NodeAddr<NodeBase*> BA) {
-      return BA.Addr->getType() == NodeAttrs::Code &&
-             BA.Addr->getKind() == Kind;
-    }
-
-    static bool IsDef(const NodeAddr<NodeBase*> BA) {
-      return BA.Addr->getType() == NodeAttrs::Ref &&
-             BA.Addr->getKind() == NodeAttrs::Def;
-    }
-
-    static bool IsUse(const NodeAddr<NodeBase*> BA) {
-      return BA.Addr->getType() == NodeAttrs::Ref &&
-             BA.Addr->getKind() == NodeAttrs::Use;
-    }
-
-    static bool IsPhi(const NodeAddr<NodeBase*> BA) {
-      return BA.Addr->getType() == NodeAttrs::Code &&
-             BA.Addr->getKind() == NodeAttrs::Phi;
-    }
-
-    static bool IsPreservingDef(const NodeAddr<DefNode*> DA) {
-      uint16_t Flags = DA.Addr->getFlags();
-      return (Flags & NodeAttrs::Preserving) && !(Flags & NodeAttrs::Undef);
-    }
-
-  private:
-    void reset();
-
-    RegisterSet getLandingPadLiveIns() const;
-
-    NodeAddr<NodeBase*> newNode(uint16_t Attrs);
-    NodeAddr<NodeBase*> cloneNode(const NodeAddr<NodeBase*> B);
-    NodeAddr<UseNode*> newUse(NodeAddr<InstrNode*> Owner,
-        MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
-    NodeAddr<PhiUseNode*> newPhiUse(NodeAddr<PhiNode*> Owner,
-        RegisterRef RR, NodeAddr<BlockNode*> PredB,
-        uint16_t Flags = NodeAttrs::PhiRef);
-    NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner,
-        MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
-    NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner,
-        RegisterRef RR, uint16_t Flags = NodeAttrs::PhiRef);
-    NodeAddr<PhiNode*> newPhi(NodeAddr<BlockNode*> Owner);
-    NodeAddr<StmtNode*> newStmt(NodeAddr<BlockNode*> Owner,
-        MachineInstr *MI);
-    NodeAddr<BlockNode*> newBlock(NodeAddr<FuncNode*> Owner,
-        MachineBasicBlock *BB);
-    NodeAddr<FuncNode*> newFunc(MachineFunction *MF);
-
-    template <typename Predicate>
-    std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>>
-    locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
-        Predicate P) const;
-
-    using BlockRefsMap = std::map<NodeId, RegisterSet>;
-
-    void buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In);
-    void recordDefsForDF(BlockRefsMap &PhiM, NodeAddr<BlockNode*> BA);
-    void buildPhis(BlockRefsMap &PhiM, RegisterSet &AllRefs,
-        NodeAddr<BlockNode*> BA);
-    void removeUnusedPhis();
-
-    void pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DM);
-    void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
-    template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA,
-        NodeAddr<T> TA, DefStack &DS);
-    template <typename Predicate> void linkStmtRefs(DefStackMap &DefM,
-        NodeAddr<StmtNode*> SA, Predicate P);
-    void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA);
-
-    void unlinkUseDF(NodeAddr<UseNode*> UA);
-    void unlinkDefDF(NodeAddr<DefNode*> DA);
-
-    void removeFromOwner(NodeAddr<RefNode*> RA) {
-      NodeAddr<InstrNode*> IA = RA.Addr->getOwner(*this);
-      IA.Addr->removeMember(RA, *this);
-    }
-
-    MachineFunction &MF;
-    const TargetInstrInfo &TII;
-    const TargetRegisterInfo &TRI;
-    const PhysicalRegisterInfo PRI;
-    const MachineDominatorTree &MDT;
-    const MachineDominanceFrontier &MDF;
-    const TargetOperandInfo &TOI;
-
-    RegisterAggr LiveIns;
-    NodeAddr<FuncNode*> Func;
-    NodeAllocator Memory;
-    // Local map:  MachineBasicBlock -> NodeAddr<BlockNode*>
-    std::map<MachineBasicBlock*,NodeAddr<BlockNode*>> BlockNodes;
-    // Lane mask map.
-    LaneMaskIndex LMI;
-  };  // struct DataFlowGraph
-
-  template <typename Predicate>
-  NodeAddr<RefNode*> RefNode::getNextRef(RegisterRef RR, Predicate P,
-        bool NextOnly, const DataFlowGraph &G) {
-    // Get the "Next" reference in the circular list that references RR and
-    // satisfies predicate "Pred".
-    auto NA = G.addr<NodeBase*>(getNext());
-
-    while (NA.Addr != this) {
-      if (NA.Addr->getType() == NodeAttrs::Ref) {
-        NodeAddr<RefNode*> RA = NA;
-        if (RA.Addr->getRegRef(G) == RR && P(NA))
-          return NA;
-        if (NextOnly)
-          break;
-        NA = G.addr<NodeBase*>(NA.Addr->getNext());
-      } else {
-        // We've hit the beginning of the chain.
-        assert(NA.Addr->getType() == NodeAttrs::Code);
-        NodeAddr<CodeNode*> CA = NA;
-        NA = CA.Addr->getFirstMember(G);
-      }
-    }
-    // Return the equivalent of "nullptr" if such a node was not found.
-    return NodeAddr<RefNode*>();
-  }
-
-  template <typename Predicate>
-  NodeList CodeNode::members_if(Predicate P, const DataFlowGraph &G) const {
-    NodeList MM;
-    auto M = getFirstMember(G);
-    if (M.Id == 0)
-      return MM;
-
-    while (M.Addr != this) {
-      if (P(M))
-        MM.push_back(M);
-      M = G.addr<NodeBase*>(M.Addr->getNext());
-    }
-    return MM;
-  }
-
-  template <typename T>
-  struct Print {
-    Print(const T &x, const DataFlowGraph &g) : Obj(x), G(g) {}
-
-    const T &Obj;
-    const DataFlowGraph &G;
-  };
-
-  template <typename T>
-  struct PrintNode : Print<NodeAddr<T>> {
-    PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g)
-      : Print<NodeAddr<T>>(x, g) {}
-  };
-
-  raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterRef> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeId> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<DefNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<UseNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const Print<NodeAddr<PhiUseNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<RefNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeList> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeSet> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<PhiNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const Print<NodeAddr<StmtNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const Print<NodeAddr<InstrNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const Print<NodeAddr<BlockNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const Print<NodeAddr<FuncNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterSet> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterAggr> &P);
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const Print<DataFlowGraph::DefStack> &P);
-
-} // end namespace rdf
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
diff --git a/llvm/lib/Target/Hexagon/RDFLiveness.h b/llvm/lib/Target/Hexagon/RDFLiveness.h
deleted file mode 100644
index ea4890271726..000000000000
--- a/llvm/lib/Target/Hexagon/RDFLiveness.h
+++ /dev/null
@@ -1,151 +0,0 @@
-//===- RDFLiveness.h --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Recalculate the liveness information given a data flow graph.
-// This includes block live-ins and kill flags.
-
-#ifndef LLVM_LIB_TARGET_HEXAGON_RDFLIVENESS_H
-#define LLVM_LIB_TARGET_HEXAGON_RDFLIVENESS_H
-
-#include "RDFGraph.h"
-#include "RDFRegisters.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/MC/LaneBitmask.h"
-#include <map>
-#include <set>
-#include <utility>
-
-namespace llvm {
-
-class MachineBasicBlock;
-class MachineDominanceFrontier;
-class MachineDominatorTree;
-class MachineRegisterInfo;
-class TargetRegisterInfo;
-
-namespace rdf {
-
-  struct Liveness {
-  public:
-    // This is really a std::map, except that it provides a non-trivial
-    // default constructor to the element accessed via [].
-    struct LiveMapType {
-      LiveMapType(const PhysicalRegisterInfo &pri) : Empty(pri) {}
-
-      RegisterAggr &operator[] (MachineBasicBlock *B) {
-        return Map.emplace(B, Empty).first->second;
-      }
-
-    private:
-      RegisterAggr Empty;
-      std::map<MachineBasicBlock*,RegisterAggr> Map;
-    };
-
-    using NodeRef = std::pair<NodeId, LaneBitmask>;
-    using NodeRefSet = std::set<NodeRef>;
-    // RegisterId in RefMap must be normalized.
-    using RefMap = std::map<RegisterId, NodeRefSet>;
-
-    Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
-        : DFG(g), TRI(g.getTRI()), PRI(g.getPRI()), MDT(g.getDT()),
-          MDF(g.getDF()), LiveMap(g.getPRI()), Empty(), NoRegs(g.getPRI()) {}
-
-    NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
-        bool TopShadows, bool FullChain, const RegisterAggr &DefRRs);
-
-    NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA) {
-      return getAllReachingDefs(RefA.Addr->getRegRef(DFG), RefA, false,
-                                false, NoRegs);
-    }
-
-    NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA) {
-      return getAllReachingDefs(RefRR, RefA, false, false, NoRegs);
-    }
-
-    NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA,
-        const RegisterAggr &DefRRs);
-
-    NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA) {
-      return getAllReachedUses(RefRR, DefA, NoRegs);
-    }
-
-    std::pair<NodeSet,bool> getAllReachingDefsRec(RegisterRef RefRR,
-        NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs);
-
-    NodeAddr<RefNode*> getNearestAliasedRef(RegisterRef RefRR,
-        NodeAddr<InstrNode*> IA);
-
-    LiveMapType &getLiveMap() { return LiveMap; }
-    const LiveMapType &getLiveMap() const { return LiveMap; }
-
-    const RefMap &getRealUses(NodeId P) const {
-      auto F = RealUseMap.find(P);
-      return F == RealUseMap.end() ? Empty : F->second;
-    }
-
-    void computePhiInfo();
-    void computeLiveIns();
-    void resetLiveIns();
-    void resetKills();
-    void resetKills(MachineBasicBlock *B);
-
-    void trace(bool T) { Trace = T; }
-
-  private:
-    const DataFlowGraph &DFG;
-    const TargetRegisterInfo &TRI;
-    const PhysicalRegisterInfo &PRI;
-    const MachineDominatorTree &MDT;
-    const MachineDominanceFrontier &MDF;
-    LiveMapType LiveMap;
-    const RefMap Empty;
-    const RegisterAggr NoRegs;
-    bool Trace = false;
-
-    // Cache of mapping from node ids (for RefNodes) to the containing
-    // basic blocks. Not computing it each time for each node reduces
-    // the liveness calculation time by a large fraction.
-    using NodeBlockMap = DenseMap<NodeId, MachineBasicBlock *>;
-    NodeBlockMap NBMap;
-
-    // Phi information:
-    //
-    // RealUseMap
-    // map: NodeId -> (map: RegisterId -> NodeRefSet)
-    //      phi id -> (map: register -> set of reached non-phi uses)
-    std::map<NodeId, RefMap> RealUseMap;
-
-    // Inverse iterated dominance frontier.
-    std::map<MachineBasicBlock*,std::set<MachineBasicBlock*>> IIDF;
-
-    // Live on entry.
-    std::map<MachineBasicBlock*,RefMap> PhiLON;
-
-    // Phi uses are considered to be located at the end of the block that
-    // they are associated with. The reaching def of a phi use dominates the
-    // block that the use corresponds to, but not the block that contains
-    // the phi itself. To include these uses in the liveness propagation (up
-    // the dominator tree), create a map: block -> set of uses live on exit.
-    std::map<MachineBasicBlock*,RefMap> PhiLOX;
-
-    MachineBasicBlock *getBlockWithRef(NodeId RN) const;
-    void traverse(MachineBasicBlock *B, RefMap &LiveIn);
-    void emptify(RefMap &M);
-
-    std::pair<NodeSet,bool> getAllReachingDefsRecImpl(RegisterRef RefRR,
-        NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs,
-        unsigned Nest, unsigned MaxNest);
-  };
-
-  raw_ostream &operator<<(raw_ostream &OS, const Print<Liveness::RefMap> &P);
-
-} // end namespace rdf
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_HEXAGON_RDFLIVENESS_H
diff --git a/llvm/lib/Target/Hexagon/RDFRegisters.h b/llvm/lib/Target/Hexagon/RDFRegisters.h
deleted file mode 100644
index 4afaf80e4659..000000000000
--- a/llvm/lib/Target/Hexagon/RDFRegisters.h
+++ /dev/null
@@ -1,240 +0,0 @@
-//===- RDFRegisters.h -------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
-#define LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
-
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/MC/LaneBitmask.h"
-#include <cassert>
-#include <cstdint>
-#include <map>
-#include <set>
-#include <vector>
-
-namespace llvm {
-
-class MachineFunction;
-class raw_ostream;
-
-namespace rdf {
-
-  using RegisterId = uint32_t;
-
-  // Template class for a map translating uint32_t into arbitrary types.
-  // The map will act like an indexed set: upon insertion of a new object,
-  // it will automatically assign a new index to it. Index of 0 is treated
-  // as invalid and is never allocated.
-  template <typename T, unsigned N = 32>
-  struct IndexedSet {
-    IndexedSet() { Map.reserve(N); }
-
-    T get(uint32_t Idx) const {
-      // Index Idx corresponds to Map[Idx-1].
-      assert(Idx != 0 && !Map.empty() && Idx-1 < Map.size());
-      return Map[Idx-1];
-    }
-
-    uint32_t insert(T Val) {
-      // Linear search.
-      auto F = llvm::find(Map, Val);
-      if (F != Map.end())
-        return F - Map.begin() + 1;
-      Map.push_back(Val);
-      return Map.size();  // Return actual_index + 1.
-    }
-
-    uint32_t find(T Val) const {
-      auto F = llvm::find(Map, Val);
-      assert(F != Map.end());
-      return F - Map.begin() + 1;
-    }
-
-    uint32_t size() const { return Map.size(); }
-
-    using const_iterator = typename std::vector<T>::const_iterator;
-
-    const_iterator begin() const { return Map.begin(); }
-    const_iterator end() const { return Map.end(); }
-
-  private:
-    std::vector<T> Map;
-  };
-
-  struct RegisterRef {
-    RegisterId Reg = 0;
-    LaneBitmask Mask = LaneBitmask::getNone();
-
-    RegisterRef() = default;
-    explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll())
-      : Reg(R), Mask(R != 0 ? M : LaneBitmask::getNone()) {}
-
-    operator bool() const {
-      return Reg != 0 && Mask.any();
-    }
-
-    bool operator== (const RegisterRef &RR) const {
-      return Reg == RR.Reg && Mask == RR.Mask;
-    }
-
-    bool operator!= (const RegisterRef &RR) const {
-      return !operator==(RR);
-    }
-
-    bool operator< (const RegisterRef &RR) const {
-      return Reg < RR.Reg || (Reg == RR.Reg && Mask < RR.Mask);
-    }
-  };
-
-
-  struct PhysicalRegisterInfo {
-    PhysicalRegisterInfo(const TargetRegisterInfo &tri,
-                         const MachineFunction &mf);
-
-    static bool isRegMaskId(RegisterId R) {
-      return Register::isStackSlot(R);
-    }
-
-    RegisterId getRegMaskId(const uint32_t *RM) const {
-      return Register::index2StackSlot(RegMasks.find(RM));
-    }
-
-    const uint32_t *getRegMaskBits(RegisterId R) const {
-      return RegMasks.get(Register::stackSlot2Index(R));
-    }
-
-    RegisterRef normalize(RegisterRef RR) const;
-
-    bool alias(RegisterRef RA, RegisterRef RB) const {
-      if (!isRegMaskId(RA.Reg))
-        return !isRegMaskId(RB.Reg) ? aliasRR(RA, RB) : aliasRM(RA, RB);
-      return !isRegMaskId(RB.Reg) ? aliasRM(RB, RA) : aliasMM(RA, RB);
-    }
-
-    std::set<RegisterId> getAliasSet(RegisterId Reg) const;
-
-    RegisterRef getRefForUnit(uint32_t U) const {
-      return RegisterRef(UnitInfos[U].Reg, UnitInfos[U].Mask);
-    }
-
-    const BitVector &getMaskUnits(RegisterId MaskId) const {
-      return MaskInfos[Register::stackSlot2Index(MaskId)].Units;
-    }
-
-    RegisterRef mapTo(RegisterRef RR, unsigned R) const;
-    const TargetRegisterInfo &getTRI() const { return TRI; }
-
-  private:
-    struct RegInfo {
-      const TargetRegisterClass *RegClass = nullptr;
-    };
-    struct UnitInfo {
-      RegisterId Reg = 0;
-      LaneBitmask Mask;
-    };
-    struct MaskInfo {
-      BitVector Units;
-    };
-
-    const TargetRegisterInfo &TRI;
-    IndexedSet<const uint32_t*> RegMasks;
-    std::vector<RegInfo> RegInfos;
-    std::vector<UnitInfo> UnitInfos;
-    std::vector<MaskInfo> MaskInfos;
-
-    bool aliasRR(RegisterRef RA, RegisterRef RB) const;
-    bool aliasRM(RegisterRef RR, RegisterRef RM) const;
-    bool aliasMM(RegisterRef RM, RegisterRef RN) const;
-  };
-
-  struct RegisterAggr {
-    RegisterAggr(const PhysicalRegisterInfo &pri)
-        : Units(pri.getTRI().getNumRegUnits()), PRI(pri) {}
-    RegisterAggr(const RegisterAggr &RG) = default;
-
-    bool empty() const { return Units.none(); }
-    bool hasAliasOf(RegisterRef RR) const;
-    bool hasCoverOf(RegisterRef RR) const;
-
-    static bool isCoverOf(RegisterRef RA, RegisterRef RB,
-                          const PhysicalRegisterInfo &PRI) {
-      return RegisterAggr(PRI).insert(RA).hasCoverOf(RB);
-    }
-
-    RegisterAggr &insert(RegisterRef RR);
-    RegisterAggr &insert(const RegisterAggr &RG);
-    RegisterAggr &intersect(RegisterRef RR);
-    RegisterAggr &intersect(const RegisterAggr &RG);
-    RegisterAggr &clear(RegisterRef RR);
-    RegisterAggr &clear(const RegisterAggr &RG);
-
-    RegisterRef intersectWith(RegisterRef RR) const;
-    RegisterRef clearIn(RegisterRef RR) const;
-    RegisterRef makeRegRef() const;
-
-    void print(raw_ostream &OS) const;
-
-    struct rr_iterator {
-      using MapType = std::map<RegisterId, LaneBitmask>;
-
-    private:
-      MapType Masks;
-      MapType::iterator Pos;
-      unsigned Index;
-      const RegisterAggr *Owner;
-
-    public:
-      rr_iterator(const RegisterAggr &RG, bool End);
-
-      RegisterRef operator*() const {
-        return RegisterRef(Pos->first, Pos->second);
-      }
-
-      rr_iterator &operator++() {
-        ++Pos;
-        ++Index;
-        return *this;
-      }
-
-      bool operator==(const rr_iterator &I) const {
-        assert(Owner == I.Owner);
-        (void)Owner;
-        return Index == I.Index;
-      }
-
-      bool operator!=(const rr_iterator &I) const {
-        return !(*this == I);
-      }
-    };
-
-    rr_iterator rr_begin() const {
-      return rr_iterator(*this, false);
-    }
-    rr_iterator rr_end() const {
-      return rr_iterator(*this, true);
-    }
-
-  private:
-    BitVector Units;
-    const PhysicalRegisterInfo &PRI;
-  };
-
-  // Optionally print the lane mask, if it is not ~0.
-  struct PrintLaneMaskOpt {
-    PrintLaneMaskOpt(LaneBitmask M) : Mask(M) {}
-    LaneBitmask Mask;
-  };
-  raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P);
-
-} // end namespace rdf
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td
index 9b3d13989ee2..d7e3519d5539 100644
--- a/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -373,6 +373,7 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
     VMSUMSHS,
     VMSUMUBM,
     VMSUMUHM,
+    VMSUMUDM,
     VMSUMUHS,
     VMULESB,
     VMULESH,
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 00f59bba52e8..ca1649fae258 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -167,6 +167,23 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
   }
 
+  if (Subtarget.isISA3_0()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Legal);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Legal);
+    setTruncStoreAction(MVT::f64, MVT::f16, Legal);
+    setTruncStoreAction(MVT::f32, MVT::f16, Legal);
+  } else {
+    // No extending loads from f16 or HW conversions back and forth.
+    setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+    setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+    setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+    setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  }
+
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   // PowerPC has pre-inc load and store's.
@@ -677,6 +694,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
       }
     }
+    setOperationAction(ISD::SELECT_CC, MVT::v4i32, Expand);
     if (!Subtarget.hasP8Vector()) {
       setOperationAction(ISD::SMAX, MVT::v2i64, Expand);
       setOperationAction(ISD::SMIN, MVT::v2i64, Expand);
@@ -10361,6 +10379,7 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::FP_EXTEND &&
          "Should only be called for ISD::FP_EXTEND");
 
+  // FIXME: handle extends from half precision float vectors on P9.
   // We only want to custom lower an extend from v2f32 to v2f64.
   if (Op.getValueType() != MVT::v2f64 ||
       Op.getOperand(0).getValueType() != MVT::v2f32)
@@ -10574,6 +10593,11 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::BITCAST:
     // Don't handle bitcast here.
     return;
+  case ISD::FP_EXTEND:
+    SDValue Lowered = LowerFP_EXTEND(SDValue(N, 0), DAG);
+    if (Lowered)
+      Results.push_back(Lowered);
+    return;
   }
 }
 
@@ -15255,7 +15279,8 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   if (!VT.isSimple())
     return false;
 
-  if (VT.isFloatingPoint() && !Subtarget.allowsUnalignedFPAccess())
+  if (VT.isFloatingPoint() && !VT.isVector() &&
+      !Subtarget.allowsUnalignedFPAccess())
     return false;
 
   if (VT.getSimpleVT().isVector()) {
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index e0c381827b87..2e1485373d19 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -637,7 +637,7 @@ namespace llvm {
     /// then the VPERM for the shuffle. All in all a very slow sequence.
     TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
       const override {
-      if (VT.getScalarSizeInBits() % 8 == 0)
+      if (VT.getVectorNumElements() != 1 && VT.getScalarSizeInBits() % 8 == 0)
         return TypeWidenVector;
       return TargetLoweringBase::getPreferredVectorAction(VT);
     }
diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index f94816a35f79..6e8635f2413c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -1342,6 +1342,10 @@ def VSBOX : VXBX_Int_Ty<1480, "vsbox", int_ppc_altivec_crypto_vsbox, v2i64>;
 def HasP9Altivec : Predicate<"PPCSubTarget->hasP9Altivec()">;
 let Predicates = [HasP9Altivec] in {
 
+// Vector Multiply-Sum
+def VMSUMUDM : VA1a_Int_Ty3<35, "vmsumudm", int_ppc_altivec_vmsumudm,
+                            v1i128, v2i64, v1i128>;
+
 // i8 element comparisons.
 def VCMPNEB   : VCMP   <  7, "vcmpneb $vD, $vA, $vB"  , v16i8>;
 def VCMPNEB_rec  : VCMPo  <  7, "vcmpneb. $vD, $vA, $vB" , v16i8>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 30906a32b00c..d7925befcd37 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2631,6 +2631,10 @@ bool PPCInstrInfo::isADDIInstrEligibleForFolding(MachineInstr &ADDIMI,
   if (Opc != PPC::ADDI && Opc != PPC::ADDI8)
     return false;
 
+  // The operand may not necessarily be an immediate - it could be a relocation.
+  if (!ADDIMI.getOperand(2).isImm())
+    return false;
+
   Imm = ADDIMI.getOperand(2).getImm();
 
   return true;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index be6b30ffa08b..95e5ff6b130d 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -3343,6 +3343,23 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi16i64)),
             (v2i64 (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0))>;
 
+  // Load/convert and convert/store patterns for f16.
+  def : Pat<(f64 (extloadf16 xoaddr:$src)),
+            (f64 (XSCVHPDP (LXSIHZX xoaddr:$src)))>;
+  def : Pat<(truncstoref16 f64:$src, xoaddr:$dst),
+            (STXSIHX (XSCVDPHP $src), xoaddr:$dst)>;
+  def : Pat<(f32 (extloadf16 xoaddr:$src)),
+            (f32 (COPY_TO_REGCLASS (XSCVHPDP (LXSIHZX xoaddr:$src)), VSSRC))>;
+  def : Pat<(truncstoref16 f32:$src, xoaddr:$dst),
+            (STXSIHX (XSCVDPHP (COPY_TO_REGCLASS $src, VSFRC)), xoaddr:$dst)>;
+  def : Pat<(f64 (f16_to_fp i32:$A)),
+            (f64 (XSCVHPDP (MTVSRWZ $A)))>;
+  def : Pat<(f32 (f16_to_fp i32:$A)),
+            (f32 (COPY_TO_REGCLASS (XSCVHPDP (MTVSRWZ $A)), VSSRC))>;
+  def : Pat<(i32 (fp_to_f16 f32:$A)),
+            (i32 (MFVSRWZ (XSCVDPHP (COPY_TO_REGCLASS $A, VSFRC))))>;
+  def : Pat<(i32 (fp_to_f16 f64:$A)), (i32 (MFVSRWZ (XSCVDPHP $A)))>;
+
   let Predicates = [IsBigEndian, HasP9Vector] in {
   // Scalar stores of i8
   def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
diff --git a/llvm/lib/Target/X86/ImmutableGraph.h b/llvm/lib/Target/X86/ImmutableGraph.h
new file mode 100644
index 000000000000..5833017037a5
--- /dev/null
+++ b/llvm/lib/Target/X86/ImmutableGraph.h
@@ -0,0 +1,446 @@
+//==========-- ImmutableGraph.h - A fast DAG implementation ---------=========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Description: ImmutableGraph is a fast DAG implementation that cannot be
+/// modified, except by creating a new ImmutableGraph. ImmutableGraph is
+/// implemented as two arrays: one containing nodes, and one containing edges.
+/// The advantages to this implementation are two-fold:
+/// 1. Iteration and traversal operations benefit from cache locality.
+/// 2. Operations on sets of nodes/edges are efficient, and representations of
+///    those sets in memory are compact. For instance, a set of edges is
+///    implemented as a bit vector, wherein each bit corresponds to one edge in
+///    the edge array. This implies a lower bound of 64x spatial improvement
+///    over, e.g., an llvm::DenseSet or llvm::SmallSet. It also means that
+///    insert/erase/contains operations complete in negligible constant time:
+///    insert and erase require one load and one store, and contains requires
+///    just one load.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H
+#define LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+template <typename NodeValueT, typename EdgeValueT> class ImmutableGraph {
+  using Traits = GraphTraits<ImmutableGraph<NodeValueT, EdgeValueT> *>;
+  template <typename> friend class ImmutableGraphBuilder;
+
+public:
+  using node_value_type = NodeValueT;
+  using edge_value_type = EdgeValueT;
+  using size_type = int;
+  class Node;
+  class Edge {
+    friend class ImmutableGraph;
+    template <typename> friend class ImmutableGraphBuilder;
+
+    const Node *Dest;
+    edge_value_type Value;
+
+  public:
+    const Node *getDest() const { return Dest; };
+    const edge_value_type &getValue() const { return Value; }
+  };
+  class Node {
+    friend class ImmutableGraph;
+    template <typename> friend class ImmutableGraphBuilder;
+
+    const Edge *Edges;
+    node_value_type Value;
+
+  public:
+    const node_value_type &getValue() const { return Value; }
+
+    const Edge *edges_begin() const { return Edges; }
+    // Nodes are allocated sequentially. Edges for a node are stored together.
+    // The end of this Node's edges is the beginning of the next node's edges.
+    // An extra node was allocated to hold the end pointer for the last real
+    // node.
+    const Edge *edges_end() const { return (this + 1)->Edges; }
+    ArrayRef<Edge> edges() const {
+      return makeArrayRef(edges_begin(), edges_end());
+    }
+  };
+
+protected:
+  ImmutableGraph(std::unique_ptr<Node[]> Nodes, std::unique_ptr<Edge[]> Edges,
+                 size_type NodesSize, size_type EdgesSize)
+      : Nodes(std::move(Nodes)), Edges(std::move(Edges)), NodesSize(NodesSize),
+        EdgesSize(EdgesSize) {}
+  ImmutableGraph(const ImmutableGraph &) = delete;
+  ImmutableGraph(ImmutableGraph &&) = delete;
+  ImmutableGraph &operator=(const ImmutableGraph &) = delete;
+  ImmutableGraph &operator=(ImmutableGraph &&) = delete;
+
+public:
+  ArrayRef<Node> nodes() const { return makeArrayRef(Nodes.get(), NodesSize); }
+  const Node *nodes_begin() const { return nodes().begin(); }
+  const Node *nodes_end() const { return nodes().end(); }
+
+  ArrayRef<Edge> edges() const { return makeArrayRef(Edges.get(), EdgesSize); }
+  const Edge *edges_begin() const { return edges().begin(); }
+  const Edge *edges_end() const { return edges().end(); }
+
+  size_type nodes_size() const { return NodesSize; }
+  size_type edges_size() const { return EdgesSize; }
+
+  // Node N must belong to this ImmutableGraph.
+  size_type getNodeIndex(const Node &N) const {
+    return std::distance(nodes_begin(), &N);
+  }
+  // Edge E must belong to this ImmutableGraph.
+  size_type getEdgeIndex(const Edge &E) const {
+    return std::distance(edges_begin(), &E);
+  }
+
+  // FIXME: Could NodeSet and EdgeSet be templated to share code?
+  class NodeSet {
+    const ImmutableGraph &G;
+    BitVector V;
+
+  public:
+    NodeSet(const ImmutableGraph &G, bool ContainsAll = false)
+        : G{G}, V{static_cast<unsigned>(G.nodes_size()), ContainsAll} {}
+    bool insert(const Node &N) {
+      size_type Idx = G.getNodeIndex(N);
+      bool AlreadyExists = V.test(Idx);
+      V.set(Idx);
+      return !AlreadyExists;
+    }
+    void erase(const Node &N) {
+      size_type Idx = G.getNodeIndex(N);
+      V.reset(Idx);
+    }
+    bool contains(const Node &N) const {
+      size_type Idx = G.getNodeIndex(N);
+      return V.test(Idx);
+    }
+    void clear() { V.reset(); }
+    size_type empty() const { return V.none(); }
+    /// Return the number of elements in the set
+    size_type count() const { return V.count(); }
+    /// Return the size of the set's domain
+    size_type size() const { return V.size(); }
+    /// Set union
+    NodeSet &operator|=(const NodeSet &RHS) {
+      assert(&this->G == &RHS.G);
+      V |= RHS.V;
+      return *this;
+    }
+    /// Set intersection
+    NodeSet &operator&=(const NodeSet &RHS) {
+      assert(&this->G == &RHS.G);
+      V &= RHS.V;
+      return *this;
+    }
+    /// Set disjoint union
+    NodeSet &operator^=(const NodeSet &RHS) {
+      assert(&this->G == &RHS.G);
+      V ^= RHS.V;
+      return *this;
+    }
+
+    using index_iterator = typename BitVector::const_set_bits_iterator;
+    index_iterator index_begin() const { return V.set_bits_begin(); }
+    index_iterator index_end() const { return V.set_bits_end(); }
+    void set(size_type Idx) { V.set(Idx); }
+    void reset(size_type Idx) { V.reset(Idx); }
+
+    class iterator {
+      const NodeSet &Set;
+      size_type Current;
+
+      void advance() {
+        assert(Current != -1);
+        Current = Set.V.find_next(Current);
+      }
+
+    public:
+      iterator(const NodeSet &Set, size_type Begin)
+          : Set{Set}, Current{Begin} {}
+      iterator operator++(int) {
+        iterator Tmp = *this;
+        advance();
+        return Tmp;
+      }
+      iterator &operator++() {
+        advance();
+        return *this;
+      }
+      Node *operator*() const {
+        assert(Current != -1);
+        return Set.G.nodes_begin() + Current;
+      }
+      bool operator==(const iterator &other) const {
+        assert(&this->Set == &other.Set);
+        return this->Current == other.Current;
+      }
+      bool operator!=(const iterator &other) const { return !(*this == other); }
+    };
+
+    iterator begin() const { return iterator{*this, V.find_first()}; }
+    iterator end() const { return iterator{*this, -1}; }
+  };
+
+  class EdgeSet {
+    const ImmutableGraph &G;
+    BitVector V;
+
+  public:
+    EdgeSet(const ImmutableGraph &G, bool ContainsAll = false)
+        : G{G}, V{static_cast<unsigned>(G.edges_size()), ContainsAll} {}
+    bool insert(const Edge &E) {
+      size_type Idx = G.getEdgeIndex(E);
+      bool AlreadyExists = V.test(Idx);
+      V.set(Idx);
+      return !AlreadyExists;
+    }
+    void erase(const Edge &E) {
+      size_type Idx = G.getEdgeIndex(E);
+      V.reset(Idx);
+    }
+    bool contains(const Edge &E) const {
+      size_type Idx = G.getEdgeIndex(E);
+      return V.test(Idx);
+    }
+    void clear() { V.reset(); }
+    bool empty() const { return V.none(); }
+    /// Return the number of elements in the set
+    size_type count() const { return V.count(); }
+    /// Return the size of the set's domain
+    size_type size() const { return V.size(); }
+    /// Set union
+    EdgeSet &operator|=(const EdgeSet &RHS) {
+      assert(&this->G == &RHS.G);
+      V |= RHS.V;
+      return *this;
+    }
+    /// Set intersection
+    EdgeSet &operator&=(const EdgeSet &RHS) {
+      assert(&this->G == &RHS.G);
+      V &= RHS.V;
+      return *this;
+    }
+    /// Set disjoint union
+    EdgeSet &operator^=(const EdgeSet &RHS) {
+      assert(&this->G == &RHS.G);
+      V ^= RHS.V;
+      return *this;
+    }
+
+    using index_iterator = typename BitVector::const_set_bits_iterator;
+    index_iterator index_begin() const { return V.set_bits_begin(); }
+    index_iterator index_end() const { return V.set_bits_end(); }
+    void set(size_type Idx) { V.set(Idx); }
+    void reset(size_type Idx) { V.reset(Idx); }
+
+    class iterator {
+      const EdgeSet &Set;
+      size_type Current;
+
+      void advance() {
+        assert(Current != -1);
+        Current = Set.V.find_next(Current);
+      }
+
+    public:
+      iterator(const EdgeSet &Set, size_type Begin)
+          : Set{Set}, Current{Begin} {}
+      iterator operator++(int) {
+        iterator Tmp = *this;
+        advance();
+        return Tmp;
+      }
+      iterator &operator++() {
+        advance();
+        return *this;
+      }
+      Edge *operator*() const {
+        assert(Current != -1);
+        return Set.G.edges_begin() + Current;
+      }
+      bool operator==(const iterator &other) const {
+        assert(&this->Set == &other.Set);
+        return this->Current == other.Current;
+      }
+      bool operator!=(const iterator &other) const { return !(*this == other); }
+    };
+
+    iterator begin() const { return iterator{*this, V.find_first()}; }
+    iterator end() const { return iterator{*this, -1}; }
+  };
+
+private:
+  std::unique_ptr<Node[]> Nodes;
+  std::unique_ptr<Edge[]> Edges;
+  size_type NodesSize;
+  size_type EdgesSize;
+};
+
+template <typename GraphT> class ImmutableGraphBuilder {
+  using node_value_type = typename GraphT::node_value_type;
+  using edge_value_type = typename GraphT::edge_value_type;
+  static_assert(
+      std::is_base_of<ImmutableGraph<node_value_type, edge_value_type>,
+                      GraphT>::value,
+      "Template argument to ImmutableGraphBuilder must derive from "
+      "ImmutableGraph<>");
+  using size_type = typename GraphT::size_type;
+  using NodeSet = typename GraphT::NodeSet;
+  using Node = typename GraphT::Node;
+  using EdgeSet = typename GraphT::EdgeSet;
+  using Edge = typename GraphT::Edge;
+  using BuilderEdge = std::pair<edge_value_type, size_type>;
+  using EdgeList = std::vector<BuilderEdge>;
+  using BuilderVertex = std::pair<node_value_type, EdgeList>;
+  using VertexVec = std::vector<BuilderVertex>;
+
+public:
+  using BuilderNodeRef = size_type;
+
+  BuilderNodeRef addVertex(const node_value_type &V) {
+    auto I = AdjList.emplace(AdjList.end(), V, EdgeList{});
+    return std::distance(AdjList.begin(), I);
+  }
+
+  void addEdge(const edge_value_type &E, BuilderNodeRef From,
+               BuilderNodeRef To) {
+    AdjList[From].second.emplace_back(E, To);
+  }
+
+  bool empty() const { return AdjList.empty(); }
+
+  template <typename... ArgT> std::unique_ptr<GraphT> get(ArgT &&... Args) {
+    size_type VertexSize = AdjList.size(), EdgeSize = 0;
+    for (const auto &V : AdjList) {
+      EdgeSize += V.second.size();
+    }
+    auto VertexArray =
+        std::make_unique<Node[]>(VertexSize + 1 /* terminator node */);
+    auto EdgeArray = std::make_unique<Edge[]>(EdgeSize);
+    size_type VI = 0, EI = 0;
+    for (; VI < VertexSize; ++VI) {
+      VertexArray[VI].Value = std::move(AdjList[VI].first);
+      VertexArray[VI].Edges = &EdgeArray[EI];
+      auto NumEdges = static_cast<size_type>(AdjList[VI].second.size());
+      for (size_type VEI = 0; VEI < NumEdges; ++VEI, ++EI) {
+        auto &E = AdjList[VI].second[VEI];
+        EdgeArray[EI].Value = std::move(E.first);
+        EdgeArray[EI].Dest = &VertexArray[E.second];
+      }
+    }
+    assert(VI == VertexSize && EI == EdgeSize && "ImmutableGraph malformed");
+    VertexArray[VI].Edges = &EdgeArray[EdgeSize]; // terminator node
+    return std::make_unique<GraphT>(std::move(VertexArray),
+                                    std::move(EdgeArray), VertexSize, EdgeSize,
+                                    std::forward<ArgT>(Args)...);
+  }
+
+  template <typename... ArgT>
+  static std::unique_ptr<GraphT> trim(const GraphT &G, const NodeSet &TrimNodes,
+                                      const EdgeSet &TrimEdges,
+                                      ArgT &&... Args) {
+    size_type NewVertexSize = G.nodes_size() - TrimNodes.count();
+    size_type NewEdgeSize = G.edges_size() - TrimEdges.count();
+    auto NewVertexArray =
+        std::make_unique<Node[]>(NewVertexSize + 1 /* terminator node */);
+    auto NewEdgeArray = std::make_unique<Edge[]>(NewEdgeSize);
+
+    // Walk the nodes and determine the new index for each node.
+    size_type NewNodeIndex = 0;
+    std::vector<size_type> RemappedNodeIndex(G.nodes_size());
+    for (const Node &N : G.nodes()) {
+      if (TrimNodes.contains(N))
+        continue;
+      RemappedNodeIndex[G.getNodeIndex(N)] = NewNodeIndex++;
+    }
+    assert(NewNodeIndex == NewVertexSize &&
+           "Should have assigned NewVertexSize indices");
+
+    size_type VertexI = 0, EdgeI = 0;
+    for (const Node &N : G.nodes()) {
+      if (TrimNodes.contains(N))
+        continue;
+      NewVertexArray[VertexI].Value = N.getValue();
+      NewVertexArray[VertexI].Edges = &NewEdgeArray[EdgeI];
+      for (const Edge &E : N.edges()) {
+        if (TrimEdges.contains(E))
+          continue;
+        NewEdgeArray[EdgeI].Value = E.getValue();
+        size_type DestIdx = G.getNodeIndex(*E.getDest());
+        size_type NewIdx = RemappedNodeIndex[DestIdx];
+        assert(NewIdx < NewVertexSize);
+        NewEdgeArray[EdgeI].Dest = &NewVertexArray[NewIdx];
+        ++EdgeI;
+      }
+      ++VertexI;
+    }
+    assert(VertexI == NewVertexSize && EdgeI == NewEdgeSize &&
+           "Gadget graph malformed");
+    NewVertexArray[VertexI].Edges = &NewEdgeArray[NewEdgeSize]; // terminator
+    return std::make_unique<GraphT>(std::move(NewVertexArray),
+                                    std::move(NewEdgeArray), NewVertexSize,
+                                    NewEdgeSize, std::forward<ArgT>(Args)...);
+  }
+
+private:
+  VertexVec AdjList;
+};
+
+template <typename NodeValueT, typename EdgeValueT>
+struct GraphTraits<ImmutableGraph<NodeValueT, EdgeValueT> *> {
+  using GraphT = ImmutableGraph<NodeValueT, EdgeValueT>;
+  using NodeRef = typename GraphT::Node const *;
+  using EdgeRef = typename GraphT::Edge const &;
+
+  static NodeRef edge_dest(EdgeRef E) { return E.getDest(); }
+  using ChildIteratorType =
+      mapped_iterator<typename GraphT::Edge const *, decltype(&edge_dest)>;
+
+  static NodeRef getEntryNode(GraphT *G) { return G->nodes_begin(); }
+  static ChildIteratorType child_begin(NodeRef N) {
+    return {N->edges_begin(), &edge_dest};
+  }
+  static ChildIteratorType child_end(NodeRef N) {
+    return {N->edges_end(), &edge_dest};
+  }
+
+  static NodeRef getNode(typename GraphT::Node const &N) { return NodeRef{&N}; }
+  using nodes_iterator =
+      mapped_iterator<typename GraphT::Node const *, decltype(&getNode)>;
+  static nodes_iterator nodes_begin(GraphT *G) {
+    return {G->nodes_begin(), &getNode};
+  }
+  static nodes_iterator nodes_end(GraphT *G) {
+    return {G->nodes_end(), &getNode};
+  }
+
+  using ChildEdgeIteratorType = typename GraphT::Edge const *;
+
+  static ChildEdgeIteratorType child_edge_begin(NodeRef N) {
+    return N->edges_begin();
+  }
+  static ChildEdgeIteratorType child_edge_end(NodeRef N) {
+    return N->edges_end();
+  }
+  static typename GraphT::size_type size(GraphT *G) { return G->nodes_size(); }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 0481a40d462a..a0ab5c3a5b3c 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -120,7 +120,7 @@ FunctionPass *createX86DomainReassignmentPass();
 FunctionPass *createX86EvexToVexInsts();
 
 /// This pass creates the thunks for the retpoline feature.
-FunctionPass *createX86RetpolineThunksPass();
+FunctionPass *createX86IndirectThunksPass();
 
 /// This pass ensures instructions featuring a memory operand
 /// have distinctive <LineNumber, Discriminator> (with respect to eachother)
@@ -133,6 +133,9 @@ InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   X86Subtarget &,
                                                   X86RegisterBankInfo &);
 
+FunctionPass *createX86LoadValueInjectionLoadHardeningPass();
+FunctionPass *createX86LoadValueInjectionLoadHardeningUnoptimizedPass();
+FunctionPass *createX86LoadValueInjectionRetHardeningPass();
 FunctionPass *createX86SpeculativeLoadHardeningPass();
 
 void initializeEvexToVexInstPassPass(PassRegistry &);
@@ -148,6 +151,9 @@ void initializeX86DomainReassignmentPass(PassRegistry &);
 void initializeX86ExecutionDomainFixPass(PassRegistry &);
 void initializeX86ExpandPseudoPass(PassRegistry &);
 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
+void initializeX86LoadValueInjectionLoadHardeningUnoptimizedPassPass(PassRegistry &);
+void initializeX86LoadValueInjectionLoadHardeningPassPass(PassRegistry &);
+void initializeX86LoadValueInjectionRetHardeningPassPass(PassRegistry &);
 void initializeX86OptimizeLEAPassPass(PassRegistry &);
 void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index a2b11d55f650..bb8952f54e3a 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -426,6 +426,22 @@ def FeatureRetpolineExternalThunk
           "ourselves. Only has effect when combined with some other retpoline "
           "feature", [FeatureRetpolineIndirectCalls]>;
 
+// Mitigate LVI attacks against indirect calls/branches and call returns
+def FeatureLVIControlFlowIntegrity
+    : SubtargetFeature<
+          "lvi-cfi", "UseLVIControlFlowIntegrity", "true",
+          "Prevent indirect calls/branches from using a memory operand, and "
+          "precede all indirect calls/branches from a register with an "
+          "LFENCE instruction to serialize control flow. Also decompose RET "
+          "instructions into a POP+LFENCE+JMP sequence.">;
+
+// Mitigate LVI attacks against data loads
+def FeatureLVILoadHardening
+    : SubtargetFeature<
+          "lvi-load-hardening", "UseLVILoadHardening", "true",
+          "Insert LFENCE instructions to prevent data speculatively injected "
+          "into loads from being used maliciously.">;
+
 // Direct Move instructions.
 def FeatureMOVDIRI  : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
                                        "Support movdiri instruction">;
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 1dbf40683564..a1d256ea872d 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3202,8 +3202,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers")))
     return false;
 
-  // Functions using retpoline for indirect calls need to use SDISel.
-  if (Subtarget->useRetpolineIndirectCalls())
+  // Functions using thunks for indirect calls need to use SDISel.
+  if (Subtarget->useIndirectThunkCalls())
     return false;
 
   // Handle only C, fastcc, and webkit_js calling conventions for now.
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 799c1f5d1285..1da20371caf5 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -765,10 +765,10 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
                                           bool InProlog) const {
   bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
 
-  // FIXME: Add retpoline support and remove this.
-  if (Is64Bit && IsLargeCodeModel && STI.useRetpolineIndirectCalls())
+  // FIXME: Add indirect thunk support and remove this.
+  if (Is64Bit && IsLargeCodeModel && STI.useIndirectThunkCalls())
     report_fatal_error("Emitting stack probe calls on 64-bit with the large "
-                       "code model and retpoline not yet implemented.");
+                       "code model and indirect thunks not yet implemented.");
 
   unsigned CallOp;
   if (Is64Bit)
@@ -2493,9 +2493,9 @@ void X86FrameLowering::adjustForSegmentedStacks(
     // is laid out within 2^31 bytes of each function body, but this seems
     // to be sufficient for JIT.
     // FIXME: Add retpoline support and remove the error here..
-    if (STI.useRetpolineIndirectCalls())
+    if (STI.useIndirectThunkCalls())
       report_fatal_error("Emitting morestack calls on 64-bit with the large "
-                         "code model and retpoline not yet implemented.");
+                         "code model and thunks not yet implemented.");
     BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
         .addReg(X86::RIP)
         .addImm(0)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index bf33f399db28..88af0ebcfd0e 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -987,7 +987,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     if (OptLevel != CodeGenOpt::None &&
         // Only do this when the target can fold the load into the call or
         // jmp.
-        !Subtarget->useRetpolineIndirectCalls() &&
+        !Subtarget->useIndirectThunkCalls() &&
         ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
          (N->getOpcode() == X86ISD::TC_RETURN &&
           (Subtarget->is64Bit() ||
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1523d56cc4e7..c8720d9ae3a6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30221,8 +30221,8 @@ bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
 }
 
 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
-  // If the subtarget is using retpolines, we need to not generate jump tables.
-  if (Subtarget.useRetpolineIndirectBranches())
+  // If the subtarget is using thunks, we need to not generate jump tables.
+  if (Subtarget.useIndirectThunkBranches())
     return false;
 
   // Otherwise, fallback on the generic logic.
@@ -31345,22 +31345,22 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
   return BB;
 }
 
-static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
+static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
   switch (RPOpc) {
-  case X86::RETPOLINE_CALL32:
+  case X86::INDIRECT_THUNK_CALL32:
     return X86::CALLpcrel32;
-  case X86::RETPOLINE_CALL64:
+  case X86::INDIRECT_THUNK_CALL64:
     return X86::CALL64pcrel32;
-  case X86::RETPOLINE_TCRETURN32:
+  case X86::INDIRECT_THUNK_TCRETURN32:
     return X86::TCRETURNdi;
-  case X86::RETPOLINE_TCRETURN64:
+  case X86::INDIRECT_THUNK_TCRETURN64:
     return X86::TCRETURNdi64;
   }
-  llvm_unreachable("not retpoline opcode");
+  llvm_unreachable("not indirect thunk opcode");
 }
 
-static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
-                                      unsigned Reg) {
+static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
+                                          unsigned Reg) {
   if (Subtarget.useRetpolineExternalThunk()) {
     // When using an external thunk for retpolines, we pick names that match the
     // names GCC happens to use as well. This helps simplify the implementation
@@ -31392,39 +31392,48 @@ static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
       return "__x86_indirect_thunk_r11";
     }
+    llvm_unreachable("unexpected reg for external indirect thunk");
+  }
+
+  if (Subtarget.useRetpolineIndirectCalls() ||
+      Subtarget.useRetpolineIndirectBranches()) {
+    // When targeting an internal COMDAT thunk use an LLVM-specific name.
+    switch (Reg) {
+    case X86::EAX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__llvm_retpoline_eax";
+    case X86::ECX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__llvm_retpoline_ecx";
+    case X86::EDX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__llvm_retpoline_edx";
+    case X86::EDI:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__llvm_retpoline_edi";
+    case X86::R11:
+      assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+      return "__llvm_retpoline_r11";
+    }
     llvm_unreachable("unexpected reg for retpoline");
   }
 
-  // When targeting an internal COMDAT thunk use an LLVM-specific name.
-  switch (Reg) {
-  case X86::EAX:
-    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
-    return "__llvm_retpoline_eax";
-  case X86::ECX:
-    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
-    return "__llvm_retpoline_ecx";
-  case X86::EDX:
-    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
-    return "__llvm_retpoline_edx";
-  case X86::EDI:
-    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
-    return "__llvm_retpoline_edi";
-  case X86::R11:
+  if (Subtarget.useLVIControlFlowIntegrity()) {
     assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
-    return "__llvm_retpoline_r11";
+    return "__llvm_lvi_thunk_r11";
   }
-  llvm_unreachable("unexpected reg for retpoline");
+  llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
-                                        MachineBasicBlock *BB) const {
+X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
+                                            MachineBasicBlock *BB) const {
   // Copy the virtual register into the R11 physical register and
   // call the retpoline thunk.
   DebugLoc DL = MI.getDebugLoc();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   Register CalleeVReg = MI.getOperand(0).getReg();
-  unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
+  unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
 
   // Find an available scratch register to hold the callee. On 64-bit, we can
   // just use R11, but we scan for uses anyway to ensure we don't generate
@@ -31458,7 +31467,7 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
     report_fatal_error("calling convention incompatible with retpoline, no "
                        "available registers");
 
-  const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
+  const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
 
   BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
       .addReg(CalleeVReg);
@@ -32234,11 +32243,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
     return EmitLoweredTLSAddr(MI, BB);
-  case X86::RETPOLINE_CALL32:
-  case X86::RETPOLINE_CALL64:
-  case X86::RETPOLINE_TCRETURN32:
-  case X86::RETPOLINE_TCRETURN64:
-    return EmitLoweredRetpoline(MI, BB);
+  case X86::INDIRECT_THUNK_CALL32:
+  case X86::INDIRECT_THUNK_CALL64:
+  case X86::INDIRECT_THUNK_TCRETURN32:
+  case X86::INDIRECT_THUNK_TCRETURN64:
+    return EmitLoweredIndirectThunk(MI, BB);
   case X86::CATCHRET:
     return EmitLoweredCatchRet(MI, BB);
   case X86::CATCHPAD:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3a17099da38f..830cdfc79c0a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1482,8 +1482,8 @@ namespace llvm {
     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
                                           MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI,
-                                            MachineBasicBlock *BB) const;
+    MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
+                                                MachineBasicBlock *BB) const;
 
     MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
                                         MachineBasicBlock *MBB) const;
diff --git a/llvm/lib/Target/X86/X86IndirectThunks.cpp b/llvm/lib/Target/X86/X86IndirectThunks.cpp
new file mode 100644
index 000000000000..36b9c3ccc959
--- /dev/null
+++ b/llvm/lib/Target/X86/X86IndirectThunks.cpp
@@ -0,0 +1,364 @@
+//==- X86IndirectThunks.cpp - Construct indirect call/jump thunks for x86  --=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Pass that injects an MI thunk that is used to lower indirect calls in a way
+/// that prevents speculation on some x86 processors and can be used to mitigate
+/// security vulnerabilities due to targeted speculative execution and side
+/// channels such as CVE-2017-5715.
+///
+/// Currently supported thunks include:
+/// - Retpoline -- A RET-implemented trampoline that lowers indirect calls
+/// - LVI Thunk -- A CALL/JMP-implemented thunk that forces load serialization
+///   before making an indirect call/jump
+///
+/// Note that the reason that this is implemented as a MachineFunctionPass and
+/// not a ModulePass is that ModulePasses at this point in the LLVM X86 pipeline
+/// serialize all transformations, which can consume lots of memory.
+///
+/// TODO(chandlerc): All of this code could use better comments and
+/// documentation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-retpoline-thunks"
+
+static const char RetpolineNamePrefix[] = "__llvm_retpoline_";
+static const char R11RetpolineName[] = "__llvm_retpoline_r11";
+static const char EAXRetpolineName[] = "__llvm_retpoline_eax";
+static const char ECXRetpolineName[] = "__llvm_retpoline_ecx";
+static const char EDXRetpolineName[] = "__llvm_retpoline_edx";
+static const char EDIRetpolineName[] = "__llvm_retpoline_edi";
+
+static const char LVIThunkNamePrefix[] = "__llvm_lvi_thunk_";
+static const char R11LVIThunkName[] = "__llvm_lvi_thunk_r11";
+
+namespace {
+template <typename Derived> class ThunkInserter {
+  Derived &getDerived() { return *static_cast<Derived *>(this); }
+
+protected:
+  bool InsertedThunks;
+  void doInitialization(Module &M) {}
+  void createThunkFunction(MachineModuleInfo &MMI, StringRef Name);
+
+public:
+  void init(Module &M) {
+    InsertedThunks = false;
+    getDerived().doInitialization(M);
+  }
+  // return `true` if `MMI` or `MF` was modified
+  bool run(MachineModuleInfo &MMI, MachineFunction &MF);
+};
+
+struct RetpolineThunkInserter : ThunkInserter<RetpolineThunkInserter> {
+  const char *getThunkPrefix() { return RetpolineNamePrefix; }
+  bool mayUseThunk(const MachineFunction &MF) {
+    const auto &STI = MF.getSubtarget<X86Subtarget>();
+    return (STI.useRetpolineIndirectCalls() ||
+            STI.useRetpolineIndirectBranches()) &&
+           !STI.useRetpolineExternalThunk();
+  }
+  void insertThunks(MachineModuleInfo &MMI);
+  void populateThunk(MachineFunction &MF);
+};
+
+struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> {
+  const char *getThunkPrefix() { return LVIThunkNamePrefix; }
+  bool mayUseThunk(const MachineFunction &MF) {
+    return MF.getSubtarget<X86Subtarget>().useLVIControlFlowIntegrity();
+  }
+  void insertThunks(MachineModuleInfo &MMI) {
+    createThunkFunction(MMI, R11LVIThunkName);
+  }
+  void populateThunk(MachineFunction &MF) {
+    // Grab the entry MBB and erase any other blocks. O0 codegen appears to
+    // generate two bbs for the entry block.
+    MachineBasicBlock *Entry = &MF.front();
+    Entry->clear();
+    while (MF.size() > 1)
+      MF.erase(std::next(MF.begin()));
+
+    // This code mitigates LVI by replacing each indirect call/jump with a
+    // direct call/jump to a thunk that looks like:
+    // ```
+    // lfence
+    // jmpq *%r11
+    // ```
+    // This ensures that if the value in register %r11 was loaded from memory,
+    // then the value in %r11 is (architecturally) correct prior to the jump.
+    const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+    BuildMI(&MF.front(), DebugLoc(), TII->get(X86::LFENCE));
+    BuildMI(&MF.front(), DebugLoc(), TII->get(X86::JMP64r)).addReg(X86::R11);
+    MF.front().addLiveIn(X86::R11);
+    return;
+  }
+};
+
+class X86IndirectThunks : public MachineFunctionPass {
+public:
+  static char ID;
+
+  X86IndirectThunks() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "X86 Indirect Thunks"; }
+
+  bool doInitialization(Module &M) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
+  }
+
+private:
+  std::tuple<RetpolineThunkInserter, LVIThunkInserter> TIs;
+
+  // FIXME: When LLVM moves to C++17, these can become folds
+  template <typename... ThunkInserterT>
+  static void initTIs(Module &M,
+                      std::tuple<ThunkInserterT...> &ThunkInserters) {
+    (void)std::initializer_list<int>{
+        (std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...};
+  }
+  template <typename... ThunkInserterT>
+  static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
+                     std::tuple<ThunkInserterT...> &ThunkInserters) {
+    bool Modified = false;
+    (void)std::initializer_list<int>{
+        Modified |= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...};
+    return Modified;
+  }
+};
+
+} // end anonymous namespace
+
+void RetpolineThunkInserter::insertThunks(MachineModuleInfo &MMI) {
+  if (MMI.getTarget().getTargetTriple().getArch() == Triple::x86_64)
+    createThunkFunction(MMI, R11RetpolineName);
+  else
+    for (StringRef Name : {EAXRetpolineName, ECXRetpolineName, EDXRetpolineName,
+                           EDIRetpolineName})
+      createThunkFunction(MMI, Name);
+}
+
+void RetpolineThunkInserter::populateThunk(MachineFunction &MF) {
+  bool Is64Bit = MF.getTarget().getTargetTriple().getArch() == Triple::x86_64;
+  Register ThunkReg;
+  if (Is64Bit) {
+    assert(MF.getName() == "__llvm_retpoline_r11" &&
+           "Should only have an r11 thunk on 64-bit targets");
+
+    // __llvm_retpoline_r11:
+    //   callq .Lr11_call_target
+    // .Lr11_capture_spec:
+    //   pause
+    //   lfence
+    //   jmp .Lr11_capture_spec
+    // .align 16
+    // .Lr11_call_target:
+    //   movq %r11, (%rsp)
+    //   retq
+    ThunkReg = X86::R11;
+  } else {
+    // For 32-bit targets we need to emit a collection of thunks for various
+    // possible scratch registers as well as a fallback that uses EDI, which is
+    // normally callee saved.
+    //   __llvm_retpoline_eax:
+    //         calll .Leax_call_target
+    //   .Leax_capture_spec:
+    //         pause
+    //         jmp .Leax_capture_spec
+    //   .align 16
+    //   .Leax_call_target:
+    //         movl %eax, (%esp)  # Clobber return addr
+    //         retl
+    //
+    //   __llvm_retpoline_ecx:
+    //   ... # Same setup
+    //         movl %ecx, (%esp)
+    //         retl
+    //
+    //   __llvm_retpoline_edx:
+    //   ... # Same setup
+    //         movl %edx, (%esp)
+    //         retl
+    //
+    //   __llvm_retpoline_edi:
+    //   ... # Same setup
+    //         movl %edi, (%esp)
+    //         retl
+    if (MF.getName() == EAXRetpolineName)
+      ThunkReg = X86::EAX;
+    else if (MF.getName() == ECXRetpolineName)
+      ThunkReg = X86::ECX;
+    else if (MF.getName() == EDXRetpolineName)
+      ThunkReg = X86::EDX;
+    else if (MF.getName() == EDIRetpolineName)
+      ThunkReg = X86::EDI;
+    else
+      llvm_unreachable("Invalid thunk name on x86-32!");
+  }
+
+  const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  // Grab the entry MBB and erase any other blocks. O0 codegen appears to
+  // generate two bbs for the entry block.
+  MachineBasicBlock *Entry = &MF.front();
+  Entry->clear();
+  while (MF.size() > 1)
+    MF.erase(std::next(MF.begin()));
+
+  MachineBasicBlock *CaptureSpec =
+      MF.CreateMachineBasicBlock(Entry->getBasicBlock());
+  MachineBasicBlock *CallTarget =
+      MF.CreateMachineBasicBlock(Entry->getBasicBlock());
+  MCSymbol *TargetSym = MF.getContext().createTempSymbol();
+  MF.push_back(CaptureSpec);
+  MF.push_back(CallTarget);
+
+  const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+  const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
+
+  Entry->addLiveIn(ThunkReg);
+  BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym);
+
+  // The MIR verifier thinks that the CALL in the entry block will fall through
+  // to CaptureSpec, so mark it as the successor. Technically, CaptureTarget is
+  // the successor, but the MIR verifier doesn't know how to cope with that.
+  Entry->addSuccessor(CaptureSpec);
+
+  // In the capture loop for speculation, we want to stop the processor from
+  // speculating as fast as possible. On Intel processors, the PAUSE instruction
+  // will block speculation without consuming any execution resources. On AMD
+  // processors, the PAUSE instruction is (essentially) a nop, so we also use an
+  // LFENCE instruction which they have advised will stop speculation as well
+  // with minimal resource utilization. We still end the capture with a jump to
+  // form an infinite loop to fully guarantee that no matter what implementation
+  // of the x86 ISA, speculating this code path never escapes.
+  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE));
+  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE));
+  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec);
+  CaptureSpec->setHasAddressTaken();
+  CaptureSpec->addSuccessor(CaptureSpec);
+
+  CallTarget->addLiveIn(ThunkReg);
+  CallTarget->setHasAddressTaken();
+  CallTarget->setAlignment(Align(16));
+
+  // Insert return address clobber
+  const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
+  const Register SPReg = Is64Bit ? X86::RSP : X86::ESP;
+  addRegOffset(BuildMI(CallTarget, DebugLoc(), TII->get(MovOpc)), SPReg, false,
+               0)
+      .addReg(ThunkReg);
+
+  CallTarget->back().setPreInstrSymbol(MF, TargetSym);
+  BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
+}
+
+template <typename Derived>
+void ThunkInserter<Derived>::createThunkFunction(MachineModuleInfo &MMI,
+                                                 StringRef Name) {
+  assert(Name.startswith(getDerived().getThunkPrefix()) &&
+         "Created a thunk with an unexpected prefix!");
+
+  Module &M = const_cast<Module &>(*MMI.getModule());
+  LLVMContext &Ctx = M.getContext();
+  auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
+  Function *F =
+      Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M);
+  F->setVisibility(GlobalValue::HiddenVisibility);
+  F->setComdat(M.getOrInsertComdat(Name));
+
+  // Add Attributes so that we don't create a frame, unwind information, or
+  // inline.
+  AttrBuilder B;
+  B.addAttribute(llvm::Attribute::NoUnwind);
+  B.addAttribute(llvm::Attribute::Naked);
+  F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+
+  // Populate our function a bit so that we can verify.
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
+  IRBuilder<> Builder(Entry);
+
+  Builder.CreateRetVoid();
+
+  // MachineFunctions/MachineBasicBlocks aren't created automatically for the
+  // IR-level constructs we already made. Create them and insert them into the
+  // module.
+  MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
+  MachineBasicBlock *EntryMBB = MF.CreateMachineBasicBlock(Entry);
+
+  // Insert EntryMBB into MF. It's not in the module until we do this.
+  MF.insert(MF.end(), EntryMBB);
+  // Set MF properties. We never use vregs...
+  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+}
+
+template <typename Derived>
+bool ThunkInserter<Derived>::run(MachineModuleInfo &MMI, MachineFunction &MF) {
+  // If MF is not a thunk, check to see if we need to insert a thunk.
+  if (!MF.getName().startswith(getDerived().getThunkPrefix())) {
+    // If we've already inserted a thunk, nothing else to do.
+    if (InsertedThunks)
+      return false;
+
+    // Only add a thunk if one of the functions has the corresponding feature
+    // enabled in its subtarget, and doesn't enable external thunks.
+    // FIXME: Conditionalize on indirect calls so we don't emit a thunk when
+    // nothing will end up calling it.
+    // FIXME: It's a little silly to look at every function just to enumerate
+    // the subtargets, but eventually we'll want to look at them for indirect
+    // calls, so maybe this is OK.
+    if (!getDerived().mayUseThunk(MF))
+      return false;
+
+    getDerived().insertThunks(MMI);
+    InsertedThunks = true;
+    return true;
+  }
+
+  // If this *is* a thunk function, we need to populate it with the correct MI.
+  getDerived().populateThunk(MF);
+  return true;
+}
+
+FunctionPass *llvm::createX86IndirectThunksPass() {
+  return new X86IndirectThunks();
+}
+
+char X86IndirectThunks::ID = 0;
+
+bool X86IndirectThunks::doInitialization(Module &M) {
+  initTIs(M, TIs);
+  return false;
+}
+
+bool X86IndirectThunks::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << getPassName() << '\n');
+  auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+  return runTIs(MMI, MF, TIs);
+}
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 78d8dd3c0d03..1fdac104cb73 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1213,14 +1213,14 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
-          Requires<[Not64BitMode, NotUseRetpolineIndirectCalls]>;
+          Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;
 
 // FIXME: This is disabled for 32-bit PIC mode because the global base
 // register which is part of the address mode may be assigned a
 // callee-saved register.
 def : Pat<(X86tcret (load addr:$dst), imm:$off),
           (TCRETURNmi addr:$dst, imm:$off)>,
-          Requires<[Not64BitMode, IsNotPIC, NotUseRetpolineIndirectCalls]>;
+          Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>;
 
 def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
           (TCRETURNdi tglobaladdr:$dst, imm:$off)>,
@@ -1232,21 +1232,21 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
-          Requires<[In64BitMode, NotUseRetpolineIndirectCalls]>;
+          Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
 
 // Don't fold loads into X86tcret requiring more than 6 regs.
 // There wouldn't be enough scratch registers for base+index.
 def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
           (TCRETURNmi64 addr:$dst, imm:$off)>,
-          Requires<[In64BitMode, NotUseRetpolineIndirectCalls]>;
+          Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
-          (RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
-          Requires<[In64BitMode, UseRetpolineIndirectCalls]>;
+          (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[In64BitMode, UseIndirectThunkCalls]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
-          (RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
-          Requires<[Not64BitMode, UseRetpolineIndirectCalls]>;
+          (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[Not64BitMode, UseIndirectThunkCalls]>;
 
 def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
           (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index 32faeb1a86f2..1842dc19ec2e 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -237,13 +237,13 @@ let isCall = 1 in
                         Sched<[WriteJumpLd]>;
     def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
                         "call{l}\t{*}$dst", [(X86call GR32:$dst)]>, OpSize32,
-                        Requires<[Not64BitMode,NotUseRetpolineIndirectCalls]>,
+                        Requires<[Not64BitMode,NotUseIndirectThunkCalls]>,
                         Sched<[WriteJump]>;
     def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
                         "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>,
                         OpSize32,
                         Requires<[Not64BitMode,FavorMemIndirectCall,
-                                  NotUseRetpolineIndirectCalls]>,
+                                  NotUseIndirectThunkCalls]>,
                         Sched<[WriteJumpLd]>;
 
     // Non-tracking calls for IBT, use with caution.
@@ -334,11 +334,11 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
                       Requires<[In64BitMode]>;
   def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
                         "call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
-                      Requires<[In64BitMode,NotUseRetpolineIndirectCalls]>;
+                      Requires<[In64BitMode,NotUseIndirectThunkCalls]>;
   def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
                         "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
                       Requires<[In64BitMode,FavorMemIndirectCall,
-                                NotUseRetpolineIndirectCalls]>;
+                                NotUseIndirectThunkCalls]>;
 
   // Non-tracking calls for IBT, use with caution.
   let isCodeGenOnly = 1 in {
@@ -393,19 +393,19 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
     Uses = [RSP, SSP],
     usesCustomInserter = 1,
     SchedRW = [WriteJump] in {
-  def RETPOLINE_CALL32 :
+  def INDIRECT_THUNK_CALL32 :
     PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>,
-            Requires<[Not64BitMode,UseRetpolineIndirectCalls]>;
+            Requires<[Not64BitMode,UseIndirectThunkCalls]>;
 
-  def RETPOLINE_CALL64 :
+  def INDIRECT_THUNK_CALL64 :
     PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>,
-            Requires<[In64BitMode,UseRetpolineIndirectCalls]>;
+            Requires<[In64BitMode,UseIndirectThunkCalls]>;
 
-  // Retpoline variant of indirect tail calls.
+  // Indirect thunk variant of indirect tail calls.
   let isTerminator = 1, isReturn = 1, isBarrier = 1 in {
-    def RETPOLINE_TCRETURN64 :
+    def INDIRECT_THUNK_TCRETURN64 :
       PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>;
-    def RETPOLINE_TCRETURN32 :
+    def INDIRECT_THUNK_TCRETURN32 :
       PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>;
   }
 }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index ca5425e8b89f..93f40c8ec996 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -996,8 +996,8 @@ def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
 def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
 def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
-def UseRetpolineIndirectCalls : Predicate<"Subtarget->useRetpolineIndirectCalls()">;
-def NotUseRetpolineIndirectCalls : Predicate<"!Subtarget->useRetpolineIndirectCalls()">;
+def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
+def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
 
 //===----------------------------------------------------------------------===//
 // X86 Instruction Format Definitions.
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
new file mode 100644
index 000000000000..35fc439998f9
--- /dev/null
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -0,0 +1,900 @@
+//==-- X86LoadValueInjectionLoadHardening.cpp - LVI load hardening for x86 --=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Description: This pass finds Load Value Injection (LVI) gadgets consisting
+/// of a load from memory (i.e., SOURCE), and any operation that may transmit
+/// the value loaded from memory over a covert channel, or use the value loaded
+/// from memory to determine a branch/call target (i.e., SINK). After finding
+/// all such gadgets in a given function, the pass minimally inserts LFENCE
+/// instructions in such a manner that the following property is satisfied: for
+/// all SOURCE+SINK pairs, all paths in the CFG from SOURCE to SINK contain at
+/// least one LFENCE instruction. The algorithm that implements this minimal
+/// insertion is influenced by an academic paper that minimally inserts memory
+/// fences for high-performance concurrent programs:
+///         http://www.cs.ucr.edu/~lesani/companion/oopsla15/OOPSLA15.pdf
+/// The algorithm implemented in this pass is as follows:
+/// 1. Build a condensed CFG (i.e., a GadgetGraph) consisting only of the
+/// following components:
+///    - SOURCE instructions (also includes function arguments)
+///    - SINK instructions
+///    - Basic block entry points
+///    - Basic block terminators
+///    - LFENCE instructions
+/// 2. Analyze the GadgetGraph to determine which SOURCE+SINK pairs (i.e.,
+/// gadgets) are already mitigated by existing LFENCEs. If all gadgets have been
+/// mitigated, go to step 6.
+/// 3. Use a heuristic or plugin to approximate minimal LFENCE insertion.
+/// 4. Insert one LFENCE along each CFG edge that was cut in step 3.
+/// 5. Go to step 2.
+/// 6. If any LFENCEs were inserted, return `true` from runOnMachineFunction()
+/// to tell LLVM that the function was modified.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ImmutableGraph.h"
+#include "X86.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define PASS_KEY "x86-lvi-load"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumFences, "Number of LFENCEs inserted for LVI mitigation");
+STATISTIC(NumFunctionsConsidered, "Number of functions analyzed");
+STATISTIC(NumFunctionsMitigated, "Number of functions for which mitigations "
+                                 "were deployed");
+STATISTIC(NumGadgets, "Number of LVI gadgets detected during analysis");
+
+static cl::opt<std::string> OptimizePluginPath(
+    PASS_KEY "-opt-plugin",
+    cl::desc("Specify a plugin to optimize LFENCE insertion"), cl::Hidden);
+
+static cl::opt<bool> NoConditionalBranches(
+    PASS_KEY "-no-cbranch",
+    cl::desc("Don't treat conditional branches as disclosure gadgets. This "
+             "may improve performance, at the cost of security."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EmitDot(
+    PASS_KEY "-dot",
+    cl::desc(
+        "For each function, emit a dot graph depicting potential LVI gadgets"),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EmitDotOnly(
+    PASS_KEY "-dot-only",
+    cl::desc("For each function, emit a dot graph depicting potential LVI "
+             "gadgets, and do not insert any fences"),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EmitDotVerify(
+    PASS_KEY "-dot-verify",
+    cl::desc("For each function, emit a dot graph to stdout depicting "
+             "potential LVI gadgets, used for testing purposes only"),
+    cl::init(false), cl::Hidden);
+
+static llvm::sys::DynamicLibrary OptimizeDL;
+typedef int (*OptimizeCutT)(unsigned int *nodes, unsigned int nodes_size,
+                            unsigned int *edges, int *edge_values,
+                            int *cut_edges /* out */, unsigned int edges_size);
+static OptimizeCutT OptimizeCut = nullptr;
+
+namespace {
+
+struct MachineGadgetGraph : ImmutableGraph<MachineInstr *, int> {
+  static constexpr int GadgetEdgeSentinel = -1;
+  static constexpr MachineInstr *const ArgNodeSentinel = nullptr;
+
+  using GraphT = ImmutableGraph<MachineInstr *, int>;
+  using Node = typename GraphT::Node;
+  using Edge = typename GraphT::Edge;
+  using size_type = typename GraphT::size_type;
+  MachineGadgetGraph(std::unique_ptr<Node[]> Nodes,
+                     std::unique_ptr<Edge[]> Edges, size_type NodesSize,
+                     size_type EdgesSize, int NumFences = 0, int NumGadgets = 0)
+      : GraphT(std::move(Nodes), std::move(Edges), NodesSize, EdgesSize),
+        NumFences(NumFences), NumGadgets(NumGadgets) {}
+  static inline bool isCFGEdge(const Edge &E) {
+    return E.getValue() != GadgetEdgeSentinel;
+  }
+  static inline bool isGadgetEdge(const Edge &E) {
+    return E.getValue() == GadgetEdgeSentinel;
+  }
+  int NumFences;
+  int NumGadgets;
+};
+
+class X86LoadValueInjectionLoadHardeningPass : public MachineFunctionPass {
+public:
+  X86LoadValueInjectionLoadHardeningPass() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "X86 Load Value Injection (LVI) Load Hardening";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+
+private:
+  using GraphBuilder = ImmutableGraphBuilder<MachineGadgetGraph>;
+  using EdgeSet = MachineGadgetGraph::EdgeSet;
+  using NodeSet = MachineGadgetGraph::NodeSet;
+  using Gadget = std::pair<MachineInstr *, MachineInstr *>;
+
+  const X86Subtarget *STI;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  std::unique_ptr<MachineGadgetGraph>
+  getGadgetGraph(MachineFunction &MF, const MachineLoopInfo &MLI,
+                 const MachineDominatorTree &MDT,
+                 const MachineDominanceFrontier &MDF) const;
+  int hardenLoadsWithPlugin(MachineFunction &MF,
+                            std::unique_ptr<MachineGadgetGraph> Graph) const;
+  int hardenLoadsWithGreedyHeuristic(
+      MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const;
+  int elimMitigatedEdgesAndNodes(MachineGadgetGraph &G,
+                                 EdgeSet &ElimEdges /* in, out */,
+                                 NodeSet &ElimNodes /* in, out */) const;
+  std::unique_ptr<MachineGadgetGraph>
+  trimMitigatedEdges(std::unique_ptr<MachineGadgetGraph> Graph) const;
+  void findAndCutEdges(MachineGadgetGraph &G,
+                       EdgeSet &CutEdges /* out */) const;
+  int insertFences(MachineFunction &MF, MachineGadgetGraph &G,
+                   EdgeSet &CutEdges /* in, out */) const;
+  bool instrUsesRegToAccessMemory(const MachineInstr &I, unsigned Reg) const;
+  bool instrUsesRegToBranch(const MachineInstr &I, unsigned Reg) const;
+  inline bool isFence(const MachineInstr *MI) const {
+    return MI && (MI->getOpcode() == X86::LFENCE ||
+                  (STI->useLVIControlFlowIntegrity() && MI->isCall()));
+  }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <>
+struct GraphTraits<MachineGadgetGraph *>
+    : GraphTraits<ImmutableGraph<MachineInstr *, int> *> {};
+
+template <>
+struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
+  using GraphType = MachineGadgetGraph;
+  using Traits = llvm::GraphTraits<GraphType *>;
+  using NodeRef = typename Traits::NodeRef;
+  using EdgeRef = typename Traits::EdgeRef;
+  using ChildIteratorType = typename Traits::ChildIteratorType;
+  using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType;
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  std::string getNodeLabel(NodeRef Node, GraphType *) {
+    if (Node->getValue() == MachineGadgetGraph::ArgNodeSentinel)
+      return "ARGS";
+
+    std::string Str;
+    raw_string_ostream OS(Str);
+    OS << *Node->getValue();
+    return OS.str();
+  }
+
+  static std::string getNodeAttributes(NodeRef Node, GraphType *) {
+    MachineInstr *MI = Node->getValue();
+    if (MI == MachineGadgetGraph::ArgNodeSentinel)
+      return "color = blue";
+    if (MI->getOpcode() == X86::LFENCE)
+      return "color = green";
+    return "";
+  }
+
+  static std::string getEdgeAttributes(NodeRef, ChildIteratorType E,
+                                       GraphType *) {
+    int EdgeVal = (*E.getCurrent()).getValue();
+    return EdgeVal >= 0 ? "label = " + std::to_string(EdgeVal)
+                        : "color = red, style = \"dashed\"";
+  }
+};
+
+} // end namespace llvm
+
+constexpr MachineInstr *MachineGadgetGraph::ArgNodeSentinel;
+constexpr int MachineGadgetGraph::GadgetEdgeSentinel;
+
+char X86LoadValueInjectionLoadHardeningPass::ID = 0;
+
+void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<MachineLoopInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominanceFrontier>();
+  AU.setPreservesCFG();
+}
+
+static void WriteGadgetGraph(raw_ostream &OS, MachineFunction &MF,
+                             MachineGadgetGraph *G) {
+  WriteGraph(OS, G, /*ShortNames*/ false,
+             "Speculative gadgets for \"" + MF.getName() + "\" function");
+}
+
+bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
+    MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
+                    << " *****\n");
+  STI = &MF.getSubtarget<X86Subtarget>();
+  if (!STI->useLVILoadHardening())
+    return false;
+
+  // FIXME: support 32-bit
+  if (!STI->is64Bit())
+    report_fatal_error("LVI load hardening is only supported on 64-bit", false);
+
+  // Don't skip functions with the "optnone" attr but participate in opt-bisect.
+  const Function &F = MF.getFunction();
+  if (!F.hasOptNone() && skipFunction(F))
+    return false;
+
+  ++NumFunctionsConsidered;
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
+  LLVM_DEBUG(dbgs() << "Building gadget graph...\n");
+  const auto &MLI = getAnalysis<MachineLoopInfo>();
+  const auto &MDT = getAnalysis<MachineDominatorTree>();
+  const auto &MDF = getAnalysis<MachineDominanceFrontier>();
+  std::unique_ptr<MachineGadgetGraph> Graph = getGadgetGraph(MF, MLI, MDT, MDF);
+  LLVM_DEBUG(dbgs() << "Building gadget graph... Done\n");
+  if (Graph == nullptr)
+    return false; // didn't find any gadgets
+
+  if (EmitDotVerify) {
+    WriteGadgetGraph(outs(), MF, Graph.get());
+    return false;
+  }
+
+  if (EmitDot || EmitDotOnly) {
+    LLVM_DEBUG(dbgs() << "Emitting gadget graph...\n");
+    std::error_code FileError;
+    std::string FileName = "lvi.";
+    FileName += MF.getName();
+    FileName += ".dot";
+    raw_fd_ostream FileOut(FileName, FileError);
+    if (FileError)
+      errs() << FileError.message();
+    WriteGadgetGraph(FileOut, MF, Graph.get());
+    FileOut.close();
+    LLVM_DEBUG(dbgs() << "Emitting gadget graph... Done\n");
+    if (EmitDotOnly)
+      return false;
+  }
+
+  int FencesInserted;
+  if (!OptimizePluginPath.empty()) {
+    if (!OptimizeDL.isValid()) {
+      std::string ErrorMsg;
+      OptimizeDL = llvm::sys::DynamicLibrary::getPermanentLibrary(
+          OptimizePluginPath.c_str(), &ErrorMsg);
+      if (!ErrorMsg.empty())
+        report_fatal_error("Failed to load opt plugin: \"" + ErrorMsg + '\"');
+      OptimizeCut = (OptimizeCutT)OptimizeDL.getAddressOfSymbol("optimize_cut");
+      if (!OptimizeCut)
+        report_fatal_error("Invalid optimization plugin");
+    }
+    FencesInserted = hardenLoadsWithPlugin(MF, std::move(Graph));
+  } else { // Use the default greedy heuristic
+    FencesInserted = hardenLoadsWithGreedyHeuristic(MF, std::move(Graph));
+  }
+
+  if (FencesInserted > 0)
+    ++NumFunctionsMitigated;
+  NumFences += FencesInserted;
+  return (FencesInserted > 0);
+}
+
+std::unique_ptr<MachineGadgetGraph>
+X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
+    MachineFunction &MF, const MachineLoopInfo &MLI,
+    const MachineDominatorTree &MDT,
+    const MachineDominanceFrontier &MDF) const {
+  using namespace rdf;
+
+  // Build the Register Dataflow Graph using the RDF framework
+  TargetOperandInfo TOI{*TII};
+  DataFlowGraph DFG{MF, *TII, *TRI, MDT, MDF, TOI};
+  DFG.build();
+  Liveness L{MF.getRegInfo(), DFG};
+  L.computePhiInfo();
+
+  GraphBuilder Builder;
+  using GraphIter = typename GraphBuilder::BuilderNodeRef;
+  DenseMap<MachineInstr *, GraphIter> NodeMap;
+  int FenceCount = 0, GadgetCount = 0;
+  auto MaybeAddNode = [&NodeMap, &Builder](MachineInstr *MI) {
+    auto Ref = NodeMap.find(MI);
+    if (Ref == NodeMap.end()) {
+      auto I = Builder.addVertex(MI);
+      NodeMap[MI] = I;
+      return std::pair<GraphIter, bool>{I, true};
+    }
+    return std::pair<GraphIter, bool>{Ref->getSecond(), false};
+  };
+
+  // The `Transmitters` map memoizes transmitters found for each def. If a def
+  // has not yet been analyzed, then it will not appear in the map. If a def
+  // has been analyzed and was determined not to have any transmitters, then
+  // its list of transmitters will be empty.
+  DenseMap<NodeId, std::vector<NodeId>> Transmitters;
+
+  // Analyze all machine instructions to find gadgets and LFENCEs, adding
+  // each interesting value to `Nodes`
+  auto AnalyzeDef = [&](NodeAddr<DefNode *> SourceDef) {
+    SmallSet<NodeId, 8> UsesVisited, DefsVisited;
+    std::function<void(NodeAddr<DefNode *>)> AnalyzeDefUseChain =
+        [&](NodeAddr<DefNode *> Def) {
+          if (Transmitters.find(Def.Id) != Transmitters.end())
+            return; // Already analyzed `Def`
+
+          // Use RDF to find all the uses of `Def`
+          rdf::NodeSet Uses;
+          RegisterRef DefReg = DFG.getPRI().normalize(Def.Addr->getRegRef(DFG));
+          for (auto UseID : L.getAllReachedUses(DefReg, Def)) {
+            auto Use = DFG.addr<UseNode *>(UseID);
+            if (Use.Addr->getFlags() & NodeAttrs::PhiRef) { // phi node
+              NodeAddr<PhiNode *> Phi = Use.Addr->getOwner(DFG);
+              for (auto I : L.getRealUses(Phi.Id)) {
+                if (DFG.getPRI().alias(RegisterRef(I.first), DefReg)) {
+                  for (auto UA : I.second)
+                    Uses.emplace(UA.first);
+                }
+              }
+            } else { // not a phi node
+              Uses.emplace(UseID);
+            }
+          }
+
+          // For each use of `Def`, we want to know whether:
+          // (1) The use can leak the Def'ed value,
+          // (2) The use can further propagate the Def'ed value to more defs
+          for (auto UseID : Uses) {
+            if (!UsesVisited.insert(UseID).second)
+              continue; // Already visited this use of `Def`
+
+            auto Use = DFG.addr<UseNode *>(UseID);
+            assert(!(Use.Addr->getFlags() & NodeAttrs::PhiRef));
+            MachineOperand &UseMO = Use.Addr->getOp();
+            MachineInstr &UseMI = *UseMO.getParent();
+            assert(UseMO.isReg());
+
+            // We naively assume that an instruction propagates any loaded
+            // uses to all defs unless the instruction is a call, in which
+            // case all arguments will be treated as gadget sources during
+            // analysis of the callee function.
+            if (UseMI.isCall())
+              continue;
+
+            // Check whether this use can transmit (leak) its value.
+            if (instrUsesRegToAccessMemory(UseMI, UseMO.getReg()) ||
+                (!NoConditionalBranches &&
+                 instrUsesRegToBranch(UseMI, UseMO.getReg()))) {
+              Transmitters[Def.Id].push_back(Use.Addr->getOwner(DFG).Id);
+              if (UseMI.mayLoad())
+                continue; // Found a transmitting load -- no need to continue
+                          // traversing its defs (i.e., this load will become
+                          // a new gadget source anyways).
+            }
+
+            // Check whether the use propagates to more defs.
+            NodeAddr<InstrNode *> Owner{Use.Addr->getOwner(DFG)};
+            rdf::NodeList AnalyzedChildDefs;
+            for (auto &ChildDef :
+                 Owner.Addr->members_if(DataFlowGraph::IsDef, DFG)) {
+              if (!DefsVisited.insert(ChildDef.Id).second)
+                continue; // Already visited this def
+              if (Def.Addr->getAttrs() & NodeAttrs::Dead)
+                continue;
+              if (Def.Id == ChildDef.Id)
+                continue; // `Def` uses itself (e.g., increment loop counter)
+
+              AnalyzeDefUseChain(ChildDef);
+
+              // `Def` inherits all of its child defs' transmitters.
+              for (auto TransmitterId : Transmitters[ChildDef.Id])
+                Transmitters[Def.Id].push_back(TransmitterId);
+            }
+          }
+
+          // Note that this statement adds `Def.Id` to the map if no
+          // transmitters were found for `Def`.
+          auto &DefTransmitters = Transmitters[Def.Id];
+
+          // Remove duplicate transmitters
+          llvm::sort(DefTransmitters);
+          DefTransmitters.erase(
+              std::unique(DefTransmitters.begin(), DefTransmitters.end()),
+              DefTransmitters.end());
+        };
+
+    // Find all of the transmitters
+    AnalyzeDefUseChain(SourceDef);
+    auto &SourceDefTransmitters = Transmitters[SourceDef.Id];
+    if (SourceDefTransmitters.empty())
+      return; // No transmitters for `SourceDef`
+
+    MachineInstr *Source = SourceDef.Addr->getFlags() & NodeAttrs::PhiRef
+                               ? MachineGadgetGraph::ArgNodeSentinel
+                               : SourceDef.Addr->getOp().getParent();
+    auto GadgetSource = MaybeAddNode(Source);
+    // Each transmitter is a sink for `SourceDef`.
+    for (auto TransmitterId : SourceDefTransmitters) {
+      MachineInstr *Sink = DFG.addr<StmtNode *>(TransmitterId).Addr->getCode();
+      auto GadgetSink = MaybeAddNode(Sink);
+      // Add the gadget edge to the graph.
+      Builder.addEdge(MachineGadgetGraph::GadgetEdgeSentinel,
+                      GadgetSource.first, GadgetSink.first);
+      ++GadgetCount;
+    }
+  };
+
+  LLVM_DEBUG(dbgs() << "Analyzing def-use chains to find gadgets\n");
+  // Analyze function arguments
+  NodeAddr<BlockNode *> EntryBlock = DFG.getFunc().Addr->getEntryBlock(DFG);
+  for (NodeAddr<PhiNode *> ArgPhi :
+       EntryBlock.Addr->members_if(DataFlowGraph::IsPhi, DFG)) {
+    NodeList Defs = ArgPhi.Addr->members_if(DataFlowGraph::IsDef, DFG);
+    llvm::for_each(Defs, AnalyzeDef);
+  }
+  // Analyze every instruction in MF
+  for (NodeAddr<BlockNode *> BA : DFG.getFunc().Addr->members(DFG)) {
+    for (NodeAddr<StmtNode *> SA :
+         BA.Addr->members_if(DataFlowGraph::IsCode<NodeAttrs::Stmt>, DFG)) {
+      MachineInstr *MI = SA.Addr->getCode();
+      if (isFence(MI)) {
+        MaybeAddNode(MI);
+        ++FenceCount;
+      } else if (MI->mayLoad()) {
+        NodeList Defs = SA.Addr->members_if(DataFlowGraph::IsDef, DFG);
+        llvm::for_each(Defs, AnalyzeDef);
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs() << "Found " << FenceCount << " fences\n");
+  LLVM_DEBUG(dbgs() << "Found " << GadgetCount << " gadgets\n");
+  if (GadgetCount == 0)
+    return nullptr;
+  NumGadgets += GadgetCount;
+
+  // Traverse CFG to build the rest of the graph
+  SmallSet<MachineBasicBlock *, 8> BlocksVisited;
+  std::function<void(MachineBasicBlock *, GraphIter, unsigned)> TraverseCFG =
+      [&](MachineBasicBlock *MBB, GraphIter GI, unsigned ParentDepth) {
+        unsigned LoopDepth = MLI.getLoopDepth(MBB);
+        if (!MBB->empty()) {
+          // Always add the first instruction in each block
+          auto NI = MBB->begin();
+          auto BeginBB = MaybeAddNode(&*NI);
+          Builder.addEdge(ParentDepth, GI, BeginBB.first);
+          if (!BlocksVisited.insert(MBB).second)
+            return;
+
+          // Add any instructions within the block that are gadget components
+          GI = BeginBB.first;
+          while (++NI != MBB->end()) {
+            auto Ref = NodeMap.find(&*NI);
+            if (Ref != NodeMap.end()) {
+              Builder.addEdge(LoopDepth, GI, Ref->getSecond());
+              GI = Ref->getSecond();
+            }
+          }
+
+          // Always add the terminator instruction, if one exists
+          auto T = MBB->getFirstTerminator();
+          if (T != MBB->end()) {
+            auto EndBB = MaybeAddNode(&*T);
+            if (EndBB.second)
+              Builder.addEdge(LoopDepth, GI, EndBB.first);
+            GI = EndBB.first;
+          }
+        }
+        for (MachineBasicBlock *Succ : MBB->successors())
+          TraverseCFG(Succ, GI, LoopDepth);
+      };
+  // ArgNodeSentinel is a pseudo-instruction that represents MF args in the
+  // GadgetGraph
+  GraphIter ArgNode = MaybeAddNode(MachineGadgetGraph::ArgNodeSentinel).first;
+  TraverseCFG(&MF.front(), ArgNode, 0);
+  std::unique_ptr<MachineGadgetGraph> G{Builder.get(FenceCount, GadgetCount)};
+  LLVM_DEBUG(dbgs() << "Found " << G->nodes_size() << " nodes\n");
+  return G;
+}
+
+// Returns the number of remaining gadget edges that could not be eliminated
+int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
+    MachineGadgetGraph &G, MachineGadgetGraph::EdgeSet &ElimEdges /* in, out */,
+    MachineGadgetGraph::NodeSet &ElimNodes /* in, out */) const {
+  if (G.NumFences > 0) {
+    // Eliminate fences and CFG edges that ingress and egress the fence, as
+    // they are trivially mitigated.
+    for (const auto &E : G.edges()) {
+      const MachineGadgetGraph::Node *Dest = E.getDest();
+      if (isFence(Dest->getValue())) {
+        ElimNodes.insert(*Dest);
+        ElimEdges.insert(E);
+        for (const auto &DE : Dest->edges())
+          ElimEdges.insert(DE);
+      }
+    }
+  }
+
+  // Find and eliminate gadget edges that have been mitigated.
+  int MitigatedGadgets = 0, RemainingGadgets = 0;
+  MachineGadgetGraph::NodeSet ReachableNodes{G};
+  for (const auto &RootN : G.nodes()) {
+    if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge))
+      continue; // skip this node if it isn't a gadget source
+
+    // Find all of the nodes that are CFG-reachable from RootN using DFS
+    ReachableNodes.clear();
+    std::function<void(const MachineGadgetGraph::Node *, bool)>
+        FindReachableNodes =
+            [&](const MachineGadgetGraph::Node *N, bool FirstNode) {
+              if (!FirstNode)
+                ReachableNodes.insert(*N);
+              for (const auto &E : N->edges()) {
+                const MachineGadgetGraph::Node *Dest = E.getDest();
+                if (MachineGadgetGraph::isCFGEdge(E) &&
+                    !ElimEdges.contains(E) && !ReachableNodes.contains(*Dest))
+                  FindReachableNodes(Dest, false);
+              }
+            };
+    FindReachableNodes(&RootN, true);
+
+    // Any gadget whose sink is unreachable has been mitigated
+    for (const auto &E : RootN.edges()) {
+      if (MachineGadgetGraph::isGadgetEdge(E)) {
+        if (ReachableNodes.contains(*E.getDest())) {
+          // This gadget's sink is reachable
+          ++RemainingGadgets;
+        } else { // This gadget's sink is unreachable, and therefore mitigated
+          ++MitigatedGadgets;
+          ElimEdges.insert(E);
+        }
+      }
+    }
+  }
+  return RemainingGadgets;
+}
+
+std::unique_ptr<MachineGadgetGraph>
+X86LoadValueInjectionLoadHardeningPass::trimMitigatedEdges(
+    std::unique_ptr<MachineGadgetGraph> Graph) const {
+  MachineGadgetGraph::NodeSet ElimNodes{*Graph};
+  MachineGadgetGraph::EdgeSet ElimEdges{*Graph};
+  int RemainingGadgets =
+      elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes);
+  if (ElimEdges.empty() && ElimNodes.empty()) {
+    Graph->NumFences = 0;
+    Graph->NumGadgets = RemainingGadgets;
+  } else {
+    Graph = GraphBuilder::trim(*Graph, ElimNodes, ElimEdges, 0 /* NumFences */,
+                               RemainingGadgets);
+  }
+  return Graph;
+}
+
+int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin(
+    MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const {
+  int FencesInserted = 0;
+
+  do {
+    LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
+    Graph = trimMitigatedEdges(std::move(Graph));
+    LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
+    if (Graph->NumGadgets == 0)
+      break;
+
+    LLVM_DEBUG(dbgs() << "Cutting edges...\n");
+    EdgeSet CutEdges{*Graph};
+    auto Nodes = std::make_unique<unsigned int[]>(Graph->nodes_size() +
+                                                  1 /* terminator node */);
+    auto Edges = std::make_unique<unsigned int[]>(Graph->edges_size());
+    auto EdgeCuts = std::make_unique<int[]>(Graph->edges_size());
+    auto EdgeValues = std::make_unique<int[]>(Graph->edges_size());
+    for (const auto &N : Graph->nodes()) {
+      Nodes[Graph->getNodeIndex(N)] = Graph->getEdgeIndex(*N.edges_begin());
+    }
+    Nodes[Graph->nodes_size()] = Graph->edges_size(); // terminator node
+    for (const auto &E : Graph->edges()) {
+      Edges[Graph->getEdgeIndex(E)] = Graph->getNodeIndex(*E.getDest());
+      EdgeValues[Graph->getEdgeIndex(E)] = E.getValue();
+    }
+    OptimizeCut(Nodes.get(), Graph->nodes_size(), Edges.get(), EdgeValues.get(),
+                EdgeCuts.get(), Graph->edges_size());
+    for (int I = 0; I < Graph->edges_size(); ++I)
+      if (EdgeCuts[I])
+        CutEdges.set(I);
+    LLVM_DEBUG(dbgs() << "Cutting edges... Done\n");
+    LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n");
+
+    LLVM_DEBUG(dbgs() << "Inserting LFENCEs...\n");
+    FencesInserted += insertFences(MF, *Graph, CutEdges);
+    LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n");
+    LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n");
+
+    Graph = GraphBuilder::trim(*Graph, MachineGadgetGraph::NodeSet{*Graph},
+                               CutEdges);
+  } while (true);
+
+  return FencesInserted;
+}
+
+int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithGreedyHeuristic(
+    MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const {
+  LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
+  Graph = trimMitigatedEdges(std::move(Graph));
+  LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
+  if (Graph->NumGadgets == 0)
+    return 0;
+
+  LLVM_DEBUG(dbgs() << "Cutting edges...\n");
+  MachineGadgetGraph::NodeSet ElimNodes{*Graph}, GadgetSinks{*Graph};
+  MachineGadgetGraph::EdgeSet ElimEdges{*Graph}, CutEdges{*Graph};
+  auto IsCFGEdge = [&ElimEdges, &CutEdges](const MachineGadgetGraph::Edge &E) {
+    return !ElimEdges.contains(E) && !CutEdges.contains(E) &&
+           MachineGadgetGraph::isCFGEdge(E);
+  };
+  auto IsGadgetEdge = [&ElimEdges,
+                       &CutEdges](const MachineGadgetGraph::Edge &E) {
+    return !ElimEdges.contains(E) && !CutEdges.contains(E) &&
+           MachineGadgetGraph::isGadgetEdge(E);
+  };
+
+  // FIXME: this is O(E^2), we could probably do better.
+  do {
+    // Find the cheapest CFG edge that will eliminate a gadget (by being
+    // egress from a SOURCE node or ingress to a SINK node), and cut it.
+    const MachineGadgetGraph::Edge *CheapestSoFar = nullptr;
+
+    // First, collect all gadget source and sink nodes.
+    MachineGadgetGraph::NodeSet GadgetSources{*Graph}, GadgetSinks{*Graph};
+    for (const auto &N : Graph->nodes()) {
+      if (ElimNodes.contains(N))
+        continue;
+      for (const auto &E : N.edges()) {
+        if (IsGadgetEdge(E)) {
+          GadgetSources.insert(N);
+          GadgetSinks.insert(*E.getDest());
+        }
+      }
+    }
+
+    // Next, look for the cheapest CFG edge which, when cut, is guaranteed to
+    // mitigate at least one gadget by either:
+    // (a) being egress from a gadget source, or
+    // (b) being ingress to a gadget sink.
+    for (const auto &N : Graph->nodes()) {
+      if (ElimNodes.contains(N))
+        continue;
+      for (const auto &E : N.edges()) {
+        if (IsCFGEdge(E)) {
+          if (GadgetSources.contains(N) || GadgetSinks.contains(*E.getDest())) {
+            if (!CheapestSoFar || E.getValue() < CheapestSoFar->getValue())
+              CheapestSoFar = &E;
+          }
+        }
+      }
+    }
+
+    assert(CheapestSoFar && "Failed to cut an edge");
+    CutEdges.insert(*CheapestSoFar);
+    ElimEdges.insert(*CheapestSoFar);
+  } while (elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes));
+  LLVM_DEBUG(dbgs() << "Cutting edges... Done\n");
+  LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n");
+
+  LLVM_DEBUG(dbgs() << "Inserting LFENCEs...\n");
+  int FencesInserted = insertFences(MF, *Graph, CutEdges);
+  LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n");
+  LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n");
+
+  return FencesInserted;
+}
+
+int X86LoadValueInjectionLoadHardeningPass::insertFences(
+    MachineFunction &MF, MachineGadgetGraph &G,
+    EdgeSet &CutEdges /* in, out */) const {
+  int FencesInserted = 0;
+  for (const auto &N : G.nodes()) {
+    for (const auto &E : N.edges()) {
+      if (CutEdges.contains(E)) {
+        MachineInstr *MI = N.getValue(), *Prev;
+        MachineBasicBlock *MBB;                  // Insert an LFENCE in this MBB
+        MachineBasicBlock::iterator InsertionPt; // ...at this point
+        if (MI == MachineGadgetGraph::ArgNodeSentinel) {
+          // insert LFENCE at beginning of entry block
+          MBB = &MF.front();
+          InsertionPt = MBB->begin();
+          Prev = nullptr;
+        } else if (MI->isBranch()) { // insert the LFENCE before the branch
+          MBB = MI->getParent();
+          InsertionPt = MI;
+          Prev = MI->getPrevNode();
+          // Remove all egress CFG edges from this branch because the inserted
+          // LFENCE prevents gadgets from crossing the branch.
+          for (const auto &E : N.edges()) {
+            if (MachineGadgetGraph::isCFGEdge(E))
+              CutEdges.insert(E);
+          }
+        } else { // insert the LFENCE after the instruction
+          MBB = MI->getParent();
+          InsertionPt = MI->getNextNode() ? MI->getNextNode() : MBB->end();
+          Prev = InsertionPt == MBB->end()
+                     ? (MBB->empty() ? nullptr : &MBB->back())
+                     : InsertionPt->getPrevNode();
+        }
+        // Ensure this insertion is not redundant (two LFENCEs in sequence).
+        if ((InsertionPt == MBB->end() || !isFence(&*InsertionPt)) &&
+            (!Prev || !isFence(Prev))) {
+          BuildMI(*MBB, InsertionPt, DebugLoc(), TII->get(X86::LFENCE));
+          ++FencesInserted;
+        }
+      }
+    }
+  }
+  return FencesInserted;
+}
+
+bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToAccessMemory(
+    const MachineInstr &MI, unsigned Reg) const {
+  if (!MI.mayLoadOrStore() || MI.getOpcode() == X86::MFENCE ||
+      MI.getOpcode() == X86::SFENCE || MI.getOpcode() == X86::LFENCE)
+    return false;
+
+  // FIXME: This does not handle pseudo loading instruction like TCRETURN*
+  const MCInstrDesc &Desc = MI.getDesc();
+  int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+  if (MemRefBeginIdx < 0) {
+    LLVM_DEBUG(dbgs() << "Warning: unable to obtain memory operand for loading "
+                         "instruction:\n";
+               MI.print(dbgs()); dbgs() << '\n';);
+    return false;
+  }
+  MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+  const MachineOperand &BaseMO =
+      MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+  const MachineOperand &IndexMO =
+      MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+  return (BaseMO.isReg() && BaseMO.getReg() != X86::NoRegister &&
+          TRI->regsOverlap(BaseMO.getReg(), Reg)) ||
+         (IndexMO.isReg() && IndexMO.getReg() != X86::NoRegister &&
+          TRI->regsOverlap(IndexMO.getReg(), Reg));
+}
+
+bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToBranch(
+    const MachineInstr &MI, unsigned Reg) const {
+  if (!MI.isConditionalBranch())
+    return false;
+  for (const MachineOperand &Use : MI.uses())
+    if (Use.isReg() && Use.getReg() == Reg)
+      return true;
+  return false;
+}
+
+INITIALIZE_PASS_BEGIN(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
+                      "X86 LVI load hardening", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
+                    "X86 LVI load hardening", false, false)
+
+FunctionPass *llvm::createX86LoadValueInjectionLoadHardeningPass() {
+  return new X86LoadValueInjectionLoadHardeningPass();
+}
+
+namespace {
+
+/// The `X86LoadValueInjectionLoadHardeningPass` above depends on expensive
+/// analysis passes that add complexity to the pipeline. This complexity
+/// can cause noticable overhead when no optimizations are enabled, i.e., -O0.
+/// The purpose of `X86LoadValueInjectionLoadHardeningUnoptimizedPass` is to
+/// provide the same security as the optimized pass, but without adding
+/// unnecessary complexity to the LLVM pipeline.
+///
+/// The behavior of this pass is simply to insert an LFENCE after every load
+/// instruction.
+class X86LoadValueInjectionLoadHardeningUnoptimizedPass
+    : public MachineFunctionPass {
+public:
+  X86LoadValueInjectionLoadHardeningUnoptimizedPass()
+      : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "X86 Load Value Injection (LVI) Load Hardening (Unoptimized)";
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  static char ID;
+};
+
+} // end anonymous namespace
+
+char X86LoadValueInjectionLoadHardeningUnoptimizedPass::ID = 0;
+
+bool X86LoadValueInjectionLoadHardeningUnoptimizedPass::runOnMachineFunction(
+    MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
+                    << " *****\n");
+  const X86Subtarget *STI = &MF.getSubtarget<X86Subtarget>();
+  if (!STI->useLVILoadHardening())
+    return false;
+
+  // FIXME: support 32-bit
+  if (!STI->is64Bit())
+    report_fatal_error("LVI load hardening is only supported on 64-bit", false);
+
+  // Don't skip functions with the "optnone" attr but participate in opt-bisect.
+  const Function &F = MF.getFunction();
+  if (!F.hasOptNone() && skipFunction(F))
+    return false;
+
+  bool Modified = false;
+  ++NumFunctionsConsidered;
+
+  const TargetInstrInfo *TII = STI->getInstrInfo();
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (!MI.mayLoad() || MI.getOpcode() == X86::LFENCE ||
+          MI.getOpcode() == X86::MFENCE)
+        continue;
+
+      MachineBasicBlock::iterator InsertionPt =
+          MI.getNextNode() ? MI.getNextNode() : MBB.end();
+      BuildMI(MBB, InsertionPt, DebugLoc(), TII->get(X86::LFENCE));
+      ++NumFences;
+      Modified = true;
+    }
+  }
+
+  if (Modified)
+    ++NumFunctionsMitigated;
+
+  return Modified;
+}
+
+INITIALIZE_PASS(X86LoadValueInjectionLoadHardeningUnoptimizedPass, PASS_KEY,
+                "X86 LVI load hardening", false, false)
+
+FunctionPass *llvm::createX86LoadValueInjectionLoadHardeningUnoptimizedPass() {
+  return new X86LoadValueInjectionLoadHardeningUnoptimizedPass();
+}
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
new file mode 100644
index 000000000000..6e1134a25950
--- /dev/null
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
@@ -0,0 +1,143 @@
+//===-- X86LoadValueInjectionRetHardening.cpp - LVI RET hardening for x86 --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Description: Replaces every `ret` instruction with the sequence:
+/// ```
+/// pop <scratch-reg>
+/// lfence
+/// jmp *<scratch-reg>
+/// ```
+/// where `<scratch-reg>` is some available scratch register, according to the
+/// calling convention of the function being mitigated.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include <bitset>
+
+using namespace llvm;
+
+#define PASS_KEY "x86-lvi-ret"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumFences, "Number of LFENCEs inserted for LVI mitigation");
+STATISTIC(NumFunctionsConsidered, "Number of functions analyzed");
+STATISTIC(NumFunctionsMitigated, "Number of functions for which mitigations "
+                                 "were deployed");
+
+namespace {
+
+class X86LoadValueInjectionRetHardeningPass : public MachineFunctionPass {
+public:
+  X86LoadValueInjectionRetHardeningPass() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "X86 Load Value Injection (LVI) Ret-Hardening";
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+};
+
+} // end anonymous namespace
+
+char X86LoadValueInjectionRetHardeningPass::ID = 0;
+
+bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
+    MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
+                    << " *****\n");
+  const X86Subtarget *Subtarget = &MF.getSubtarget<X86Subtarget>();
+  if (!Subtarget->useLVIControlFlowIntegrity() || !Subtarget->is64Bit())
+    return false; // FIXME: support 32-bit
+
+  // Don't skip functions with the "optnone" attr but participate in opt-bisect.
+  const Function &F = MF.getFunction();
+  if (!F.hasOptNone() && skipFunction(F))
+    return false;
+
+  ++NumFunctionsConsidered;
+  const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const X86InstrInfo *TII = Subtarget->getInstrInfo();
+  unsigned ClobberReg = X86::NoRegister;
+  std::bitset<X86::NUM_TARGET_REGS> UnclobberableGR64s;
+  UnclobberableGR64s.set(X86::RSP); // can't clobber stack pointer
+  UnclobberableGR64s.set(X86::RIP); // can't clobber instruction pointer
+  UnclobberableGR64s.set(X86::RAX); // used for function return
+  UnclobberableGR64s.set(X86::RDX); // used for function return
+
+  // We can clobber any register allowed by the function's calling convention.
+  for (const MCPhysReg *PR = TRI->getCalleeSavedRegs(&MF); auto Reg = *PR; ++PR)
+    UnclobberableGR64s.set(Reg);
+  for (auto &Reg : X86::GR64RegClass) {
+    if (!UnclobberableGR64s.test(Reg)) {
+      ClobberReg = Reg;
+      break;
+    }
+  }
+
+  if (ClobberReg != X86::NoRegister) {
+    LLVM_DEBUG(dbgs() << "Selected register "
+                      << Subtarget->getRegisterInfo()->getRegAsmName(ClobberReg)
+                      << " to clobber\n");
+  } else {
+    LLVM_DEBUG(dbgs() << "Could not find a register to clobber\n");
+  }
+
+  bool Modified = false;
+  for (auto &MBB : MF) {
+    if (MBB.empty())
+      continue;
+
+    MachineInstr &MI = MBB.back();
+    if (MI.getOpcode() != X86::RETQ)
+      continue;
+
+    if (ClobberReg != X86::NoRegister) {
+      MBB.erase_instr(&MI);
+      BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::POP64r))
+          .addReg(ClobberReg, RegState::Define)
+          .setMIFlag(MachineInstr::FrameDestroy);
+      BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::LFENCE));
+      BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::JMP64r))
+          .addReg(ClobberReg);
+    } else {
+      // In case there is no available scratch register, we can still read from
+      // RSP to assert that RSP points to a valid page. The write to RSP is
+      // also helpful because it verifies that the stack's write permissions
+      // are intact.
+      MachineInstr *Fence = BuildMI(MBB, MI, DebugLoc(), TII->get(X86::LFENCE));
+      addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)),
+                   X86::RSP, false, 0)
+          .addImm(0)
+          ->addRegisterDead(X86::EFLAGS, TRI);
+    }
+
+    ++NumFences;
+    Modified = true;
+  }
+
+  if (Modified)
+    ++NumFunctionsMitigated;
+  return Modified;
+}
+
+INITIALIZE_PASS(X86LoadValueInjectionRetHardeningPass, PASS_KEY,
+                "X86 LVI ret hardener", false, false)
+
+FunctionPass *llvm::createX86LoadValueInjectionRetHardeningPass() {
+  return new X86LoadValueInjectionRetHardeningPass();
+}
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 7f49c6e861d4..f5caaaae4d84 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1220,8 +1220,8 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
       break;
     case MachineOperand::MO_Register:
       // FIXME: Add retpoline support and remove this.
-      if (Subtarget->useRetpolineIndirectCalls())
-        report_fatal_error("Lowering register statepoints with retpoline not "
+      if (Subtarget->useIndirectThunkCalls())
+        report_fatal_error("Lowering register statepoints with thunks not "
                            "yet implemented.");
       CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
       CallOpcode = X86::CALL64r;
@@ -1399,9 +1399,9 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
     EmitAndCountInstruction(
         MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
     // FIXME: Add retpoline support and remove this.
-    if (Subtarget->useRetpolineIndirectCalls())
+    if (Subtarget->useIndirectThunkCalls())
       report_fatal_error(
-          "Lowering patchpoint with retpoline not yet implemented.");
+          "Lowering patchpoint with thunks not yet implemented.");
     EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
   }
 
diff --git a/llvm/lib/Target/X86/X86RetpolineThunks.cpp b/llvm/lib/Target/X86/X86RetpolineThunks.cpp
deleted file mode 100644
index 9085d7f068ac..000000000000
--- a/llvm/lib/Target/X86/X86RetpolineThunks.cpp
+++ /dev/null
@@ -1,286 +0,0 @@
-//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86  --=====//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-///
-/// Pass that injects an MI thunk implementing a "retpoline". This is
-/// a RET-implemented trampoline that is used to lower indirect calls in a way
-/// that prevents speculation on some x86 processors and can be used to mitigate
-/// security vulnerabilities due to targeted speculative execution and side
-/// channels such as CVE-2017-5715.
-///
-/// TODO(chandlerc): All of this code could use better comments and
-/// documentation.
-///
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86InstrBuilder.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-retpoline-thunks"
-
-static const char ThunkNamePrefix[] = "__llvm_retpoline_";
-static const char R11ThunkName[]    = "__llvm_retpoline_r11";
-static const char EAXThunkName[]    = "__llvm_retpoline_eax";
-static const char ECXThunkName[]    = "__llvm_retpoline_ecx";
-static const char EDXThunkName[]    = "__llvm_retpoline_edx";
-static const char EDIThunkName[]    = "__llvm_retpoline_edi";
-
-namespace {
-class X86RetpolineThunks : public MachineFunctionPass {
-public:
-  static char ID;
-
-  X86RetpolineThunks() : MachineFunctionPass(ID) {}
-
-  StringRef getPassName() const override { return "X86 Retpoline Thunks"; }
-
-  bool doInitialization(Module &M) override;
-  bool runOnMachineFunction(MachineFunction &F) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    MachineFunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<MachineModuleInfoWrapperPass>();
-    AU.addPreserved<MachineModuleInfoWrapperPass>();
-  }
-
-private:
-  MachineModuleInfo *MMI = nullptr;
-  const TargetMachine *TM = nullptr;
-  bool Is64Bit = false;
-  const X86Subtarget *STI = nullptr;
-  const X86InstrInfo *TII = nullptr;
-
-  bool InsertedThunks = false;
-
-  void createThunkFunction(Module &M, StringRef Name);
-  void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
-  void populateThunk(MachineFunction &MF, unsigned Reg);
-};
-
-} // end anonymous namespace
-
-FunctionPass *llvm::createX86RetpolineThunksPass() {
-  return new X86RetpolineThunks();
-}
-
-char X86RetpolineThunks::ID = 0;
-
-bool X86RetpolineThunks::doInitialization(Module &M) {
-  InsertedThunks = false;
-  return false;
-}
-
-bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << getPassName() << '\n');
-
-  TM = &MF.getTarget();;
-  STI = &MF.getSubtarget<X86Subtarget>();
-  TII = STI->getInstrInfo();
-  Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64;
-
-  MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
-  Module &M = const_cast<Module &>(*MMI->getModule());
-
-  // If this function is not a thunk, check to see if we need to insert
-  // a thunk.
-  if (!MF.getName().startswith(ThunkNamePrefix)) {
-    // If we've already inserted a thunk, nothing else to do.
-    if (InsertedThunks)
-      return false;
-
-    // Only add a thunk if one of the functions has the retpoline feature
-    // enabled in its subtarget, and doesn't enable external thunks.
-    // FIXME: Conditionalize on indirect calls so we don't emit a thunk when
-    // nothing will end up calling it.
-    // FIXME: It's a little silly to look at every function just to enumerate
-    // the subtargets, but eventually we'll want to look at them for indirect
-    // calls, so maybe this is OK.
-    if ((!STI->useRetpolineIndirectCalls() &&
-         !STI->useRetpolineIndirectBranches()) ||
-        STI->useRetpolineExternalThunk())
-      return false;
-
-    // Otherwise, we need to insert the thunk.
-    // WARNING: This is not really a well behaving thing to do in a function
-    // pass. We extract the module and insert a new function (and machine
-    // function) directly into the module.
-    if (Is64Bit)
-      createThunkFunction(M, R11ThunkName);
-    else
-      for (StringRef Name :
-           {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName})
-        createThunkFunction(M, Name);
-    InsertedThunks = true;
-    return true;
-  }
-
-  // If this *is* a thunk function, we need to populate it with the correct MI.
-  if (Is64Bit) {
-    assert(MF.getName() == "__llvm_retpoline_r11" &&
-           "Should only have an r11 thunk on 64-bit targets");
-
-    // __llvm_retpoline_r11:
-    //   callq .Lr11_call_target
-    // .Lr11_capture_spec:
-    //   pause
-    //   lfence
-    //   jmp .Lr11_capture_spec
-    // .align 16
-    // .Lr11_call_target:
-    //   movq %r11, (%rsp)
-    //   retq
-    populateThunk(MF, X86::R11);
-  } else {
-    // For 32-bit targets we need to emit a collection of thunks for various
-    // possible scratch registers as well as a fallback that uses EDI, which is
-    // normally callee saved.
-    //   __llvm_retpoline_eax:
-    //         calll .Leax_call_target
-    //   .Leax_capture_spec:
-    //         pause
-    //         jmp .Leax_capture_spec
-    //   .align 16
-    //   .Leax_call_target:
-    //         movl %eax, (%esp)  # Clobber return addr
-    //         retl
-    //
-    //   __llvm_retpoline_ecx:
-    //   ... # Same setup
-    //         movl %ecx, (%esp)
-    //         retl
-    //
-    //   __llvm_retpoline_edx:
-    //   ... # Same setup
-    //         movl %edx, (%esp)
-    //         retl
-    //
-    //   __llvm_retpoline_edi:
-    //   ... # Same setup
-    //         movl %edi, (%esp)
-    //         retl
-    if (MF.getName() == EAXThunkName)
-      populateThunk(MF, X86::EAX);
-    else if (MF.getName() == ECXThunkName)
-      populateThunk(MF, X86::ECX);
-    else if (MF.getName() == EDXThunkName)
-      populateThunk(MF, X86::EDX);
-    else if (MF.getName() == EDIThunkName)
-      populateThunk(MF, X86::EDI);
-    else
-      llvm_unreachable("Invalid thunk name on x86-32!");
-  }
-
-  return true;
-}
-
-void X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) {
-  assert(Name.startswith(ThunkNamePrefix) &&
-         "Created a thunk with an unexpected prefix!");
-
-  LLVMContext &Ctx = M.getContext();
-  auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
-  Function *F =
-      Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M);
-  F->setVisibility(GlobalValue::HiddenVisibility);
-  F->setComdat(M.getOrInsertComdat(Name));
-
-  // Add Attributes so that we don't create a frame, unwind information, or
-  // inline.
-  AttrBuilder B;
-  B.addAttribute(llvm::Attribute::NoUnwind);
-  B.addAttribute(llvm::Attribute::Naked);
-  F->addAttributes(llvm::AttributeList::FunctionIndex, B);
-
-  // Populate our function a bit so that we can verify.
-  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
-  IRBuilder<> Builder(Entry);
-
-  Builder.CreateRetVoid();
-
-  // MachineFunctions/MachineBasicBlocks aren't created automatically for the
-  // IR-level constructs we already made. Create them and insert them into the
-  // module.
-  MachineFunction &MF = MMI->getOrCreateMachineFunction(*F);
-  MachineBasicBlock *EntryMBB = MF.CreateMachineBasicBlock(Entry);
-
-  // Insert EntryMBB into MF. It's not in the module until we do this.
-  MF.insert(MF.end(), EntryMBB);
-}
-
-void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
-                                                    unsigned Reg) {
-  const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
-  const unsigned SPReg = Is64Bit ? X86::RSP : X86::ESP;
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(MovOpc)), SPReg, false, 0)
-      .addReg(Reg);
-}
-
-void X86RetpolineThunks::populateThunk(MachineFunction &MF,
-                                       unsigned Reg) {
-  // Set MF properties. We never use vregs...
-  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
-
-  // Grab the entry MBB and erase any other blocks. O0 codegen appears to
-  // generate two bbs for the entry block.
-  MachineBasicBlock *Entry = &MF.front();
-  Entry->clear();
-  while (MF.size() > 1)
-    MF.erase(std::next(MF.begin()));
-
-  MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
-  MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
-  MCSymbol *TargetSym = MF.getContext().createTempSymbol();
-  MF.push_back(CaptureSpec);
-  MF.push_back(CallTarget);
-
-  const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
-  const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
-
-  Entry->addLiveIn(Reg);
-  BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym);
-
-  // The MIR verifier thinks that the CALL in the entry block will fall through
-  // to CaptureSpec, so mark it as the successor. Technically, CaptureTarget is
-  // the successor, but the MIR verifier doesn't know how to cope with that.
-  Entry->addSuccessor(CaptureSpec);
-
-  // In the capture loop for speculation, we want to stop the processor from
-  // speculating as fast as possible. On Intel processors, the PAUSE instruction
-  // will block speculation without consuming any execution resources. On AMD
-  // processors, the PAUSE instruction is (essentially) a nop, so we also use an
-  // LFENCE instruction which they have advised will stop speculation as well
-  // with minimal resource utilization. We still end the capture with a jump to
-  // form an infinite loop to fully guarantee that no matter what implementation
-  // of the x86 ISA, speculating this code path never escapes.
-  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE));
-  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE));
-  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec);
-  CaptureSpec->setHasAddressTaken();
-  CaptureSpec->addSuccessor(CaptureSpec);
-
-  CallTarget->addLiveIn(Reg);
-  CallTarget->setHasAddressTaken();
-  CallTarget->setAlignment(Align(16));
-  insertRegReturnAddrClobber(*CallTarget, Reg);
-  CallTarget->back().setPreInstrSymbol(MF, TargetSym);
-  BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
-}
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index f4e8d30328ca..af5153243c8b 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -421,6 +421,16 @@ protected:
   /// than emitting one inside the compiler.
   bool UseRetpolineExternalThunk = false;
 
+  /// Prevent generation of indirect call/branch instructions from memory,
+  /// and force all indirect call/branch instructions from a register to be
+  /// preceded by an LFENCE. Also decompose RET instructions into a
+  /// POP+LFENCE+JMP sequence.
+  bool UseLVIControlFlowIntegrity = false;
+
+  /// Insert LFENCE instructions to prevent data speculatively injected into
+  /// loads from being used maliciously.
+  bool UseLVILoadHardening = false;
+
   /// Use software floating point for code generation.
   bool UseSoftFloat = false;
 
@@ -707,8 +717,21 @@ public:
     return UseRetpolineIndirectBranches;
   }
   bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
+
+  // These are generic getters that OR together all of the thunk types
+  // supported by the subtarget. Therefore useIndirectThunk*() will return true
+  // if any respective thunk feature is enabled.
+  bool useIndirectThunkCalls() const {
+    return useRetpolineIndirectCalls() || useLVIControlFlowIntegrity();
+  }
+  bool useIndirectThunkBranches() const {
+    return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity();
+  }
+
   bool preferMaskRegisters() const { return PreferMaskRegisters; }
   bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
+  bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; }
+  bool useLVILoadHardening() const { return UseLVILoadHardening; }
 
   unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
   unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
@@ -853,10 +876,10 @@ public:
   /// Return true if the subtarget allows calls to immediate address.
   bool isLegalToCallImmediateAddr() const;
 
-  /// If we are using retpolines, we need to expand indirectbr to avoid it
+  /// If we are using indirect thunks, we need to expand indirectbr to avoid it
   /// lowering to an actual indirect jump.
   bool enableIndirectBrExpand() const override {
-    return useRetpolineIndirectBranches();
+    return useIndirectThunkBranches();
   }
 
   /// Enable the MachineScheduler pass for all X86 subtargets.
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 7176e46f07b1..9f639ffa22ec 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -82,6 +82,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
   initializeX86SpeculativeLoadHardeningPassPass(PR);
   initializeX86FlagsCopyLoweringPassPass(PR);
   initializeX86CondBrFoldingPassPass(PR);
+  initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
+  initializeX86LoadValueInjectionRetHardeningPassPass(PR);
   initializeX86OptimizeLEAPassPass(PR);
 }
 
@@ -496,6 +498,10 @@ void X86PassConfig::addMachineSSAOptimization() {
 
 void X86PassConfig::addPostRegAlloc() {
   addPass(createX86FloatingPointStackifierPass());
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createX86LoadValueInjectionLoadHardeningPass());
+  else
+    addPass(createX86LoadValueInjectionLoadHardeningUnoptimizedPass());
 }
 
 void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
@@ -525,7 +531,7 @@ void X86PassConfig::addPreEmitPass2() {
   const Triple &TT = TM->getTargetTriple();
   const MCAsmInfo *MAI = TM->getMCAsmInfo();
 
-  addPass(createX86RetpolineThunksPass());
+  addPass(createX86IndirectThunksPass());
 
   // Insert extra int3 instructions after trailing call instructions to avoid
   // issues in the unwinder.
@@ -542,6 +548,7 @@ void X86PassConfig::addPreEmitPass2() {
   // Identify valid longjmp targets for Windows Control Flow Guard.
   if (TT.isOSWindows())
     addPass(createCFGuardLongjmpPass());
+  addPass(createX86LoadValueInjectionRetHardeningPass());
 }
 
 std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index ec976a971e3c..23561c25c50a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1768,7 +1768,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     Constant *C2;
 
     // C-(C2-X) --> X+(C-C2)
-    if (match(Op1, m_Sub(m_Constant(C2), m_Value(X))))
+    if (match(Op1, m_Sub(m_Constant(C2), m_Value(X))) && !isa<ConstantExpr>(C2))
       return BinaryOperator::CreateAdd(X, ConstantExpr::getSub(C, C2));
 
     // C-(X+C2) --> (C-C2)-X