191 files changed, 17212 insertions, 9353 deletions
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 44345addbcb6..69c368524a74 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -24,7 +24,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
@@ -111,18 +110,13 @@ bool AggressiveAntiDepState::IsLive(unsigned Reg)
   return((KillIndices[Reg] != ~0u) && (DefIndices[Reg] == ~0u));
 }
 
-
-
-AggressiveAntiDepBreaker::
-AggressiveAntiDepBreaker(MachineFunction& MFi,
-                         const RegisterClassInfo &RCI,
-                         TargetSubtargetInfo::RegClassVector& CriticalPathRCs) :
-  AntiDepBreaker(), MF(MFi),
-  MRI(MF.getRegInfo()),
-  TII(MF.getTarget().getInstrInfo()),
-  TRI(MF.getTarget().getRegisterInfo()),
-  RegClassInfo(RCI),
-  State(nullptr) {
+AggressiveAntiDepBreaker::AggressiveAntiDepBreaker(
+    MachineFunction &MFi, const RegisterClassInfo &RCI,
+    TargetSubtargetInfo::RegClassVector &CriticalPathRCs)
+    : AntiDepBreaker(), MF(MFi), MRI(MF.getRegInfo()),
+      TII(MF.getSubtarget().getInstrInfo()),
+      TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RCI),
+      State(nullptr) {
   /* Collect a bitset of all registers that are only broken if they
      are on the critical path. */
   for (unsigned i = 0, e = CriticalPathRCs.size(); i < e; ++i) {
@@ -262,11 +256,8 @@ static void AntiDepEdges(const SUnit *SU, std::vector<const SDep*>& Edges) {
   for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
        P != PE; ++P) {
     if ((P->getKind() == SDep::Anti) || (P->getKind() == SDep::Output)) {
-      unsigned Reg = P->getReg();
-      if (RegSet.count(Reg) == 0) {
+      if (RegSet.insert(P->getReg()).second)
         Edges.push_back(&*P);
-        RegSet.insert(Reg);
-      }
     }
   }
 }
@@ -527,7 +518,7 @@ BitVector AggressiveAntiDepBreaker::GetRenameRegisters(unsigned Reg) {
       BV &= RCBV;
     }
 
-    DEBUG(dbgs() << " " << RC->getName());
+    DEBUG(dbgs() << " " << TRI->getRegClassName(RC));
   }
 
   return BV;
@@ -620,8 +611,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
 
   DEBUG(dbgs() << "\tFind Registers:");
 
-  if (RenameOrder.count(SuperRC) == 0)
-    RenameOrder.insert(RenameOrderType::value_type(SuperRC, Order.size()));
+  RenameOrder.insert(RenameOrderType::value_type(SuperRC, Order.size()));
 
   unsigned OrigR = RenameOrder[SuperRC];
   unsigned EndR = ((OrigR == Order.size()) ? 0 : OrigR);
@@ -683,6 +673,21 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
           goto next_super_reg;
       }
 
+      // We cannot rename 'Reg' to 'NewReg' if one of the uses of 'Reg' also
+      // defines 'NewReg' via an early-clobber operand.
+      auto Range = RegRefs.equal_range(Reg);
+      for (auto Q = Range.first, QE = Range.second; Q != QE; ++Q) {
+        auto UseMI = Q->second.Operand->getParent();
+        int Idx = UseMI->findRegisterDefOperandIdx(NewReg, false, true, TRI);
+        if (Idx == -1)
+          continue;
+
+        if (UseMI->getOperand(Idx).isEarlyClobber()) {
+          DEBUG(dbgs() << "(ec)");
+          goto next_super_reg;
+        }
+      }
+
       // Record that 'Reg' can be renamed to 'NewReg'.
       RenameMap.insert(std::pair<unsigned, unsigned>(Reg, NewReg));
     }
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.h b/lib/CodeGen/AggressiveAntiDepBreaker.h
index 2ab9d89574e4..12cf95b9b4f9 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.h
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
-#define LLVM_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
+#ifndef LLVM_LIB_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
+#define LLVM_LIB_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
 
 #include "AntiDepBreaker.h"
 #include "llvm/ADT/BitVector.h"
@@ -32,88 +32,83 @@
 namespace llvm {
 class RegisterClassInfo;
 
-  /// Class AggressiveAntiDepState
   /// Contains all the state necessary for anti-dep breaking.
   class AggressiveAntiDepState {
   public:
-    /// RegisterReference - Information about a register reference
-    /// within a liverange
+    /// Information about a register reference within a liverange
     typedef struct {
-      /// Operand - The registers operand
+      /// The registers operand
       MachineOperand *Operand;
-      /// RC - The register class
+      /// The register class
       const TargetRegisterClass *RC;
     } RegisterReference;
 
   private:
-    /// NumTargetRegs - Number of non-virtual target registers
-    /// (i.e. TRI->getNumRegs()).
+    /// Number of non-virtual target registers (i.e. TRI->getNumRegs()).
     const unsigned NumTargetRegs;
 
-    /// GroupNodes - Implements a disjoint-union data structure to
+    /// Implements a disjoint-union data structure to
     /// form register groups. A node is represented by an index into
     /// the vector. A node can "point to" itself to indicate that it
     /// is the parent of a group, or point to another node to indicate
     /// that it is a member of the same group as that node.
     std::vector<unsigned> GroupNodes;
 
-    /// GroupNodeIndices - For each register, the index of the GroupNode
+    /// For each register, the index of the GroupNode
     /// currently representing the group that the register belongs to.
     /// Register 0 is always represented by the 0 group, a group
     /// composed of registers that are not eligible for anti-aliasing.
     std::vector<unsigned> GroupNodeIndices;
 
-    /// RegRefs - Map registers to all their references within a live range.
+    /// Map registers to all their references within a live range.
     std::multimap<unsigned, RegisterReference> RegRefs;
 
-    /// KillIndices - The index of the most recent kill (proceding bottom-up),
+    /// The index of the most recent kill (proceding bottom-up),
     /// or ~0u if the register is not live.
     std::vector<unsigned> KillIndices;
 
-    /// DefIndices - The index of the most recent complete def (proceding bottom
+    /// The index of the most recent complete def (proceding bottom
     /// up), or ~0u if the register is live.
     std::vector<unsigned> DefIndices;
 
   public:
     AggressiveAntiDepState(const unsigned TargetRegs, MachineBasicBlock *BB);
 
-    /// GetKillIndices - Return the kill indices.
+    /// Return the kill indices.
     std::vector<unsigned> &GetKillIndices() { return KillIndices; }
 
-    /// GetDefIndices - Return the define indices.
+    /// Return the define indices.
     std::vector<unsigned> &GetDefIndices() { return DefIndices; }
 
-    /// GetRegRefs - Return the RegRefs map.
+    /// Return the RegRefs map.
     std::multimap<unsigned, RegisterReference>& GetRegRefs() { return RegRefs; }
 
-    // GetGroup - Get the group for a register. The returned value is
+    // Get the group for a register. The returned value is
     // the index of the GroupNode representing the group.
     unsigned GetGroup(unsigned Reg);
 
-    // GetGroupRegs - Return a vector of the registers belonging to a
-    // group. If RegRefs is non-NULL then only included referenced registers.
+    // Return a vector of the registers belonging to a group.
+    // If RegRefs is non-NULL then only included referenced registers.
     void GetGroupRegs(
        unsigned Group,
        std::vector<unsigned> &Regs,
        std::multimap<unsigned,
          AggressiveAntiDepState::RegisterReference> *RegRefs);
 
-    // UnionGroups - Union Reg1's and Reg2's groups to form a new
-    // group. Return the index of the GroupNode representing the
-    // group.
+    // Union Reg1's and Reg2's groups to form a new group.
+    // Return the index of the GroupNode representing the group.
     unsigned UnionGroups(unsigned Reg1, unsigned Reg2);
 
-    // LeaveGroup - Remove a register from its current group and place
+    // Remove a register from its current group and place
     // it alone in its own group. Return the index of the GroupNode
     // representing the registers new group.
     unsigned LeaveGroup(unsigned Reg);
 
-    /// IsLive - Return true if Reg is live
+    /// Return true if Reg is live.
     bool IsLive(unsigned Reg);
   };
 
 
-  /// Class AggressiveAntiDepBreaker
   class AggressiveAntiDepBreaker : public AntiDepBreaker {
     MachineFunction& MF;
     MachineRegisterInfo &MRI;
@@ -121,12 +116,11 @@ class RegisterClassInfo;
     const TargetRegisterInfo *TRI;
     const RegisterClassInfo &RegClassInfo;
 
-    /// CriticalPathSet - The set of registers that should only be
+    /// The set of registers that should only be
     /// renamed if they are on the critical path.
     BitVector CriticalPathSet;
 
-    /// State - The state used to identify and rename anti-dependence
-    /// registers.
+    /// The state used to identify and rename anti-dependence registers.
     AggressiveAntiDepState *State;
 
   public:
@@ -135,11 +129,10 @@ class RegisterClassInfo;
                           TargetSubtargetInfo::RegClassVector& CriticalPathRCs);
     ~AggressiveAntiDepBreaker();
 
-    /// Start - Initialize anti-dep breaking for a new basic block.
+    /// Initialize anti-dep breaking for a new basic block.
     void StartBlock(MachineBasicBlock *BB) override;
 
-    /// BreakAntiDependencies - Identifiy anti-dependencies along the critical
-    /// path
+    /// Identifiy anti-dependencies along the critical path
     /// of the ScheduleDAG and break them by renaming registers.
     ///
     unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
@@ -148,24 +141,24 @@ class RegisterClassInfo;
                                    unsigned InsertPosIndex,
                                    DbgValueVector &DbgValues) override;
 
-    /// Observe - Update liveness information to account for the current
+    /// Update liveness information to account for the current
     /// instruction, which will not be scheduled.
     ///
     void Observe(MachineInstr *MI, unsigned Count,
                  unsigned InsertPosIndex) override;
 
-    /// Finish - Finish anti-dep breaking for a basic block.
+    /// Finish anti-dep breaking for a basic block.
     void FinishBlock() override;
 
   private:
     /// Keep track of a position in the allocation order for each regclass.
     typedef std::map<const TargetRegisterClass *, unsigned> RenameOrderType;
 
-    /// IsImplicitDefUse - Return true if MO represents a register
+    /// Return true if MO represents a register
     /// that is both implicitly used and defined in MI
     bool IsImplicitDefUse(MachineInstr *MI, MachineOperand& MO);
 
-    /// GetPassthruRegs - If MI implicitly def/uses a register, then
+    /// If MI implicitly def/uses a register, then
     /// return that register and all subregisters.
     void GetPassthruRegs(MachineInstr *MI, std::set<unsigned>& PassthruRegs);
 
diff --git a/lib/CodeGen/AllocationOrder.h b/lib/CodeGen/AllocationOrder.h
index 64ff2a7ce836..1e4eaa76ee7f 100644
--- a/lib/CodeGen/AllocationOrder.h
+++ b/lib/CodeGen/AllocationOrder.h
@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_ALLOCATIONORDER_H
-#define LLVM_CODEGEN_ALLOCATIONORDER_H
+#ifndef LLVM_LIB_CODEGEN_ALLOCATIONORDER_H
+#define LLVM_LIB_CODEGEN_ALLOCATIONORDER_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCRegisterInfo.h"
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 0eabee30e964..2e8af9eb0048 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -25,12 +25,14 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+
 using namespace llvm;
 
-/// ComputeLinearIndex - Given an LLVM IR aggregate type and a sequence
-/// of insertvalue or extractvalue indices that identify a member, return
-/// the linearized index of the start of the member.
-///
+/// Compute the linearized index of a member in a nested aggregate/struct/array
+/// by recursing and accumulating CurIndex as long as there are indices in the
+/// index list.
 unsigned llvm::ComputeLinearIndex(Type *Ty,
                                   const unsigned *Indices,
                                   const unsigned *IndicesEnd,
@@ -49,16 +51,23 @@ unsigned llvm::ComputeLinearIndex(Type *Ty,
         return ComputeLinearIndex(*EI, Indices+1, IndicesEnd, CurIndex);
       CurIndex = ComputeLinearIndex(*EI, nullptr, nullptr, CurIndex);
     }
+    assert(!Indices && "Unexpected out of bound");
     return CurIndex;
   }
   // Given an array type, recursively traverse the elements.
   else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Type *EltTy = ATy->getElementType();
-    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i) {
-      if (Indices && *Indices == i)
-        return ComputeLinearIndex(EltTy, Indices+1, IndicesEnd, CurIndex);
-      CurIndex = ComputeLinearIndex(EltTy, nullptr, nullptr, CurIndex);
+    unsigned NumElts = ATy->getNumElements();
+    // Compute the Linear offset when jumping one element of the array
+    unsigned EltLinearOffset = ComputeLinearIndex(EltTy, nullptr, nullptr, 0);
+    if (Indices) {
+      assert(*Indices < NumElts && "Unexpected out of bound");
+      // If the indice is inside the array, compute the index to the requested
+      // elt and recurse inside the element with the end of the indices list
+      CurIndex += EltLinearOffset* *Indices;
+      return ComputeLinearIndex(EltTy, Indices+1, IndicesEnd, CurIndex);
     }
+    CurIndex += EltLinearOffset*NumElts;
     return CurIndex;
   }
   // We haven't found the type we're looking for, so keep searching.
@@ -106,15 +115,16 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, Type *Ty,
 }
 
 /// ExtractTypeInfo - Returns the type info, possibly bitcast, encoded in V.
-GlobalVariable *llvm::ExtractTypeInfo(Value *V) {
+GlobalValue *llvm::ExtractTypeInfo(Value *V) {
   V = V->stripPointerCasts();
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
+  GlobalValue *GV = dyn_cast<GlobalValue>(V);
+  GlobalVariable *Var = dyn_cast<GlobalVariable>(V);
 
-  if (GV && GV->getName() == "llvm.eh.catch.all.value") {
-    assert(GV->hasInitializer() &&
+  if (Var && Var->getName() == "llvm.eh.catch.all.value") {
+    assert(Var->hasInitializer() &&
            "The EH catch-all value must have an initializer");
-    Value *Init = GV->getInitializer();
-    GV = dyn_cast<GlobalVariable>(Init);
+    Value *Init = Var->getInitializer();
+    GV = dyn_cast<GlobalValue>(Init);
     if (!GV) V = cast<ConstantPointerNull>(Init);
   }
 
@@ -508,8 +518,8 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
         return false;
     }
 
-  return returnTypeIsEligibleForTailCall(ExitBB->getParent(), I, Ret,
-                                         *TM.getTargetLowering());
+  return returnTypeIsEligibleForTailCall(
+      ExitBB->getParent(), I, Ret, *TM.getSubtargetImpl()->getTargetLowering());
 }
 
 bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
@@ -606,3 +616,29 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
 
   return true;
 }
+
+bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) {
+  if (!GV->hasLinkOnceODRLinkage())
+    return false;
+
+  if (GV->hasUnnamedAddr())
+    return true;
+
+  // If it is a non constant variable, it needs to be uniqued across shared
+  // objects.
+  if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) {
+    if (!Var->isConstant())
+      return false;
+  }
+
+  // An alias can point to a variable. We could try to resolve the alias to
+  // decide, but for now just don't hide them.
+  if (isa<GlobalAlias>(GV))
+    return false;
+
+  GlobalStatus GS;
+  if (GlobalStatus::analyzeGlobal(GV, GS))
+    return false;
+
+  return !GS.IsCompared;
+}
diff --git a/lib/CodeGen/AntiDepBreaker.h b/lib/CodeGen/AntiDepBreaker.h
index df47f984d578..a61a8efa4da0 100644
--- a/lib/CodeGen/AntiDepBreaker.h
+++ b/lib/CodeGen/AntiDepBreaker.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_ANTIDEPBREAKER_H
-#define LLVM_CODEGEN_ANTIDEPBREAKER_H
+#ifndef LLVM_LIB_CODEGEN_ANTIDEPBREAKER_H
+#define LLVM_LIB_CODEGEN_ANTIDEPBREAKER_H
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -25,9 +25,8 @@
 
 namespace llvm {
 
-/// AntiDepBreaker - This class works into conjunction with the
-/// post-RA scheduler to rename registers to break register
-/// anti-dependencies.
+/// This class works in conjunction with the post-RA scheduler to rename
+/// registers to break register anti-dependencies (WAR hazards).
 class AntiDepBreaker {
 public:
   typedef std::vector<std::pair<MachineInstr *, MachineInstr *> > 
@@ -35,29 +34,26 @@ public:
 
   virtual ~AntiDepBreaker();
 
-  /// Start - Initialize anti-dep breaking for a new basic block.
+  /// Initialize anti-dep breaking for a new basic block.
   virtual void StartBlock(MachineBasicBlock *BB) =0;
 
-  /// BreakAntiDependencies - Identifiy anti-dependencies within a
-  /// basic-block region and break them by renaming registers. Return
-  /// the number of anti-dependencies broken.
-  ///
+  /// Identifiy anti-dependencies within a basic-block region and break them by
+  /// renaming registers. Return the number of anti-dependencies broken.
   virtual unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                                          MachineBasicBlock::iterator Begin,
                                          MachineBasicBlock::iterator End,
                                          unsigned InsertPosIndex,
                                          DbgValueVector &DbgValues) = 0;
   
-  /// Observe - Update liveness information to account for the current
+  /// Update liveness information to account for the current
   /// instruction, which will not be scheduled.
-  ///
   virtual void Observe(MachineInstr *MI, unsigned Count,
                        unsigned InsertPosIndex) =0;
   
-  /// Finish - Finish anti-dep breaking for a basic block.
+  /// Finish anti-dep breaking for a basic block.
   virtual void FinishBlock() =0;
 
-  /// UpdateDbgValue - Update DBG_VALUE if dependency breaker is updating
+  /// Update DBG_VALUE if dependency breaker is updating
   /// other machine instruction to use NewReg.
   void UpdateDbgValue(MachineInstr *MI, unsigned OldReg, unsigned NewReg) {
     assert (MI->isDebugValue() && "MI is not DBG_VALUE!");
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 251f5effd6b4..66c6c6335407 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -108,7 +108,7 @@ void ARMException::endFunction(const MachineFunction *) {
 }
 
 void ARMException::emitTypeInfos(unsigned TTypeEncoding) {
-  const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<const GlobalValue *> &TypeInfos = MMI->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
 
   bool VerboseAsm = Asm->OutStreamer.isVerboseAsm();
@@ -121,9 +121,9 @@ void ARMException::emitTypeInfos(unsigned TTypeEncoding) {
     Entry = TypeInfos.size();
   }
 
-  for (std::vector<const GlobalVariable *>::const_reverse_iterator
+  for (std::vector<const GlobalValue *>::const_reverse_iterator
          I = TypeInfos.rbegin(), E = TypeInfos.rend(); I != E; ++I) {
-    const GlobalVariable *GV = *I;
+    const GlobalValue *GV = *I;
     if (VerboseAsm)
       Asm->OutStreamer.AddComment("TypeInfo " + Twine(Entry--));
     Asm->EmitTTypeReference(GV, TTypeEncoding);
diff --git a/lib/CodeGen/AsmPrinter/AddressPool.h b/lib/CodeGen/AsmPrinter/AddressPool.h
index 42757d732fd5..802e050cdf6e 100644
--- a/lib/CodeGen/AsmPrinter/AddressPool.h
+++ b/lib/CodeGen/AsmPrinter/AddressPool.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_ADDRESSPOOL_H__
-#define CODEGEN_ASMPRINTER_ADDRESSPOOL_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_ADDRESSPOOL_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_ADDRESSPOOL_H
 
 #include "llvm/ADT/DenseMap.h"
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 424e759caa8c..bbed8081a148 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -14,11 +14,13 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
+#include "Win64Exception.h"
 #include "WinCodeViewLineTables.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/JumpInstrTableInfo.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -49,7 +51,6 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include "llvm/Transforms/Utils/GlobalStatus.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -98,11 +99,10 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &TD,
 }
 
 AsmPrinter::AsmPrinter(TargetMachine &tm, MCStreamer &Streamer)
-  : MachineFunctionPass(ID),
-    TM(tm), MAI(tm.getMCAsmInfo()), MII(tm.getInstrInfo()),
-    OutContext(Streamer.getContext()),
-    OutStreamer(Streamer),
-    LastMI(nullptr), LastFn(0), Counter(~0U), SetCounter(0) {
+    : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()),
+      MII(tm.getSubtargetImpl()->getInstrInfo()),
+      OutContext(Streamer.getContext()), OutStreamer(Streamer), LastMI(nullptr),
+      LastFn(0), Counter(~0U), SetCounter(0) {
   DD = nullptr; MMI = nullptr; LI = nullptr; MF = nullptr;
   CurrentFnSym = CurrentFnSymForSize = nullptr;
   GCMetadataPrinters = nullptr;
@@ -129,12 +129,12 @@ unsigned AsmPrinter::getFunctionNumber() const {
 }
 
 const TargetLoweringObjectFile &AsmPrinter::getObjFileLowering() const {
-  return TM.getTargetLowering()->getObjFileLowering();
+  return TM.getSubtargetImpl()->getTargetLowering()->getObjFileLowering();
 }
 
 /// getDataLayout - Return information about data layout.
 const DataLayout &AsmPrinter::getDataLayout() const {
-  return *TM.getDataLayout();
+  return *TM.getSubtargetImpl()->getDataLayout();
 }
 
 const MCSubtargetInfo &AsmPrinter::getSubtargetInfo() const {
@@ -173,9 +173,9 @@ bool AsmPrinter::doInitialization(Module &M) {
   const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
     .Initialize(OutContext, TM);
 
-  OutStreamer.InitSections();
+  OutStreamer.InitSections(false);
 
-  Mang = new Mangler(TM.getDataLayout());
+  Mang = new Mangler(TM.getSubtargetImpl()->getDataLayout());
 
   // Emit the version-min deplyment target directive if needed.
   //
@@ -210,7 +210,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   assert(MI && "AsmPrinter didn't require GCModuleInfo?");
   for (auto &I : *MI)
     if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*I))
-      MP->beginAssembly(*this);
+      MP->beginAssembly(M, *MI, *this);
 
   // Emit module-level inline asm if it exists.
   if (!M.getModuleInlineAsm().empty()) {
@@ -222,11 +222,22 @@ bool AsmPrinter::doInitialization(Module &M) {
   }
 
   if (MAI->doesSupportDebugInformation()) {
+    bool skip_dwarf = false;
     if (Triple(TM.getTargetTriple()).isKnownWindowsMSVCEnvironment()) {
       Handlers.push_back(HandlerInfo(new WinCodeViewLineTables(this),
                                      DbgTimerName,
                                      CodeViewLineTablesGroupName));
-    } else {
+      // FIXME: Don't emit DWARF debug info if there's at least one function
+      // with AddressSanitizer instrumentation.
+      // This is a band-aid fix for PR22032.
+      for (auto &F : M.functions()) {
+        if (F.hasFnAttribute(Attribute::SanitizeAddress)) {
+          skip_dwarf = true;
+          break;
+        }
+      }
+    }
+    if (!skip_dwarf) {
       DD = new DwarfDebug(this, &M);
       Handlers.push_back(HandlerInfo(DD, DbgTimerName, DWARFGroupName));
     }
@@ -243,8 +254,14 @@ bool AsmPrinter::doInitialization(Module &M) {
   case ExceptionHandling::ARM:
     ES = new ARMException(this);
     break;
-  case ExceptionHandling::WinEH:
-    ES = new Win64Exception(this);
+  case ExceptionHandling::ItaniumWinEH:
+  case ExceptionHandling::MSVC:
+    switch (MAI->getWinEHEncodingType()) {
+    default: llvm_unreachable("unsupported unwinding information encoding");
+    case WinEH::EncodingType::Itanium:
+      ES = new Win64Exception(this);
+      break;
+    }
     break;
   }
   if (ES)
@@ -253,33 +270,10 @@ bool AsmPrinter::doInitialization(Module &M) {
 }
 
 static bool canBeHidden(const GlobalValue *GV, const MCAsmInfo &MAI) {
-  GlobalValue::LinkageTypes Linkage = GV->getLinkage();
-  if (Linkage != GlobalValue::LinkOnceODRLinkage)
-    return false;
-
   if (!MAI.hasWeakDefCanBeHiddenDirective())
     return false;
 
-  if (GV->hasUnnamedAddr())
-    return true;
-
-  // This is only used for MachO, so right now it doesn't really matter how
-  // we handle alias. Revisit this once the MachO linker implements aliases.
-  if (isa<GlobalAlias>(GV))
-    return false;
-
-  // If it is a non constant variable, it needs to be uniqued across shared
-  // objects.
-  if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) {
-    if (!Var->isConstant())
-      return false;
-  }
-
-  GlobalStatus GS;
-  if (!GlobalStatus::analyzeGlobal(GV, GS) && !GS.IsCompared)
-    return true;
-
-  return false;
+  return canBeOmittedFromSymbolTable(GV);
 }
 
 void AsmPrinter::EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
@@ -356,12 +350,17 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   if (!GV->hasInitializer())   // External globals require no extra code.
     return;
 
+  GVSym->redefineIfPossible();
+  if (GVSym->isDefined() || GVSym->isVariable())
+    report_fatal_error("symbol '" + Twine(GVSym->getName()) +
+                       "' is already defined");
+
   if (MAI->hasDotTypeDotSizeDirective())
     OutStreamer.EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
 
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
 
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   uint64_t Size = DL->getTypeAllocSize(GV->getType()->getElementType());
 
   // If the alignment is specified, we *must* obey it.  Overaligning a global
@@ -528,6 +527,10 @@ void AsmPrinter::EmitFunctionHeader() {
     OutStreamer.GetCommentOS() << '\n';
   }
 
+  // Emit the prefix data.
+  if (F->hasPrefixData())
+    EmitGlobalConstant(F->getPrefixData());
+
   // Emit the CurrentFnSym.  This is a virtual function to allow targets to
   // do their wild and crazy things as required.
   EmitFunctionEntryLabel();
@@ -548,21 +551,26 @@ void AsmPrinter::EmitFunctionHeader() {
     HI.Handler->beginFunction(MF);
   }
 
-  // Emit the prefix data.
-  if (F->hasPrefixData())
-    EmitGlobalConstant(F->getPrefixData());
+  // Emit the prologue data.
+  if (F->hasPrologueData())
+    EmitGlobalConstant(F->getPrologueData());
 }
 
 /// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
 /// function.  This can be overridden by targets as required to do custom stuff.
 void AsmPrinter::EmitFunctionEntryLabel() {
+  CurrentFnSym->redefineIfPossible();
+
   // The function label could have already been emitted if two symbols end up
   // conflicting due to asm renaming.  Detect this and emit an error.
-  if (CurrentFnSym->isUndefined())
-    return OutStreamer.EmitLabel(CurrentFnSym);
+  if (CurrentFnSym->isVariable())
+    report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
+                       "' is a protected alias");
+  if (CurrentFnSym->isDefined())
+    report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
+                       "' label emitted multiple times to assembly file");
 
-  report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
-                     "' label emitted multiple times to assembly file");
+  return OutStreamer.EmitLabel(CurrentFnSym);
 }
 
 /// emitComments - Pretty-print comments for instructions.
@@ -578,20 +586,24 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   // We assume a single instruction only has a spill or reload, not
   // both.
   const MachineMemOperand *MMO;
-  if (TM.getInstrInfo()->isLoadFromStackSlotPostFE(&MI, FI)) {
+  if (TM.getSubtargetImpl()->getInstrInfo()->isLoadFromStackSlotPostFE(&MI,
+                                                                       FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
       CommentOS << MMO->getSize() << "-byte Reload\n";
     }
-  } else if (TM.getInstrInfo()->hasLoadFromStackSlot(&MI, MMO, FI)) {
+  } else if (TM.getSubtargetImpl()->getInstrInfo()->hasLoadFromStackSlot(
+                 &MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI))
       CommentOS << MMO->getSize() << "-byte Folded Reload\n";
-  } else if (TM.getInstrInfo()->isStoreToStackSlotPostFE(&MI, FI)) {
+  } else if (TM.getSubtargetImpl()->getInstrInfo()->isStoreToStackSlotPostFE(
+                 &MI, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
       CommentOS << MMO->getSize() << "-byte Spill\n";
     }
-  } else if (TM.getInstrInfo()->hasStoreToStackSlot(&MI, MMO, FI)) {
+  } else if (TM.getSubtargetImpl()->getInstrInfo()->hasStoreToStackSlot(
+                 &MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI))
       CommentOS << MMO->getSize() << "-byte Folded Spill\n";
   }
@@ -605,8 +617,9 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
 /// that is an implicit def.
 void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
   unsigned RegNo = MI->getOperand(0).getReg();
-  OutStreamer.AddComment(Twine("implicit-def: ") +
-                         TM.getRegisterInfo()->getName(RegNo));
+  OutStreamer.AddComment(
+      Twine("implicit-def: ") +
+      TM.getSubtargetImpl()->getRegisterInfo()->getName(RegNo));
   OutStreamer.AddBlankLine();
 }
 
@@ -616,7 +629,7 @@ static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
     const MachineOperand &Op = MI->getOperand(i);
     assert(Op.isReg() && "KILL instruction must have only register operands");
     Str += ' ';
-    Str += AP.TM.getRegisterInfo()->getName(Op.getReg());
+    Str += AP.TM.getSubtargetImpl()->getRegisterInfo()->getName(Op.getReg());
     Str += (Op.isDef() ? "<def>" : "<kill>");
   }
   AP.OutStreamer.AddComment(Str);
@@ -627,21 +640,27 @@ static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
 /// of DBG_VALUE, returning true if it was able to do so.  A false return
 /// means the target will need to handle MI in EmitInstruction.
 static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
-  // This code handles only the 3-operand target-independent form.
-  if (MI->getNumOperands() != 3)
+  // This code handles only the 4-operand target-independent form.
+  if (MI->getNumOperands() != 4)
     return false;
 
   SmallString<128> Str;
   raw_svector_ostream OS(Str);
   OS << "DEBUG_VALUE: ";
 
-  DIVariable V(MI->getOperand(2).getMetadata());
+  DIVariable V = MI->getDebugVariable();
   if (V.getContext().isSubprogram()) {
     StringRef Name = DISubprogram(V.getContext()).getDisplayName();
     if (!Name.empty())
       OS << Name << ":";
   }
-  OS << V.getName() << " <- ";
+  OS << V.getName();
+
+  DIExpression Expr = MI->getDebugExpression();
+  if (Expr.isVariablePiece())
+    OS << " [piece offset=" << Expr.getPieceOffset()
+       << " size=" << Expr.getPieceSize() << "]";
+  OS << " <- ";
 
   // The second operand is only an offset if it's an immediate.
   bool Deref = MI->getOperand(0).isReg() && MI->getOperand(1).isImm();
@@ -672,7 +691,8 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
       Reg = MI->getOperand(0).getReg();
     } else {
       assert(MI->getOperand(0).isFI() && "Unknown operand type");
-      const TargetFrameLowering *TFI = AP.TM.getFrameLowering();
+      const TargetFrameLowering *TFI =
+          AP.TM.getSubtargetImpl()->getFrameLowering();
       Offset += TFI->getFrameIndexReference(*AP.MF,
                                             MI->getOperand(0).getIndex(), Reg);
       Deref = true;
@@ -686,7 +706,7 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
     }
     if (Deref)
       OS << '[';
-    OS << AP.TM.getRegisterInfo()->getName(Reg);
+    OS << AP.TM.getSubtargetImpl()->getRegisterInfo()->getName(Reg);
   }
 
   if (Deref)
@@ -709,8 +729,7 @@ AsmPrinter::CFIMoveType AsmPrinter::needsCFIMoves() {
 }
 
 bool AsmPrinter::needsSEHMoves() {
-  return MAI->getExceptionHandlingType() == ExceptionHandling::WinEH &&
-    MF->getFunction()->needsUnwindTableEntry();
+  return MAI->usesWindowsCFI() && MF->getFunction()->needsUnwindTableEntry();
 }
 
 void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) {
@@ -722,9 +741,6 @@ void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) {
   if (needsCFIMoves() == CFI_M_None)
     return;
 
-  if (MMI->getCompactUnwindEncoding() != 0)
-    OutStreamer.EmitCompactUnwindEncoding(MMI->getCompactUnwindEncoding());
-
   const MachineModuleInfo &MMI = MF->getMMI();
   const std::vector<MCCFIInstruction> &Instrs = MMI.getFrameInstructions();
   unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
@@ -732,6 +748,16 @@ void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) {
   emitCFIInstruction(CFI);
 }
 
+void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) {
+  // The operands are the MCSymbol and the frame offset of the allocation.
+  MCSymbol *FrameAllocSym = MI.getOperand(0).getMCSymbol();
+  int FrameOffset = MI.getOperand(1).getImm();
+
+  // Emit a symbol assignment.
+  OutStreamer.EmitAssignment(FrameAllocSym,
+                             MCConstantExpr::Create(FrameOffset, OutContext));
+}
+
 /// EmitFunctionBody - This method emits the body and trailer for a
 /// function.
 void AsmPrinter::EmitFunctionBody() {
@@ -742,12 +768,10 @@ void AsmPrinter::EmitFunctionBody() {
 
   // Print out code for the function.
   bool HasAnyRealCode = false;
-  const MachineInstr *LastMI = nullptr;
   for (auto &MBB : *MF) {
     // Print a label for the basic block.
     EmitBasicBlockStart(MBB);
     for (auto &MI : MBB) {
-      LastMI = &MI;
 
       // Print the assembly for the instruction.
       if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&
@@ -772,6 +796,10 @@ void AsmPrinter::EmitFunctionBody() {
         emitCFIInstruction(MI);
         break;
 
+      case TargetOpcode::FRAME_ALLOC:
+        emitFrameAlloc(MI);
+        break;
+
       case TargetOpcode::EH_LABEL:
       case TargetOpcode::GC_LABEL:
         OutStreamer.EmitLabel(MI.getOperand(0).getMCSymbol());
@@ -804,26 +832,22 @@ void AsmPrinter::EmitFunctionBody() {
         }
       }
     }
-  }
 
-  // If the last instruction was a prolog label, then we have a situation where
-  // we emitted a prolog but no function body. This results in the ending prolog
-  // label equaling the end of function label and an invalid "row" in the
-  // FDE. We need to emit a noop in this situation so that the FDE's rows are
-  // valid.
-  bool RequiresNoop = LastMI && LastMI->isCFIInstruction();
+    EmitBasicBlockEnd(MBB);
+  }
 
   // If the function is empty and the object file uses .subsections_via_symbols,
   // then we need to emit *something* to the function body to prevent the
   // labels from collapsing together.  Just emit a noop.
-  if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode) || RequiresNoop) {
+  if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode)) {
     MCInst Noop;
-    TM.getInstrInfo()->getNoopForMachoTarget(Noop);
-    if (Noop.getOpcode()) {
-      OutStreamer.AddComment("avoids zero-length function");
+    TM.getSubtargetImpl()->getInstrInfo()->getNoopForMachoTarget(Noop);
+    OutStreamer.AddComment("avoids zero-length function");
+
+    // Targets can opt-out of emitting the noop here by leaving the opcode
+    // unspecified.
+    if (Noop.getOpcode())
       OutStreamer.EmitInstruction(Noop, getSubtargetInfo());
-    } else  // Target not mc-ized yet.
-      OutStreamer.EmitRawText(StringRef("\tnop\n"));
   }
 
   const Function *F = MF->getFunction();
@@ -869,8 +893,6 @@ void AsmPrinter::EmitFunctionBody() {
   OutStreamer.AddBlankLine();
 }
 
-static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP);
-
 bool AsmPrinter::doFinalization(Module &M) {
   // Emit global variables.
   for (const auto &G : M.globals())
@@ -895,17 +917,18 @@ bool AsmPrinter::doFinalization(Module &M) {
     unsigned Arch = Triple(getTargetTriple()).getArch();
     bool IsThumb = (Arch == Triple::thumb || Arch == Triple::thumbeb);
     MCInst TrapInst;
-    TM.getInstrInfo()->getTrap(TrapInst);
+    TM.getSubtargetImpl()->getInstrInfo()->getTrap(TrapInst);
+    unsigned LogAlignment = llvm::Log2_64(JITI->entryByteAlignment());
+
+    // Emit the right section for these functions.
+    OutStreamer.SwitchSection(OutContext.getObjectFileInfo()->getTextSection());
     for (const auto &KV : JITI->getTables()) {
       uint64_t Count = 0;
       for (const auto &FunPair : KV.second) {
         // Emit the function labels to make this be a function entry point.
         MCSymbol *FunSym =
           OutContext.GetOrCreateSymbol(FunPair.second->getName());
-        OutStreamer.EmitSymbolAttribute(FunSym, MCSA_Global);
-        // FIXME: JumpTableInstrInfo should store information about the required
-        // alignment of table entries and the size of the padding instruction.
-        EmitAlignment(3);
+        EmitAlignment(LogAlignment);
         if (IsThumb)
           OutStreamer.EmitThumbFunc(FunSym);
         if (MAI->hasDotTypeDotSizeDirective())
@@ -920,16 +943,16 @@ bool AsmPrinter::doFinalization(Module &M) {
         const MCSymbolRefExpr *TargetSymRef =
           MCSymbolRefExpr::Create(TargetSymbol, MCSymbolRefExpr::VK_PLT,
                                   OutContext);
-        TM.getInstrInfo()->getUnconditionalBranch(JumpToFun, TargetSymRef);
+        TM.getSubtargetImpl()->getInstrInfo()->getUnconditionalBranch(
+            JumpToFun, TargetSymRef);
         OutStreamer.EmitInstruction(JumpToFun, getSubtargetInfo());
         ++Count;
       }
 
       // Emit enough padding instructions to fill up to the next power of two.
-      // This assumes that the trap instruction takes 8 bytes or fewer.
       uint64_t Remaining = NextPowerOf2(Count) - Count;
       for (uint64_t C = 0; C < Remaining; ++C) {
-        EmitAlignment(3);
+        EmitAlignment(LogAlignment);
         OutStreamer.EmitInstruction(TrapInst, getSubtargetInfo());
       }
 
@@ -976,35 +999,49 @@ bool AsmPrinter::doFinalization(Module &M) {
     }
   }
 
-  if (MAI->hasSetDirective()) {
-    OutStreamer.AddBlankLine();
-    for (const auto &Alias : M.aliases()) {
-      MCSymbol *Name = getSymbol(&Alias);
+  OutStreamer.AddBlankLine();
+  for (const auto &Alias : M.aliases()) {
+    MCSymbol *Name = getSymbol(&Alias);
 
-      if (Alias.hasExternalLinkage() || !MAI->getWeakRefDirective())
-        OutStreamer.EmitSymbolAttribute(Name, MCSA_Global);
-      else if (Alias.hasWeakLinkage() || Alias.hasLinkOnceLinkage())
-        OutStreamer.EmitSymbolAttribute(Name, MCSA_WeakReference);
-      else
-        assert(Alias.hasLocalLinkage() && "Invalid alias linkage");
+    if (Alias.hasExternalLinkage() || !MAI->getWeakRefDirective())
+      OutStreamer.EmitSymbolAttribute(Name, MCSA_Global);
+    else if (Alias.hasWeakLinkage() || Alias.hasLinkOnceLinkage())
+      OutStreamer.EmitSymbolAttribute(Name, MCSA_WeakReference);
+    else
+      assert(Alias.hasLocalLinkage() && "Invalid alias linkage");
 
-      EmitVisibility(Name, Alias.getVisibility());
+    EmitVisibility(Name, Alias.getVisibility());
 
-      // Emit the directives as assignments aka .set:
-      OutStreamer.EmitAssignment(Name,
-                                 lowerConstant(Alias.getAliasee(), *this));
-    }
+    // Emit the directives as assignments aka .set:
+    OutStreamer.EmitAssignment(Name, lowerConstant(Alias.getAliasee()));
   }
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
   assert(MI && "AsmPrinter didn't require GCModuleInfo?");
   for (GCModuleInfo::iterator I = MI->end(), E = MI->begin(); I != E; )
     if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(**--I))
-      MP->finishAssembly(*this);
+      MP->finishAssembly(M, *MI, *this);
 
   // Emit llvm.ident metadata in an '.ident' directive.
   EmitModuleIdents(M);
 
+  // Emit __morestack address if needed for indirect calls.
+  if (MMI->usesMorestackAddr()) {
+    const MCSection *ReadOnlySection =
+        getObjFileLowering().getSectionForConstant(SectionKind::getReadOnly(),
+                                                   /*C=*/nullptr);
+    OutStreamer.SwitchSection(ReadOnlySection);
+
+    MCSymbol *AddrSymbol =
+        OutContext.GetOrCreateSymbol(StringRef("__morestack_addr"));
+    OutStreamer.EmitLabel(AddrSymbol);
+
+    const DataLayout &DL = *TM.getSubtargetImpl()->getDataLayout();
+    unsigned PtrSize = DL.getPointerSize(0);
+    OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("__morestack"),
+                                PtrSize);
+  }
+
   // If we don't have any trampolines, then we don't require stack memory
   // to be executable. Some targets have a directive to declare this.
   Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
@@ -1062,7 +1099,8 @@ void AsmPrinter::EmitConstantPool() {
     const MachineConstantPoolEntry &CPE = CP[i];
     unsigned Align = CPE.getAlignment();
 
-    SectionKind Kind = CPE.getSectionKind(TM.getDataLayout());
+    SectionKind Kind =
+        CPE.getSectionKind(TM.getSubtargetImpl()->getDataLayout());
 
     const Constant *C = nullptr;
     if (!CPE.isMachineConstantPoolEntry())
@@ -1115,7 +1153,8 @@ void AsmPrinter::EmitConstantPool() {
       OutStreamer.EmitZeros(NewOffset - Offset);
 
       Type *Ty = CPE.getType();
-      Offset = NewOffset + TM.getDataLayout()->getTypeAllocSize(Ty);
+      Offset = NewOffset +
+               TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(Ty);
 
       OutStreamer.EmitLabel(Sym);
       if (CPE.isMachineConstantPoolEntry())
@@ -1130,7 +1169,7 @@ void AsmPrinter::EmitConstantPool() {
 /// by the current function to the current output stream.
 ///
 void AsmPrinter::EmitJumpTableInfo() {
-  const DataLayout *DL = MF->getTarget().getDataLayout();
+  const DataLayout *DL = MF->getSubtarget().getDataLayout();
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   if (!MJTI) return;
   if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline) return;
@@ -1161,7 +1200,8 @@ void AsmPrinter::EmitJumpTableInfo() {
     JTInDiffSection = true;
   }
 
-  EmitAlignment(Log2_32(MJTI->getEntryAlignment(*TM.getDataLayout())));
+  EmitAlignment(Log2_32(
+      MJTI->getEntryAlignment(*TM.getSubtargetImpl()->getDataLayout())));
 
   // Jump tables in code sections are marked with a data_region directive
   // where that's supported.
@@ -1174,17 +1214,17 @@ void AsmPrinter::EmitJumpTableInfo() {
     // If this jump table was deleted, ignore it.
     if (JTBBs.empty()) continue;
 
-    // For the EK_LabelDifference32 entry, if the target supports .set, emit a
-    // .set directive for each unique entry.  This reduces the number of
-    // relocations the assembler will generate for the jump table.
+    // For the EK_LabelDifference32 entry, if using .set avoids a relocation,
+    /// emit a .set directive for each unique entry.
     if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 &&
-        MAI->hasSetDirective()) {
+        MAI->doesSetDirectiveSuppressesReloc()) {
       SmallPtrSet<const MachineBasicBlock*, 16> EmittedSets;
-      const TargetLowering *TLI = TM.getTargetLowering();
+      const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
       const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF,JTI,OutContext);
       for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii) {
         const MachineBasicBlock *MBB = JTBBs[ii];
-        if (!EmittedSets.insert(MBB)) continue;
+        if (!EmittedSets.insert(MBB).second)
+          continue;
 
         // .set LJTSet, LBB32-base
         const MCExpr *LHS =
@@ -1223,8 +1263,9 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
   case MachineJumpTableInfo::EK_Inline:
     llvm_unreachable("Cannot emit EK_Inline jump table entry");
   case MachineJumpTableInfo::EK_Custom32:
-    Value = TM.getTargetLowering()->LowerCustomJumpTableEntry(MJTI, MBB, UID,
-                                                              OutContext);
+    Value =
+        TM.getSubtargetImpl()->getTargetLowering()->LowerCustomJumpTableEntry(
+            MJTI, MBB, UID, OutContext);
     break;
   case MachineJumpTableInfo::EK_BlockAddress:
     // EK_BlockAddress - Each entry is a plain address of block, e.g.:
@@ -1250,34 +1291,30 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
   }
 
   case MachineJumpTableInfo::EK_LabelDifference32: {
-    // EK_LabelDifference32 - Each entry is the address of the block minus
-    // the address of the jump table.  This is used for PIC jump tables where
-    // gprel32 is not supported.  e.g.:
+    // Each entry is the address of the block minus the address of the jump
+    // table. This is used for PIC jump tables where gprel32 is not supported.
+    // e.g.:
     //      .word LBB123 - LJTI1_2
-    // If the .set directive is supported, this is emitted as:
+    // If the .set directive avoids relocations, this is emitted as:
     //      .set L4_5_set_123, LBB123 - LJTI1_2
     //      .word L4_5_set_123
-
-    // If we have emitted set directives for the jump table entries, print
-    // them rather than the entries themselves.  If we're emitting PIC, then
-    // emit the table entries as differences between two text section labels.
-    if (MAI->hasSetDirective()) {
-      // If we used .set, reference the .set's symbol.
+    if (MAI->doesSetDirectiveSuppressesReloc()) {
       Value = MCSymbolRefExpr::Create(GetJTSetSymbol(UID, MBB->getNumber()),
                                       OutContext);
       break;
     }
-    // Otherwise, use the difference as the jump table entry.
     Value = MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext);
-    const MCExpr *JTI = MCSymbolRefExpr::Create(GetJTISymbol(UID), OutContext);
-    Value = MCBinaryExpr::CreateSub(Value, JTI, OutContext);
+    const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
+    const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, UID, OutContext);
+    Value = MCBinaryExpr::CreateSub(Value, Base, OutContext);
     break;
   }
   }
 
   assert(Value && "Unknown entry kind!");
 
-  unsigned EntrySize = MJTI->getEntrySize(*TM.getDataLayout());
+  unsigned EntrySize =
+      MJTI->getEntrySize(*TM.getSubtargetImpl()->getDataLayout());
   OutStreamer.EmitValue(Value, EntrySize);
 }
 
@@ -1387,7 +1424,7 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   }
 
   // Emit the function pointers in the target-specific order
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   unsigned Align = Log2_32(DL->getPointerPrefAlignment());
   std::stable_sort(Structors.begin(), Structors.end(),
                    [](const Structor &L,
@@ -1450,9 +1487,9 @@ void AsmPrinter::EmitInt32(int Value) const {
   OutStreamer.EmitIntValue(Value, 4);
 }
 
-/// EmitLabelDifference - Emit something like ".long Hi-Lo" where the size
-/// in bytes of the directive is specified by Size and Hi/Lo specify the
-/// labels.  This implicitly uses .set if it is available.
+/// Emit something like ".long Hi-Lo" where the size in bytes of the directive
+/// is specified by Size and Hi/Lo specify the labels. This implicitly uses
+/// .set if it avoids relocations.
 void AsmPrinter::EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
                                      unsigned Size) const {
   // Get the Hi-Lo expression.
@@ -1461,7 +1498,7 @@ void AsmPrinter::EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
                             MCSymbolRefExpr::Create(Lo, OutContext),
                             OutContext);
 
-  if (!MAI->hasSetDirective()) {
+  if (!MAI->doesSetDirectiveSuppressesReloc()) {
     OutStreamer.EmitValue(Diff, Size);
     return;
   }
@@ -1472,36 +1509,6 @@ void AsmPrinter::EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
   OutStreamer.EmitSymbolValue(SetLabel, Size);
 }
 
-/// EmitLabelOffsetDifference - Emit something like ".long Hi+Offset-Lo"
-/// where the size in bytes of the directive is specified by Size and Hi/Lo
-/// specify the labels.  This implicitly uses .set if it is available.
-void AsmPrinter::EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
-                                           const MCSymbol *Lo,
-                                           unsigned Size) const {
-
-  // Emit Hi+Offset - Lo
-  // Get the Hi+Offset expression.
-  const MCExpr *Plus =
-    MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Hi, OutContext),
-                            MCConstantExpr::Create(Offset, OutContext),
-                            OutContext);
-
-  // Get the Hi+Offset-Lo expression.
-  const MCExpr *Diff =
-    MCBinaryExpr::CreateSub(Plus,
-                            MCSymbolRefExpr::Create(Lo, OutContext),
-                            OutContext);
-
-  if (!MAI->hasSetDirective())
-    OutStreamer.EmitValue(Diff, Size);
-  else {
-    // Otherwise, emit with .set (aka assignment).
-    MCSymbol *SetLabel = GetTempSymbol("set", SetCounter++);
-    OutStreamer.EmitAssignment(SetLabel, Diff);
-    OutStreamer.EmitSymbolValue(SetLabel, Size);
-  }
-}
-
 /// EmitLabelPlusOffset - Emit something like ".long Label+Offset"
 /// where the size in bytes of the directive is specified by Size and Label
 /// specifies the label.  This implicitly uses .set if it is available.
@@ -1531,24 +1538,27 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
 // if required for correctness.
 //
 void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalObject *GV) const {
-  if (GV) NumBits = getGVAlignmentLog2(GV, *TM.getDataLayout(), NumBits);
+  if (GV)
+    NumBits = getGVAlignmentLog2(GV, *TM.getSubtargetImpl()->getDataLayout(),
+                                 NumBits);
 
   if (NumBits == 0) return;   // 1-byte aligned: no need to emit alignment.
 
+  assert(NumBits <
+             static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+         "undefined behavior");
   if (getCurrentSection()->getKind().isText())
-    OutStreamer.EmitCodeAlignment(1 << NumBits);
+    OutStreamer.EmitCodeAlignment(1u << NumBits);
   else
-    OutStreamer.EmitValueToAlignment(1 << NumBits);
+    OutStreamer.EmitValueToAlignment(1u << NumBits);
 }
 
 //===----------------------------------------------------------------------===//
 // Constant emission.
 //===----------------------------------------------------------------------===//
 
-/// lowerConstant - Lower the specified LLVM Constant to an MCExpr.
-///
-static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
-  MCContext &Ctx = AP.OutContext;
+const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
+  MCContext &Ctx = OutContext;
 
   if (CV->isNullValue() || isa<UndefValue>(CV))
     return MCConstantExpr::Create(0, Ctx);
@@ -1557,19 +1567,18 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
-    return MCSymbolRefExpr::Create(AP.getSymbol(GV), Ctx);
+    return MCSymbolRefExpr::Create(getSymbol(GV), Ctx);
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
-    return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
+    return MCSymbolRefExpr::Create(GetBlockAddressSymbol(BA), Ctx);
 
   const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
   if (!CE) {
     llvm_unreachable("Unknown constant value to lower!");
   }
 
-  if (const MCExpr *RelocExpr =
-          AP.getObjFileLowering().getExecutableRelativeSymbol(CE, *AP.Mang,
-                                                              AP.TM))
+  if (const MCExpr *RelocExpr
+      = getObjFileLowering().getExecutableRelativeSymbol(CE, *Mang, TM))
     return RelocExpr;
 
   switch (CE->getOpcode()) {
@@ -1577,10 +1586,10 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     // If the code isn't optimized, there may be outstanding folding
     // opportunities. Attempt to fold the expression using DataLayout as a
     // last resort before giving up.
-    if (Constant *C =
-          ConstantFoldConstantExpression(CE, AP.TM.getDataLayout()))
+    if (Constant *C = ConstantFoldConstantExpression(
+            CE, TM.getSubtargetImpl()->getDataLayout()))
       if (C != CE)
-        return lowerConstant(C, AP);
+        return lowerConstant(C);
 
     // Otherwise report the problem to the user.
     {
@@ -1588,16 +1597,16 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
       raw_string_ostream OS(S);
       OS << "Unsupported expression in static initializer: ";
       CE->printAsOperand(OS, /*PrintType=*/false,
-                     !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
+                     !MF ? nullptr : MF->getFunction()->getParent());
       report_fatal_error(OS.str());
     }
   case Instruction::GetElementPtr: {
-    const DataLayout &DL = *AP.TM.getDataLayout();
+    const DataLayout &DL = *TM.getSubtargetImpl()->getDataLayout();
     // Generate a symbolic expression for the byte address
     APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0);
     cast<GEPOperator>(CE)->accumulateConstantOffset(DL, OffsetAI);
 
-    const MCExpr *Base = lowerConstant(CE->getOperand(0), AP);
+    const MCExpr *Base = lowerConstant(CE->getOperand(0));
     if (!OffsetAI)
       return Base;
 
@@ -1613,26 +1622,26 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     // is reasonable to treat their delta as a 32-bit value.
     // FALL THROUGH.
   case Instruction::BitCast:
-    return lowerConstant(CE->getOperand(0), AP);
+    return lowerConstant(CE->getOperand(0));
 
   case Instruction::IntToPtr: {
-    const DataLayout &DL = *AP.TM.getDataLayout();
+    const DataLayout &DL = *TM.getSubtargetImpl()->getDataLayout();
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
     Op = ConstantExpr::getIntegerCast(Op, DL.getIntPtrType(CV->getType()),
                                       false/*ZExt*/);
-    return lowerConstant(Op, AP);
+    return lowerConstant(Op);
   }
 
   case Instruction::PtrToInt: {
-    const DataLayout &DL = *AP.TM.getDataLayout();
+    const DataLayout &DL = *TM.getSubtargetImpl()->getDataLayout();
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
     Constant *Op = CE->getOperand(0);
     Type *Ty = CE->getType();
 
-    const MCExpr *OpExpr = lowerConstant(Op, AP);
+    const MCExpr *OpExpr = lowerConstant(Op);
 
     // We can emit the pointer value into this slot if the slot is an
     // integer slot equal to the size of the pointer.
@@ -1658,8 +1667,8 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor: {
-    const MCExpr *LHS = lowerConstant(CE->getOperand(0), AP);
-    const MCExpr *RHS = lowerConstant(CE->getOperand(1), AP);
+    const MCExpr *LHS = lowerConstant(CE->getOperand(0));
+    const MCExpr *RHS = lowerConstant(CE->getOperand(1));
     switch (CE->getOpcode()) {
     default: llvm_unreachable("Unknown binary operator constant cast expr");
     case Instruction::Add: return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
@@ -1699,7 +1708,8 @@ static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     if (CI->getBitWidth() > 64) return -1;
 
-    uint64_t Size = TM.getDataLayout()->getTypeAllocSize(V->getType());
+    uint64_t Size =
+        TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(V->getType());
     uint64_t Value = CI->getZExtValue();
 
     // Make sure the constant is at least 8 bits long and has a power
@@ -1743,7 +1753,9 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
   // See if we can aggregate this into a .fill, if so, emit it as such.
   int Value = isRepeatedByteSequence(CDS, AP.TM);
   if (Value != -1) {
-    uint64_t Bytes = AP.TM.getDataLayout()->getTypeAllocSize(CDS->getType());
+    uint64_t Bytes =
+        AP.TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
+            CDS->getType());
     // Don't emit a 1-byte object as a .fill.
     if (Bytes > 1)
       return AP.OutStreamer.EmitFill(Bytes, Value);
@@ -1793,7 +1805,7 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
     }
   }
 
-  const DataLayout &DL = *AP.TM.getDataLayout();
+  const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
   unsigned Size = DL.getTypeAllocSize(CDS->getType());
   unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) *
                         CDS->getNumElements();
@@ -1808,7 +1820,9 @@ static void emitGlobalConstantArray(const ConstantArray *CA, AsmPrinter &AP) {
   int Value = isRepeatedByteSequence(CA, AP.TM);
 
   if (Value != -1) {
-    uint64_t Bytes = AP.TM.getDataLayout()->getTypeAllocSize(CA->getType());
+    uint64_t Bytes =
+        AP.TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
+            CA->getType());
     AP.OutStreamer.EmitFill(Bytes, Value);
   }
   else {
@@ -1821,7 +1835,7 @@ static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) {
   for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
     emitGlobalConstantImpl(CV->getOperand(i), AP);
 
-  const DataLayout &DL = *AP.TM.getDataLayout();
+  const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
   unsigned Size = DL.getTypeAllocSize(CV->getType());
   unsigned EmittedSize = DL.getTypeAllocSize(CV->getType()->getElementType()) *
                          CV->getType()->getNumElements();
@@ -1831,7 +1845,7 @@ static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) {
 
 static void emitGlobalConstantStruct(const ConstantStruct *CS, AsmPrinter &AP) {
   // Print the fields in successive locations. Pad to align if needed!
-  const DataLayout *DL = AP.TM.getDataLayout();
+  const DataLayout *DL = AP.TM.getSubtargetImpl()->getDataLayout();
   unsigned Size = DL->getTypeAllocSize(CS->getType());
   const StructLayout *Layout = DL->getStructLayout(CS->getType());
   uint64_t SizeSoFar = 0;
@@ -1881,7 +1895,7 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
 
   // PPC's long double has odd notions of endianness compared to how LLVM
   // handles it: p[0] goes first for *big* endian on PPC.
-  if (AP.TM.getDataLayout()->isBigEndian() &&
+  if (AP.TM.getSubtargetImpl()->getDataLayout()->isBigEndian() &&
       !CFP->getType()->isPPC_FP128Ty()) {
     int Chunk = API.getNumWords() - 1;
 
@@ -1900,13 +1914,13 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
   }
 
   // Emit the tail padding for the long double.
-  const DataLayout &DL = *AP.TM.getDataLayout();
+  const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
   AP.OutStreamer.EmitZeros(DL.getTypeAllocSize(CFP->getType()) -
                            DL.getTypeStoreSize(CFP->getType()));
 }
 
 static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
-  const DataLayout *DL = AP.TM.getDataLayout();
+  const DataLayout *DL = AP.TM.getSubtargetImpl()->getDataLayout();
   unsigned BitWidth = CI->getBitWidth();
 
   // Copy the value as we may massage the layout for constants whose bit width
@@ -1952,7 +1966,8 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
     // Emit the extra bits after the 64-bits chunks.
 
     // Emit a directive that fills the expected size.
-    uint64_t Size = AP.TM.getDataLayout()->getTypeAllocSize(CI->getType());
+    uint64_t Size = AP.TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
+        CI->getType());
     Size -= (BitWidth / 64) * 8;
     assert(Size && Size * 8 >= ExtraBitsSize &&
            (ExtraBits & (((uint64_t)-1) >> (64 - ExtraBitsSize)))
@@ -1962,7 +1977,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
 }
 
 static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP) {
-  const DataLayout *DL = AP.TM.getDataLayout();
+  const DataLayout *DL = AP.TM.getSubtargetImpl()->getDataLayout();
   uint64_t Size = DL->getTypeAllocSize(CV->getType());
   if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
     return AP.OutStreamer.EmitZeros(Size);
@@ -2022,12 +2037,13 @@ static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP) {
 
   // Otherwise, it must be a ConstantExpr.  Lower it to an MCExpr, then emit it
   // thread the streamer with EmitValue.
-  AP.OutStreamer.EmitValue(lowerConstant(CV, AP), Size);
+  AP.OutStreamer.EmitValue(AP.lowerConstant(CV), Size);
 }
 
 /// EmitGlobalConstant - Print a general LLVM constant to the .s file.
 void AsmPrinter::EmitGlobalConstant(const Constant *CV) {
-  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType());
+  uint64_t Size =
+      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(CV->getType());
   if (Size)
     emitGlobalConstantImpl(CV, *this);
   else if (MAI->hasSubsectionsViaSymbols()) {
@@ -2056,7 +2072,7 @@ void AsmPrinter::printOffset(int64_t Offset, raw_ostream &OS) const {
 /// GetTempSymbol - Return the MCSymbol corresponding to the assembler
 /// temporary label with the specified stem and unique ID.
 MCSymbol *AsmPrinter::GetTempSymbol(Twine Name, unsigned ID) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   return OutContext.GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix()) +
                                       Name + Twine(ID));
 }
@@ -2064,7 +2080,7 @@ MCSymbol *AsmPrinter::GetTempSymbol(Twine Name, unsigned ID) const {
 /// GetTempSymbol - Return an assembler temporary label with the specified
 /// stem.
 MCSymbol *AsmPrinter::GetTempSymbol(Twine Name) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   return OutContext.GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+
                                       Name);
 }
@@ -2080,7 +2096,7 @@ MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BasicBlock *BB) const {
 
 /// GetCPISymbol - Return the symbol for the specified constant pool entry.
 MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   return OutContext.GetOrCreateSymbol
     (Twine(DL->getPrivateGlobalPrefix()) + "CPI" + Twine(getFunctionNumber())
      + "_" + Twine(CPID));
@@ -2094,7 +2110,7 @@ MCSymbol *AsmPrinter::GetJTISymbol(unsigned JTID, bool isLinkerPrivate) const {
 /// GetJTSetSymbol - Return the symbol for the specified jump table .set
 /// FIXME: privatize to AsmPrinter.
 MCSymbol *AsmPrinter::GetJTSetSymbol(unsigned UID, unsigned MBBID) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   return OutContext.GetOrCreateSymbol
   (Twine(DL->getPrivateGlobalPrefix()) + Twine(getFunctionNumber()) + "_" +
    Twine(UID) + "_set_" + Twine(MBBID));
@@ -2292,6 +2308,11 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
   if (!S.usesMetadata())
     return nullptr;
 
+  assert(!S.useStatepoints() && "statepoints do not currently support custom"
+         " stackmap formats, please see the documentation for a description of"
+         " the default format.  If you really need a custom serialized format,"
+         " please file a bug");
+
   gcp_map_type &GCMap = getGCMap(GCMetadataPrinters);
   gcp_map_type::iterator GCPI = GCMap.find(&S);
   if (GCPI != GCMap.end())
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 02cd12be0453..00681f6ce8cd 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "ByteStreamer.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/ADT/SmallBitVector.h"
+#include "DwarfExpression.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSection.h"
@@ -27,10 +27,30 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
+void DebugLocDwarfExpression::EmitOp(uint8_t Op, const char *Comment) {
+  BS.EmitInt8(
+      Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op)
+                  : dwarf::OperationEncodingString(Op));
+}
+
+void DebugLocDwarfExpression::EmitSigned(int Value) {
+  BS.EmitSLEB128(Value, Twine(Value));
+}
+
+void DebugLocDwarfExpression::EmitUnsigned(unsigned Value) {
+  BS.EmitULEB128(Value, Twine(Value));
+}
+
+bool DebugLocDwarfExpression::isFrameRegister(unsigned MachineReg) {
+  // This information is not available while emitting .debug_loc entries.
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Dwarf Emission Helper Routines
 //===----------------------------------------------------------------------===//
@@ -130,7 +150,7 @@ unsigned AsmPrinter::GetSizeOfEncodedValue(unsigned Encoding) const {
   default:
     llvm_unreachable("Invalid encoded value.");
   case dwarf::DW_EH_PE_absptr:
-    return TM.getDataLayout()->getPointerSize();
+    return TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
   case dwarf::DW_EH_PE_udata2:
     return 2;
   case dwarf::DW_EH_PE_udata4:
@@ -186,60 +206,6 @@ void AsmPrinter::EmitSectionOffset(const MCSymbol *Label,
   EmitLabelDifference(Label, SectionLabel, 4);
 }
 
-/// Emit a dwarf register operation.
-static void emitDwarfRegOp(ByteStreamer &Streamer, int Reg) {
-  assert(Reg >= 0);
-  if (Reg < 32) {
-    Streamer.EmitInt8(dwarf::DW_OP_reg0 + Reg,
-                      dwarf::OperationEncodingString(dwarf::DW_OP_reg0 + Reg));
-  } else {
-    Streamer.EmitInt8(dwarf::DW_OP_regx, "DW_OP_regx");
-    Streamer.EmitULEB128(Reg, Twine(Reg));
-  }
-}
-
-/// Emit an (double-)indirect dwarf register operation.
-static void emitDwarfRegOpIndirect(ByteStreamer &Streamer, int Reg, int Offset,
-                                   bool Deref) {
-  assert(Reg >= 0);
-  if (Reg < 32) {
-    Streamer.EmitInt8(dwarf::DW_OP_breg0 + Reg,
-                      dwarf::OperationEncodingString(dwarf::DW_OP_breg0 + Reg));
-  } else {
-    Streamer.EmitInt8(dwarf::DW_OP_bregx, "DW_OP_bregx");
-    Streamer.EmitULEB128(Reg, Twine(Reg));
-  }
-  Streamer.EmitSLEB128(Offset);
-  if (Deref)
-    Streamer.EmitInt8(dwarf::DW_OP_deref, "DW_OP_deref");
-}
-
-/// Emit a dwarf register operation for describing
-/// - a small value occupying only part of a register or
-/// - a small register representing only part of a value.
-static void emitDwarfOpPiece(ByteStreamer &Streamer, unsigned SizeInBits,
-                             unsigned OffsetInBits) {
-  assert(SizeInBits > 0 && "zero-sized piece");
-  unsigned SizeOfByte = 8;
-  if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
-    Streamer.EmitInt8(dwarf::DW_OP_bit_piece, "DW_OP_bit_piece");
-    Streamer.EmitULEB128(SizeInBits, Twine(SizeInBits));
-    Streamer.EmitULEB128(OffsetInBits, Twine(OffsetInBits));
-  } else {
-    Streamer.EmitInt8(dwarf::DW_OP_piece, "DW_OP_piece");
-    unsigned ByteSize = SizeInBits / SizeOfByte;
-    Streamer.EmitULEB128(ByteSize, Twine(ByteSize));
-  }
-}
-
-/// Emit a shift-right dwarf expression.
-static void emitDwarfOpShr(ByteStreamer &Streamer,
-                           unsigned ShiftBy) {
-  Streamer.EmitInt8(dwarf::DW_OP_constu, "DW_OP_constu");
-  Streamer.EmitULEB128(ShiftBy);
-  Streamer.EmitInt8(dwarf::DW_OP_shr, "DW_OP_shr");
-}
-
 // Some targets do not provide a DWARF register number for every
 // register.  This function attempts to emit a DWARF register by
 // emitting a piece of a super-register or by piecing together
@@ -249,112 +215,47 @@ void AsmPrinter::EmitDwarfRegOpPiece(ByteStreamer &Streamer,
                                      unsigned PieceSizeInBits,
                                      unsigned PieceOffsetInBits) const {
   assert(MLoc.isReg() && "MLoc must be a register");
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-  int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
-
-  // If this is a valid register number, emit it.
-  if (Reg >= 0) {
-    emitDwarfRegOp(Streamer, Reg);
-    emitDwarfOpPiece(Streamer, PieceSizeInBits, PieceOffsetInBits);
-    return;
-  }
-
-  // Walk up the super-register chain until we find a valid number.
-  // For example, EAX on x86_64 is a 32-bit piece of RAX with offset 0.
-  for (MCSuperRegIterator SR(MLoc.getReg(), TRI); SR.isValid(); ++SR) {
-    Reg = TRI->getDwarfRegNum(*SR, false);
-    if (Reg >= 0) {
-      unsigned Idx = TRI->getSubRegIndex(*SR, MLoc.getReg());
-      unsigned Size = TRI->getSubRegIdxSize(Idx);
-      unsigned Offset = TRI->getSubRegIdxOffset(Idx);
-      OutStreamer.AddComment("super-register");
-      emitDwarfRegOp(Streamer, Reg);
-      if (PieceOffsetInBits == Offset) {
-        emitDwarfOpPiece(Streamer, Size, Offset);
-      } else {
-        // If this is part of a variable in a sub-register at a
-        // non-zero offset, we need to manually shift the value into
-        // place, since the DW_OP_piece describes the part of the
-        // variable, not the position of the subregister.
-        emitDwarfOpPiece(Streamer, Size, PieceOffsetInBits);
-        if (Offset)
-          emitDwarfOpShr(Streamer, Offset);
-      }
-      return;
-    }
-  }
-
-  // Otherwise, attempt to find a covering set of sub-register numbers.
-  // For example, Q0 on ARM is a composition of D0+D1.
-  //
-  // Keep track of the current position so we can emit the more
-  // efficient DW_OP_piece.
-  unsigned CurPos = PieceOffsetInBits;
-  // The size of the register in bits, assuming 8 bits per byte.
-  unsigned RegSize = TRI->getMinimalPhysRegClass(MLoc.getReg())->getSize() * 8;
-  // Keep track of the bits in the register we already emitted, so we
-  // can avoid emitting redundant aliasing subregs.
-  SmallBitVector Coverage(RegSize, false);
-  for (MCSubRegIterator SR(MLoc.getReg(), TRI); SR.isValid(); ++SR) {
-    unsigned Idx = TRI->getSubRegIndex(MLoc.getReg(), *SR);
-    unsigned Size = TRI->getSubRegIdxSize(Idx);
-    unsigned Offset = TRI->getSubRegIdxOffset(Idx);
-    Reg = TRI->getDwarfRegNum(*SR, false);
-
-    // Intersection between the bits we already emitted and the bits
-    // covered by this subregister.
-    SmallBitVector Intersection(RegSize, false);
-    Intersection.set(Offset, Offset + Size);
-    Intersection ^= Coverage;
-
-    // If this sub-register has a DWARF number and we haven't covered
-    // its range, emit a DWARF piece for it.
-    if (Reg >= 0 && Intersection.any()) {
-      OutStreamer.AddComment("sub-register");
-      emitDwarfRegOp(Streamer, Reg);
-      emitDwarfOpPiece(Streamer, Size, Offset == CurPos ? 0 : Offset);
-      CurPos = Offset + Size;
-
-      // Mark it as emitted.
-      Coverage.set(Offset, Offset + Size);
-    }
-  }
+  DebugLocDwarfExpression Expr(*this, Streamer);
+  Expr.AddMachineRegPiece(MLoc.getReg(), PieceSizeInBits, PieceOffsetInBits);
+}
 
-  if (CurPos == PieceOffsetInBits) {
-    // FIXME: We have no reasonable way of handling errors in here.
-    Streamer.EmitInt8(dwarf::DW_OP_nop,
-                      "nop (could not find a dwarf register number)");
-  }
+void AsmPrinter::EmitDwarfOpPiece(ByteStreamer &Streamer,
+                                  unsigned PieceSizeInBits,
+                                  unsigned PieceOffsetInBits) const {
+  DebugLocDwarfExpression Expr(*this, Streamer);
+  Expr.AddOpPiece(PieceSizeInBits, PieceOffsetInBits);
 }
 
 /// EmitDwarfRegOp - Emit dwarf register operation.
 void AsmPrinter::EmitDwarfRegOp(ByteStreamer &Streamer,
                                 const MachineLocation &MLoc,
                                 bool Indirect) const {
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  DebugLocDwarfExpression Expr(*this, Streamer);
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
   int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
   if (Reg < 0) {
     // We assume that pointers are always in an addressable register.
-    if (Indirect || MLoc.isIndirect()) {
+    if (Indirect || MLoc.isIndirect())
       // FIXME: We have no reasonable way of handling errors in here. The
       // caller might be in the middle of a dwarf expression. We should
       // probably assert that Reg >= 0 once debug info generation is more
       // mature.
-      Streamer.EmitInt8(dwarf::DW_OP_nop,
-                        "nop (invalid dwarf register number for indirect loc)");
-      return;
-    }
+      return Expr.EmitOp(dwarf::DW_OP_nop,
+                         "nop (could not find a dwarf register number)");
 
     // Attempt to find a valid super- or sub-register.
-    return EmitDwarfRegOpPiece(Streamer, MLoc);
+    if (!Expr.AddMachineRegPiece(MLoc.getReg()))
+      Expr.EmitOp(dwarf::DW_OP_nop,
+                  "nop (could not find a dwarf register number)");
+    return;
   }
 
   if (MLoc.isIndirect())
-    emitDwarfRegOpIndirect(Streamer, Reg, MLoc.getOffset(), Indirect);
+    Expr.AddRegIndirect(Reg, MLoc.getOffset(), Indirect);
   else if (Indirect)
-    emitDwarfRegOpIndirect(Streamer, Reg, 0, false);
+    Expr.AddRegIndirect(Reg, 0, false);
   else
-    emitDwarfRegOp(Streamer, Reg);
+    Expr.AddReg(Reg);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h b/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
index 2825367abd8f..31867dd01d05 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H__
-#define CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H
 
 #include "llvm/Support/DataTypes.h"
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 46ee0c856bda..f6ce4a08db5b 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
@@ -63,7 +64,7 @@ static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
 
     if (LocInfo->getNumOperands() != 0)
       if (const ConstantInt *CI =
-          dyn_cast<ConstantInt>(LocInfo->getOperand(ErrorLine)))
+              mdconst::dyn_extract<ConstantInt>(LocInfo->getOperand(ErrorLine)))
         LocCookie = CI->getZExtValue();
   }
 
@@ -89,6 +90,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
   assert(MCAI && "No MCAsmInfo");
   if (!MCAI->useIntegratedAssembler() &&
       !OutStreamer.isIntegratedAssemblerRequired()) {
+    emitInlineAsmStart(TM.getSubtarget<MCSubtargetInfo>());
     OutStreamer.EmitRawText(Str);
     emitInlineAsmEnd(TM.getSubtarget<MCSubtargetInfo>(), nullptr);
     return;
@@ -110,14 +112,14 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
     HasDiagHandler = true;
   }
 
-  MemoryBuffer *Buffer;
+  std::unique_ptr<MemoryBuffer> Buffer;
   if (isNullTerminated)
     Buffer = MemoryBuffer::getMemBuffer(Str, "<inline asm>");
   else
     Buffer = MemoryBuffer::getMemBufferCopy(Str, "<inline asm>");
 
   // Tell SrcMgr about this buffer, it takes ownership of the buffer.
-  SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
+  SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());
 
   std::unique_ptr<MCAsmParser> Parser(
       createMCAsmParser(SrcMgr, OutContext, OutStreamer, *MAI));
@@ -146,7 +148,12 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
                        " we don't have an asm parser for this target\n");
   Parser->setAssemblerDialect(Dialect);
   Parser->setTargetParser(*TAP.get());
+  if (MF) {
+    const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+    TAP->SetFrameRegister(TRI->getFrameRegister(*MF));
+  }
 
+  emitInlineAsmStart(STIOrig);
   // Don't implicitly switch to the text section before the asm.
   int Res = Parser->Run(/*NoInitialTextSection*/ true,
                         /*NoFinalize*/ true);
@@ -462,7 +469,8 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
     if (MI->getOperand(i-1).isMetadata() &&
         (LocMD = MI->getOperand(i-1).getMetadata()) &&
         LocMD->getNumOperands() != 0) {
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(LocMD->getOperand(0))) {
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
         LocCookie = CI->getZExtValue();
         break;
       }
@@ -500,7 +508,7 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
 /// for their own strange codes.
 void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
                               const char *Code) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   if (!strcmp(Code, "private")) {
     OS << DL->getPrivateGlobalPrefix();
   } else if (!strcmp(Code, "comment")) {
@@ -561,5 +569,7 @@ bool AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
   return true;
 }
 
+void AsmPrinter::emitInlineAsmStart(const MCSubtargetInfo &StartInfo) const {}
+
 void AsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
                                   const MCSubtargetInfo *EndInfo) const {}
diff --git a/lib/CodeGen/AsmPrinter/ByteStreamer.h b/lib/CodeGen/AsmPrinter/ByteStreamer.h
index 6c01d65cc1aa..42be1149e973 100644
--- a/lib/CodeGen/AsmPrinter/ByteStreamer.h
+++ b/lib/CodeGen/AsmPrinter/ByteStreamer.h
@@ -12,13 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_BYTESTREAMER_H
-#define LLVM_CODEGEN_BYTESTREAMER_H
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
 
+#include "DIEHash.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/MC/MCStreamer.h"
-#include "DIEHash.h"
 
 namespace llvm {
 class ByteStreamer {
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index f555f212a978..01d2c7220ab9 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -9,7 +9,9 @@ add_llvm_library(LLVMAsmPrinter
   DIEHash.cpp
   DwarfAccelTable.cpp
   DwarfCFIException.cpp
+  DwarfCompileUnit.cpp
   DwarfDebug.cpp
+  DwarfExpression.cpp
   DwarfFile.cpp
   DwarfStringPool.cpp
   DwarfUnit.cpp
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index c3dcd9c8c19d..64ba56bdacf7 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -11,13 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DIE.h"
+#include "llvm/CodeGen/DIE.h"
+#include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "DwarfUnit.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
@@ -370,6 +372,29 @@ void DIEString::print(raw_ostream &O) const {
 // DIEEntry Implementation
 //===----------------------------------------------------------------------===//
 
+/// Emit something like ".long Hi+Offset-Lo" where the size in bytes of the
+/// directive is specified by Size and Hi/Lo specify the labels.
+static void emitLabelOffsetDifference(MCStreamer &Streamer, const MCSymbol *Hi,
+                                      uint64_t Offset, const MCSymbol *Lo,
+                                      unsigned Size) {
+  MCContext &Context = Streamer.getContext();
+
+  // Emit Hi+Offset - Lo
+  // Get the Hi+Offset expression.
+  const MCExpr *Plus =
+      MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Hi, Context),
+                              MCConstantExpr::Create(Offset, Context), Context);
+
+  // Get the Hi+Offset-Lo expression.
+  const MCExpr *Diff = MCBinaryExpr::CreateSub(
+      Plus, MCSymbolRefExpr::Create(Lo, Context), Context);
+
+  // Otherwise, emit with .set (aka assignment).
+  MCSymbol *SetLabel = Context.CreateTempSymbol();
+  Streamer.EmitAssignment(SetLabel, Diff);
+  Streamer.EmitSymbolValue(SetLabel, Size);
+}
+
 /// EmitValue - Emit debug information entry offset.
 ///
 void DIEEntry::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
@@ -388,9 +413,9 @@ void DIEEntry::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
       AP->EmitLabelPlusOffset(CU->getSectionSym(), Addr,
                               DIEEntry::getRefAddrSize(AP));
     else
-      AP->EmitLabelOffsetDifference(CU->getSectionSym(), Addr,
-                                    CU->getSectionSym(),
-                                    DIEEntry::getRefAddrSize(AP));
+      emitLabelOffsetDifference(AP->OutStreamer, CU->getSectionSym(), Addr,
+                                CU->getSectionSym(),
+                                DIEEntry::getRefAddrSize(AP));
   } else
     AP->EmitInt32(Entry.getOffset());
 }
diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/lib/CodeGen/AsmPrinter/DIE.h
deleted file mode 100644
index ef05f1707810..000000000000
--- a/lib/CodeGen/AsmPrinter/DIE.h
+++ /dev/null
@@ -1,587 +0,0 @@
-//===--- lib/CodeGen/DIE.h - DWARF Info Entries -----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Data structures for DWARF info entries.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CODEGEN_ASMPRINTER_DIE_H__
-#define CODEGEN_ASMPRINTER_DIE_H__
-
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Dwarf.h"
-#include <vector>
-
-namespace llvm {
-class AsmPrinter;
-class MCExpr;
-class MCSymbol;
-class raw_ostream;
-class DwarfTypeUnit;
-
-//===--------------------------------------------------------------------===//
-/// DIEAbbrevData - Dwarf abbreviation data, describes one attribute of a
-/// Dwarf abbreviation.
-class DIEAbbrevData {
-  /// Attribute - Dwarf attribute code.
-  ///
-  dwarf::Attribute Attribute;
-
-  /// Form - Dwarf form code.
-  ///
-  dwarf::Form Form;
-
-public:
-  DIEAbbrevData(dwarf::Attribute A, dwarf::Form F) : Attribute(A), Form(F) {}
-
-  // Accessors.
-  dwarf::Attribute getAttribute() const { return Attribute; }
-  dwarf::Form getForm() const { return Form; }
-
-  /// Profile - Used to gather unique data for the abbreviation folding set.
-  ///
-  void Profile(FoldingSetNodeID &ID) const;
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEAbbrev - Dwarf abbreviation, describes the organization of a debug
-/// information object.
-class DIEAbbrev : public FoldingSetNode {
-  /// Unique number for node.
-  ///
-  unsigned Number;
-
-  /// Tag - Dwarf tag code.
-  ///
-  dwarf::Tag Tag;
-
-  /// Children - Whether or not this node has children.
-  ///
-  // This cheats a bit in all of the uses since the values in the standard
-  // are 0 and 1 for no children and children respectively.
-  bool Children;
-
-  /// Data - Raw data bytes for abbreviation.
-  ///
-  SmallVector<DIEAbbrevData, 12> Data;
-
-public:
-  DIEAbbrev(dwarf::Tag T, bool C) : Tag(T), Children(C), Data() {}
-
-  // Accessors.
-  dwarf::Tag getTag() const { return Tag; }
-  unsigned getNumber() const { return Number; }
-  bool hasChildren() const { return Children; }
-  const SmallVectorImpl<DIEAbbrevData> &getData() const { return Data; }
-  void setChildrenFlag(bool hasChild) { Children = hasChild; }
-  void setNumber(unsigned N) { Number = N; }
-
-  /// AddAttribute - Adds another set of attribute information to the
-  /// abbreviation.
-  void AddAttribute(dwarf::Attribute Attribute, dwarf::Form Form) {
-    Data.push_back(DIEAbbrevData(Attribute, Form));
-  }
-
-  /// Profile - Used to gather unique data for the abbreviation folding set.
-  ///
-  void Profile(FoldingSetNodeID &ID) const;
-
-  /// Emit - Print the abbreviation using the specified asm printer.
-  ///
-  void Emit(AsmPrinter *AP) const;
-
-#ifndef NDEBUG
-  void print(raw_ostream &O);
-  void dump();
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIE - A structured debug information entry.  Has an abbreviation which
-/// describes its organization.
-class DIEValue;
-
-class DIE {
-protected:
-  /// Offset - Offset in debug info section.
-  ///
-  unsigned Offset;
-
-  /// Size - Size of instance + children.
-  ///
-  unsigned Size;
-
-  /// Abbrev - Buffer for constructing abbreviation.
-  ///
-  DIEAbbrev Abbrev;
-
-  /// Children DIEs.
-  ///
-  // This can't be a vector<DIE> because pointer validity is requirent for the
-  // Parent pointer and DIEEntry.
-  // It can't be a list<DIE> because some clients need pointer validity before
-  // the object has been added to any child list
-  // (eg: DwarfUnit::constructVariableDIE). These aren't insurmountable, but may
-  // be more convoluted than beneficial.
-  std::vector<std::unique_ptr<DIE>> Children;
-
-  DIE *Parent;
-
-  /// Attribute values.
-  ///
-  SmallVector<DIEValue *, 12> Values;
-
-protected:
-  DIE()
-      : Offset(0), Size(0), Abbrev((dwarf::Tag)0, dwarf::DW_CHILDREN_no),
-        Parent(nullptr) {}
-
-public:
-  explicit DIE(dwarf::Tag Tag)
-      : Offset(0), Size(0), Abbrev((dwarf::Tag)Tag, dwarf::DW_CHILDREN_no),
-        Parent(nullptr) {}
-
-  // Accessors.
-  DIEAbbrev &getAbbrev() { return Abbrev; }
-  const DIEAbbrev &getAbbrev() const { return Abbrev; }
-  unsigned getAbbrevNumber() const { return Abbrev.getNumber(); }
-  dwarf::Tag getTag() const { return Abbrev.getTag(); }
-  unsigned getOffset() const { return Offset; }
-  unsigned getSize() const { return Size; }
-  const std::vector<std::unique_ptr<DIE>> &getChildren() const {
-    return Children;
-  }
-  const SmallVectorImpl<DIEValue *> &getValues() const { return Values; }
-  DIE *getParent() const { return Parent; }
-  /// Climb up the parent chain to get the compile or type unit DIE this DIE
-  /// belongs to.
-  const DIE *getUnit() const;
-  /// Similar to getUnit, returns null when DIE is not added to an
-  /// owner yet.
-  const DIE *getUnitOrNull() const;
-  void setOffset(unsigned O) { Offset = O; }
-  void setSize(unsigned S) { Size = S; }
-
-  /// addValue - Add a value and attributes to a DIE.
-  ///
-  void addValue(dwarf::Attribute Attribute, dwarf::Form Form, DIEValue *Value) {
-    Abbrev.AddAttribute(Attribute, Form);
-    Values.push_back(Value);
-  }
-
-  /// addChild - Add a child to the DIE.
-  ///
-  void addChild(std::unique_ptr<DIE> Child) {
-    assert(!Child->getParent());
-    Abbrev.setChildrenFlag(dwarf::DW_CHILDREN_yes);
-    Child->Parent = this;
-    Children.push_back(std::move(Child));
-  }
-
-  /// findAttribute - Find a value in the DIE with the attribute given,
-  /// returns NULL if no such attribute exists.
-  DIEValue *findAttribute(dwarf::Attribute Attribute) const;
-
-#ifndef NDEBUG
-  void print(raw_ostream &O, unsigned IndentCount = 0) const;
-  void dump();
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEValue - A debug information entry value. Some of these roughly correlate
-/// to DWARF attribute classes.
-///
-class DIEValue {
-  virtual void anchor();
-
-public:
-  enum Type {
-    isInteger,
-    isString,
-    isExpr,
-    isLabel,
-    isDelta,
-    isEntry,
-    isTypeSignature,
-    isBlock,
-    isLoc,
-    isLocList,
-  };
-
-protected:
-  /// Ty - Type of data stored in the value.
-  ///
-  Type Ty;
-
-  explicit DIEValue(Type T) : Ty(T) {}
-  virtual ~DIEValue() {}
-
-public:
-  // Accessors
-  Type getType() const { return Ty; }
-
-  /// EmitValue - Emit value via the Dwarf writer.
-  ///
-  virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const = 0;
-
-  /// SizeOf - Return the size of a value in bytes.
-  ///
-  virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const = 0;
-
-#ifndef NDEBUG
-  virtual void print(raw_ostream &O) const = 0;
-  void dump() const;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEInteger - An integer value DIE.
-///
-class DIEInteger : public DIEValue {
-  uint64_t Integer;
-
-public:
-  explicit DIEInteger(uint64_t I) : DIEValue(isInteger), Integer(I) {}
-
-  /// BestForm - Choose the best form for integer.
-  ///
-  static dwarf::Form BestForm(bool IsSigned, uint64_t Int) {
-    if (IsSigned) {
-      const int64_t SignedInt = Int;
-      if ((char)Int == SignedInt)
-        return dwarf::DW_FORM_data1;
-      if ((short)Int == SignedInt)
-        return dwarf::DW_FORM_data2;
-      if ((int)Int == SignedInt)
-        return dwarf::DW_FORM_data4;
-    } else {
-      if ((unsigned char)Int == Int)
-        return dwarf::DW_FORM_data1;
-      if ((unsigned short)Int == Int)
-        return dwarf::DW_FORM_data2;
-      if ((unsigned int)Int == Int)
-        return dwarf::DW_FORM_data4;
-    }
-    return dwarf::DW_FORM_data8;
-  }
-
-  /// EmitValue - Emit integer of appropriate size.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  uint64_t getValue() const { return Integer; }
-
-  /// SizeOf - Determine size of integer value in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *I) { return I->getType() == isInteger; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEExpr - An expression DIE.
-//
-class DIEExpr : public DIEValue {
-  const MCExpr *Expr;
-
-public:
-  explicit DIEExpr(const MCExpr *E) : DIEValue(isExpr), Expr(E) {}
-
-  /// EmitValue - Emit expression value.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// getValue - Get MCExpr.
-  ///
-  const MCExpr *getValue() const { return Expr; }
-
-  /// SizeOf - Determine size of expression value in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *E) { return E->getType() == isExpr; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIELabel - A label DIE.
-//
-class DIELabel : public DIEValue {
-  const MCSymbol *Label;
-
-public:
-  explicit DIELabel(const MCSymbol *L) : DIEValue(isLabel), Label(L) {}
-
-  /// EmitValue - Emit label value.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// getValue - Get MCSymbol.
-  ///
-  const MCSymbol *getValue() const { return Label; }
-
-  /// SizeOf - Determine size of label value in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *L) { return L->getType() == isLabel; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEDelta - A simple label difference DIE.
-///
-class DIEDelta : public DIEValue {
-  const MCSymbol *LabelHi;
-  const MCSymbol *LabelLo;
-
-public:
-  DIEDelta(const MCSymbol *Hi, const MCSymbol *Lo)
-      : DIEValue(isDelta), LabelHi(Hi), LabelLo(Lo) {}
-
-  /// EmitValue - Emit delta value.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// SizeOf - Determine size of delta value in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *D) { return D->getType() == isDelta; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEString - A container for string values.
-///
-class DIEString : public DIEValue {
-  const DIEValue *Access;
-  const StringRef Str;
-
-public:
-  DIEString(const DIEValue *Acc, const StringRef S)
-      : DIEValue(isString), Access(Acc), Str(S) {}
-
-  /// getString - Grab the string out of the object.
-  StringRef getString() const { return Str; }
-
-  /// EmitValue - Emit delta value.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// SizeOf - Determine size of delta value in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *D) { return D->getType() == isString; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEEntry - A pointer to another debug information entry.  An instance of
-/// this class can also be used as a proxy for a debug information entry not
-/// yet defined (ie. types.)
-class DIEEntry : public DIEValue {
-  DIE &Entry;
-
-public:
-  explicit DIEEntry(DIE &E) : DIEValue(isEntry), Entry(E) {
-  }
-
-  DIE &getEntry() const { return Entry; }
-
-  /// EmitValue - Emit debug information entry offset.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// SizeOf - Determine size of debug information entry in bytes.
-  ///
-   unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override {
-    return Form == dwarf::DW_FORM_ref_addr ? getRefAddrSize(AP)
-                                           : sizeof(int32_t);
-  }
-
-  /// Returns size of a ref_addr entry.
-  static unsigned getRefAddrSize(AsmPrinter *AP);
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *E) { return E->getType() == isEntry; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// \brief A signature reference to a type unit.
-class DIETypeSignature : public DIEValue {
-  const DwarfTypeUnit &Unit;
-
-public:
-  explicit DIETypeSignature(const DwarfTypeUnit &Unit)
-      : DIEValue(isTypeSignature), Unit(Unit) {}
-
-  /// \brief Emit type unit signature.
-  void EmitValue(AsmPrinter *Asm, dwarf::Form Form) const override;
-
-  /// Returns size of a ref_sig8 entry.
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override {
-    assert(Form == dwarf::DW_FORM_ref_sig8);
-    return 8;
-  }
-
-  // \brief Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *E) {
-    return E->getType() == isTypeSignature;
-  }
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-  void dump() const;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIELoc - Represents an expression location.
-//
-class DIELoc : public DIEValue, public DIE {
-  mutable unsigned Size; // Size in bytes excluding size header.
-public:
-  DIELoc() : DIEValue(isLoc), Size(0) {}
-
-  /// ComputeSize - Calculate the size of the location expression.
-  ///
-  unsigned ComputeSize(AsmPrinter *AP) const;
-
-  /// BestForm - Choose the best form for data.
-  ///
-  dwarf::Form BestForm(unsigned DwarfVersion) const {
-    if (DwarfVersion > 3)
-      return dwarf::DW_FORM_exprloc;
-    // Pre-DWARF4 location expressions were blocks and not exprloc.
-    if ((unsigned char)Size == Size)
-      return dwarf::DW_FORM_block1;
-    if ((unsigned short)Size == Size)
-      return dwarf::DW_FORM_block2;
-    if ((unsigned int)Size == Size)
-      return dwarf::DW_FORM_block4;
-    return dwarf::DW_FORM_block;
-  }
-
-  /// EmitValue - Emit location data.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// SizeOf - Determine size of location data in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *E) { return E->getType() == isLoc; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEBlock - Represents a block of values.
-//
-class DIEBlock : public DIEValue, public DIE {
-  mutable unsigned Size; // Size in bytes excluding size header.
-public:
-  DIEBlock() : DIEValue(isBlock), Size(0) {}
-
-  /// ComputeSize - Calculate the size of the location expression.
-  ///
-  unsigned ComputeSize(AsmPrinter *AP) const;
-
-  /// BestForm - Choose the best form for data.
-  ///
-  dwarf::Form BestForm() const {
-    if ((unsigned char)Size == Size)
-      return dwarf::DW_FORM_block1;
-    if ((unsigned short)Size == Size)
-      return dwarf::DW_FORM_block2;
-    if ((unsigned int)Size == Size)
-      return dwarf::DW_FORM_block4;
-    return dwarf::DW_FORM_block;
-  }
-
-  /// EmitValue - Emit location data.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// SizeOf - Determine size of location data in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *E) { return E->getType() == isBlock; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIELocList - Represents a pointer to a location list in the debug_loc
-/// section.
-//
-class DIELocList : public DIEValue {
-  // Index into the .debug_loc vector.
-  size_t Index;
-
-public:
-  DIELocList(size_t I) : DIEValue(isLocList), Index(I) {}
-
-  /// getValue - Grab the current index out.
-  size_t getValue() const { return Index; }
-
-  /// EmitValue - Emit location data.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// SizeOf - Determine size of location data in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *E) { return E->getType() == isLocList; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-} // end llvm namespace
-
-#endif
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
index c2fad59aa4e1..1e2ba2ccb9a4 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -13,11 +13,11 @@
 
 #include "ByteStreamer.h"
 #include "DIEHash.h"
-#include "DIE.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Endian.h"
@@ -261,7 +261,7 @@ void DIEHash::hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
     return;
   }
 
-  // otherwise, b) use the letter 'T' as a the marker, ...
+  // otherwise, b) use the letter 'T' as the marker, ...
   addULEB128('T');
 
   addULEB128(Attribute);
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.h b/lib/CodeGen/AsmPrinter/DIEHash.h
index 175d66063d49..ac014b727b75 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -11,11 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DIEHASH_H__
-#define CODEGEN_ASMPRINTER_DIEHASH_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DIEHASH_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DIEHASH_H
 
-#include "DIE.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/Support/MD5.h"
 
 namespace llvm {
diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
index a66d08e501ac..0c2a5e551ad0 100644
--- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
@@ -8,25 +8,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "DbgValueHistoryCalculator.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
 #include <map>
-#include <set>
+using namespace llvm;
 
 #define DEBUG_TYPE "dwarfdebug"
 
-namespace llvm {
-
 // \brief If @MI is a DBG_VALUE with debug value described by a
 // defined register, returns the number of this register.
 // In the other case, returns 0.
 static unsigned isDescribedByReg(const MachineInstr &MI) {
   assert(MI.isDebugValue());
-  assert(MI.getNumOperands() == 3);
+  assert(MI.getNumOperands() == 4);
   // If location of variable is described using a register (directly or
   // indirecltly), this register is always a first operand.
   return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : 0;
@@ -36,7 +36,7 @@ void DbgValueHistoryMap::startInstrRange(const MDNode *Var,
                                          const MachineInstr &MI) {
   // Instruction range should start with a DBG_VALUE instruction for the
   // variable.
-  assert(MI.isDebugValue() && MI.getDebugVariable() == Var);
+  assert(MI.isDebugValue() && "not a DBG_VALUE");
   auto &Ranges = VarInstrRanges[Var];
   if (!Ranges.empty() && Ranges.back().second == nullptr &&
       Ranges.back().first->isIdenticalTo(&MI)) {
@@ -96,6 +96,19 @@ static void addRegDescribedVar(RegDescribedVarsMap &RegVars,
   VarSet.push_back(Var);
 }
 
+// \brief Terminate the location range for variables described by register at
+// @I by inserting @ClobberingInstr to their history.
+static void clobberRegisterUses(RegDescribedVarsMap &RegVars,
+                                RegDescribedVarsMap::iterator I,
+                                DbgValueHistoryMap &HistMap,
+                                const MachineInstr &ClobberingInstr) {
+  // Iterate over all variables described by this register and add this
+  // instruction to their history, clobbering it.
+  for (const auto &Var : I->second)
+    HistMap.endInstrRange(Var, ClobberingInstr);
+  RegVars.erase(I);
+}
+
 // \brief Terminate the location range for variables described by register
 // @RegNo by inserting @ClobberingInstr to their history.
 static void clobberRegisterUses(RegDescribedVarsMap &RegVars, unsigned RegNo,
@@ -104,22 +117,26 @@ static void clobberRegisterUses(RegDescribedVarsMap &RegVars, unsigned RegNo,
   const auto &I = RegVars.find(RegNo);
   if (I == RegVars.end())
     return;
-  // Iterate over all variables described by this register and add this
-  // instruction to their history, clobbering it.
-  for (const auto &Var : I->second)
-    HistMap.endInstrRange(Var, ClobberingInstr);
-  RegVars.erase(I);
+  clobberRegisterUses(RegVars, I, HistMap, ClobberingInstr);
 }
 
-// \brief Collect all registers clobbered by @MI and insert them to @Regs.
-static void collectClobberedRegisters(const MachineInstr &MI,
+// \brief Collect all registers clobbered by @MI and apply the functor
+// @Func to their RegNo.
+// @Func should be a functor with a void(unsigned) signature. We're
+// not using std::function here for performance reasons. It has a
+// small but measurable impact. By using a functor instead of a
+// std::set& here, we can avoid the overhead of constructing
+// temporaries in calculateDbgValueHistory, which has a significant
+// performance impact.
+template<typename Callable>
+static void applyToClobberedRegisters(const MachineInstr &MI,
                                       const TargetRegisterInfo *TRI,
-                                      std::set<unsigned> &Regs) {
+                                      Callable Func) {
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isDef() || !MO.getReg())
       continue;
     for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
-      Regs.insert(*AI);
+      Func(*AI);
   }
 }
 
@@ -133,11 +150,12 @@ static const MachineInstr *getFirstEpilogueInst(const MachineBasicBlock &MBB) {
   // as the return instruction.
   DebugLoc LastLoc = LastMI->getDebugLoc();
   auto Res = LastMI;
-  for (MachineBasicBlock::const_reverse_iterator I(std::next(LastMI)); I != MBB.rend();
-       ++I) {
+  for (MachineBasicBlock::const_reverse_iterator I(std::next(LastMI)),
+       E = MBB.rend();
+       I != E; ++I) {
     if (I->getDebugLoc() != LastLoc)
       return Res;
-    Res = std::prev(I.base());
+    Res = &*I;
   }
   // If all instructions have the same debug location, assume whole MBB is
   // an epilogue.
@@ -145,25 +163,26 @@ static const MachineInstr *getFirstEpilogueInst(const MachineBasicBlock &MBB) {
 }
 
 // \brief Collect registers that are modified in the function body (their
-// contents is changed only in the prologue and epilogue).
+// contents is changed outside of the prologue and epilogue).
 static void collectChangingRegs(const MachineFunction *MF,
                                 const TargetRegisterInfo *TRI,
-                                std::set<unsigned> &Regs) {
+                                BitVector &Regs) {
   for (const auto &MBB : *MF) {
     auto FirstEpilogueInst = getFirstEpilogueInst(MBB);
-    bool IsInEpilogue = false;
+
     for (const auto &MI : MBB) {
-      IsInEpilogue |= &MI == FirstEpilogueInst;
-      if (!MI.getFlag(MachineInstr::FrameSetup) && !IsInEpilogue)
-        collectClobberedRegisters(MI, TRI, Regs);
+      if (&MI == FirstEpilogueInst)
+        break;
+      if (!MI.getFlag(MachineInstr::FrameSetup))
+        applyToClobberedRegisters(MI, TRI, [&](unsigned r) { Regs.set(r); });
     }
   }
 }
 
-void calculateDbgValueHistory(const MachineFunction *MF,
-                              const TargetRegisterInfo *TRI,
-                              DbgValueHistoryMap &Result) {
-  std::set<unsigned> ChangingRegs;
+void llvm::calculateDbgValueHistory(const MachineFunction *MF,
+                                    const TargetRegisterInfo *TRI,
+                                    DbgValueHistoryMap &Result) {
+  BitVector ChangingRegs(TRI->getNumRegs());
   collectChangingRegs(MF, TRI, ChangingRegs);
 
   RegDescribedVarsMap RegVars;
@@ -172,17 +191,18 @@ void calculateDbgValueHistory(const MachineFunction *MF,
       if (!MI.isDebugValue()) {
         // Not a DBG_VALUE instruction. It may clobber registers which describe
         // some variables.
-        std::set<unsigned> MIClobberedRegs;
-        collectClobberedRegisters(MI, TRI, MIClobberedRegs);
-        for (unsigned RegNo : MIClobberedRegs) {
-          if (ChangingRegs.count(RegNo))
+        applyToClobberedRegisters(MI, TRI, [&](unsigned RegNo) {
+          if (ChangingRegs.test(RegNo))
             clobberRegisterUses(RegVars, RegNo, Result, MI);
-        }
+        });
         continue;
       }
 
       assert(MI.getNumOperands() > 1 && "Invalid DBG_VALUE instruction!");
-      const MDNode *Var = MI.getDebugVariable();
+      // Use the base variable (without any DW_OP_piece expressions)
+      // as index into History. The full variables including the
+      // piece expressions are attached to the MI.
+      DIVariable Var = MI.getDebugVariable();
 
       if (unsigned PrevReg = Result.getRegisterForVar(Var))
         dropRegDescribedVar(RegVars, PrevReg, Var);
@@ -196,11 +216,12 @@ void calculateDbgValueHistory(const MachineFunction *MF,
     // Make sure locations for register-described variables are valid only
     // until the end of the basic block (unless it's the last basic block, in
     // which case let their liveness run off to the end of the function).
-    if (!MBB.empty() &&  &MBB != &MF->back()) {
-      for (unsigned RegNo : ChangingRegs)
-        clobberRegisterUses(RegVars, RegNo, Result, MBB.back());
+    if (!MBB.empty() && &MBB != &MF->back()) {
+      for (auto I = RegVars.begin(), E = RegVars.end(); I != E;) {
+        auto CurElem = I++; // CurElem can be erased below.
+        if (ChangingRegs.test(CurElem->first))
+          clobberRegisterUses(RegVars, CurElem, Result, MBB.back());
+      }
     }
   }
 }
-
-}
diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
index b9177f05950e..4b6200717d26 100644
--- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H_
-#define CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H_
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H
 
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -28,9 +28,11 @@ class DbgValueHistoryMap {
   // range. If end is not specified, location is valid until the start
   // instruction of the next instruction range, or until the end of the
   // function.
+public:
   typedef std::pair<const MachineInstr *, const MachineInstr *> InstrRange;
   typedef SmallVector<InstrRange, 4> InstrRanges;
   typedef MapVector<const MDNode *, InstrRanges> InstrRangesMap;
+private:
   InstrRangesMap VarInstrRanges;
 
 public:
diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index 3beb799203b7..b4fcada59ecc 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -7,14 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H__
-#define CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
 #include "llvm/IR/Constants.h"
-#include "llvm/MC/MachineLocation.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
 
 namespace llvm {
-class DwarfCompileUnit;
 class MDNode;
 /// \brief This struct describes location entries emitted in the .debug_loc
 /// section.
@@ -26,25 +26,30 @@ class DebugLocEntry {
 public:
   /// A single location or constant.
   struct Value {
-    Value(const MDNode *Var, int64_t i)
-      : Variable(Var), EntryKind(E_Integer) {
+    Value(const MDNode *Var, const MDNode *Expr, int64_t i)
+        : Variable(Var), Expression(Expr), EntryKind(E_Integer) {
       Constant.Int = i;
     }
-    Value(const MDNode *Var, const ConstantFP *CFP)
-      : Variable(Var), EntryKind(E_ConstantFP) {
+    Value(const MDNode *Var, const MDNode *Expr, const ConstantFP *CFP)
+        : Variable(Var), Expression(Expr), EntryKind(E_ConstantFP) {
       Constant.CFP = CFP;
     }
-    Value(const MDNode *Var, const ConstantInt *CIP)
-      : Variable(Var), EntryKind(E_ConstantInt) {
+    Value(const MDNode *Var, const MDNode *Expr, const ConstantInt *CIP)
+        : Variable(Var), Expression(Expr), EntryKind(E_ConstantInt) {
       Constant.CIP = CIP;
     }
-    Value(const MDNode *Var, MachineLocation Loc)
-      : Variable(Var), EntryKind(E_Location), Loc(Loc) {
+    Value(const MDNode *Var, const MDNode *Expr, MachineLocation Loc)
+        : Variable(Var), Expression(Expr), EntryKind(E_Location), Loc(Loc) {
+      assert(DIVariable(Var).Verify());
+      assert(DIExpression(Expr).Verify());
     }
 
     // The variable to which this location entry corresponds.
     const MDNode *Variable;
 
+    // Any complex address location expression for this Value.
+    const MDNode *Expression;
+
     // Type of entry that this represents.
     enum EntryType { E_Location, E_Integer, E_ConstantFP, E_ConstantInt };
     enum EntryType EntryKind;
@@ -59,23 +64,6 @@ public:
     // Or a location in the machine frame.
     MachineLocation Loc;
 
-    bool operator==(const Value &other) const {
-      if (EntryKind != other.EntryKind)
-        return false;
-
-      switch (EntryKind) {
-      case E_Location:
-        return Loc == other.Loc;
-      case E_Integer:
-        return Constant.Int == other.Constant.Int;
-      case E_ConstantFP:
-        return Constant.CFP == other.Constant.CFP;
-      case E_ConstantInt:
-        return Constant.CIP == other.Constant.CIP;
-      }
-      llvm_unreachable("unhandled EntryKind");
-    }
-
     bool isLocation() const { return EntryKind == E_Location; }
     bool isInt() const { return EntryKind == E_Integer; }
     bool isConstantFP() const { return EntryKind == E_ConstantFP; }
@@ -84,40 +72,114 @@ public:
     const ConstantFP *getConstantFP() const { return Constant.CFP; }
     const ConstantInt *getConstantInt() const { return Constant.CIP; }
     MachineLocation getLoc() const { return Loc; }
-    const MDNode *getVariable() const { return Variable; }
+    const MDNode *getVariableNode() const { return Variable; }
+    DIVariable getVariable() const { return DIVariable(Variable); }
+    bool isVariablePiece() const { return getExpression().isVariablePiece(); }
+    DIExpression getExpression() const { return DIExpression(Expression); }
+    friend bool operator==(const Value &, const Value &);
+    friend bool operator<(const Value &, const Value &);
   };
+
 private:
-  /// A list of locations/constants belonging to this entry.
+  /// A nonempty list of locations/constants belonging to this entry,
+  /// sorted by offset.
   SmallVector<Value, 1> Values;
 
-  /// The compile unit that this location entry is referenced by.
-  const DwarfCompileUnit *Unit;
-
 public:
-  DebugLocEntry() : Begin(nullptr), End(nullptr), Unit(nullptr) {}
-  DebugLocEntry(const MCSymbol *B, const MCSymbol *E,
-                Value Val, const DwarfCompileUnit *U)
-      : Begin(B), End(E), Unit(U) {
+  DebugLocEntry(const MCSymbol *B, const MCSymbol *E, Value Val)
+      : Begin(B), End(E) {
     Values.push_back(std::move(Val));
   }
 
+  /// \brief If this and Next are describing different pieces of the same
+  // variable, merge them by appending Next's values to the current
+  // list of values.
+  // Return true if the merge was successful.
+  bool MergeValues(const DebugLocEntry &Next) {
+    if (Begin == Next.Begin) {
+      DIExpression Expr(Values[0].Expression);
+      DIVariable Var(Values[0].Variable);
+      DIExpression NextExpr(Next.Values[0].Expression);
+      DIVariable NextVar(Next.Values[0].Variable);
+      if (Var == NextVar && Expr.isVariablePiece() &&
+          NextExpr.isVariablePiece()) {
+        addValues(Next.Values);
+        End = Next.End;
+        return true;
+      }
+    }
+    return false;
+  }
+
   /// \brief Attempt to merge this DebugLocEntry with Next and return
   /// true if the merge was successful. Entries can be merged if they
   /// share the same Loc/Constant and if Next immediately follows this
   /// Entry.
-  bool Merge(const DebugLocEntry &Next) {
+  bool MergeRanges(const DebugLocEntry &Next) {
+    // If this and Next are describing the same variable, merge them.
     if ((End == Next.Begin && Values == Next.Values)) {
       End = Next.End;
       return true;
     }
     return false;
   }
+
   const MCSymbol *getBeginSym() const { return Begin; }
   const MCSymbol *getEndSym() const { return End; }
-  const DwarfCompileUnit *getCU() const { return Unit; }
-  const ArrayRef<Value> getValues() const { return Values; }
-  void addValue(Value Val) { Values.push_back(Val); }
+  ArrayRef<Value> getValues() const { return Values; }
+  void addValues(ArrayRef<DebugLocEntry::Value> Vals) {
+    Values.append(Vals.begin(), Vals.end());
+    sortUniqueValues();
+    assert(std::all_of(Values.begin(), Values.end(), [](DebugLocEntry::Value V){
+          return V.isVariablePiece();
+        }) && "value must be a piece");
+  }
+
+  // Sort the pieces by offset.
+  // Remove any duplicate entries by dropping all but the first.
+  void sortUniqueValues() {
+    std::sort(Values.begin(), Values.end());
+    Values.erase(std::unique(Values.begin(), Values.end(),
+                             [](const Value &A, const Value &B) {
+                   return A.getVariable() == B.getVariable() &&
+                          A.getExpression() == B.getExpression();
+                 }),
+                 Values.end());
+  }
 };
 
+/// Compare two Values for equality.
+inline bool operator==(const DebugLocEntry::Value &A,
+                       const DebugLocEntry::Value &B) {
+  if (A.EntryKind != B.EntryKind)
+    return false;
+
+  if (A.Expression != B.Expression)
+    return false;
+
+  if (A.Variable != B.Variable)
+    return false;
+
+  switch (A.EntryKind) {
+  case DebugLocEntry::Value::E_Location:
+    return A.Loc == B.Loc;
+  case DebugLocEntry::Value::E_Integer:
+    return A.Constant.Int == B.Constant.Int;
+  case DebugLocEntry::Value::E_ConstantFP:
+    return A.Constant.CFP == B.Constant.CFP;
+  case DebugLocEntry::Value::E_ConstantInt:
+    return A.Constant.CIP == B.Constant.CIP;
+  }
+  llvm_unreachable("unhandled EntryKind");
+}
+
+/// Compare two pieces based on their offset.
+inline bool operator<(const DebugLocEntry::Value &A,
+                      const DebugLocEntry::Value &B) {
+  return A.getExpression().getPieceOffset() <
+         B.getExpression().getPieceOffset();
 }
+
+}
+
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DebugLocList.h b/lib/CodeGen/AsmPrinter/DebugLocList.h
index 7a51c7bd5b0c..0f1d2edd425c 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocList.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocList.h
@@ -7,16 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DEBUGLOCLIST_H__
-#define CODEGEN_ASMPRINTER_DEBUGLOCLIST_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCLIST_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCLIST_H
 
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/ADT/SmallVector.h"
 #include "DebugLocEntry.h"
+#include "llvm/ADT/SmallVector.h"
 
 namespace llvm {
+class DwarfCompileUnit;
+class MCSymbol;
 struct DebugLocList {
   MCSymbol *Label;
+  DwarfCompileUnit *CU;
   SmallVector<DebugLocEntry, 4> List;
 };
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index e9527c458af3..a71f35e1232d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -12,11 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfAccelTable.h"
-#include "DIE.h"
+#include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -174,7 +175,8 @@ void DwarfAccelTable::EmitOffsets(AsmPrinter *Asm, MCSymbol *SecBegin) {
 // Walk through the buckets and emit the full data for each element in
 // the bucket. For the string case emit the dies and the various offsets.
 // Terminate each HashData bucket with 0.
-void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfFile *D) {
+void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfDebug *D,
+                               MCSymbol *StrSym) {
   uint64_t PrevHash = UINT64_MAX;
   for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
     for (HashList::const_iterator HI = Buckets[i].begin(),
@@ -183,13 +185,14 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfFile *D) {
       // Remember to emit the label for our offset.
       Asm->OutStreamer.EmitLabel((*HI)->Sym);
       Asm->OutStreamer.AddComment((*HI)->Str);
-      Asm->EmitSectionOffset((*HI)->Data.StrSym,
-                             D->getStringPool().getSectionSymbol());
+      Asm->EmitSectionOffset((*HI)->Data.StrSym, StrSym);
       Asm->OutStreamer.AddComment("Num DIEs");
       Asm->EmitInt32((*HI)->Data.Values.size());
       for (HashDataContents *HD : (*HI)->Data.Values) {
         // Emit the DIE offset
-        Asm->EmitInt32(HD->Die->getOffset());
+        DwarfCompileUnit *CU = D->lookupUnit(HD->Die->getUnit());
+        assert(CU && "Accelerated DIE should belong to a CU.");
+        Asm->EmitInt32(HD->Die->getOffset() + CU->getDebugInfoOffset());
         // If we have multiple Atoms emit that info too.
         // FIXME: A bit of a hack, we either emit only one atom or all info.
         if (HeaderData.Atoms.size() > 1) {
@@ -206,7 +209,8 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfFile *D) {
 }
 
 // Emit the entire data structure to the output file.
-void DwarfAccelTable::Emit(AsmPrinter *Asm, MCSymbol *SecBegin, DwarfFile *D) {
+void DwarfAccelTable::Emit(AsmPrinter *Asm, MCSymbol *SecBegin, DwarfDebug *D,
+                           MCSymbol *StrSym) {
   // Emit the header.
   EmitHeader(Asm);
 
@@ -220,7 +224,7 @@ void DwarfAccelTable::Emit(AsmPrinter *Asm, MCSymbol *SecBegin, DwarfFile *D) {
   EmitOffsets(Asm, SecBegin);
 
   // Emit the hash data.
-  EmitData(Asm, D);
+  EmitData(Asm, D, StrSym);
 }
 
 #ifndef NDEBUG
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
index a3cc95f75b52..4a6085b46fc5 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DWARFACCELTABLE_H__
-#define CODEGEN_ASMPRINTER_DWARFACCELTABLE_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
 
-#include "DIE.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
@@ -62,7 +62,7 @@
 namespace llvm {
 
 class AsmPrinter;
-class DwarfFile;
+class DwarfDebug;
 
 class DwarfAccelTable {
 
@@ -140,7 +140,7 @@ public:
 private:
   struct TableHeaderData {
     uint32_t die_offset_base;
-    SmallVector<Atom, 1> Atoms;
+    SmallVector<Atom, 3> Atoms;
 
     TableHeaderData(ArrayRef<Atom> AtomList, uint32_t offset = 0)
         : die_offset_base(offset), Atoms(AtomList.begin(), AtomList.end()) {}
@@ -223,7 +223,7 @@ private:
   void EmitBuckets(AsmPrinter *);
   void EmitHashes(AsmPrinter *);
   void EmitOffsets(AsmPrinter *, MCSymbol *);
-  void EmitData(AsmPrinter *, DwarfFile *D);
+  void EmitData(AsmPrinter *, DwarfDebug *D, MCSymbol *StrSym);
 
   // Allocator for HashData and HashDataContents.
   BumpPtrAllocator Allocator;
@@ -248,7 +248,7 @@ public:
   void AddName(StringRef Name, MCSymbol *StrSym, const DIE *Die,
                char Flags = 0);
   void FinalizeTable(AsmPrinter *, StringRef);
-  void Emit(AsmPrinter *, MCSymbol *, DwarfFile *);
+  void Emit(AsmPrinter *, MCSymbol *, DwarfDebug *, MCSymbol *StrSym);
 #ifndef NDEBUG
   void print(raw_ostream &O);
   void dump() { print(dbgs()); }
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 74215aa695dd..0dc52dae8f39 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -51,7 +51,7 @@ void DwarfCFIException::endModule() {
   if (moveTypeModule == AsmPrinter::CFI_M_Debug)
     Asm->OutStreamer.EmitCFISections(false, true);
 
-  if (!Asm->MAI->isExceptionHandlingDwarf())
+  if (!Asm->MAI->usesItaniumLSDAForExceptions())
     return;
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
new file mode 100644
index 000000000000..b4dba9c59236
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -0,0 +1,836 @@
+#include "DwarfCompileUnit.h"
+#include "DwarfExpression.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+namespace llvm {
+
+DwarfCompileUnit::DwarfCompileUnit(unsigned UID, DICompileUnit Node,
+                                   AsmPrinter *A, DwarfDebug *DW,
+                                   DwarfFile *DWU)
+    : DwarfUnit(UID, dwarf::DW_TAG_compile_unit, Node, A, DW, DWU),
+      Skeleton(nullptr), LabelBegin(nullptr), BaseAddress(nullptr) {
+  insertDIE(Node, &getUnitDie());
+}
+
+/// addLabelAddress - Add a dwarf label attribute data and value using
+/// DW_FORM_addr or DW_FORM_GNU_addr_index.
+///
+void DwarfCompileUnit::addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
+                                       const MCSymbol *Label) {
+
+  // Don't use the address pool in non-fission or in the skeleton unit itself.
+  // FIXME: Once GDB supports this, it's probably worthwhile using the address
+  // pool from the skeleton - maybe even in non-fission (possibly fewer
+  // relocations by sharing them in the pool, but we have other ideas about how
+  // to reduce the number of relocations as well/instead).
+  if (!DD->useSplitDwarf() || !Skeleton)
+    return addLocalLabelAddress(Die, Attribute, Label);
+
+  if (Label)
+    DD->addArangeLabel(SymbolCU(this, Label));
+
+  unsigned idx = DD->getAddressPool().getIndex(Label);
+  DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
+  Die.addValue(Attribute, dwarf::DW_FORM_GNU_addr_index, Value);
+}
+
+void DwarfCompileUnit::addLocalLabelAddress(DIE &Die,
+                                            dwarf::Attribute Attribute,
+                                            const MCSymbol *Label) {
+  if (Label)
+    DD->addArangeLabel(SymbolCU(this, Label));
+
+  Die.addValue(Attribute, dwarf::DW_FORM_addr,
+               Label ? (DIEValue *)new (DIEValueAllocator) DIELabel(Label)
+                     : new (DIEValueAllocator) DIEInteger(0));
+}
+
+unsigned DwarfCompileUnit::getOrCreateSourceID(StringRef FileName,
+                                               StringRef DirName) {
+  // If we print assembly, we can't separate .file entries according to
+  // compile units. Thus all files will belong to the default compile unit.
+
+  // FIXME: add a better feature test than hasRawTextSupport. Even better,
+  // extend .file to support this.
+  return Asm->OutStreamer.EmitDwarfFileDirective(
+      0, DirName, FileName,
+      Asm->OutStreamer.hasRawTextSupport() ? 0 : getUniqueID());
+}
+
+// Return const expression if value is a GEP to access merged global
+// constant. e.g.
+// i8* getelementptr ({ i8, i8, i8, i8 }* @_MergedGlobals, i32 0, i32 0)
+static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
+  const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(V);
+  if (!CE || CE->getNumOperands() != 3 ||
+      CE->getOpcode() != Instruction::GetElementPtr)
+    return nullptr;
+
+  // First operand points to a global struct.
+  Value *Ptr = CE->getOperand(0);
+  if (!isa<GlobalValue>(Ptr) ||
+      !isa<StructType>(cast<PointerType>(Ptr->getType())->getElementType()))
+    return nullptr;
+
+  // Second operand is zero.
+  const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(CE->getOperand(1));
+  if (!CI || !CI->isZero())
+    return nullptr;
+
+  // Third operand is offset.
+  if (!isa<ConstantInt>(CE->getOperand(2)))
+    return nullptr;
+
+  return CE;
+}
+
+/// getOrCreateGlobalVariableDIE - get or create global variable DIE.
+DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(DIGlobalVariable GV) {
+  // Check for pre-existence.
+  if (DIE *Die = getDIE(GV))
+    return Die;
+
+  assert(GV.isGlobalVariable());
+
+  DIScope GVContext = GV.getContext();
+  DIType GTy = DD->resolve(GV.getType());
+
+  // Construct the context before querying for the existence of the DIE in
+  // case such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(GVContext);
+
+  // Add to map.
+  DIE *VariableDIE = &createAndAddDIE(GV.getTag(), *ContextDIE, GV);
+  DIScope DeclContext;
+
+  if (DIDerivedType SDMDecl = GV.getStaticDataMemberDeclaration()) {
+    DeclContext = resolve(SDMDecl.getContext());
+    assert(SDMDecl.isStaticMember() && "Expected static member decl");
+    assert(GV.isDefinition());
+    // We need the declaration DIE that is in the static member's class.
+    DIE *VariableSpecDIE = getOrCreateStaticMemberDIE(SDMDecl);
+    addDIEEntry(*VariableDIE, dwarf::DW_AT_specification, *VariableSpecDIE);
+  } else {
+    DeclContext = GV.getContext();
+    // Add name and type.
+    addString(*VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
+    addType(*VariableDIE, GTy);
+
+    // Add scoping info.
+    if (!GV.isLocalToUnit())
+      addFlag(*VariableDIE, dwarf::DW_AT_external);
+
+    // Add line number info.
+    addSourceLine(*VariableDIE, GV);
+  }
+
+  if (!GV.isDefinition())
+    addFlag(*VariableDIE, dwarf::DW_AT_declaration);
+
+  // Add location.
+  bool addToAccelTable = false;
+  bool isGlobalVariable = GV.getGlobal() != nullptr;
+  if (isGlobalVariable) {
+    addToAccelTable = true;
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc();
+    const MCSymbol *Sym = Asm->getSymbol(GV.getGlobal());
+    if (GV.getGlobal()->isThreadLocal()) {
+      // FIXME: Make this work with -gsplit-dwarf.
+      unsigned PointerSize = Asm->getDataLayout().getPointerSize();
+      assert((PointerSize == 4 || PointerSize == 8) &&
+             "Add support for other sizes if necessary");
+      // Based on GCC's support for TLS:
+      if (!DD->useSplitDwarf()) {
+        // 1) Start with a constNu of the appropriate pointer size
+        addUInt(*Loc, dwarf::DW_FORM_data1,
+                PointerSize == 4 ? dwarf::DW_OP_const4u : dwarf::DW_OP_const8u);
+        // 2) containing the (relocated) offset of the TLS variable
+        //    within the module's TLS block.
+        addExpr(*Loc, dwarf::DW_FORM_udata,
+                Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
+      } else {
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+        addUInt(*Loc, dwarf::DW_FORM_udata,
+                DD->getAddressPool().getIndex(Sym, /* TLS */ true));
+      }
+      // 3) followed by a custom OP to make the debugger do a TLS lookup.
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_push_tls_address);
+    } else {
+      DD->addArangeLabel(SymbolCU(this, Sym));
+      addOpAddress(*Loc, Sym);
+    }
+
+    addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
+    // Add the linkage name.
+    StringRef LinkageName = GV.getLinkageName();
+    if (!LinkageName.empty())
+      // From DWARF4: DIEs to which DW_AT_linkage_name may apply include:
+      // TAG_common_block, TAG_constant, TAG_entry_point, TAG_subprogram and
+      // TAG_variable.
+      addString(*VariableDIE,
+                DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
+                                           : dwarf::DW_AT_MIPS_linkage_name,
+                GlobalValue::getRealLinkageName(LinkageName));
+  } else if (const ConstantInt *CI =
+                 dyn_cast_or_null<ConstantInt>(GV.getConstant())) {
+    addConstantValue(*VariableDIE, CI, GTy);
+  } else if (const ConstantExpr *CE = getMergedGlobalExpr(GV.getConstant())) {
+    addToAccelTable = true;
+    // GV is a merged global.
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc();
+    Value *Ptr = CE->getOperand(0);
+    MCSymbol *Sym = Asm->getSymbol(cast<GlobalValue>(Ptr));
+    DD->addArangeLabel(SymbolCU(this, Sym));
+    addOpAddress(*Loc, Sym);
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    SmallVector<Value *, 3> Idx(CE->op_begin() + 1, CE->op_end());
+    addUInt(*Loc, dwarf::DW_FORM_udata,
+            Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+    addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
+  }
+
+  if (addToAccelTable) {
+    DD->addAccelName(GV.getName(), *VariableDIE);
+
+    // If the linkage name is different than the name, go ahead and output
+    // that as well into the name table.
+    if (GV.getLinkageName() != "" && GV.getName() != GV.getLinkageName())
+      DD->addAccelName(GV.getLinkageName(), *VariableDIE);
+  }
+
+  addGlobalName(GV.getName(), *VariableDIE, DeclContext);
+  return VariableDIE;
+}
+
+void DwarfCompileUnit::addRange(RangeSpan Range) {
+  bool SameAsPrevCU = this == DD->getPrevCU();
+  DD->setPrevCU(this);
+  // If we have no current ranges just add the range and return, otherwise,
+  // check the current section and CU against the previous section and CU we
+  // emitted into and the subprogram was contained within. If these are the
+  // same then extend our current range, otherwise add this as a new range.
+  if (CURanges.empty() || !SameAsPrevCU ||
+      (&CURanges.back().getEnd()->getSection() !=
+       &Range.getEnd()->getSection())) {
+    CURanges.push_back(Range);
+    return;
+  }
+
+  CURanges.back().setEnd(Range.getEnd());
+}
+
+void DwarfCompileUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
+                                       const MCSymbol *Label,
+                                       const MCSymbol *Sec) {
+  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+    addLabel(Die, Attribute,
+             DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                        : dwarf::DW_FORM_data4,
+             Label);
+  else
+    addSectionDelta(Die, Attribute, Label, Sec);
+}
+
+void DwarfCompileUnit::initStmtList(MCSymbol *DwarfLineSectionSym) {
+  // Define start line table label for each Compile Unit.
+  MCSymbol *LineTableStartSym =
+      Asm->OutStreamer.getDwarfLineTableSymbol(getUniqueID());
+
+  stmtListIndex = UnitDie.getValues().size();
+
+  // DW_AT_stmt_list is a offset of line number information for this
+  // compile unit in debug_line section. For split dwarf this is
+  // left in the skeleton CU and so not included.
+  // The line table entries are not always emitted in assembly, so it
+  // is not okay to use line_table_start here.
+  addSectionLabel(UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym,
+                  DwarfLineSectionSym);
+}
+
+void DwarfCompileUnit::applyStmtList(DIE &D) {
+  D.addValue(dwarf::DW_AT_stmt_list,
+             UnitDie.getAbbrev().getData()[stmtListIndex].getForm(),
+             UnitDie.getValues()[stmtListIndex]);
+}
+
+void DwarfCompileUnit::attachLowHighPC(DIE &D, const MCSymbol *Begin,
+                                       const MCSymbol *End) {
+  assert(Begin && "Begin label should not be null!");
+  assert(End && "End label should not be null!");
+  assert(Begin->isDefined() && "Invalid starting label");
+  assert(End->isDefined() && "Invalid end label");
+
+  addLabelAddress(D, dwarf::DW_AT_low_pc, Begin);
+  if (DD->getDwarfVersion() < 4)
+    addLabelAddress(D, dwarf::DW_AT_high_pc, End);
+  else
+    addLabelDelta(D, dwarf::DW_AT_high_pc, End, Begin);
+}
+
+// Find DIE for the given subprogram and attach appropriate DW_AT_low_pc
+// and DW_AT_high_pc attributes. If there are global variables in this
+// scope then create and insert DIEs for these variables.
+DIE &DwarfCompileUnit::updateSubprogramScopeDIE(DISubprogram SP) {
+  DIE *SPDie = getOrCreateSubprogramDIE(SP, includeMinimalInlineScopes());
+
+  attachLowHighPC(*SPDie, DD->getFunctionBeginSym(), DD->getFunctionEndSym());
+  if (!DD->getCurrentFunction()->getTarget().Options.DisableFramePointerElim(
+          *DD->getCurrentFunction()))
+    addFlag(*SPDie, dwarf::DW_AT_APPLE_omit_frame_ptr);
+
+  // Only include DW_AT_frame_base in full debug info
+  if (!includeMinimalInlineScopes()) {
+    const TargetRegisterInfo *RI =
+        Asm->TM.getSubtargetImpl()->getRegisterInfo();
+    MachineLocation Location(RI->getFrameRegister(*Asm->MF));
+    if (RI->isPhysicalRegister(Location.getReg()))
+      addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
+  }
+
+  // Add name to the name table, we do this here because we're guaranteed
+  // to have concrete versions of our DW_TAG_subprogram nodes.
+  DD->addSubprogramNames(SP, *SPDie);
+
+  return *SPDie;
+}
+
+// Construct a DIE for this scope.
+void DwarfCompileUnit::constructScopeDIE(
+    LexicalScope *Scope, SmallVectorImpl<std::unique_ptr<DIE>> &FinalChildren) {
+  if (!Scope || !Scope->getScopeNode())
+    return;
+
+  DIScope DS(Scope->getScopeNode());
+
+  assert((Scope->getInlinedAt() || !DS.isSubprogram()) &&
+         "Only handle inlined subprograms here, use "
+         "constructSubprogramScopeDIE for non-inlined "
+         "subprograms");
+
+  SmallVector<std::unique_ptr<DIE>, 8> Children;
+
+  // We try to create the scope DIE first, then the children DIEs. This will
+  // avoid creating un-used children then removing them later when we find out
+  // the scope DIE is null.
+  std::unique_ptr<DIE> ScopeDIE;
+  if (Scope->getParent() && DS.isSubprogram()) {
+    ScopeDIE = constructInlinedScopeDIE(Scope);
+    if (!ScopeDIE)
+      return;
+    // We create children when the scope DIE is not null.
+    createScopeChildrenDIE(Scope, Children);
+  } else {
+    // Early exit when we know the scope DIE is going to be null.
+    if (DD->isLexicalScopeDIENull(Scope))
+      return;
+
+    unsigned ChildScopeCount;
+
+    // We create children here when we know the scope DIE is not going to be
+    // null and the children will be added to the scope DIE.
+    createScopeChildrenDIE(Scope, Children, &ChildScopeCount);
+
+    // Skip imported directives in gmlt-like data.
+    if (!includeMinimalInlineScopes()) {
+      // There is no need to emit empty lexical block DIE.
+      for (const auto &E : DD->findImportedEntitiesForScope(DS))
+        Children.push_back(
+            constructImportedEntityDIE(DIImportedEntity(E.second)));
+    }
+
+    // If there are only other scopes as children, put them directly in the
+    // parent instead, as this scope would serve no purpose.
+    if (Children.size() == ChildScopeCount) {
+      FinalChildren.insert(FinalChildren.end(),
+                           std::make_move_iterator(Children.begin()),
+                           std::make_move_iterator(Children.end()));
+      return;
+    }
+    ScopeDIE = constructLexicalScopeDIE(Scope);
+    assert(ScopeDIE && "Scope DIE should not be null.");
+  }
+
+  // Add children
+  for (auto &I : Children)
+    ScopeDIE->addChild(std::move(I));
+
+  FinalChildren.push_back(std::move(ScopeDIE));
+}
+
+void DwarfCompileUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
+                                       const MCSymbol *Hi, const MCSymbol *Lo) {
+  DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
+  Die.addValue(Attribute, DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                                     : dwarf::DW_FORM_data4,
+               Value);
+}
+
+void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
+                                         SmallVector<RangeSpan, 2> Range) {
+  // Emit offset in .debug_range as a relocatable label. emitDIE will handle
+  // emitting it appropriately.
+  auto *RangeSectionSym = DD->getRangeSectionSym();
+
+  RangeSpanList List(
+      Asm->GetTempSymbol("debug_ranges", DD->getNextRangeNumber()),
+      std::move(Range));
+
+  // Under fission, ranges are specified by constant offsets relative to the
+  // CU's DW_AT_GNU_ranges_base.
+  if (isDwoUnit())
+    addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                    RangeSectionSym);
+  else
+    addSectionLabel(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                    RangeSectionSym);
+
+  // Add the range list to the set of ranges to be emitted.
+  (Skeleton ? Skeleton : this)->CURangeLists.push_back(std::move(List));
+}
+
+void DwarfCompileUnit::attachRangesOrLowHighPC(
+    DIE &Die, SmallVector<RangeSpan, 2> Ranges) {
+  if (Ranges.size() == 1) {
+    const auto &single = Ranges.front();
+    attachLowHighPC(Die, single.getStart(), single.getEnd());
+  } else
+    addScopeRangeList(Die, std::move(Ranges));
+}
+
+void DwarfCompileUnit::attachRangesOrLowHighPC(
+    DIE &Die, const SmallVectorImpl<InsnRange> &Ranges) {
+  SmallVector<RangeSpan, 2> List;
+  List.reserve(Ranges.size());
+  for (const InsnRange &R : Ranges)
+    List.push_back(RangeSpan(DD->getLabelBeforeInsn(R.first),
+                             DD->getLabelAfterInsn(R.second)));
+  attachRangesOrLowHighPC(Die, std::move(List));
+}
+
+// This scope represents inlined body of a function. Construct DIE to
+// represent this concrete inlined copy of the function.
+std::unique_ptr<DIE>
+DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) {
+  assert(Scope->getScopeNode());
+  DIScope DS(Scope->getScopeNode());
+  DISubprogram InlinedSP = getDISubprogram(DS);
+  // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
+  // was inlined from another compile unit.
+  DIE *OriginDIE = DU->getAbstractSPDies()[InlinedSP];
+  assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
+
+  auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_inlined_subroutine);
+  addDIEEntry(*ScopeDIE, dwarf::DW_AT_abstract_origin, *OriginDIE);
+
+  attachRangesOrLowHighPC(*ScopeDIE, Scope->getRanges());
+
+  // Add the call site information to the DIE.
+  DILocation DL(Scope->getInlinedAt());
+  addUInt(*ScopeDIE, dwarf::DW_AT_call_file, None,
+          getOrCreateSourceID(DL.getFilename(), DL.getDirectory()));
+  addUInt(*ScopeDIE, dwarf::DW_AT_call_line, None, DL.getLineNumber());
+
+  // Add name to the name table, we do this here because we're guaranteed
+  // to have concrete versions of our DW_TAG_inlined_subprogram nodes.
+  DD->addSubprogramNames(InlinedSP, *ScopeDIE);
+
+  return ScopeDIE;
+}
+
+// Construct new DW_TAG_lexical_block for this scope and attach
+// DW_AT_low_pc/DW_AT_high_pc labels.
+std::unique_ptr<DIE>
+DwarfCompileUnit::constructLexicalScopeDIE(LexicalScope *Scope) {
+  if (DD->isLexicalScopeDIENull(Scope))
+    return nullptr;
+
+  auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_lexical_block);
+  if (Scope->isAbstractScope())
+    return ScopeDIE;
+
+  attachRangesOrLowHighPC(*ScopeDIE, Scope->getRanges());
+
+  return ScopeDIE;
+}
+
+/// constructVariableDIE - Construct a DIE for the given DbgVariable.
+std::unique_ptr<DIE> DwarfCompileUnit::constructVariableDIE(DbgVariable &DV,
+                                                            bool Abstract) {
+  auto D = constructVariableDIEImpl(DV, Abstract);
+  DV.setDIE(*D);
+  return D;
+}
+
+std::unique_ptr<DIE>
+DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
+                                           bool Abstract) {
+  // Define variable debug information entry.
+  auto VariableDie = make_unique<DIE>(DV.getTag());
+
+  if (Abstract) {
+    applyVariableAttributes(DV, *VariableDie);
+    return VariableDie;
+  }
+
+  // Add variable address.
+
+  unsigned Offset = DV.getDotDebugLocOffset();
+  if (Offset != ~0U) {
+    addLocationList(*VariableDie, dwarf::DW_AT_location, Offset);
+    return VariableDie;
+  }
+
+  // Check if variable is described by a DBG_VALUE instruction.
+  if (const MachineInstr *DVInsn = DV.getMInsn()) {
+    assert(DVInsn->getNumOperands() == 4);
+    if (DVInsn->getOperand(0).isReg()) {
+      const MachineOperand RegOp = DVInsn->getOperand(0);
+      // If the second operand is an immediate, this is an indirect value.
+      if (DVInsn->getOperand(1).isImm()) {
+        MachineLocation Location(RegOp.getReg(),
+                                 DVInsn->getOperand(1).getImm());
+        addVariableAddress(DV, *VariableDie, Location);
+      } else if (RegOp.getReg())
+        addVariableAddress(DV, *VariableDie, MachineLocation(RegOp.getReg()));
+    } else if (DVInsn->getOperand(0).isImm())
+      addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType());
+    else if (DVInsn->getOperand(0).isFPImm())
+      addConstantFPValue(*VariableDie, DVInsn->getOperand(0));
+    else if (DVInsn->getOperand(0).isCImm())
+      addConstantValue(*VariableDie, DVInsn->getOperand(0).getCImm(),
+                       DV.getType());
+
+    return VariableDie;
+  }
+
+  // .. else use frame index.
+  int FI = DV.getFrameIndex();
+  if (FI != ~0) {
+    unsigned FrameReg = 0;
+    const TargetFrameLowering *TFI =
+        Asm->TM.getSubtargetImpl()->getFrameLowering();
+    int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
+    MachineLocation Location(FrameReg, Offset);
+    addVariableAddress(DV, *VariableDie, Location);
+  }
+
+  return VariableDie;
+}
+
+std::unique_ptr<DIE> DwarfCompileUnit::constructVariableDIE(
+    DbgVariable &DV, const LexicalScope &Scope, DIE *&ObjectPointer) {
+  auto Var = constructVariableDIE(DV, Scope.isAbstractScope());
+  if (DV.isObjectPointer())
+    ObjectPointer = Var.get();
+  return Var;
+}
+
+DIE *DwarfCompileUnit::createScopeChildrenDIE(
+    LexicalScope *Scope, SmallVectorImpl<std::unique_ptr<DIE>> &Children,
+    unsigned *ChildScopeCount) {
+  DIE *ObjectPointer = nullptr;
+
+  for (DbgVariable *DV : DU->getScopeVariables().lookup(Scope))
+    Children.push_back(constructVariableDIE(*DV, *Scope, ObjectPointer));
+
+  unsigned ChildCountWithoutScopes = Children.size();
+
+  for (LexicalScope *LS : Scope->getChildren())
+    constructScopeDIE(LS, Children);
+
+  if (ChildScopeCount)
+    *ChildScopeCount = Children.size() - ChildCountWithoutScopes;
+
+  return ObjectPointer;
+}
+
+void DwarfCompileUnit::constructSubprogramScopeDIE(LexicalScope *Scope) {
+  assert(Scope && Scope->getScopeNode());
+  assert(!Scope->getInlinedAt());
+  assert(!Scope->isAbstractScope());
+  DISubprogram Sub(Scope->getScopeNode());
+
+  assert(Sub.isSubprogram());
+
+  DD->getProcessedSPNodes().insert(Sub);
+
+  DIE &ScopeDIE = updateSubprogramScopeDIE(Sub);
+
+  // If this is a variadic function, add an unspecified parameter.
+  DITypeArray FnArgs = Sub.getType().getTypeArray();
+
+  // Collect lexical scope children first.
+  // ObjectPointer might be a local (non-argument) local variable if it's a
+  // block's synthetic this pointer.
+  if (DIE *ObjectPointer = createAndAddScopeChildren(Scope, ScopeDIE))
+    addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer, *ObjectPointer);
+
+  // If we have a single element of null, it is a function that returns void.
+  // If we have more than one elements and the last one is null, it is a
+  // variadic function.
+  if (FnArgs.getNumElements() > 1 &&
+      !FnArgs.getElement(FnArgs.getNumElements() - 1) &&
+      !includeMinimalInlineScopes())
+    ScopeDIE.addChild(make_unique<DIE>(dwarf::DW_TAG_unspecified_parameters));
+}
+
+DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
+                                                 DIE &ScopeDIE) {
+  // We create children when the scope DIE is not null.
+  SmallVector<std::unique_ptr<DIE>, 8> Children;
+  DIE *ObjectPointer = createScopeChildrenDIE(Scope, Children);
+
+  // Add children
+  for (auto &I : Children)
+    ScopeDIE.addChild(std::move(I));
+
+  return ObjectPointer;
+}
+
+void
+DwarfCompileUnit::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) {
+  DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()];
+  if (AbsDef)
+    return;
+
+  DISubprogram SP(Scope->getScopeNode());
+
+  DIE *ContextDIE;
+
+  if (includeMinimalInlineScopes())
+    ContextDIE = &getUnitDie();
+  // Some of this is duplicated from DwarfUnit::getOrCreateSubprogramDIE, with
+  // the important distinction that the DIDescriptor is not associated with the
+  // DIE (since the DIDescriptor will be associated with the concrete DIE, if
+  // any). It could be refactored to some common utility function.
+  else if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
+    ContextDIE = &getUnitDie();
+    getOrCreateSubprogramDIE(SPDecl);
+  } else
+    ContextDIE = getOrCreateContextDIE(resolve(SP.getContext()));
+
+  // Passing null as the associated DIDescriptor because the abstract definition
+  // shouldn't be found by lookup.
+  AbsDef =
+      &createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, DIDescriptor());
+  applySubprogramAttributesToDefinition(SP, *AbsDef);
+
+  if (!includeMinimalInlineScopes())
+    addUInt(*AbsDef, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
+  if (DIE *ObjectPointer = createAndAddScopeChildren(Scope, *AbsDef))
+    addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer);
+}
+
+std::unique_ptr<DIE>
+DwarfCompileUnit::constructImportedEntityDIE(const DIImportedEntity &Module) {
+  assert(Module.Verify() &&
+         "Use one of the MDNode * overloads to handle invalid metadata");
+  std::unique_ptr<DIE> IMDie = make_unique<DIE>((dwarf::Tag)Module.getTag());
+  insertDIE(Module, IMDie.get());
+  DIE *EntityDie;
+  DIDescriptor Entity = resolve(Module.getEntity());
+  if (Entity.isNameSpace())
+    EntityDie = getOrCreateNameSpace(DINameSpace(Entity));
+  else if (Entity.isSubprogram())
+    EntityDie = getOrCreateSubprogramDIE(DISubprogram(Entity));
+  else if (Entity.isType())
+    EntityDie = getOrCreateTypeDIE(DIType(Entity));
+  else if (Entity.isGlobalVariable())
+    EntityDie = getOrCreateGlobalVariableDIE(DIGlobalVariable(Entity));
+  else
+    EntityDie = getDIE(Entity);
+  assert(EntityDie);
+  addSourceLine(*IMDie, Module.getLineNumber(),
+                Module.getContext().getFilename(),
+                Module.getContext().getDirectory());
+  addDIEEntry(*IMDie, dwarf::DW_AT_import, *EntityDie);
+  StringRef Name = Module.getName();
+  if (!Name.empty())
+    addString(*IMDie, dwarf::DW_AT_name, Name);
+
+  return IMDie;
+}
+
+void DwarfCompileUnit::finishSubprogramDefinition(DISubprogram SP) {
+  DIE *D = getDIE(SP);
+  if (DIE *AbsSPDIE = DU->getAbstractSPDies().lookup(SP)) {
+    if (D)
+      // If this subprogram has an abstract definition, reference that
+      addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
+  } else {
+    if (!D && !includeMinimalInlineScopes())
+      // Lazily construct the subprogram if we didn't see either concrete or
+      // inlined versions during codegen. (except in -gmlt ^ where we want
+      // to omit these entirely)
+      D = getOrCreateSubprogramDIE(SP);
+    if (D)
+      // And attach the attributes
+      applySubprogramAttributesToDefinition(SP, *D);
+  }
+}
+void DwarfCompileUnit::collectDeadVariables(DISubprogram SP) {
+  assert(SP.isSubprogram() && "CU's subprogram list contains a non-subprogram");
+  assert(SP.isDefinition() &&
+         "CU's subprogram list contains a subprogram declaration");
+  DIArray Variables = SP.getVariables();
+  if (Variables.getNumElements() == 0)
+    return;
+
+  DIE *SPDIE = DU->getAbstractSPDies().lookup(SP);
+  if (!SPDIE)
+    SPDIE = getDIE(SP);
+  assert(SPDIE);
+  for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
+    DIVariable DV(Variables.getElement(vi));
+    assert(DV.isVariable());
+    DbgVariable NewVar(DV, DIExpression(nullptr), DD);
+    auto VariableDie = constructVariableDIE(NewVar);
+    applyVariableAttributes(NewVar, *VariableDie);
+    SPDIE->addChild(std::move(VariableDie));
+  }
+}
+
+void DwarfCompileUnit::emitHeader(const MCSymbol *ASectionSym) const {
+  // Don't bother labeling the .dwo unit, as its offset isn't used.
+  if (!Skeleton)
+    Asm->OutStreamer.EmitLabel(LabelBegin);
+
+  DwarfUnit::emitHeader(ASectionSym);
+}
+
+/// addGlobalName - Add a new global name to the compile unit.
+void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die,
+                                     DIScope Context) {
+  if (includeMinimalInlineScopes())
+    return;
+  std::string FullName = getParentContextString(Context) + Name.str();
+  GlobalNames[FullName] = &Die;
+}
+
+/// Add a new global type to the unit.
+void DwarfCompileUnit::addGlobalType(DIType Ty, const DIE &Die,
+                                     DIScope Context) {
+  if (includeMinimalInlineScopes())
+    return;
+  std::string FullName = getParentContextString(Context) + Ty.getName().str();
+  GlobalTypes[FullName] = &Die;
+}
+
+/// addVariableAddress - Add DW_AT_location attribute for a
+/// DbgVariable based on provided MachineLocation.
+void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
+                                          MachineLocation Location) {
+  if (DV.variableHasComplexAddress())
+    addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else if (DV.isBlockByrefVariable())
+    addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else
+    addAddress(Die, dwarf::DW_AT_location, Location,
+               DV.getVariable().isIndirect());
+}
+
+/// Add an address attribute to a die based on the location provided.
+void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,
+                                  const MachineLocation &Location,
+                                  bool Indirect) {
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc();
+
+  bool validReg;
+  if (Location.isReg() && !Indirect)
+    validReg = addRegisterOpPiece(*Loc, Location.getReg());
+  else
+    validReg = addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+
+  if (!validReg)
+    return;
+
+  if (!Location.isReg() && Indirect)
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, Loc);
+}
+
+/// Start with the address based on the location provided, and generate the
+/// DWARF information necessary to find the actual variable given the extra
+/// address information encoded in the DbgVariable, starting from the starting
+/// location.  Add the DWARF information to the die.
+void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
+                                         dwarf::Attribute Attribute,
+                                         const MachineLocation &Location) {
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc();
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+  DIExpression Expr = DV.getExpression();
+  if (Location.getOffset()) {
+    if (DwarfExpr.AddMachineRegIndirect(Location.getReg(),
+                                        Location.getOffset())) {
+      DwarfExpr.AddExpression(Expr);
+      assert(!DV.getVariable().isIndirect()
+             && "double indirection not handled");
+    }
+  } else {
+    if (DwarfExpr.AddMachineRegExpression(Expr, Location.getReg()))
+      if (DV.getVariable().isIndirect())
+        DwarfExpr.EmitOp(dwarf::DW_OP_deref);
+  }
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, Loc);
+}
+
+/// Add a Dwarf loclistptr attribute data and value.
+void DwarfCompileUnit::addLocationList(DIE &Die, dwarf::Attribute Attribute,
+                                       unsigned Index) {
+  DIEValue *Value = new (DIEValueAllocator) DIELocList(Index);
+  dwarf::Form Form = DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                                : dwarf::DW_FORM_data4;
+  Die.addValue(Attribute, Form, Value);
+}
+
+void DwarfCompileUnit::applyVariableAttributes(const DbgVariable &Var,
+                                               DIE &VariableDie) {
+  StringRef Name = Var.getName();
+  if (!Name.empty())
+    addString(VariableDie, dwarf::DW_AT_name, Name);
+  addSourceLine(VariableDie, Var.getVariable());
+  addType(VariableDie, Var.getType());
+  if (Var.isArtificial())
+    addFlag(VariableDie, dwarf::DW_AT_artificial);
+}
+
+/// Add a Dwarf expression attribute data and value.
+void DwarfCompileUnit::addExpr(DIELoc &Die, dwarf::Form Form,
+                               const MCExpr *Expr) {
+  DIEValue *Value = new (DIEValueAllocator) DIEExpr(Expr);
+  Die.addValue((dwarf::Attribute)0, Form, Value);
+}
+
+void DwarfCompileUnit::applySubprogramAttributesToDefinition(DISubprogram SP,
+                                                             DIE &SPDie) {
+  DISubprogram SPDecl = SP.getFunctionDeclaration();
+  DIScope Context = resolve(SPDecl ? SPDecl.getContext() : SP.getContext());
+  applySubprogramAttributes(SP, SPDie, includeMinimalInlineScopes());
+  addGlobalName(SP.getName(), SPDie, Context);
+}
+
+bool DwarfCompileUnit::isDwoUnit() const {
+  return DD->useSplitDwarf() && Skeleton;
+}
+
+bool DwarfCompileUnit::includeMinimalInlineScopes() const {
+  return getCUNode().getEmissionKind() == DIBuilder::LineTablesOnly ||
+         (DD->useSplitDwarf() && !Skeleton);
+}
+} // end llvm namespace
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
new file mode 100644
index 000000000000..91164bc7aa2c
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -0,0 +1,250 @@
+//===-- llvm/CodeGen/DwarfCompileUnit.h - Dwarf Compile Unit ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+
+#include "DwarfUnit.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/Dwarf.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class DIE;
+class DwarfDebug;
+class DwarfFile;
+class MCSymbol;
+class LexicalScope;
+
+class DwarfCompileUnit : public DwarfUnit {
+  /// The attribute index of DW_AT_stmt_list in the compile unit DIE, avoiding
+  /// the need to search for it in applyStmtList.
+  unsigned stmtListIndex;
+
+  /// Skeleton unit associated with this unit.
+  DwarfCompileUnit *Skeleton;
+
+  /// A label at the start of the non-dwo section related to this unit.
+  MCSymbol *SectionSym;
+
+  /// The start of the unit within its section.
+  MCSymbol *LabelBegin;
+
+  /// GlobalNames - A map of globally visible named entities for this unit.
+  StringMap<const DIE *> GlobalNames;
+
+  /// GlobalTypes - A map of globally visible types for this unit.
+  StringMap<const DIE *> GlobalTypes;
+
+  // List of range lists for a given compile unit, separate from the ranges for
+  // the CU itself.
+  SmallVector<RangeSpanList, 1> CURangeLists;
+
+  // List of ranges for a given compile unit.
+  SmallVector<RangeSpan, 2> CURanges;
+
+  // The base address of this unit, if any. Used for relative references in
+  // ranges/locs.
+  const MCSymbol *BaseAddress;
+
+  /// \brief Construct a DIE for the given DbgVariable without initializing the
+  /// DbgVariable's DIE reference.
+  std::unique_ptr<DIE> constructVariableDIEImpl(const DbgVariable &DV,
+                                                bool Abstract);
+
+  bool isDwoUnit() const override;
+
+  bool includeMinimalInlineScopes() const;
+
+public:
+  DwarfCompileUnit(unsigned UID, DICompileUnit Node, AsmPrinter *A,
+                   DwarfDebug *DW, DwarfFile *DWU);
+
+  DwarfCompileUnit *getSkeleton() const {
+    return Skeleton;
+  }
+
+  void initStmtList(MCSymbol *DwarfLineSectionSym);
+
+  /// Apply the DW_AT_stmt_list from this compile unit to the specified DIE.
+  void applyStmtList(DIE &D);
+
+  /// getOrCreateGlobalVariableDIE - get or create global variable DIE.
+  DIE *getOrCreateGlobalVariableDIE(DIGlobalVariable GV);
+
+  /// addLabelAddress - Add a dwarf label attribute data and value using
+  /// either DW_FORM_addr or DW_FORM_GNU_addr_index.
+  void addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
+                       const MCSymbol *Label);
+
+  /// addLocalLabelAddress - Add a dwarf label attribute data and value using
+  /// DW_FORM_addr only.
+  void addLocalLabelAddress(DIE &Die, dwarf::Attribute Attribute,
+                            const MCSymbol *Label);
+
+  /// addSectionDelta - Add a label delta attribute data and value.
+  void addSectionDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
+                       const MCSymbol *Lo);
+
+  DwarfCompileUnit &getCU() override { return *this; }
+
+  unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override;
+
+  /// addRange - Add an address range to the list of ranges for this unit.
+  void addRange(RangeSpan Range);
+
+  void attachLowHighPC(DIE &D, const MCSymbol *Begin, const MCSymbol *End);
+
+  /// addSectionLabel - Add a Dwarf section label attribute data and value.
+  ///
+  void addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
+                       const MCSymbol *Label, const MCSymbol *Sec);
+
+  /// \brief Find DIE for the given subprogram and attach appropriate
+  /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
+  /// variables in this scope then create and insert DIEs for these
+  /// variables.
+  DIE &updateSubprogramScopeDIE(DISubprogram SP);
+
+  void constructScopeDIE(LexicalScope *Scope,
+                         SmallVectorImpl<std::unique_ptr<DIE>> &FinalChildren);
+
+  /// \brief A helper function to construct a RangeSpanList for a given
+  /// lexical scope.
+  void addScopeRangeList(DIE &ScopeDIE, SmallVector<RangeSpan, 2> Range);
+
+  void attachRangesOrLowHighPC(DIE &D, SmallVector<RangeSpan, 2> Ranges);
+
+  void attachRangesOrLowHighPC(DIE &D,
+                               const SmallVectorImpl<InsnRange> &Ranges);
+  /// \brief This scope represents inlined body of a function. Construct
+  /// DIE to represent this concrete inlined copy of the function.
+  std::unique_ptr<DIE> constructInlinedScopeDIE(LexicalScope *Scope);
+
+  /// \brief Construct new DW_TAG_lexical_block for this scope and
+  /// attach DW_AT_low_pc/DW_AT_high_pc labels.
+  std::unique_ptr<DIE> constructLexicalScopeDIE(LexicalScope *Scope);
+
+  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
+  std::unique_ptr<DIE> constructVariableDIE(DbgVariable &DV,
+                                            bool Abstract = false);
+
+  std::unique_ptr<DIE> constructVariableDIE(DbgVariable &DV,
+                                            const LexicalScope &Scope,
+                                            DIE *&ObjectPointer);
+
+  /// A helper function to create children of a Scope DIE.
+  DIE *createScopeChildrenDIE(LexicalScope *Scope,
+                              SmallVectorImpl<std::unique_ptr<DIE>> &Children,
+                              unsigned *ChildScopeCount = nullptr);
+
+  /// \brief Construct a DIE for this subprogram scope.
+  void constructSubprogramScopeDIE(LexicalScope *Scope);
+
+  DIE *createAndAddScopeChildren(LexicalScope *Scope, DIE &ScopeDIE);
+
+  void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
+
+  /// \brief Construct import_module DIE.
+  std::unique_ptr<DIE>
+  constructImportedEntityDIE(const DIImportedEntity &Module);
+
+  void finishSubprogramDefinition(DISubprogram SP);
+
+  void collectDeadVariables(DISubprogram SP);
+
+  /// Set the skeleton unit associated with this unit.
+  void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; }
+
+  MCSymbol *getSectionSym() const {
+    assert(Section);
+    return SectionSym;
+  }
+
+  /// Pass in the SectionSym even though we could recreate it in every compile
+  /// unit (type units will have actually distinct symbols once they're in
+  /// comdat sections).
+  void initSection(const MCSection *Section, MCSymbol *SectionSym) {
+    DwarfUnit::initSection(Section);
+    this->SectionSym = SectionSym;
+
+    // Don't bother labeling the .dwo unit, as its offset isn't used.
+    if (!Skeleton)
+      LabelBegin =
+          Asm->GetTempSymbol(Section->getLabelBeginName(), getUniqueID());
+  }
+
+  unsigned getLength() {
+    return sizeof(uint32_t) + // Length field
+        getHeaderSize() + UnitDie.getSize();
+  }
+
+  void emitHeader(const MCSymbol *ASectionSym) const override;
+
+  MCSymbol *getLabelBegin() const {
+    assert(Section);
+    return LabelBegin;
+  }
+
+  /// Add a new global name to the compile unit.
+  void addGlobalName(StringRef Name, DIE &Die, DIScope Context) override;
+
+  /// Add a new global type to the compile unit.
+  void addGlobalType(DIType Ty, const DIE &Die, DIScope Context) override;
+
+  const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; }
+  const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; }
+
+  /// Add DW_AT_location attribute for a DbgVariable based on provided
+  /// MachineLocation.
+  void addVariableAddress(const DbgVariable &DV, DIE &Die,
+                          MachineLocation Location);
+  /// Add an address attribute to a die based on the location provided.
+  void addAddress(DIE &Die, dwarf::Attribute Attribute,
+                  const MachineLocation &Location, bool Indirect = false);
+
+  /// Start with the address based on the location provided, and generate the
+  /// DWARF information necessary to find the actual variable (navigating the
+  /// extra location information encoded in the type) based on the starting
+  /// location.  Add the DWARF information to the die.
+  void addComplexAddress(const DbgVariable &DV, DIE &Die,
+                         dwarf::Attribute Attribute,
+                         const MachineLocation &Location);
+
+  /// Add a Dwarf loclistptr attribute data and value.
+  void addLocationList(DIE &Die, dwarf::Attribute Attribute, unsigned Index);
+  void applyVariableAttributes(const DbgVariable &Var, DIE &VariableDie);
+
+  /// Add a Dwarf expression attribute data and value.
+  void addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr);
+
+  void applySubprogramAttributesToDefinition(DISubprogram SP, DIE &SPDie);
+
+  /// getRangeLists - Get the vector of range lists.
+  const SmallVectorImpl<RangeSpanList> &getRangeLists() const {
+    return (Skeleton ? Skeleton : this)->CURangeLists;
+  }
+
+  /// getRanges - Get the list of ranges for this unit.
+  const SmallVectorImpl<RangeSpan> &getRanges() const { return CURanges; }
+  SmallVector<RangeSpan, 2> takeRanges() { return std::move(CURanges); }
+
+  void setBaseAddress(const MCSymbol *Base) { BaseAddress = Base; }
+  const MCSymbol *getBaseAddress() const { return BaseAddress; }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index ac1c0ffb3cc0..a587b46c4ba8 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -11,15 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ByteStreamer.h"
 #include "DwarfDebug.h"
-#include "DIE.h"
+#include "ByteStreamer.h"
 #include "DIEHash.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfExpression.h"
 #include "DwarfUnit.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/Constants.h"
@@ -48,6 +50,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "dwarfdebug"
@@ -150,7 +153,7 @@ DIType DbgVariable::getType() const {
     if (tag == dwarf::DW_TAG_pointer_type)
       subType = resolve(DIDerivedType(Ty).getTypeDerivedFrom());
 
-    DIArray Elements = DICompositeType(subType).getTypeArray();
+    DIArray Elements = DICompositeType(subType).getElements();
     for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
       DIDerivedType DT(Elements.getElement(i));
       if (getName() == DT.getName())
@@ -166,10 +169,11 @@ static LLVM_CONSTEXPR DwarfAccelTable::Atom TypeAtoms[] = {
     DwarfAccelTable::Atom(dwarf::DW_ATOM_type_flags, dwarf::DW_FORM_data1)};
 
 DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
-    : Asm(A), MMI(Asm->MMI), FirstCU(nullptr), PrevLabel(nullptr),
-      GlobalRangeCount(0), InfoHolder(A, "info_string", DIEValueAllocator),
+    : Asm(A), MMI(Asm->MMI), PrevLabel(nullptr), GlobalRangeCount(0),
+      InfoHolder(A, *this, "info_string", DIEValueAllocator),
       UsedNonDefaultText(false),
-      SkeletonHolder(A, "skel_string", DIEValueAllocator),
+      SkeletonHolder(A, *this, "skel_string", DIEValueAllocator),
+      IsDarwin(Triple(A->getTargetTriple()).isOSDarwin()),
       AccelNames(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                        dwarf::DW_FORM_data4)),
       AccelObjC(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
@@ -189,8 +193,6 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
 
   // Turn on accelerator tables for Darwin by default, pubnames by
   // default for non-Darwin, and handle split dwarf.
-  bool IsDarwin = Triple(A->getTargetTriple()).isOSDarwin();
-
   if (DwarfAccelTables == Default)
     HasDwarfAccelTables = IsDarwin;
   else
@@ -309,26 +311,6 @@ bool DwarfDebug::isSubprogramContext(const MDNode *Context) {
   return false;
 }
 
-// Find DIE for the given subprogram and attach appropriate DW_AT_low_pc
-// and DW_AT_high_pc attributes. If there are global variables in this
-// scope then create and insert DIEs for these variables.
-DIE &DwarfDebug::updateSubprogramScopeDIE(DwarfCompileUnit &SPCU,
-                                          DISubprogram SP) {
-  DIE *SPDie = SPCU.getOrCreateSubprogramDIE(SP);
-
-  attachLowHighPC(SPCU, *SPDie, FunctionBeginSym, FunctionEndSym);
-
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  MachineLocation Location(RI->getFrameRegister(*Asm->MF));
-  SPCU.addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
-
-  // Add name to the name table, we do this here because we're guaranteed
-  // to have concrete versions of our DW_TAG_subprogram nodes.
-  addSubprogramNames(SP, *SPDie);
-
-  return *SPDie;
-}
-
 /// Check whether we should create a DIE for the given Scope, return true
 /// if we don't create a DIE (the corresponding DIE is null).
 bool DwarfDebug::isLexicalScopeDIENull(LexicalScope *Scope) {
@@ -345,271 +327,30 @@ bool DwarfDebug::isLexicalScopeDIENull(LexicalScope *Scope) {
 
   // We don't create a DIE if we have a single Range and the end label
   // is null.
-  SmallVectorImpl<InsnRange>::const_iterator RI = Ranges.begin();
-  MCSymbol *End = getLabelAfterInsn(RI->second);
-  return !End;
-}
-
-static void addSectionLabel(AsmPrinter &Asm, DwarfUnit &U, DIE &D,
-                            dwarf::Attribute A, const MCSymbol *L,
-                            const MCSymbol *Sec) {
-  if (Asm.MAI->doesDwarfUseRelocationsAcrossSections())
-    U.addSectionLabel(D, A, L);
-  else
-    U.addSectionDelta(D, A, L, Sec);
-}
-
-void DwarfDebug::addScopeRangeList(DwarfCompileUnit &TheCU, DIE &ScopeDIE,
-                                   const SmallVectorImpl<InsnRange> &Range) {
-  // Emit offset in .debug_range as a relocatable label. emitDIE will handle
-  // emitting it appropriately.
-  MCSymbol *RangeSym = Asm->GetTempSymbol("debug_ranges", GlobalRangeCount++);
-
-  // Under fission, ranges are specified by constant offsets relative to the
-  // CU's DW_AT_GNU_ranges_base.
-  if (useSplitDwarf())
-    TheCU.addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, RangeSym,
-                          DwarfDebugRangeSectionSym);
-  else
-    addSectionLabel(*Asm, TheCU, ScopeDIE, dwarf::DW_AT_ranges, RangeSym,
-                    DwarfDebugRangeSectionSym);
-
-  RangeSpanList List(RangeSym);
-  for (const InsnRange &R : Range) {
-    RangeSpan Span(getLabelBeforeInsn(R.first), getLabelAfterInsn(R.second));
-    List.addRange(std::move(Span));
-  }
-
-  // Add the range list to the set of ranges to be emitted.
-  TheCU.addRangeList(std::move(List));
-}
-
-void DwarfDebug::attachRangesOrLowHighPC(DwarfCompileUnit &TheCU, DIE &Die,
-                                    const SmallVectorImpl<InsnRange> &Ranges) {
-  assert(!Ranges.empty());
-  if (Ranges.size() == 1)
-    attachLowHighPC(TheCU, Die, getLabelBeforeInsn(Ranges.front().first),
-                    getLabelAfterInsn(Ranges.front().second));
-  else
-    addScopeRangeList(TheCU, Die, Ranges);
-}
-
-// Construct new DW_TAG_lexical_block for this scope and attach
-// DW_AT_low_pc/DW_AT_high_pc labels.
-std::unique_ptr<DIE>
-DwarfDebug::constructLexicalScopeDIE(DwarfCompileUnit &TheCU,
-                                     LexicalScope *Scope) {
-  if (isLexicalScopeDIENull(Scope))
-    return nullptr;
-
-  auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_lexical_block);
-  if (Scope->isAbstractScope())
-    return ScopeDIE;
-
-  attachRangesOrLowHighPC(TheCU, *ScopeDIE, Scope->getRanges());
-
-  return ScopeDIE;
+  return !getLabelAfterInsn(Ranges.front().second);
 }
 
-// This scope represents inlined body of a function. Construct DIE to
-// represent this concrete inlined copy of the function.
-std::unique_ptr<DIE>
-DwarfDebug::constructInlinedScopeDIE(DwarfCompileUnit &TheCU,
-                                     LexicalScope *Scope) {
-  assert(Scope->getScopeNode());
-  DIScope DS(Scope->getScopeNode());
-  DISubprogram InlinedSP = getDISubprogram(DS);
-  // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
-  // was inlined from another compile unit.
-  DIE *OriginDIE = AbstractSPDies[InlinedSP];
-  assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
-
-  auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_inlined_subroutine);
-  TheCU.addDIEEntry(*ScopeDIE, dwarf::DW_AT_abstract_origin, *OriginDIE);
-
-  attachRangesOrLowHighPC(TheCU, *ScopeDIE, Scope->getRanges());
-
-  InlinedSubprogramDIEs.insert(OriginDIE);
-
-  // Add the call site information to the DIE.
-  DILocation DL(Scope->getInlinedAt());
-  TheCU.addUInt(*ScopeDIE, dwarf::DW_AT_call_file, None,
-                TheCU.getOrCreateSourceID(DL.getFilename(), DL.getDirectory()));
-  TheCU.addUInt(*ScopeDIE, dwarf::DW_AT_call_line, None, DL.getLineNumber());
-
-  // Add name to the name table, we do this here because we're guaranteed
-  // to have concrete versions of our DW_TAG_inlined_subprogram nodes.
-  addSubprogramNames(InlinedSP, *ScopeDIE);
-
-  return ScopeDIE;
-}
-
-static std::unique_ptr<DIE> constructVariableDIE(DwarfCompileUnit &TheCU,
-                                                 DbgVariable &DV,
-                                                 const LexicalScope &Scope,
-                                                 DIE *&ObjectPointer) {
-  auto Var = TheCU.constructVariableDIE(DV, Scope.isAbstractScope());
-  if (DV.isObjectPointer())
-    ObjectPointer = Var.get();
-  return Var;
-}
-
-DIE *DwarfDebug::createScopeChildrenDIE(
-    DwarfCompileUnit &TheCU, LexicalScope *Scope,
-    SmallVectorImpl<std::unique_ptr<DIE>> &Children) {
-  DIE *ObjectPointer = nullptr;
-
-  // Collect arguments for current function.
-  if (LScopes.isCurrentFunctionScope(Scope)) {
-    for (DbgVariable *ArgDV : CurrentFnArguments)
-      if (ArgDV)
-        Children.push_back(
-            constructVariableDIE(TheCU, *ArgDV, *Scope, ObjectPointer));
-
-    // If this is a variadic function, add an unspecified parameter.
-    DISubprogram SP(Scope->getScopeNode());
-    DIArray FnArgs = SP.getType().getTypeArray();
-    if (FnArgs.getElement(FnArgs.getNumElements() - 1)
-            .isUnspecifiedParameter()) {
-      Children.push_back(
-          make_unique<DIE>(dwarf::DW_TAG_unspecified_parameters));
-    }
-  }
-
-  // Collect lexical scope children first.
-  for (DbgVariable *DV : ScopeVariables.lookup(Scope))
-    Children.push_back(constructVariableDIE(TheCU, *DV, *Scope, ObjectPointer));
-
-  for (LexicalScope *LS : Scope->getChildren())
-    if (std::unique_ptr<DIE> Nested = constructScopeDIE(TheCU, LS))
-      Children.push_back(std::move(Nested));
-  return ObjectPointer;
+template <typename Func> void forBothCUs(DwarfCompileUnit &CU, Func F) {
+  F(CU);
+  if (auto *SkelCU = CU.getSkeleton())
+    F(*SkelCU);
 }
 
-void DwarfDebug::createAndAddScopeChildren(DwarfCompileUnit &TheCU,
-                                           LexicalScope *Scope, DIE &ScopeDIE) {
-  // We create children when the scope DIE is not null.
-  SmallVector<std::unique_ptr<DIE>, 8> Children;
-  if (DIE *ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children))
-    TheCU.addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer, *ObjectPointer);
-
-  // Add children
-  for (auto &I : Children)
-    ScopeDIE.addChild(std::move(I));
-}
-
-void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &TheCU,
-                                                     LexicalScope *Scope) {
+void DwarfDebug::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) {
   assert(Scope && Scope->getScopeNode());
   assert(Scope->isAbstractScope());
   assert(!Scope->getInlinedAt());
 
-  DISubprogram SP(Scope->getScopeNode());
+  const MDNode *SP = Scope->getScopeNode();
 
   ProcessedSPNodes.insert(SP);
 
-  DIE *&AbsDef = AbstractSPDies[SP];
-  if (AbsDef)
-    return;
-
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
-  DwarfCompileUnit &SPCU = *SPMap[SP];
-  DIE *ContextDIE;
-
-  // Some of this is duplicated from DwarfUnit::getOrCreateSubprogramDIE, with
-  // the important distinction that the DIDescriptor is not associated with the
-  // DIE (since the DIDescriptor will be associated with the concrete DIE, if
-  // any). It could be refactored to some common utility function.
-  if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
-    ContextDIE = &SPCU.getUnitDie();
-    SPCU.getOrCreateSubprogramDIE(SPDecl);
-  } else
-    ContextDIE = SPCU.getOrCreateContextDIE(resolve(SP.getContext()));
-
-  // Passing null as the associated DIDescriptor because the abstract definition
-  // shouldn't be found by lookup.
-  AbsDef = &SPCU.createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE,
-                                 DIDescriptor());
-  SPCU.applySubprogramAttributesToDefinition(SP, *AbsDef);
-
-  SPCU.addUInt(*AbsDef, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
-  createAndAddScopeChildren(SPCU, Scope, *AbsDef);
-}
-
-DIE &DwarfDebug::constructSubprogramScopeDIE(DwarfCompileUnit &TheCU,
-                                             LexicalScope *Scope) {
-  assert(Scope && Scope->getScopeNode());
-  assert(!Scope->getInlinedAt());
-  assert(!Scope->isAbstractScope());
-  DISubprogram Sub(Scope->getScopeNode());
-
-  assert(Sub.isSubprogram());
-
-  ProcessedSPNodes.insert(Sub);
-
-  DIE &ScopeDIE = updateSubprogramScopeDIE(TheCU, Sub);
-
-  createAndAddScopeChildren(TheCU, Scope, ScopeDIE);
-
-  return ScopeDIE;
-}
-
-// Construct a DIE for this scope.
-std::unique_ptr<DIE> DwarfDebug::constructScopeDIE(DwarfCompileUnit &TheCU,
-                                                   LexicalScope *Scope) {
-  if (!Scope || !Scope->getScopeNode())
-    return nullptr;
-
-  DIScope DS(Scope->getScopeNode());
-
-  assert((Scope->getInlinedAt() || !DS.isSubprogram()) &&
-         "Only handle inlined subprograms here, use "
-         "constructSubprogramScopeDIE for non-inlined "
-         "subprograms");
-
-  SmallVector<std::unique_ptr<DIE>, 8> Children;
-
-  // We try to create the scope DIE first, then the children DIEs. This will
-  // avoid creating un-used children then removing them later when we find out
-  // the scope DIE is null.
-  std::unique_ptr<DIE> ScopeDIE;
-  if (Scope->getParent() && DS.isSubprogram()) {
-    ScopeDIE = constructInlinedScopeDIE(TheCU, Scope);
-    if (!ScopeDIE)
-      return nullptr;
-    // We create children when the scope DIE is not null.
-    createScopeChildrenDIE(TheCU, Scope, Children);
-  } else {
-    // Early exit when we know the scope DIE is going to be null.
-    if (isLexicalScopeDIENull(Scope))
-      return nullptr;
-
-    // We create children here when we know the scope DIE is not going to be
-    // null and the children will be added to the scope DIE.
-    createScopeChildrenDIE(TheCU, Scope, Children);
-
-    // There is no need to emit empty lexical block DIE.
-    std::pair<ImportedEntityMap::const_iterator,
-              ImportedEntityMap::const_iterator> Range =
-        std::equal_range(ScopesWithImportedEntities.begin(),
-                         ScopesWithImportedEntities.end(),
-                         std::pair<const MDNode *, const MDNode *>(DS, nullptr),
-                         less_first());
-    if (Children.empty() && Range.first == Range.second)
-      return nullptr;
-    ScopeDIE = constructLexicalScopeDIE(TheCU, Scope);
-    assert(ScopeDIE && "Scope DIE should not be null.");
-    for (ImportedEntityMap::const_iterator i = Range.first; i != Range.second;
-         ++i)
-      constructImportedEntityDIE(TheCU, i->second, *ScopeDIE);
-  }
-
-  // Add children
-  for (auto &I : Children)
-    ScopeDIE->addChild(std::move(I));
-
-  return ScopeDIE;
+  auto &CU = SPMap[SP];
+  forBothCUs(*CU, [&](DwarfCompileUnit &CU) {
+    CU.constructAbstractSubprogramScopeDIE(Scope);
+  });
 }
 
 void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const {
@@ -630,6 +371,8 @@ DwarfCompileUnit &DwarfDebug::constructDwarfCompileUnit(DICompileUnit DIUnit) {
   DwarfCompileUnit &NewCU = *OwnedUnit;
   DIE &Die = NewCU.getUnitDie();
   InfoHolder.addUnit(std::move(OwnedUnit));
+  if (useSplitDwarf())
+    NewCU.setSkeleton(constructSkeletonCU(NewCU));
 
   // LTO with assembly output shares a single line table amongst multiple CUs.
   // To avoid the compilation directory being ambiguous, let the line table
@@ -666,14 +409,10 @@ DwarfCompileUnit &DwarfDebug::constructDwarfCompileUnit(DICompileUnit DIUnit) {
     NewCU.addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
                   dwarf::DW_FORM_data1, RVer);
 
-  if (!FirstCU)
-    FirstCU = &NewCU;
-
-  if (useSplitDwarf()) {
+  if (useSplitDwarf())
     NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoDWOSection(),
                       DwarfInfoDWOSectionSym);
-    NewCU.setSkeleton(constructSkeletonCU(NewCU));
-  } else
+  else
     NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoSection(),
                       DwarfInfoSectionSym);
 
@@ -682,44 +421,12 @@ DwarfCompileUnit &DwarfDebug::constructDwarfCompileUnit(DICompileUnit DIUnit) {
   return NewCU;
 }
 
-void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit &TheCU,
-                                            const MDNode *N) {
+void DwarfDebug::constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU,
+                                                  const MDNode *N) {
   DIImportedEntity Module(N);
   assert(Module.Verify());
   if (DIE *D = TheCU.getOrCreateContextDIE(Module.getContext()))
-    constructImportedEntityDIE(TheCU, Module, *D);
-}
-
-void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit &TheCU,
-                                            const MDNode *N, DIE &Context) {
-  DIImportedEntity Module(N);
-  assert(Module.Verify());
-  return constructImportedEntityDIE(TheCU, Module, Context);
-}
-
-void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit &TheCU,
-                                            const DIImportedEntity &Module,
-                                            DIE &Context) {
-  assert(Module.Verify() &&
-         "Use one of the MDNode * overloads to handle invalid metadata");
-  DIE &IMDie = TheCU.createAndAddDIE(Module.getTag(), Context, Module);
-  DIE *EntityDie;
-  DIDescriptor Entity = resolve(Module.getEntity());
-  if (Entity.isNameSpace())
-    EntityDie = TheCU.getOrCreateNameSpace(DINameSpace(Entity));
-  else if (Entity.isSubprogram())
-    EntityDie = TheCU.getOrCreateSubprogramDIE(DISubprogram(Entity));
-  else if (Entity.isType())
-    EntityDie = TheCU.getOrCreateTypeDIE(DIType(Entity));
-  else
-    EntityDie = TheCU.getDIE(Entity);
-  TheCU.addSourceLine(IMDie, Module.getLineNumber(),
-                      Module.getContext().getFilename(),
-                      Module.getContext().getDirectory());
-  TheCU.addDIEEntry(IMDie, dwarf::DW_AT_import, *EntityDie);
-  StringRef Name = Module.getName();
-  if (!Name.empty())
-    TheCU.addString(IMDie, dwarf::DW_AT_name, Name);
+    D->addChild(TheCU.constructImportedEntityDIE(Module));
 }
 
 // Emit all Dwarf sections that should come prior to the content. Create
@@ -731,8 +438,8 @@ void DwarfDebug::beginModule() {
 
   const Module *M = MMI->getModule();
 
-  // If module has named metadata anchors then use them, otherwise scan the
-  // module using debug info finder to collect debug info.
+  FunctionDIs = makeSubprogramMap(*M);
+
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (!CU_Nodes)
     return;
@@ -755,13 +462,18 @@ void DwarfDebug::beginModule() {
               ScopesWithImportedEntities.end(), less_first());
     DIArray GVs = CUNode.getGlobalVariables();
     for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i)
-      CU.createGlobalVariableDIE(DIGlobalVariable(GVs.getElement(i)));
+      CU.getOrCreateGlobalVariableDIE(DIGlobalVariable(GVs.getElement(i)));
     DIArray SPs = CUNode.getSubprograms();
     for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i)
       SPMap.insert(std::make_pair(SPs.getElement(i), &CU));
     DIArray EnumTypes = CUNode.getEnumTypes();
-    for (unsigned i = 0, e = EnumTypes.getNumElements(); i != e; ++i)
-      CU.getOrCreateTypeDIE(EnumTypes.getElement(i));
+    for (unsigned i = 0, e = EnumTypes.getNumElements(); i != e; ++i) {
+      DIType Ty(EnumTypes.getElement(i));
+      // The enum types array by design contains pointers to
+      // MDNodes rather than DIRefs. Unique them here.
+      DIType UniqueTy(resolve(Ty.getRef()));
+      CU.getOrCreateTypeDIE(UniqueTy);
+    }
     DIArray RetainedTypes = CUNode.getRetainedTypes();
     for (unsigned i = 0, e = RetainedTypes.getNumElements(); i != e; ++i) {
       DIType Ty(RetainedTypes.getElement(i));
@@ -773,7 +485,7 @@ void DwarfDebug::beginModule() {
     // Emit imported_modules last so that the relevant context is already
     // available.
     for (unsigned i = 0, e = ImportedEntities.getNumElements(); i != e; ++i)
-      constructImportedEntityDIE(CU, ImportedEntities.getElement(i));
+      constructAndAddImportedEntityDIE(CU, ImportedEntities.getElement(i));
   }
 
   // Tell MMI that we have debug info.
@@ -786,9 +498,7 @@ void DwarfDebug::beginModule() {
 void DwarfDebug::finishVariableDefinitions() {
   for (const auto &Var : ConcreteVariables) {
     DIE *VariableDie = Var->getDIE();
-    // FIXME: There shouldn't be any variables without DIEs.
-    if (!VariableDie)
-      continue;
+    assert(VariableDie);
     // FIXME: Consider the time-space tradeoff of just storing the unit pointer
     // in the ConcreteVariables list, rather than looking it up again here.
     // DIE::getUnit isn't simple - it walks parent pointers, etc.
@@ -804,36 +514,10 @@ void DwarfDebug::finishVariableDefinitions() {
 }
 
 void DwarfDebug::finishSubprogramDefinitions() {
-  const Module *M = MMI->getModule();
-
-  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
-  for (MDNode *N : CU_Nodes->operands()) {
-    DICompileUnit TheCU(N);
-    // Construct subprogram DIE and add variables DIEs.
-    DwarfCompileUnit *SPCU =
-        static_cast<DwarfCompileUnit *>(CUMap.lookup(TheCU));
-    DIArray Subprograms = TheCU.getSubprograms();
-    for (unsigned i = 0, e = Subprograms.getNumElements(); i != e; ++i) {
-      DISubprogram SP(Subprograms.getElement(i));
-      // Perhaps the subprogram is in another CU (such as due to comdat
-      // folding, etc), in which case ignore it here.
-      if (SPMap[SP] != SPCU)
-        continue;
-      DIE *D = SPCU->getDIE(SP);
-      if (DIE *AbsSPDIE = AbstractSPDies.lookup(SP)) {
-        if (D)
-          // If this subprogram has an abstract definition, reference that
-          SPCU->addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
-      } else {
-        if (!D)
-          // Lazily construct the subprogram if we didn't see either concrete or
-          // inlined versions during codegen.
-          D = SPCU->getOrCreateSubprogramDIE(SP);
-        // And attach the attributes
-        SPCU->applySubprogramAttributesToDefinition(SP, *D);
-      }
-    }
-  }
+  for (const auto &P : SPMap)
+    forBothCUs(*P.second, [&](DwarfCompileUnit &CU) {
+      CU.finishSubprogramDefinition(DISubprogram(P.first));
+    });
 }
 
 
@@ -853,26 +537,7 @@ void DwarfDebug::collectDeadVariables() {
         DISubprogram SP(Subprograms.getElement(i));
         if (ProcessedSPNodes.count(SP) != 0)
           continue;
-        assert(SP.isSubprogram() &&
-               "CU's subprogram list contains a non-subprogram");
-        assert(SP.isDefinition() &&
-               "CU's subprogram list contains a subprogram declaration");
-        DIArray Variables = SP.getVariables();
-        if (Variables.getNumElements() == 0)
-          continue;
-
-        DIE *SPDIE = AbstractSPDies.lookup(SP);
-        if (!SPDIE)
-          SPDIE = SPCU->getDIE(SP);
-        assert(SPDIE);
-        for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
-          DIVariable DV(Variables.getElement(vi));
-          assert(DV.isVariable());
-          DbgVariable NewVar(DV, this);
-          auto VariableDie = SPCU->constructVariableDIE(NewVar);
-          SPCU->applyVariableAttributes(NewVar, *VariableDie);
-          SPDIE->addChild(std::move(VariableDie));
-        }
+        SPCU->collectDeadVariables(SP);
       }
     }
   }
@@ -888,66 +553,52 @@ void DwarfDebug::finalizeModuleInfo() {
 
   // Handle anything that needs to be done on a per-unit basis after
   // all other generation.
-  for (const auto &TheU : getUnits()) {
+  for (const auto &P : CUMap) {
+    auto &TheCU = *P.second;
     // Emit DW_AT_containing_type attribute to connect types with their
     // vtable holding type.
-    TheU->constructContainingTypeDIEs();
+    TheCU.constructContainingTypeDIEs();
 
     // Add CU specific attributes if we need to add any.
-    if (TheU->getUnitDie().getTag() == dwarf::DW_TAG_compile_unit) {
-      // If we're splitting the dwarf out now that we've got the entire
-      // CU then add the dwo id to it.
-      DwarfCompileUnit *SkCU =
-          static_cast<DwarfCompileUnit *>(TheU->getSkeleton());
-      if (useSplitDwarf()) {
-        // Emit a unique identifier for this CU.
-        uint64_t ID = DIEHash(Asm).computeCUSignature(TheU->getUnitDie());
-        TheU->addUInt(TheU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
-                      dwarf::DW_FORM_data8, ID);
-        SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
-                      dwarf::DW_FORM_data8, ID);
-
-        // We don't keep track of which addresses are used in which CU so this
-        // is a bit pessimistic under LTO.
-        if (!AddrPool.isEmpty())
-          addSectionLabel(*Asm, *SkCU, SkCU->getUnitDie(),
-                          dwarf::DW_AT_GNU_addr_base, DwarfAddrSectionSym,
-                          DwarfAddrSectionSym);
-        if (!TheU->getRangeLists().empty())
-          addSectionLabel(*Asm, *SkCU, SkCU->getUnitDie(),
-                          dwarf::DW_AT_GNU_ranges_base,
-                          DwarfDebugRangeSectionSym, DwarfDebugRangeSectionSym);
-      }
+    // If we're splitting the dwarf out now that we've got the entire
+    // CU then add the dwo id to it.
+    auto *SkCU = TheCU.getSkeleton();
+    if (useSplitDwarf()) {
+      // Emit a unique identifier for this CU.
+      uint64_t ID = DIEHash(Asm).computeCUSignature(TheCU.getUnitDie());
+      TheCU.addUInt(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
+                    dwarf::DW_FORM_data8, ID);
+      SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
+                    dwarf::DW_FORM_data8, ID);
+
+      // We don't keep track of which addresses are used in which CU so this
+      // is a bit pessimistic under LTO.
+      if (!AddrPool.isEmpty())
+        SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_addr_base,
+                              DwarfAddrSectionSym, DwarfAddrSectionSym);
+      if (!SkCU->getRangeLists().empty())
+        SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_ranges_base,
+                              DwarfDebugRangeSectionSym,
+                              DwarfDebugRangeSectionSym);
+    }
 
-      // If we have code split among multiple sections or non-contiguous
-      // ranges of code then emit a DW_AT_ranges attribute on the unit that will
-      // remain in the .o file, otherwise add a DW_AT_low_pc.
-      // FIXME: We should use ranges allow reordering of code ala
-      // .subsections_via_symbols in mach-o. This would mean turning on
-      // ranges for all subprogram DIEs for mach-o.
-      DwarfCompileUnit &U =
-          SkCU ? *SkCU : static_cast<DwarfCompileUnit &>(*TheU);
-      unsigned NumRanges = TheU->getRanges().size();
-      if (NumRanges) {
-        if (NumRanges > 1) {
-          addSectionLabel(*Asm, U, U.getUnitDie(), dwarf::DW_AT_ranges,
-                          Asm->GetTempSymbol("cu_ranges", U.getUniqueID()),
-                          DwarfDebugRangeSectionSym);
-
-          // A DW_AT_low_pc attribute may also be specified in combination with
-          // DW_AT_ranges to specify the default base address for use in
-          // location lists (see Section 2.6.2) and range lists (see Section
-          // 2.17.3).
-          U.addUInt(U.getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
-                    0);
-        } else {
-          RangeSpan &Range = TheU->getRanges().back();
-          U.addLocalLabelAddress(U.getUnitDie(), dwarf::DW_AT_low_pc,
-                                 Range.getStart());
-          U.addLabelDelta(U.getUnitDie(), dwarf::DW_AT_high_pc, Range.getEnd(),
-                          Range.getStart());
-        }
-      }
+    // If we have code split among multiple sections or non-contiguous
+    // ranges of code then emit a DW_AT_ranges attribute on the unit that will
+    // remain in the .o file, otherwise add a DW_AT_low_pc.
+    // FIXME: We should use ranges allow reordering of code ala
+    // .subsections_via_symbols in mach-o. This would mean turning on
+    // ranges for all subprogram DIEs for mach-o.
+    DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
+    if (unsigned NumRanges = TheCU.getRanges().size()) {
+      if (NumRanges > 1)
+        // A DW_AT_low_pc attribute may also be specified in combination with
+        // DW_AT_ranges to specify the default base address for use in
+        // location lists (see Section 2.6.2) and range lists (see Section
+        // 2.17.3).
+        U.addUInt(U.getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0);
+      else
+        TheCU.setBaseAddress(TheCU.getRanges().front().getStart());
+      U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges());
     }
   }
 
@@ -1009,7 +660,10 @@ void DwarfDebug::endModule() {
   assert(CurFn == nullptr);
   assert(CurMI == nullptr);
 
-  if (!FirstCU)
+  // If we aren't actually generating debug info (check beginModule -
+  // conditionalized on !DisableDebugInfoPrinting and the presence of the
+  // llvm.dbg.cu metadata node)
+  if (!DwarfInfoSectionSym)
     return;
 
   // End any existing sections.
@@ -1063,9 +717,6 @@ void DwarfDebug::endModule() {
   // clean up.
   SPMap.clear();
   AbstractVariables.clear();
-
-  // Reset these for the next Module if we have one.
-  FirstCU = nullptr;
 }
 
 // Find abstract variable, if any, associated with Var.
@@ -1091,8 +742,8 @@ DbgVariable *DwarfDebug::getExistingAbstractVariable(const DIVariable &DV) {
 
 void DwarfDebug::createAbstractVariable(const DIVariable &Var,
                                         LexicalScope *Scope) {
-  auto AbsDbgVariable = make_unique<DbgVariable>(Var, this);
-  addScopeVariable(Scope, AbsDbgVariable.get());
+  auto AbsDbgVariable = make_unique<DbgVariable>(Var, DIExpression(), this);
+  InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get());
   AbstractVariables[Var] = std::move(AbsDbgVariable);
 }
 
@@ -1116,55 +767,35 @@ DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(const DIVariable &DV,
     createAbstractVariable(Cleansed, Scope);
 }
 
-// If Var is a current function argument then add it to CurrentFnArguments list.
-bool DwarfDebug::addCurrentFnArgument(DbgVariable *Var, LexicalScope *Scope) {
-  if (!LScopes.isCurrentFunctionScope(Scope))
-    return false;
-  DIVariable DV = Var->getVariable();
-  if (DV.getTag() != dwarf::DW_TAG_arg_variable)
-    return false;
-  unsigned ArgNo = DV.getArgNumber();
-  if (ArgNo == 0)
-    return false;
-
-  size_t Size = CurrentFnArguments.size();
-  if (Size == 0)
-    CurrentFnArguments.resize(CurFn->getFunction()->arg_size());
-  // llvm::Function argument size is not good indicator of how many
-  // arguments does the function have at source level.
-  if (ArgNo > Size)
-    CurrentFnArguments.resize(ArgNo * 2);
-  CurrentFnArguments[ArgNo - 1] = Var;
-  return true;
-}
-
 // Collect variable information from side table maintained by MMI.
 void DwarfDebug::collectVariableInfoFromMMITable(
-    SmallPtrSet<const MDNode *, 16> &Processed) {
+    SmallPtrSetImpl<const MDNode *> &Processed) {
   for (const auto &VI : MMI->getVariableDbgInfo()) {
     if (!VI.Var)
       continue;
     Processed.insert(VI.Var);
-    DIVariable DV(VI.Var);
     LexicalScope *Scope = LScopes.findLexicalScope(VI.Loc);
 
     // If variable scope is not found then skip this variable.
     if (!Scope)
       continue;
 
+    DIVariable DV(VI.Var);
+    DIExpression Expr(VI.Expr);
     ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
-    ConcreteVariables.push_back(make_unique<DbgVariable>(DV, this));
+    ConcreteVariables.push_back(make_unique<DbgVariable>(DV, Expr, this));
     DbgVariable *RegVar = ConcreteVariables.back().get();
     RegVar->setFrameIndex(VI.Slot);
-    addScopeVariable(Scope, RegVar);
+    InfoHolder.addScopeVariable(Scope, RegVar);
   }
 }
 
 // Get .debug_loc entry for the instruction range starting at MI.
 static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) {
+  const MDNode *Expr = MI->getDebugExpression();
   const MDNode *Var = MI->getDebugVariable();
 
-  assert(MI->getNumOperands() == 3);
+  assert(MI->getNumOperands() == 4);
   if (MI->getOperand(0).isReg()) {
     MachineLocation MLoc;
     // If the second operand is an immediate, this is a
@@ -1173,24 +804,138 @@ static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) {
       MLoc.set(MI->getOperand(0).getReg());
     else
       MLoc.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
-    return DebugLocEntry::Value(Var, MLoc);
+    return DebugLocEntry::Value(Var, Expr, MLoc);
   }
   if (MI->getOperand(0).isImm())
-    return DebugLocEntry::Value(Var, MI->getOperand(0).getImm());
+    return DebugLocEntry::Value(Var, Expr, MI->getOperand(0).getImm());
   if (MI->getOperand(0).isFPImm())
-    return DebugLocEntry::Value(Var, MI->getOperand(0).getFPImm());
+    return DebugLocEntry::Value(Var, Expr, MI->getOperand(0).getFPImm());
   if (MI->getOperand(0).isCImm())
-    return DebugLocEntry::Value(Var, MI->getOperand(0).getCImm());
+    return DebugLocEntry::Value(Var, Expr, MI->getOperand(0).getCImm());
 
-  llvm_unreachable("Unexpected 3 operand DBG_VALUE instruction!");
+  llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!");
 }
 
-// Find variables for each lexical scope.
+/// Determine whether two variable pieces overlap.
+static bool piecesOverlap(DIExpression P1, DIExpression P2) {
+  if (!P1.isVariablePiece() || !P2.isVariablePiece())
+    return true;
+  unsigned l1 = P1.getPieceOffset();
+  unsigned l2 = P2.getPieceOffset();
+  unsigned r1 = l1 + P1.getPieceSize();
+  unsigned r2 = l2 + P2.getPieceSize();
+  // True where [l1,r1[ and [r1,r2[ overlap.
+  return (l1 < r2) && (l2 < r1);
+}
+
+/// Build the location list for all DBG_VALUEs in the function that
+/// describe the same variable.  If the ranges of several independent
+/// pieces of the same variable overlap partially, split them up and
+/// combine the ranges. The resulting DebugLocEntries are will have
+/// strict monotonically increasing begin addresses and will never
+/// overlap.
+//
+// Input:
+//
+//   Ranges History [var, loc, piece ofs size]
+// 0 |      [x, (reg0, piece 0, 32)]
+// 1 | |    [x, (reg1, piece 32, 32)] <- IsPieceOfPrevEntry
+// 2 | |    ...
+// 3   |    [clobber reg0]
+// 4        [x, (mem, piece 0, 64)] <- overlapping with both previous pieces of x.
+//
+// Output:
+//
+// [0-1]    [x, (reg0, piece  0, 32)]
+// [1-3]    [x, (reg0, piece  0, 32), (reg1, piece 32, 32)]
+// [3-4]    [x, (reg1, piece 32, 32)]
+// [4- ]    [x, (mem,  piece  0, 64)]
 void
-DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
-  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
-  DwarfCompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
+DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
+                              const DbgValueHistoryMap::InstrRanges &Ranges) {
+  SmallVector<DebugLocEntry::Value, 4> OpenRanges;
+
+  for (auto I = Ranges.begin(), E = Ranges.end(); I != E; ++I) {
+    const MachineInstr *Begin = I->first;
+    const MachineInstr *End = I->second;
+    assert(Begin->isDebugValue() && "Invalid History entry");
+
+    // Check if a variable is inaccessible in this range.
+    if (Begin->getNumOperands() > 1 &&
+        Begin->getOperand(0).isReg() && !Begin->getOperand(0).getReg()) {
+      OpenRanges.clear();
+      continue;
+    }
+
+    // If this piece overlaps with any open ranges, truncate them.
+    DIExpression DIExpr = Begin->getDebugExpression();
+    auto Last = std::remove_if(OpenRanges.begin(), OpenRanges.end(),
+                               [&](DebugLocEntry::Value R) {
+      return piecesOverlap(DIExpr, R.getExpression());
+    });
+    OpenRanges.erase(Last, OpenRanges.end());
 
+    const MCSymbol *StartLabel = getLabelBeforeInsn(Begin);
+    assert(StartLabel && "Forgot label before DBG_VALUE starting a range!");
+
+    const MCSymbol *EndLabel;
+    if (End != nullptr)
+      EndLabel = getLabelAfterInsn(End);
+    else if (std::next(I) == Ranges.end())
+      EndLabel = FunctionEndSym;
+    else
+      EndLabel = getLabelBeforeInsn(std::next(I)->first);
+    assert(EndLabel && "Forgot label after instruction ending a range!");
+
+    DEBUG(dbgs() << "DotDebugLoc: " << *Begin << "\n");
+
+    auto Value = getDebugLocValue(Begin);
+    DebugLocEntry Loc(StartLabel, EndLabel, Value);
+    bool couldMerge = false;
+
+    // If this is a piece, it may belong to the current DebugLocEntry.
+    if (DIExpr.isVariablePiece()) {
+      // Add this value to the list of open ranges.
+      OpenRanges.push_back(Value);
+
+      // Attempt to add the piece to the last entry.
+      if (!DebugLoc.empty())
+        if (DebugLoc.back().MergeValues(Loc))
+          couldMerge = true;
+    }
+
+    if (!couldMerge) {
+      // Need to add a new DebugLocEntry. Add all values from still
+      // valid non-overlapping pieces.
+      if (OpenRanges.size())
+        Loc.addValues(OpenRanges);
+
+      DebugLoc.push_back(std::move(Loc));
+    }
+
+    // Attempt to coalesce the ranges of two otherwise identical
+    // DebugLocEntries.
+    auto CurEntry = DebugLoc.rbegin();
+    auto PrevEntry = std::next(CurEntry);
+    if (PrevEntry != DebugLoc.rend() && PrevEntry->MergeRanges(*CurEntry))
+      DebugLoc.pop_back();
+
+    DEBUG({
+      dbgs() << CurEntry->getValues().size() << " Values:\n";
+      for (auto Value : CurEntry->getValues()) {
+        Value.getVariable()->dump();
+        Value.getExpression()->dump();
+      }
+      dbgs() << "-----\n";
+    });
+  }
+}
+
+
+// Find variables for each lexical scope.
+void
+DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU, DISubprogram SP,
+                                SmallPtrSetImpl<const MDNode *> &Processed) {
   // Grab the variable info that was squirreled away in the MMI side-table.
   collectVariableInfoFromMMITable(Processed);
 
@@ -1205,10 +950,7 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
       continue;
 
     LexicalScope *Scope = nullptr;
-    if (DV.getTag() == dwarf::DW_TAG_arg_variable &&
-        DISubprogram(DV.getContext()).describes(CurFn->getFunction()))
-      Scope = LScopes.getCurrentFunctionScope();
-    else if (MDNode *IA = DV.getInlinedAt()) {
+    if (MDNode *IA = DV.getInlinedAt()) {
       DebugLoc DL = DebugLoc::getFromDILocation(IA);
       Scope = LScopes.findInlinedScope(DebugLoc::get(
           DL.getLine(), DL.getCol(), DV.getContext(), IA));
@@ -1224,7 +966,7 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
     ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
     ConcreteVariables.push_back(make_unique<DbgVariable>(MInsn, this));
     DbgVariable *RegVar = ConcreteVariables.back().get();
-    addScopeVariable(Scope, RegVar);
+    InfoHolder.addScopeVariable(Scope, RegVar);
 
     // Check if the first DBG_VALUE is valid for the rest of the function.
     if (Ranges.size() == 1 && Ranges.front().second == nullptr)
@@ -1235,53 +977,26 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
 
     DotDebugLocEntries.resize(DotDebugLocEntries.size() + 1);
     DebugLocList &LocList = DotDebugLocEntries.back();
+    LocList.CU = &TheCU;
     LocList.Label =
         Asm->GetTempSymbol("debug_loc", DotDebugLocEntries.size() - 1);
-    SmallVector<DebugLocEntry, 4> &DebugLoc = LocList.List;
-    for (auto I = Ranges.begin(), E = Ranges.end(); I != E; ++I) {
-      const MachineInstr *Begin = I->first;
-      const MachineInstr *End = I->second;
-      assert(Begin->isDebugValue() && "Invalid History entry");
-
-      // Check if a variable is unaccessible in this range.
-      if (Begin->getNumOperands() > 1 && Begin->getOperand(0).isReg() &&
-          !Begin->getOperand(0).getReg())
-        continue;
-      DEBUG(dbgs() << "DotDebugLoc Pair:\n" << "\t" << *Begin);
-      if (End != nullptr)
-        DEBUG(dbgs() << "\t" << *End);
-      else
-        DEBUG(dbgs() << "\tNULL\n");
 
-      const MCSymbol *StartLabel = getLabelBeforeInsn(Begin);
-      assert(StartLabel && "Forgot label before DBG_VALUE starting a range!");
-
-      const MCSymbol *EndLabel;
-      if (End != nullptr)
-        EndLabel = getLabelAfterInsn(End);
-      else if (std::next(I) == Ranges.end())
-        EndLabel = FunctionEndSym;
-      else
-        EndLabel = getLabelBeforeInsn(std::next(I)->first);
-      assert(EndLabel && "Forgot label after instruction ending a range!");
-
-      DebugLocEntry Loc(StartLabel, EndLabel, getDebugLocValue(Begin), TheCU);
-      if (DebugLoc.empty() || !DebugLoc.back().Merge(Loc))
-        DebugLoc.push_back(std::move(Loc));
-    }
+    // Build the location list for this variable.
+    buildLocationList(LocList.List, Ranges);
   }
 
   // Collect info for variables that were optimized out.
-  DIArray Variables = DISubprogram(FnScope->getScopeNode()).getVariables();
+  DIArray Variables = SP.getVariables();
   for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
     DIVariable DV(Variables.getElement(i));
     assert(DV.isVariable());
-    if (!Processed.insert(DV))
+    if (!Processed.insert(DV).second)
       continue;
     if (LexicalScope *Scope = LScopes.findLexicalScope(DV.getContext())) {
       ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
-      ConcreteVariables.push_back(make_unique<DbgVariable>(DV, this));
-      addScopeVariable(Scope, ConcreteVariables.back().get());
+      DIExpression NoExpr;
+      ConcreteVariables.push_back(make_unique<DbgVariable>(DV, NoExpr, this));
+      InfoHolder.addScopeVariable(Scope, ConcreteVariables.back().get());
     }
   }
 }
@@ -1311,8 +1026,10 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
       if (DL == PrologEndLoc) {
         Flags |= DWARF2_FLAG_PROLOGUE_END;
         PrologEndLoc = DebugLoc();
+        Flags |= DWARF2_FLAG_IS_STMT;
       }
-      if (PrologEndLoc.isUnknown())
+      if (DL.getLine() !=
+          Asm->OutStreamer.getContext().getCurrentDwarfLoc().getLine())
         Flags |= DWARF2_FLAG_IS_STMT;
 
       if (!DL.isUnknown()) {
@@ -1402,8 +1119,12 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
   for (const auto &MBB : *MF)
     for (const auto &MI : MBB)
       if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
-          !MI.getDebugLoc().isUnknown())
+        !MI.getDebugLoc().isUnknown()) {
+        // Did the target forget to set the FrameSetup flag for CFI insns?
+        assert(!MI.isCFIInstruction() &&
+               "First non-frame-setup instruction is a CFI instruction.");
         return MI.getDebugLoc();
+      }
   return DebugLoc();
 }
 
@@ -1416,6 +1137,10 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   if (!MMI->hasDebugInfo())
     return;
 
+  auto DI = FunctionDIs.find(MF->getFunction());
+  if (DI == FunctionDIs.end())
+    return;
+
   // Grab the lexical scopes for the function, if we don't have any of those
   // then we're not going to be able to do anything.
   LScopes.initialize(*MF);
@@ -1431,6 +1156,14 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   // belongs to so that we add to the correct per-cu line table in the
   // non-asm case.
   LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  // FnScope->getScopeNode() and DI->second should represent the same function,
+  // though they may not be the same MDNode due to inline functions merged in
+  // LTO where the debug info metadata still differs (either due to distinct
+  // written differences - two versions of a linkonce_odr function
+  // written/copied into two separate files, or some sub-optimal metadata that
+  // isn't structurally identical (see: file path/name info from clang, which
+  // includes the directory of the cpp file being built, even when the file name
+  // is absolute (such as an <> lookup header)))
   DwarfCompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
   assert(TheCU && "Unable to find compile unit!");
   if (Asm->OutStreamer.hasRawTextSupport())
@@ -1445,7 +1178,8 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   Asm->OutStreamer.EmitLabel(FunctionBeginSym);
 
   // Calculate history for local variables.
-  calculateDbgValueHistory(MF, Asm->TM.getRegisterInfo(), DbgValues);
+  calculateDbgValueHistory(MF, Asm->TM.getSubtargetImpl()->getRegisterInfo(),
+                           DbgValues);
 
   // Request labels for the full history.
   for (const auto &I : DbgValues) {
@@ -1455,10 +1189,24 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
     // The first mention of a function argument gets the FunctionBeginSym
     // label, so arguments are visible when breaking at function entry.
-    DIVariable DV(I.first);
-    if (DV.isVariable() && DV.getTag() == dwarf::DW_TAG_arg_variable &&
-        getDISubprogram(DV.getContext()).describes(MF->getFunction()))
+    DIVariable DIVar(Ranges.front().first->getDebugVariable());
+    if (DIVar.isVariable() && DIVar.getTag() == dwarf::DW_TAG_arg_variable &&
+        getDISubprogram(DIVar.getContext()).describes(MF->getFunction())) {
       LabelsBeforeInsn[Ranges.front().first] = FunctionBeginSym;
+      if (Ranges.front().first->getDebugExpression().isVariablePiece()) {
+        // Mark all non-overlapping initial pieces.
+        for (auto I = Ranges.begin(); I != Ranges.end(); ++I) {
+          DIExpression Piece = I->first->getDebugExpression();
+          if (std::all_of(Ranges.begin(), I,
+                          [&](DbgValueHistoryMap::InstrRange Pred) {
+                return !piecesOverlap(Piece, Pred.first->getDebugExpression());
+              }))
+            LabelsBeforeInsn[I->first] = FunctionBeginSym;
+          else
+            break;
+        }
+      }
+    }
 
     for (const auto &Range : Ranges) {
       requestLabelBeforeInsn(Range.first);
@@ -1484,55 +1232,16 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   }
 }
 
-void DwarfDebug::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
-  if (addCurrentFnArgument(Var, LS))
-    return;
-  SmallVectorImpl<DbgVariable *> &Vars = ScopeVariables[LS];
-  DIVariable DV = Var->getVariable();
-  // Variables with positive arg numbers are parameters.
-  if (unsigned ArgNum = DV.getArgNumber()) {
-    // Keep all parameters in order at the start of the variable list to ensure
-    // function types are correct (no out-of-order parameters)
-    //
-    // This could be improved by only doing it for optimized builds (unoptimized
-    // builds have the right order to begin with), searching from the back (this
-    // would catch the unoptimized case quickly), or doing a binary search
-    // rather than linear search.
-    SmallVectorImpl<DbgVariable *>::iterator I = Vars.begin();
-    while (I != Vars.end()) {
-      unsigned CurNum = (*I)->getVariable().getArgNumber();
-      // A local (non-parameter) variable has been found, insert immediately
-      // before it.
-      if (CurNum == 0)
-        break;
-      // A later indexed parameter has been found, insert immediately before it.
-      if (CurNum > ArgNum)
-        break;
-      ++I;
-    }
-    Vars.insert(I, Var);
-    return;
-  }
-
-  Vars.push_back(Var);
-}
-
 // Gather and emit post-function debug information.
 void DwarfDebug::endFunction(const MachineFunction *MF) {
-  // Every beginFunction(MF) call should be followed by an endFunction(MF) call,
-  // though the beginFunction may not be called at all.
-  // We should handle both cases.
-  if (!CurFn)
-    CurFn = MF;
-  else
-    assert(CurFn == MF);
-  assert(CurFn != nullptr);
+  assert(CurFn == MF &&
+      "endFunction should be called with the same function as beginFunction");
 
-  if (!MMI->hasDebugInfo() || LScopes.empty()) {
+  if (!MMI->hasDebugInfo() || LScopes.empty() ||
+      !FunctionDIs.count(MF->getFunction())) {
     // If we don't have a lexical scope for this function then there will
     // be a hole in the range information. Keep note of this by setting the
     // previously used section to nullptr.
-    PrevSection = nullptr;
     PrevCU = nullptr;
     CurFn = nullptr;
     return;
@@ -1546,12 +1255,35 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   // Set DwarfDwarfCompileUnitID in MCContext to default value.
   Asm->OutStreamer.getContext().setDwarfCompileUnitID(0);
 
+  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  DISubprogram SP(FnScope->getScopeNode());
+  DwarfCompileUnit &TheCU = *SPMap.lookup(SP);
+
   SmallPtrSet<const MDNode *, 16> ProcessedVars;
-  collectVariableInfo(ProcessedVars);
+  collectVariableInfo(TheCU, SP, ProcessedVars);
 
-  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
-  DwarfCompileUnit &TheCU = *SPMap.lookup(FnScope->getScopeNode());
+  // Add the range of this function to the list of ranges for the CU.
+  TheCU.addRange(RangeSpan(FunctionBeginSym, FunctionEndSym));
+
+  // Under -gmlt, skip building the subprogram if there are no inlined
+  // subroutines inside it.
+  if (TheCU.getCUNode().getEmissionKind() == DIBuilder::LineTablesOnly &&
+      LScopes.getAbstractScopesList().empty() && !IsDarwin) {
+    assert(InfoHolder.getScopeVariables().empty());
+    assert(DbgValues.empty());
+    // FIXME: This wouldn't be true in LTO with a -g (with inlining) CU followed
+    // by a -gmlt CU. Add a test and remove this assertion.
+    assert(AbstractVariables.empty());
+    LabelsBeforeInsn.clear();
+    LabelsAfterInsn.clear();
+    PrevLabel = nullptr;
+    CurFn = nullptr;
+    return;
+  }
 
+#ifndef NDEBUG
+  size_t NumAbstractScopes = LScopes.getAbstractScopesList().size();
+#endif
   // Construct abstract scopes.
   for (LexicalScope *AScope : LScopes.getAbstractScopesList()) {
     DISubprogram SP(AScope->getScopeNode());
@@ -1561,29 +1293,25 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
     for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
       DIVariable DV(Variables.getElement(i));
       assert(DV && DV.isVariable());
-      if (!ProcessedVars.insert(DV))
+      if (!ProcessedVars.insert(DV).second)
         continue;
       ensureAbstractVariableIsCreated(DV, DV.getContext());
+      assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes
+             && "ensureAbstractVariableIsCreated inserted abstract scopes");
     }
-    constructAbstractSubprogramScopeDIE(TheCU, AScope);
+    constructAbstractSubprogramScopeDIE(AScope);
   }
 
-  DIE &CurFnDIE = constructSubprogramScopeDIE(TheCU, FnScope);
-  if (!CurFn->getTarget().Options.DisableFramePointerElim(*CurFn))
-    TheCU.addFlag(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr);
-
-  // Add the range of this function to the list of ranges for the CU.
-  RangeSpan Span(FunctionBeginSym, FunctionEndSym);
-  TheCU.addRange(std::move(Span));
-  PrevSection = Asm->getCurrentSection();
-  PrevCU = &TheCU;
+  TheCU.constructSubprogramScopeDIE(FnScope);
+  if (auto *SkelCU = TheCU.getSkeleton())
+    if (!LScopes.getAbstractScopesList().empty())
+      SkelCU->constructSubprogramScopeDIE(FnScope);
 
   // Clear debug info
   // Ownership of DbgVariables is a bit subtle - ScopeVariables owns all the
   // DbgVariables except those that are also in AbstractVariables (since they
   // can be used cross-function)
-  ScopeVariables.clear();
-  CurrentFnArguments.clear();
+  InfoHolder.getScopeVariables().clear();
   DbgValues.clear();
   LabelsBeforeInsn.clear();
   LabelsAfterInsn.clear();
@@ -1603,8 +1331,8 @@ void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
     assert(Scope.isScope());
     Fn = Scope.getFilename();
     Dir = Scope.getDirectory();
-    if (Scope.isLexicalBlock())
-      Discriminator = DILexicalBlock(S).getDiscriminator();
+    if (Scope.isLexicalBlockFile())
+      Discriminator = DILexicalBlockFile(S).getDiscriminator();
 
     unsigned CUID = Asm->OutStreamer.getContext().getDwarfCompileUnitID();
     Src = static_cast<DwarfCompileUnit &>(*InfoHolder.getUnits()[CUID])
@@ -1625,9 +1353,12 @@ void DwarfDebug::emitSectionLabels() {
   // Dwarf sections base addresses.
   DwarfInfoSectionSym =
       emitSectionSym(Asm, TLOF.getDwarfInfoSection(), "section_info");
-  if (useSplitDwarf())
+  if (useSplitDwarf()) {
     DwarfInfoDWOSectionSym =
         emitSectionSym(Asm, TLOF.getDwarfInfoDWOSection(), "section_info_dwo");
+    DwarfTypesDWOSectionSym =
+        emitSectionSym(Asm, TLOF.getDwarfTypesDWOSection(), "section_types_dwo");
+  }
   DwarfAbbrevSectionSym =
       emitSectionSym(Asm, TLOF.getDwarfAbbrevSection(), "section_abbrev");
   if (useSplitDwarf())
@@ -1711,7 +1442,7 @@ void DwarfDebug::emitDIE(DIE &Die) {
 void DwarfDebug::emitDebugInfo() {
   DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
 
-  Holder.emitUnits(this, DwarfAbbrevSectionSym);
+  Holder.emitUnits(DwarfAbbrevSectionSym);
 }
 
 // Emit the abbreviation section.
@@ -1745,54 +1476,41 @@ void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) {
   Asm->EmitInt8(1);
 }
 
-// Emit visible names into a hashed accelerator table section.
-void DwarfDebug::emitAccelNames() {
-  AccelNames.FinalizeTable(Asm, "Names");
-  Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfAccelNamesSection());
-  MCSymbol *SectionBegin = Asm->GetTempSymbol("names_begin");
+void DwarfDebug::emitAccel(DwarfAccelTable &Accel, const MCSection *Section,
+                           StringRef TableName, StringRef SymName) {
+  Accel.FinalizeTable(Asm, TableName);
+  Asm->OutStreamer.SwitchSection(Section);
+  auto *SectionBegin = Asm->GetTempSymbol(SymName);
   Asm->OutStreamer.EmitLabel(SectionBegin);
 
   // Emit the full data.
-  AccelNames.Emit(Asm, SectionBegin, &InfoHolder);
+  Accel.Emit(Asm, SectionBegin, this, DwarfStrSectionSym);
+}
+
+// Emit visible names into a hashed accelerator table section.
+void DwarfDebug::emitAccelNames() {
+  emitAccel(AccelNames, Asm->getObjFileLowering().getDwarfAccelNamesSection(),
+            "Names", "names_begin");
 }
 
 // Emit objective C classes and categories into a hashed accelerator table
 // section.
 void DwarfDebug::emitAccelObjC() {
-  AccelObjC.FinalizeTable(Asm, "ObjC");
-  Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfAccelObjCSection());
-  MCSymbol *SectionBegin = Asm->GetTempSymbol("objc_begin");
-  Asm->OutStreamer.EmitLabel(SectionBegin);
-
-  // Emit the full data.
-  AccelObjC.Emit(Asm, SectionBegin, &InfoHolder);
+  emitAccel(AccelObjC, Asm->getObjFileLowering().getDwarfAccelObjCSection(),
+            "ObjC", "objc_begin");
 }
 
 // Emit namespace dies into a hashed accelerator table.
 void DwarfDebug::emitAccelNamespaces() {
-  AccelNamespace.FinalizeTable(Asm, "namespac");
-  Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfAccelNamespaceSection());
-  MCSymbol *SectionBegin = Asm->GetTempSymbol("namespac_begin");
-  Asm->OutStreamer.EmitLabel(SectionBegin);
-
-  // Emit the full data.
-  AccelNamespace.Emit(Asm, SectionBegin, &InfoHolder);
+  emitAccel(AccelNamespace,
+            Asm->getObjFileLowering().getDwarfAccelNamespaceSection(),
+            "namespac", "namespac_begin");
 }
 
 // Emit type dies into a hashed accelerator table.
 void DwarfDebug::emitAccelTypes() {
-
-  AccelTypes.FinalizeTable(Asm, "types");
-  Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfAccelTypesSection());
-  MCSymbol *SectionBegin = Asm->GetTempSymbol("types_begin");
-  Asm->OutStreamer.EmitLabel(SectionBegin);
-
-  // Emit the full data.
-  AccelTypes.Emit(Asm, SectionBegin, &InfoHolder);
+  emitAccel(AccelTypes, Asm->getObjFileLowering().getDwarfAccelTypesSection(),
+            "types", "types_begin");
 }
 
 // Public name handling.
@@ -1859,12 +1577,13 @@ void DwarfDebug::emitDebugPubNames(bool GnuStyle) {
       GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubNamesSection()
                : Asm->getObjFileLowering().getDwarfPubNamesSection();
 
-  emitDebugPubSection(GnuStyle, PSec, "Names", &DwarfUnit::getGlobalNames);
+  emitDebugPubSection(GnuStyle, PSec, "Names",
+                      &DwarfCompileUnit::getGlobalNames);
 }
 
 void DwarfDebug::emitDebugPubSection(
     bool GnuStyle, const MCSection *PSec, StringRef Name,
-    const StringMap<const DIE *> &(DwarfUnit::*Accessor)() const) {
+    const StringMap<const DIE *> &(DwarfCompileUnit::*Accessor)() const) {
   for (const auto &NU : CUMap) {
     DwarfCompileUnit *TheU = NU.second;
 
@@ -1873,7 +1592,7 @@ void DwarfDebug::emitDebugPubSection(
     if (Globals.empty())
       continue;
 
-    if (auto Skeleton = static_cast<DwarfCompileUnit *>(TheU->getSkeleton()))
+    if (auto *Skeleton = TheU->getSkeleton())
       TheU = Skeleton;
     unsigned ID = TheU->getUniqueID();
 
@@ -1895,7 +1614,7 @@ void DwarfDebug::emitDebugPubSection(
     Asm->EmitSectionOffset(TheU->getLabelBegin(), TheU->getSectionSym());
 
     Asm->OutStreamer.AddComment("Compilation Unit Length");
-    Asm->EmitLabelDifference(TheU->getLabelEnd(), TheU->getLabelBegin(), 4);
+    Asm->EmitInt32(TheU->getLength());
 
     // Emit the pubnames for this compilation unit.
     for (const auto &GI : Globals) {
@@ -1928,7 +1647,8 @@ void DwarfDebug::emitDebugPubTypes(bool GnuStyle) {
       GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubTypesSection()
                : Asm->getObjFileLowering().getDwarfPubTypesSection();
 
-  emitDebugPubSection(GnuStyle, PSec, "Types", &DwarfUnit::getGlobalTypes);
+  emitDebugPubSection(GnuStyle, PSec, "Types",
+                      &DwarfCompileUnit::getGlobalTypes);
 }
 
 // Emit visible names into a debug str section.
@@ -1937,61 +1657,86 @@ void DwarfDebug::emitDebugStr() {
   Holder.emitStrings(Asm->getObjFileLowering().getDwarfStrSection());
 }
 
+/// Emits an optimal (=sorted) sequence of DW_OP_pieces.
+void DwarfDebug::emitLocPieces(ByteStreamer &Streamer,
+                               const DITypeIdentifierMap &Map,
+                               ArrayRef<DebugLocEntry::Value> Values) {
+  assert(std::all_of(Values.begin(), Values.end(), [](DebugLocEntry::Value P) {
+        return P.isVariablePiece();
+      }) && "all values are expected to be pieces");
+  assert(std::is_sorted(Values.begin(), Values.end()) &&
+         "pieces are expected to be sorted");
+
+  unsigned Offset = 0;
+  for (auto Piece : Values) {
+    const unsigned SizeOfByte = 8;
+    DIExpression Expr = Piece.getExpression();
+    unsigned PieceOffset = Expr.getPieceOffset();
+    unsigned PieceSize = Expr.getPieceSize();
+    assert(Offset <= PieceOffset && "overlapping or duplicate pieces");
+    if (Offset < PieceOffset) {
+      // The DWARF spec seriously mandates pieces with no locations for gaps.
+      Asm->EmitDwarfOpPiece(Streamer, (PieceOffset-Offset)*SizeOfByte);
+      Offset += PieceOffset-Offset;
+    }
+    Offset += PieceSize;
+
+#ifndef NDEBUG
+    DIVariable Var = Piece.getVariable();
+    assert(!Var.isIndirect() && "indirect address for piece");
+    unsigned VarSize = Var.getSizeInBits(Map);
+    assert(PieceSize+PieceOffset <= VarSize/SizeOfByte
+           && "piece is larger than or outside of variable");
+    assert(PieceSize*SizeOfByte != VarSize
+           && "piece covers entire variable");
+#endif
+
+    emitDebugLocValue(Streamer, Piece, PieceOffset*SizeOfByte);
+  }
+}
+
+
 void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
                                    const DebugLocEntry &Entry) {
-  assert(Entry.getValues().size() == 1 &&
-         "multi-value entries are not supported yet.");
   const DebugLocEntry::Value Value = Entry.getValues()[0];
-  DIVariable DV(Value.getVariable());
+  if (Value.isVariablePiece())
+    // Emit all pieces that belong to the same variable and range.
+    return emitLocPieces(Streamer, TypeIdentifierMap, Entry.getValues());
+
+  assert(Entry.getValues().size() == 1 && "only pieces may have >1 value");
+  emitDebugLocValue(Streamer, Value);
+}
+
+void DwarfDebug::emitDebugLocValue(ByteStreamer &Streamer,
+                                   const DebugLocEntry::Value &Value,
+                                   unsigned PieceOffsetInBits) {
+  DIVariable DV = Value.getVariable();
+  DebugLocDwarfExpression DwarfExpr(*Asm, Streamer);
+
+  // Regular entry.
   if (Value.isInt()) {
     DIBasicType BTy(resolve(DV.getType()));
     if (BTy.Verify() && (BTy.getEncoding() == dwarf::DW_ATE_signed ||
-                         BTy.getEncoding() == dwarf::DW_ATE_signed_char)) {
-      Streamer.EmitInt8(dwarf::DW_OP_consts, "DW_OP_consts");
-      Streamer.EmitSLEB128(Value.getInt());
-    } else {
-      Streamer.EmitInt8(dwarf::DW_OP_constu, "DW_OP_constu");
-      Streamer.EmitULEB128(Value.getInt());
-    }
+                         BTy.getEncoding() == dwarf::DW_ATE_signed_char))
+      DwarfExpr.AddSignedConstant(Value.getInt());
+    else
+      DwarfExpr.AddUnsignedConstant(Value.getInt());
   } else if (Value.isLocation()) {
     MachineLocation Loc = Value.getLoc();
-    if (!DV.hasComplexAddress())
+    DIExpression Expr = Value.getExpression();
+    if (!Expr || (Expr.getNumElements() == 0))
       // Regular entry.
       Asm->EmitDwarfRegOp(Streamer, Loc, DV.isIndirect());
     else {
       // Complex address entry.
-      unsigned N = DV.getNumAddrElements();
-      unsigned i = 0;
-      if (N >= 2 && DV.getAddrElement(0) == DIBuilder::OpPlus) {
-        if (Loc.getOffset()) {
-          i = 2;
-          Asm->EmitDwarfRegOp(Streamer, Loc, DV.isIndirect());
-          Streamer.EmitInt8(dwarf::DW_OP_deref, "DW_OP_deref");
-          Streamer.EmitInt8(dwarf::DW_OP_plus_uconst, "DW_OP_plus_uconst");
-          Streamer.EmitSLEB128(DV.getAddrElement(1));
-        } else {
-          // If first address element is OpPlus then emit
-          // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
-          MachineLocation TLoc(Loc.getReg(), DV.getAddrElement(1));
-          Asm->EmitDwarfRegOp(Streamer, TLoc, DV.isIndirect());
-          i = 2;
-        }
-      } else {
-        Asm->EmitDwarfRegOp(Streamer, Loc, DV.isIndirect());
-      }
-
-      // Emit remaining complex address elements.
-      for (; i < N; ++i) {
-        uint64_t Element = DV.getAddrElement(i);
-        if (Element == DIBuilder::OpPlus) {
-          Streamer.EmitInt8(dwarf::DW_OP_plus_uconst, "DW_OP_plus_uconst");
-          Streamer.EmitULEB128(DV.getAddrElement(++i));
-        } else if (Element == DIBuilder::OpDeref) {
-          if (!Loc.isReg())
-            Streamer.EmitInt8(dwarf::DW_OP_deref, "DW_OP_deref");
-        } else
-          llvm_unreachable("unknown Opcode found in complex address");
-      }
+      if (Loc.getOffset()) {
+        DwarfExpr.AddMachineRegIndirect(Loc.getReg(), Loc.getOffset());
+        DwarfExpr.AddExpression(Expr, PieceOffsetInBits);
+      } else
+        DwarfExpr.AddMachineRegExpression(Expr, Loc.getReg(),
+                                          PieceOffsetInBits);
+      if (DV.isIndirect())
+        DwarfExpr.EmitOp(dwarf::DW_OP_deref);
     }
   }
   // else ... ignore constant fp. There is not any good way to
@@ -2020,14 +1765,12 @@ void DwarfDebug::emitDebugLoc() {
   unsigned char Size = Asm->getDataLayout().getPointerSize();
   for (const auto &DebugLoc : DotDebugLocEntries) {
     Asm->OutStreamer.EmitLabel(DebugLoc.Label);
+    const DwarfCompileUnit *CU = DebugLoc.CU;
     for (const auto &Entry : DebugLoc.List) {
       // Set up the range. This range is relative to the entry point of the
       // compile unit. This is a hard coded 0 for low_pc when we're emitting
       // ranges, or the DW_AT_low_pc on the compile unit otherwise.
-      const DwarfCompileUnit *CU = Entry.getCU();
-      if (CU->getRanges().size() == 1) {
-        // Grab the begin symbol from the first range as our base.
-        const MCSymbol *Base = CU->getRanges()[0].getStart();
+      if (auto *Base = CU->getBaseAddress()) {
         Asm->EmitLabelDifference(Entry.getBeginSym(), Base, Size);
         Asm->EmitLabelDifference(Entry.getEndSym(), Base, Size);
       } else {
@@ -2157,6 +1900,10 @@ void DwarfDebug::emitDebugARanges() {
   for (DwarfCompileUnit *CU : CUs) {
     std::vector<ArangeSpan> &List = Spans[CU];
 
+    // Describe the skeleton CU's offset and length, not the dwo file's.
+    if (auto *Skel = CU->getSkeleton())
+      CU = Skel;
+
     // Emit size of content not including length itself.
     unsigned ContentSize =
         sizeof(int16_t) + // DWARF ARange version number
@@ -2179,7 +1926,7 @@ void DwarfDebug::emitDebugARanges() {
     Asm->OutStreamer.AddComment("DWARF Arange version number");
     Asm->EmitInt16(dwarf::DW_ARANGES_VERSION);
     Asm->OutStreamer.AddComment("Offset Into Debug Info Section");
-    Asm->EmitSectionOffset(CU->getLocalLabelBegin(), CU->getLocalSectionSym());
+    Asm->EmitSectionOffset(CU->getLabelBegin(), CU->getSectionSym());
     Asm->OutStreamer.AddComment("Address Size (in bytes)");
     Asm->EmitInt8(PtrSize);
     Asm->OutStreamer.AddComment("Segment Size (in bytes)");
@@ -2223,6 +1970,9 @@ void DwarfDebug::emitDebugRanges() {
   for (const auto &I : CUMap) {
     DwarfCompileUnit *TheCU = I.second;
 
+    if (auto *Skel = TheCU->getSkeleton())
+      TheCU = Skel;
+
     // Iterate over the misc ranges for the compile units in the module.
     for (const RangeSpanList &List : TheCU->getRangeLists()) {
       // Emit our symbol so we can find the beginning of the range.
@@ -2233,9 +1983,7 @@ void DwarfDebug::emitDebugRanges() {
         const MCSymbol *End = Range.getEnd();
         assert(Begin && "Range without a begin symbol?");
         assert(End && "Range without an end symbol?");
-        if (TheCU->getRanges().size() == 1) {
-          // Grab the begin symbol from the first range as our base.
-          const MCSymbol *Base = TheCU->getRanges()[0].getStart();
+        if (auto *Base = TheCU->getBaseAddress()) {
           Asm->EmitLabelDifference(Begin, Base, Size);
           Asm->EmitLabelDifference(End, Base, Size);
         } else {
@@ -2248,23 +1996,6 @@ void DwarfDebug::emitDebugRanges() {
       Asm->OutStreamer.EmitIntValue(0, Size);
       Asm->OutStreamer.EmitIntValue(0, Size);
     }
-
-    // Now emit a range for the CU itself.
-    if (TheCU->getRanges().size() > 1) {
-      Asm->OutStreamer.EmitLabel(
-          Asm->GetTempSymbol("cu_ranges", TheCU->getUniqueID()));
-      for (const RangeSpan &Range : TheCU->getRanges()) {
-        const MCSymbol *Begin = Range.getStart();
-        const MCSymbol *End = Range.getEnd();
-        assert(Begin && "Range without a begin symbol?");
-        assert(End && "Range without an end symbol?");
-        Asm->OutStreamer.EmitSymbolValue(Begin, Size);
-        Asm->OutStreamer.EmitSymbolValue(End, Size);
-      }
-      // And terminate the list with two 0 values.
-      Asm->OutStreamer.EmitIntValue(0, Size);
-      Asm->OutStreamer.EmitIntValue(0, Size);
-    }
   }
 }
 
@@ -2272,11 +2003,11 @@ void DwarfDebug::emitDebugRanges() {
 
 void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
                                   std::unique_ptr<DwarfUnit> NewU) {
-  NewU->addLocalString(Die, dwarf::DW_AT_GNU_dwo_name,
-                       U.getCUNode().getSplitDebugFilename());
+  NewU->addString(Die, dwarf::DW_AT_GNU_dwo_name,
+                  U.getCUNode().getSplitDebugFilename());
 
   if (!CompilationDir.empty())
-    NewU->addLocalString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
+    NewU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
 
   addGnuPubAttributes(*NewU, Die);
 
@@ -2301,31 +2032,13 @@ DwarfCompileUnit &DwarfDebug::constructSkeletonCU(const DwarfCompileUnit &CU) {
   return NewCU;
 }
 
-// This DIE has the following attributes: DW_AT_comp_dir, DW_AT_dwo_name,
-// DW_AT_addr_base.
-DwarfTypeUnit &DwarfDebug::constructSkeletonTU(DwarfTypeUnit &TU) {
-  DwarfCompileUnit &CU = static_cast<DwarfCompileUnit &>(
-      *SkeletonHolder.getUnits()[TU.getCU().getUniqueID()]);
-
-  auto OwnedUnit = make_unique<DwarfTypeUnit>(TU.getUniqueID(), CU, Asm, this,
-                                              &SkeletonHolder);
-  DwarfTypeUnit &NewTU = *OwnedUnit;
-  NewTU.setTypeSignature(TU.getTypeSignature());
-  NewTU.setType(nullptr);
-  NewTU.initSection(
-      Asm->getObjFileLowering().getDwarfTypesSection(TU.getTypeSignature()));
-
-  initSkeletonUnit(TU, NewTU.getUnitDie(), std::move(OwnedUnit));
-  return NewTU;
-}
-
 // Emit the .debug_info.dwo section for separated dwarf. This contains the
 // compile units that would normally be in debug_info.
 void DwarfDebug::emitDebugInfoDWO() {
   assert(useSplitDwarf() && "No split dwarf debug info?");
   // Don't pass an abbrev symbol, using a constant zero instead so as not to
   // emit relocations into the dwo file.
-  InfoHolder.emitUnits(this, /* AbbrevSymbol */ nullptr);
+  InfoHolder.emitUnits(/* AbbrevSymbol */ nullptr);
 }
 
 // Emit the .debug_abbrev.dwo section for separated dwarf. This contains the
@@ -2349,9 +2062,8 @@ void DwarfDebug::emitDebugStrDWO() {
   assert(useSplitDwarf() && "No split dwarf?");
   const MCSection *OffSec =
       Asm->getObjFileLowering().getDwarfStrOffDWOSection();
-  const MCSymbol *StrSym = DwarfStrSectionSym;
   InfoHolder.emitStrings(Asm->getObjFileLowering().getDwarfStrDWOSection(),
-                         OffSec, StrSym);
+                         OffSec);
 }
 
 MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) {
@@ -2391,9 +2103,9 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   bool TopLevelType = TypeUnitsUnderConstruction.empty();
   AddrPool.resetUsedFlag();
 
-  auto OwnedUnit =
-      make_unique<DwarfTypeUnit>(InfoHolder.getUnits().size(), CU, Asm, this,
-                                 &InfoHolder, getDwoLineTable(CU));
+  auto OwnedUnit = make_unique<DwarfTypeUnit>(
+      InfoHolder.getUnits().size() + TypeUnitsUnderConstruction.size(), CU, Asm,
+      this, &InfoHolder, getDwoLineTable(CU));
   DwarfTypeUnit &NewTU = *OwnedUnit;
   DIE &UnitDie = NewTU.getUnitDie();
   TU = &NewTU;
@@ -2406,15 +2118,13 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   uint64_t Signature = makeTypeSignature(Identifier);
   NewTU.setTypeSignature(Signature);
 
-  if (!useSplitDwarf())
+  if (useSplitDwarf())
+    NewTU.initSection(Asm->getObjFileLowering().getDwarfTypesDWOSection());
+  else {
     CU.applyStmtList(UnitDie);
-
-  // FIXME: Skip using COMDAT groups for type units in the .dwo file once tools
-  // such as DWP ( http://gcc.gnu.org/wiki/DebugFissionDWP ) can cope with it.
-  NewTU.initSection(
-      useSplitDwarf()
-          ? Asm->getObjFileLowering().getDwarfTypesDWOSection(Signature)
-          : Asm->getObjFileLowering().getDwarfTypesSection(Signature));
+    NewTU.initSection(
+        Asm->getObjFileLowering().getDwarfTypesSection(Signature));
+  }
 
   NewTU.setType(NewTU.createTypeDIE(CTy));
 
@@ -2442,29 +2152,12 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
 
     // If the type wasn't dependent on fission addresses, finish adding the type
     // and all its dependent types.
-    for (auto &TU : TypeUnitsToAdd) {
-      if (useSplitDwarf())
-        TU.first->setSkeleton(constructSkeletonTU(*TU.first));
+    for (auto &TU : TypeUnitsToAdd)
       InfoHolder.addUnit(std::move(TU.first));
-    }
   }
   CU.addDIETypeSignature(RefDie, NewTU);
 }
 
-void DwarfDebug::attachLowHighPC(DwarfCompileUnit &Unit, DIE &D,
-                                 MCSymbol *Begin, MCSymbol *End) {
-  assert(Begin && "Begin label should not be null!");
-  assert(End && "End label should not be null!");
-  assert(Begin->isDefined() && "Invalid starting label");
-  assert(End->isDefined() && "Invalid end label");
-
-  Unit.addLabelAddress(D, dwarf::DW_AT_low_pc, Begin);
-  if (DwarfVersion < 4)
-    Unit.addLabelAddress(D, dwarf::DW_AT_high_pc, End);
-  else
-    Unit.addLabelDelta(D, dwarf::DW_AT_high_pc, End, Begin);
-}
-
 // Accelerator table mutators - add each name along with its companion
 // DIE to the proper table while ensuring that the name that we're going
 // to reference is in the string table. We do this since the names we
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index f2aa80845a05..a1a94264dddd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -11,29 +11,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DWARFDEBUG_H__
-#define CODEGEN_ASMPRINTER_DWARFDEBUG_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
 
-#include "DwarfFile.h"
 #include "AsmPrinterHandler.h"
-#include "DIE.h"
 #include "DbgValueHistoryCalculator.h"
 #include "DebugLocEntry.h"
 #include "DebugLocList.h"
 #include "DwarfAccelTable.h"
+#include "DwarfFile.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/FoldingSet.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MachineLocation.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Allocator.h"
-
 #include <memory>
 
 namespace llvm {
@@ -70,6 +69,7 @@ public:
 /// \brief This class is used to track local variable information.
 class DbgVariable {
   DIVariable Var;             // Variable Descriptor.
+  DIExpression Expr;          // Complex address location expression.
   DIE *TheDIE;                // Variable DIE.
   unsigned DotDebugLocOffset; // Offset in DotDebugLocEntries.
   const MachineInstr *MInsn;  // DBG_VALUE instruction of the variable.
@@ -78,18 +78,22 @@ class DbgVariable {
 
 public:
   /// Construct a DbgVariable from a DIVariable.
-  DbgVariable(DIVariable V, DwarfDebug *DD)
-      : Var(V), TheDIE(nullptr), DotDebugLocOffset(~0U), MInsn(nullptr),
-        FrameIndex(~0), DD(DD) {}
+  DbgVariable(DIVariable V, DIExpression E, DwarfDebug *DD)
+      : Var(V), Expr(E), TheDIE(nullptr), DotDebugLocOffset(~0U),
+        MInsn(nullptr), FrameIndex(~0), DD(DD) {
+    assert(Var.Verify() && Expr.Verify());
+  }
 
   /// Construct a DbgVariable from a DEBUG_VALUE.
   /// AbstractVar may be NULL.
   DbgVariable(const MachineInstr *DbgValue, DwarfDebug *DD)
-      : Var(DbgValue->getDebugVariable()), TheDIE(nullptr),
-        DotDebugLocOffset(~0U), MInsn(DbgValue), FrameIndex(~0), DD(DD) {}
+      : Var(DbgValue->getDebugVariable()), Expr(DbgValue->getDebugExpression()),
+        TheDIE(nullptr), DotDebugLocOffset(~0U), MInsn(DbgValue),
+        FrameIndex(~0), DD(DD) {}
 
   // Accessors.
   DIVariable getVariable() const { return Var; }
+  DIExpression getExpression() const { return Expr; }
   void setDIE(DIE &D) { TheDIE = &D; }
   DIE *getDIE() const { return TheDIE; }
   void setDotDebugLocOffset(unsigned O) { DotDebugLocOffset = O; }
@@ -124,14 +128,14 @@ public:
 
   bool variableHasComplexAddress() const {
     assert(Var.isVariable() && "Invalid complex DbgVariable!");
-    return Var.hasComplexAddress();
+    return Expr.getNumElements() > 0;
   }
   bool isBlockByrefVariable() const;
   unsigned getNumAddrElements() const {
     assert(Var.isVariable() && "Invalid complex DbgVariable!");
-    return Var.getNumAddrElements();
+    return Expr.getNumElements();
   }
-  uint64_t getAddrElement(unsigned i) const { return Var.getAddrElement(i); }
+  uint64_t getAddrElement(unsigned i) const { return Expr.getElement(i); }
   DIType getType() const;
 
 private:
@@ -159,26 +163,15 @@ class DwarfDebug : public AsmPrinterHandler {
   // All DIEValues are allocated through this allocator.
   BumpPtrAllocator DIEValueAllocator;
 
-  // Handle to the compile unit used for the inline extension handling,
-  // this is just so that the DIEValue allocator has a place to store
-  // the particular elements.
-  // FIXME: Store these off of DwarfDebug instead?
-  DwarfCompileUnit *FirstCU;
-
   // Maps MDNode with its corresponding DwarfCompileUnit.
   MapVector<const MDNode *, DwarfCompileUnit *> CUMap;
 
   // Maps subprogram MDNode with its corresponding DwarfCompileUnit.
-  DenseMap<const MDNode *, DwarfCompileUnit *> SPMap;
+  MapVector<const MDNode *, DwarfCompileUnit *> SPMap;
 
   // Maps a CU DIE with its corresponding DwarfCompileUnit.
   DenseMap<const DIE *, DwarfCompileUnit *> CUDieMap;
 
-  /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
-  /// be shared across CUs, that is why we keep the map here instead
-  /// of in DwarfCompileUnit.
-  DenseMap<const MDNode *, DIE *> MDTypeNodeToDieMap;
-
   // List of all labels used in aranges generation.
   std::vector<SymbolCU> ArangeLabels;
 
@@ -189,19 +182,8 @@ class DwarfDebug : public AsmPrinterHandler {
   typedef DenseMap<const MCSection *, SmallVector<SymbolCU, 8> > SectionMapType;
   SectionMapType SectionMap;
 
-  // List of arguments for current function.
-  SmallVector<DbgVariable *, 8> CurrentFnArguments;
-
   LexicalScopes LScopes;
 
-  // Collection of abstract subprogram DIEs.
-  DenseMap<const MDNode *, DIE *> AbstractSPDies;
-
-  // Collection of dbg variables of a scope.
-  typedef DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8> >
-  ScopeVariablesMap;
-  ScopeVariablesMap ScopeVariables;
-
   // Collection of abstract variables.
   DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
   SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables;
@@ -210,10 +192,6 @@ class DwarfDebug : public AsmPrinterHandler {
   // can refer to them in spite of insertions into this list.
   SmallVector<DebugLocList, 4> DotDebugLocEntries;
 
-  // Collection of subprogram DIEs that are marked (at the end of the module)
-  // as DW_AT_inline.
-  SmallPtrSet<DIE *, 4> InlinedSubprogramDIEs;
-
   // This is a collection of subprogram MDNodes that are processed to
   // create DIEs.
   SmallPtrSet<const MDNode *, 16> ProcessedSPNodes;
@@ -243,10 +221,6 @@ class DwarfDebug : public AsmPrinterHandler {
   // If nonnull, stores the current machine instruction we're processing.
   const MachineInstr *CurMI;
 
-  // If nonnull, stores the section that the previous function was allocated to
-  // emitting.
-  const MCSection *PrevSection;
-
   // If nonnull, stores the CU in which the previous subprogram was contained.
   const DwarfCompileUnit *PrevCU;
 
@@ -258,6 +232,7 @@ class DwarfDebug : public AsmPrinterHandler {
   MCSymbol *DwarfDebugLocSectionSym, *DwarfLineSectionSym, *DwarfAddrSectionSym;
   MCSymbol *FunctionBeginSym, *FunctionEndSym;
   MCSymbol *DwarfInfoDWOSectionSym, *DwarfAbbrevDWOSectionSym;
+  MCSymbol *DwarfTypesDWOSectionSym;
   MCSymbol *DwarfStrDWOSectionSym;
   MCSymbol *DwarfGnuPubNamesSectionSym, *DwarfGnuPubTypesSectionSym;
 
@@ -322,6 +297,7 @@ class DwarfDebug : public AsmPrinterHandler {
 
   // True iff there are multiple CUs in this module.
   bool SingleCU;
+  bool IsDarwin;
 
   AddressPool AddrPool;
 
@@ -330,9 +306,9 @@ class DwarfDebug : public AsmPrinterHandler {
   DwarfAccelTable AccelNamespace;
   DwarfAccelTable AccelTypes;
 
-  MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &);
+  DenseMap<const Function *, DISubprogram> FunctionDIs;
 
-  void addScopeVariable(LexicalScope *LS, DbgVariable *Var);
+  MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &);
 
   const SmallVectorImpl<std::unique_ptr<DwarfUnit>> &getUnits() {
     return InfoHolder.getUnits();
@@ -348,45 +324,8 @@ class DwarfDebug : public AsmPrinterHandler {
   void ensureAbstractVariableIsCreatedIfScoped(const DIVariable &Var,
                                                const MDNode *Scope);
 
-  /// \brief Find DIE for the given subprogram and attach appropriate
-  /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
-  /// variables in this scope then create and insert DIEs for these
-  /// variables.
-  DIE &updateSubprogramScopeDIE(DwarfCompileUnit &SPCU, DISubprogram SP);
-
-  /// \brief A helper function to check whether the DIE for a given Scope is
-  /// going to be null.
-  bool isLexicalScopeDIENull(LexicalScope *Scope);
-
-  /// \brief A helper function to construct a RangeSpanList for a given
-  /// lexical scope.
-  void addScopeRangeList(DwarfCompileUnit &TheCU, DIE &ScopeDIE,
-                         const SmallVectorImpl<InsnRange> &Range);
-
-  /// \brief Construct new DW_TAG_lexical_block for this scope and
-  /// attach DW_AT_low_pc/DW_AT_high_pc labels.
-  std::unique_ptr<DIE> constructLexicalScopeDIE(DwarfCompileUnit &TheCU,
-                                                LexicalScope *Scope);
-
-  /// \brief This scope represents inlined body of a function. Construct
-  /// DIE to represent this concrete inlined copy of the function.
-  std::unique_ptr<DIE> constructInlinedScopeDIE(DwarfCompileUnit &TheCU,
-                                                LexicalScope *Scope);
-
-  /// \brief Construct a DIE for this scope.
-  std::unique_ptr<DIE> constructScopeDIE(DwarfCompileUnit &TheCU,
-                                         LexicalScope *Scope);
-  void createAndAddScopeChildren(DwarfCompileUnit &TheCU, LexicalScope *Scope,
-                                 DIE &ScopeDIE);
   /// \brief Construct a DIE for this abstract scope.
-  void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &TheCU,
-                                           LexicalScope *Scope);
-  /// \brief Construct a DIE for this subprogram scope.
-  DIE &constructSubprogramScopeDIE(DwarfCompileUnit &TheCU,
-                                   LexicalScope *Scope);
-  /// A helper function to create children of a Scope DIE.
-  DIE *createScopeChildrenDIE(DwarfCompileUnit &TheCU, LexicalScope *Scope,
-                              SmallVectorImpl<std::unique_ptr<DIE>> &Children);
+  void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
   /// \brief Emit initial Dwarf sections with a label at the start of each one.
   void emitSectionLabels();
@@ -422,6 +361,10 @@ class DwarfDebug : public AsmPrinterHandler {
   /// the line matrix.
   void emitEndOfLineMatrix(unsigned SectionEnd);
 
+  /// \brief Emit a specified accelerator table.
+  void emitAccel(DwarfAccelTable &Accel, const MCSection *Section,
+                 StringRef TableName, StringRef SymName);
+
   /// \brief Emit visible names into a hashed accelerator table section.
   void emitAccelNames();
 
@@ -447,10 +390,9 @@ class DwarfDebug : public AsmPrinterHandler {
   /// index.
   void emitDebugPubTypes(bool GnuStyle = false);
 
-  void
-  emitDebugPubSection(bool GnuStyle, const MCSection *PSec, StringRef Name,
-                      const StringMap<const DIE *> &(DwarfUnit::*Accessor)()
-                      const);
+  void emitDebugPubSection(
+      bool GnuStyle, const MCSection *PSec, StringRef Name,
+      const StringMap<const DIE *> &(DwarfCompileUnit::*Accessor)() const);
 
   /// \brief Emit visible names into a debug str section.
   void emitDebugStr();
@@ -505,15 +447,8 @@ class DwarfDebug : public AsmPrinterHandler {
   DwarfCompileUnit &constructDwarfCompileUnit(DICompileUnit DIUnit);
 
   /// \brief Construct imported_module or imported_declaration DIE.
-  void constructImportedEntityDIE(DwarfCompileUnit &TheCU, const MDNode *N);
-
-  /// \brief Construct import_module DIE.
-  void constructImportedEntityDIE(DwarfCompileUnit &TheCU, const MDNode *N,
-                                  DIE &Context);
-
-  /// \brief Construct import_module DIE.
-  void constructImportedEntityDIE(DwarfCompileUnit &TheCU,
-                                  const DIImportedEntity &Module, DIE &Context);
+  void constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU,
+                                        const MDNode *N);
 
   /// \brief Register a source line with debug info. Returns the unique
   /// label that was emitted and which provides correspondence to the
@@ -525,38 +460,29 @@ class DwarfDebug : public AsmPrinterHandler {
   /// ending of a scope.
   void identifyScopeMarkers();
 
-  /// \brief If Var is an current function argument that add it in
-  /// CurrentFnArguments list.
-  bool addCurrentFnArgument(DbgVariable *Var, LexicalScope *Scope);
-
   /// \brief Populate LexicalScope entries with variables' info.
-  void collectVariableInfo(SmallPtrSet<const MDNode *, 16> &ProcessedVars);
+  void collectVariableInfo(DwarfCompileUnit &TheCU, DISubprogram SP,
+                           SmallPtrSetImpl<const MDNode *> &ProcessedVars);
+
+  /// \brief Build the location list for all DBG_VALUEs in the
+  /// function that describe the same variable.
+  void buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
+                         const DbgValueHistoryMap::InstrRanges &Ranges);
 
   /// \brief Collect variable information from the side table maintained
   /// by MMI.
-  void collectVariableInfoFromMMITable(SmallPtrSet<const MDNode *, 16> &P);
+  void collectVariableInfoFromMMITable(SmallPtrSetImpl<const MDNode *> &P);
 
   /// \brief Ensure that a label will be emitted before MI.
   void requestLabelBeforeInsn(const MachineInstr *MI) {
     LabelsBeforeInsn.insert(std::make_pair(MI, nullptr));
   }
 
-  /// \brief Return Label preceding the instruction.
-  MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
-
   /// \brief Ensure that a label will be emitted after MI.
   void requestLabelAfterInsn(const MachineInstr *MI) {
     LabelsAfterInsn.insert(std::make_pair(MI, nullptr));
   }
 
-  /// \brief Return Label immediately following the instruction.
-  MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
-
-  void attachRangesOrLowHighPC(DwarfCompileUnit &Unit, DIE &D,
-                               const SmallVectorImpl<InsnRange> &Ranges);
-  void attachLowHighPC(DwarfCompileUnit &Unit, DIE &D, MCSymbol *Begin,
-                       MCSymbol *End);
-
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
@@ -565,13 +491,6 @@ public:
 
   ~DwarfDebug() override;
 
-  void insertDIE(const MDNode *TypeMD, DIE *Die) {
-    MDTypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
-  }
-  DIE *getDIE(const MDNode *TypeMD) {
-    return MDTypeNodeToDieMap.lookup(TypeMD);
-  }
-
   /// \brief Emit all Dwarf sections that should come prior to the
   /// content.
   void beginModule();
@@ -624,11 +543,15 @@ public:
   /// Returns the section symbol for the .debug_loc section.
   MCSymbol *getDebugLocSym() const { return DwarfDebugLocSectionSym; }
 
-  /// Returns the previous section that was emitted into.
-  const MCSection *getPrevSection() const { return PrevSection; }
+  /// Returns the section symbol for the .debug_str section.
+  MCSymbol *getDebugStrSym() const { return DwarfStrSectionSym; }
+
+  /// Returns the section symbol for the .debug_ranges section.
+  MCSymbol *getRangeSectionSym() const { return DwarfDebugRangeSectionSym; }
 
   /// Returns the previous CU that was being updated
   const DwarfCompileUnit *getPrevCU() const { return PrevCU; }
+  void setPrevCU(const DwarfCompileUnit *PrevCU) { this->PrevCU = PrevCU; }
 
   /// Returns the entries for the .debug_loc section.
   const SmallVectorImpl<DebugLocList> &
@@ -639,6 +562,14 @@ public:
   /// \brief Emit an entry for the debug loc section. This can be used to
   /// handle an entry that's going to be emitted into the debug loc section.
   void emitDebugLocEntry(ByteStreamer &Streamer, const DebugLocEntry &Entry);
+  /// \brief emit a single value for the debug loc section.
+  void emitDebugLocValue(ByteStreamer &Streamer,
+                         const DebugLocEntry::Value &Value,
+                         unsigned PieceOffsetInBits = 0);
+  /// Emits an optimal (=sorted) sequence of DW_OP_pieces.
+  void emitLocPieces(ByteStreamer &Streamer,
+                     const DITypeIdentifierMap &Map,
+                     ArrayRef<DebugLocEntry::Value> Values);
 
   /// Emit the location for a debug loc entry, including the size header.
   void emitDebugLocEntryLocation(const DebugLocEntry &Entry);
@@ -672,6 +603,40 @@ public:
   void addAccelNamespace(StringRef Name, const DIE &Die);
 
   void addAccelType(StringRef Name, const DIE &Die, char Flags);
+
+  const MachineFunction *getCurrentFunction() const { return CurFn; }
+  const MCSymbol *getFunctionBeginSym() const { return FunctionBeginSym; }
+  const MCSymbol *getFunctionEndSym() const { return FunctionEndSym; }
+
+  iterator_range<ImportedEntityMap::const_iterator>
+  findImportedEntitiesForScope(const MDNode *Scope) const {
+    return make_range(std::equal_range(
+        ScopesWithImportedEntities.begin(), ScopesWithImportedEntities.end(),
+        std::pair<const MDNode *, const MDNode *>(Scope, nullptr),
+        less_first()));
+  }
+
+  /// \brief A helper function to check whether the DIE for a given Scope is
+  /// going to be null.
+  bool isLexicalScopeDIENull(LexicalScope *Scope);
+
+  /// \brief Return Label preceding the instruction.
+  MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
+
+  /// \brief Return Label immediately following the instruction.
+  MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
+
+  // FIXME: Consider rolling ranges up into DwarfDebug since we use a single
+  // range_base anyway, so there's no need to keep them as separate per-CU range
+  // lists. (though one day we might end up with a range.dwo section, in which
+  // case it'd go to DwarfFile)
+  unsigned getNextRangeNumber() { return GlobalRangeCount++; }
+
+  // FIXME: Sink these functions down into DwarfFile/Dwarf*Unit.
+
+  SmallPtrSet<const MDNode *, 16> &getProcessedSPNodes() {
+    return ProcessedSPNodes;
+  }
 };
 } // End of namespace llvm
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index 0440fce506d0..e8867c0a323c 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
-#define LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
 
 #include "EHStreamer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -81,39 +81,6 @@ public:
   /// endFunction - Gather and emit post-function exception information.
   void endFunction(const MachineFunction *) override;
 };
-
-class Win64Exception : public EHStreamer {
-  /// shouldEmitPersonality - Per-function flag to indicate if personality
-  /// info should be emitted.
-  bool shouldEmitPersonality;
-
-  /// shouldEmitLSDA - Per-function flag to indicate if the LSDA
-  /// should be emitted.
-  bool shouldEmitLSDA;
-
-  /// shouldEmitMoves - Per-function flag to indicate if frame moves info
-  /// should be emitted.
-  bool shouldEmitMoves;
-
-public:
-  //===--------------------------------------------------------------------===//
-  // Main entry points.
-  //
-  Win64Exception(AsmPrinter *A);
-  virtual ~Win64Exception();
-
-  /// endModule - Emit all exception information that should come after the
-  /// content.
-  void endModule() override;
-
-  /// beginFunction - Gather pre-function exception information.  Assumes being
-  /// emitted immediately after the function entry point.
-  void beginFunction(const MachineFunction *MF) override;
-
-  /// endFunction - Gather and emit post-function exception information.
-  void endFunction(const MachineFunction *) override;
-};
-
 } // End of namespace llvm
 
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
new file mode 100644
index 000000000000..8e85effa965e
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -0,0 +1,260 @@
+//===-- llvm/CodeGen/DwarfExpression.cpp - Dwarf Debug Framework ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfExpression.h"
+#include "DwarfDebug.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+const TargetRegisterInfo *DwarfExpression::getTRI() const {
+  return AP.TM.getSubtargetImpl()->getRegisterInfo();
+}
+
+unsigned DwarfExpression::getDwarfVersion() const {
+  return AP.getDwarfDebug()->getDwarfVersion();
+}
+
+void DwarfExpression::AddReg(int DwarfReg, const char *Comment) {
+  assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+  if (DwarfReg < 32) {
+    EmitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
+  } else {
+    EmitOp(dwarf::DW_OP_regx, Comment);
+    EmitUnsigned(DwarfReg);
+  }
+}
+
+void DwarfExpression::AddRegIndirect(int DwarfReg, int Offset, bool Deref) {
+  assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+  if (DwarfReg < 32) {
+    EmitOp(dwarf::DW_OP_breg0 + DwarfReg);
+  } else {
+    EmitOp(dwarf::DW_OP_bregx);
+    EmitUnsigned(DwarfReg);
+  }
+  EmitSigned(Offset);
+  if (Deref)
+    EmitOp(dwarf::DW_OP_deref);
+}
+
+void DwarfExpression::AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits) {
+  assert(SizeInBits > 0 && "piece has size zero");
+  const unsigned SizeOfByte = 8;
+  if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
+    EmitOp(dwarf::DW_OP_bit_piece);
+    EmitUnsigned(SizeInBits);
+    EmitUnsigned(OffsetInBits);
+  } else {
+    EmitOp(dwarf::DW_OP_piece);
+    unsigned ByteSize = SizeInBits / SizeOfByte;
+    EmitUnsigned(ByteSize);
+  }
+}
+
+void DwarfExpression::AddShr(unsigned ShiftBy) {
+  EmitOp(dwarf::DW_OP_constu);
+  EmitUnsigned(ShiftBy);
+  EmitOp(dwarf::DW_OP_shr);
+}
+
+bool DwarfExpression::AddMachineRegIndirect(unsigned MachineReg, int Offset) {
+  int DwarfReg = getTRI()->getDwarfRegNum(MachineReg, false);
+  if (DwarfReg < 0)
+    return false;
+
+  if (isFrameRegister(MachineReg)) {
+    // If variable offset is based in frame register then use fbreg.
+    EmitOp(dwarf::DW_OP_fbreg);
+    EmitSigned(Offset);
+  } else {
+    AddRegIndirect(DwarfReg, Offset);
+  }
+  return true;
+}
+
+bool DwarfExpression::AddMachineRegPiece(unsigned MachineReg,
+                                         unsigned PieceSizeInBits,
+                                         unsigned PieceOffsetInBits) {
+  const TargetRegisterInfo *TRI = getTRI();
+  int Reg = TRI->getDwarfRegNum(MachineReg, false);
+
+  // If this is a valid register number, emit it.
+  if (Reg >= 0) {
+    AddReg(Reg);
+    if (PieceSizeInBits)
+      AddOpPiece(PieceSizeInBits, PieceOffsetInBits);
+    return true;
+  }
+
+  // Walk up the super-register chain until we find a valid number.
+  // For example, EAX on x86_64 is a 32-bit piece of RAX with offset 0.
+  for (MCSuperRegIterator SR(MachineReg, TRI); SR.isValid(); ++SR) {
+    Reg = TRI->getDwarfRegNum(*SR, false);
+    if (Reg >= 0) {
+      unsigned Idx = TRI->getSubRegIndex(*SR, MachineReg);
+      unsigned Size = TRI->getSubRegIdxSize(Idx);
+      unsigned RegOffset = TRI->getSubRegIdxOffset(Idx);
+      AddReg(Reg, "super-register");
+      if (PieceOffsetInBits == RegOffset) {
+        AddOpPiece(Size, RegOffset);
+      } else {
+        // If this is part of a variable in a sub-register at a
+        // non-zero offset, we need to manually shift the value into
+        // place, since the DW_OP_piece describes the part of the
+        // variable, not the position of the subregister.
+        if (RegOffset)
+          AddShr(RegOffset);
+        AddOpPiece(Size, PieceOffsetInBits);
+      }
+      return true;
+    }
+  }
+
+  // Otherwise, attempt to find a covering set of sub-register numbers.
+  // For example, Q0 on ARM is a composition of D0+D1.
+  //
+  // Keep track of the current position so we can emit the more
+  // efficient DW_OP_piece.
+  unsigned CurPos = PieceOffsetInBits;
+  // The size of the register in bits, assuming 8 bits per byte.
+  unsigned RegSize = TRI->getMinimalPhysRegClass(MachineReg)->getSize() * 8;
+  // Keep track of the bits in the register we already emitted, so we
+  // can avoid emitting redundant aliasing subregs.
+  SmallBitVector Coverage(RegSize, false);
+  for (MCSubRegIterator SR(MachineReg, TRI); SR.isValid(); ++SR) {
+    unsigned Idx = TRI->getSubRegIndex(MachineReg, *SR);
+    unsigned Size = TRI->getSubRegIdxSize(Idx);
+    unsigned Offset = TRI->getSubRegIdxOffset(Idx);
+    Reg = TRI->getDwarfRegNum(*SR, false);
+
+    // Intersection between the bits we already emitted and the bits
+    // covered by this subregister.
+    SmallBitVector Intersection(RegSize, false);
+    Intersection.set(Offset, Offset + Size);
+    Intersection ^= Coverage;
+
+    // If this sub-register has a DWARF number and we haven't covered
+    // its range, emit a DWARF piece for it.
+    if (Reg >= 0 && Intersection.any()) {
+      AddReg(Reg, "sub-register");
+      AddOpPiece(Size, Offset == CurPos ? 0 : Offset);
+      CurPos = Offset + Size;
+
+      // Mark it as emitted.
+      Coverage.set(Offset, Offset + Size);
+    }
+  }
+
+  return CurPos > PieceOffsetInBits;
+}
+
+void DwarfExpression::AddSignedConstant(int Value) {
+  EmitOp(dwarf::DW_OP_consts);
+  EmitSigned(Value);
+  // The proper way to describe a constant value is
+  // DW_OP_constu <const>, DW_OP_stack_value.
+  // Unfortunately, DW_OP_stack_value was not available until DWARF-4,
+  // so we will continue to generate DW_OP_constu <const> for DWARF-2
+  // and DWARF-3. Technically, this is incorrect since DW_OP_const <const>
+  // actually describes a value at a constant addess, not a constant value.
+  // However, in the past there was no better way  to describe a constant
+  // value, so the producers and consumers started to rely on heuristics
+  // to disambiguate the value vs. location status of the expression.
+  // See PR21176 for more details.
+  if (getDwarfVersion() >= 4)
+    EmitOp(dwarf::DW_OP_stack_value);
+}
+
+void DwarfExpression::AddUnsignedConstant(unsigned Value) {
+  EmitOp(dwarf::DW_OP_constu);
+  EmitUnsigned(Value);
+  // cf. comment in DwarfExpression::AddSignedConstant().
+  if (getDwarfVersion() >= 4)
+    EmitOp(dwarf::DW_OP_stack_value);
+}
+
+static unsigned getOffsetOrZero(unsigned OffsetInBits,
+                                unsigned PieceOffsetInBits) {
+  if (OffsetInBits == PieceOffsetInBits)
+    return 0;
+  assert(OffsetInBits >= PieceOffsetInBits && "overlapping pieces");
+  return OffsetInBits;
+}
+
+bool DwarfExpression::AddMachineRegExpression(DIExpression Expr,
+                                              unsigned MachineReg,
+                                              unsigned PieceOffsetInBits) {
+  unsigned N = Expr.getNumElements();
+  unsigned I = 0;
+  bool ValidReg = false;
+  // Pattern-match combinations for which more efficient representations exist
+  // first.
+  if (N >= 3 && Expr.getElement(0) == dwarf::DW_OP_piece) {
+    unsigned SizeOfByte = 8;
+    unsigned OffsetInBits = Expr.getElement(1) * SizeOfByte;
+    unsigned SizeInBits = Expr.getElement(2) * SizeOfByte;
+    ValidReg =
+        AddMachineRegPiece(MachineReg, SizeInBits,
+                           getOffsetOrZero(OffsetInBits, PieceOffsetInBits));
+    I = 3;
+  } else if (N >= 3 && Expr.getElement(0) == dwarf::DW_OP_plus &&
+             Expr.getElement(2) == dwarf::DW_OP_deref) {
+    // [DW_OP_reg,Offset,DW_OP_plus,DW_OP_deref] --> [DW_OP_breg,Offset].
+    unsigned Offset = Expr.getElement(1);
+    ValidReg = AddMachineRegIndirect(MachineReg, Offset);
+    I = 3;
+  } else if (N >= 1 && Expr.getElement(0) == dwarf::DW_OP_deref) {
+    // [DW_OP_reg,DW_OP_deref] --> [DW_OP_breg].
+    ValidReg = AddMachineRegIndirect(MachineReg);
+    I = 1;
+  } else
+    ValidReg = AddMachineRegPiece(MachineReg);
+
+  if (!ValidReg)
+    return false;
+
+  // Emit remaining elements of the expression.
+  AddExpression(Expr, I);
+  return true;
+}
+
+void DwarfExpression::AddExpression(DIExpression Expr, unsigned I,
+                                    unsigned PieceOffsetInBits) {
+  unsigned N = Expr.getNumElements();
+  for (; I < N; ++I) {
+    switch (Expr.getElement(I)) {
+    case dwarf::DW_OP_piece: {
+      unsigned SizeOfByte = 8;
+      unsigned OffsetInBits = Expr.getElement(++I) * SizeOfByte;
+      unsigned SizeInBits = Expr.getElement(++I) * SizeOfByte;
+      AddOpPiece(SizeInBits, getOffsetOrZero(OffsetInBits, PieceOffsetInBits));
+      break;
+    }
+    case dwarf::DW_OP_plus:
+      EmitOp(dwarf::DW_OP_plus_uconst);
+      EmitUnsigned(Expr.getElement(++I));
+      break;
+    case dwarf::DW_OP_deref:
+      EmitOp(dwarf::DW_OP_deref);
+      break;
+    default:
+      llvm_unreachable("unhandled opcode found in DIExpression");
+    }
+  }
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.h b/lib/CodeGen/AsmPrinter/DwarfExpression.h
new file mode 100644
index 000000000000..92e4d5d8a290
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -0,0 +1,133 @@
+//===-- llvm/CodeGen/DwarfExpression.h - Dwarf Compile Unit ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
+
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class ByteStreamer;
+class TargetRegisterInfo;
+class DwarfUnit;
+class DIELoc;
+
+/// Base class containing the logic for constructing DWARF expressions
+/// independently of whether they are emitted into a DIE or into a .debug_loc
+/// entry.
+class DwarfExpression {
+protected:
+  const AsmPrinter &AP;
+  // Various convenience accessors that extract things out of AsmPrinter.
+  const TargetRegisterInfo *getTRI() const;
+  unsigned getDwarfVersion() const;
+
+public:
+  DwarfExpression(const AsmPrinter &AP) : AP(AP) {}
+  virtual ~DwarfExpression() {}
+
+  /// Output a dwarf operand and an optional assembler comment.
+  virtual void EmitOp(uint8_t Op, const char *Comment = nullptr) = 0;
+  /// Emit a raw signed value.
+  virtual void EmitSigned(int Value) = 0;
+  /// Emit a raw unsigned value.
+  virtual void EmitUnsigned(unsigned Value) = 0;
+  /// Return whether the given machine register is the frame register in the
+  /// current function.
+  virtual bool isFrameRegister(unsigned MachineReg) = 0;
+
+  /// Emit a dwarf register operation.
+  void AddReg(int DwarfReg, const char *Comment = nullptr);
+  /// Emit an (double-)indirect dwarf register operation.
+  void AddRegIndirect(int DwarfReg, int Offset, bool Deref = false);
+
+  /// Emit a dwarf register operation for describing
+  /// - a small value occupying only part of a register or
+  /// - a register representing only part of a value.
+  void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
+  /// Emit a shift-right dwarf expression.
+  void AddShr(unsigned ShiftBy);
+
+  /// Emit an indirect dwarf register operation for the given machine register.
+  /// \return false if no DWARF register exists for MachineReg.
+  bool AddMachineRegIndirect(unsigned MachineReg, int Offset = 0);
+
+  /// \brief Emit a partial DWARF register operation.
+  /// \param MachineReg        the register
+  /// \param PieceSizeInBits   size and
+  /// \param PieceOffsetInBits offset of the piece in bits, if this is one
+  ///                          piece of an aggregate value.
+  ///
+  /// If size and offset is zero an operation for the entire
+  /// register is emitted: Some targets do not provide a DWARF
+  /// register number for every register.  If this is the case, this
+  /// function will attempt to emit a DWARF register by emitting a
+  /// piece of a super-register or by piecing together multiple
+  /// subregisters that alias the register.
+  ///
+  /// \return false if no DWARF register exists for MachineReg.
+  bool AddMachineRegPiece(unsigned MachineReg, unsigned PieceSizeInBits = 0,
+                          unsigned PieceOffsetInBits = 0);
+
+  /// Emit a signed constant.
+  void AddSignedConstant(int Value);
+  /// Emit an unsigned constant.
+  void AddUnsignedConstant(unsigned Value);
+
+  /// Emit an entire DIExpression on top of a machine register location.
+  /// \param PieceOffsetInBits If this is one piece out of a fragmented
+  /// location, this is the offset of the piece inside the entire variable.
+  /// \return false if no DWARF register exists for MachineReg.
+  bool AddMachineRegExpression(DIExpression Expr, unsigned MachineReg,
+                               unsigned PieceOffsetInBits = 0);
+  /// Emit a the operations in a DIExpression, starting from element I.
+  /// \param PieceOffsetInBits If this is one piece out of a fragmented
+  /// location, this is the offset of the piece inside the entire variable.
+  void AddExpression(DIExpression Expr, unsigned PieceOffsetInBits = 0,
+                     unsigned I = 0);
+};
+
+/// DwarfExpression implementation for .debug_loc entries.
+class DebugLocDwarfExpression : public DwarfExpression {
+  ByteStreamer &BS;
+
+public:
+  DebugLocDwarfExpression(const AsmPrinter &AP, ByteStreamer &BS)
+      : DwarfExpression(AP), BS(BS) {}
+
+  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void EmitSigned(int Value) override;
+  void EmitUnsigned(unsigned Value) override;
+  bool isFrameRegister(unsigned MachineReg) override;
+};
+
+/// DwarfExpression implementation for singular DW_AT_location.
+class DIEDwarfExpression : public DwarfExpression {
+  DwarfUnit &DU;
+  DIELoc &DIE;
+
+public:
+  DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE)
+      : DwarfExpression(AP), DU(DU), DIE(DIE) {}
+
+  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void EmitSigned(int Value) override;
+  void EmitUnsigned(unsigned Value) override;
+  bool isFrameRegister(unsigned MachineReg) override;
+};
+}
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 737ee54f94b5..549abf88324b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -8,18 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfFile.h"
-
 #include "DwarfDebug.h"
 #include "DwarfUnit.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
-DwarfFile::DwarfFile(AsmPrinter *AP, StringRef Pref, BumpPtrAllocator &DA)
-    : Asm(AP), StrPool(DA, *Asm, Pref) {}
+DwarfFile::DwarfFile(AsmPrinter *AP, DwarfDebug &DD, StringRef Pref,
+                     BumpPtrAllocator &DA)
+    : Asm(AP), DD(DD), StrPool(DA, *Asm, Pref) {}
 
 DwarfFile::~DwarfFile() {}
 
@@ -48,25 +48,18 @@ void DwarfFile::addUnit(std::unique_ptr<DwarfUnit> U) {
 
 // Emit the various dwarf units to the unit section USection with
 // the abbreviations going into ASection.
-void DwarfFile::emitUnits(DwarfDebug *DD, const MCSymbol *ASectionSym) {
+void DwarfFile::emitUnits(const MCSymbol *ASectionSym) {
   for (const auto &TheU : CUs) {
     DIE &Die = TheU->getUnitDie();
     const MCSection *USection = TheU->getSection();
     Asm->OutStreamer.SwitchSection(USection);
 
-    // Emit the compile units header.
-    Asm->OutStreamer.EmitLabel(TheU->getLabelBegin());
-
-    // Emit size of content not including length itself
-    Asm->OutStreamer.AddComment("Length of Unit");
-    Asm->EmitInt32(TheU->getHeaderSize() + Die.getSize());
-
     TheU->emitHeader(ASectionSym);
 
-    DD->emitDIE(Die);
-    Asm->OutStreamer.EmitLabel(TheU->getLabelEnd());
+    DD.emitDIE(Die);
   }
 }
+
 // Compute the size and offset for each DIE.
 void DwarfFile::computeSizeAndOffsets() {
   // Offset from the first CU in the debug info section is 0 initially.
@@ -149,8 +142,44 @@ void DwarfFile::emitAbbrevs(const MCSection *Section) {
 
 // Emit strings into a string section.
 void DwarfFile::emitStrings(const MCSection *StrSection,
-                            const MCSection *OffsetSection,
-                            const MCSymbol *StrSecSym) {
-  StrPool.emit(*Asm, StrSection, OffsetSection, StrSecSym);
+                            const MCSection *OffsetSection) {
+  StrPool.emit(*Asm, StrSection, OffsetSection);
+}
+
+void DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
+  SmallVectorImpl<DbgVariable *> &Vars = ScopeVariables[LS];
+  DIVariable DV = Var->getVariable();
+  // Variables with positive arg numbers are parameters.
+  if (unsigned ArgNum = DV.getArgNumber()) {
+    // Keep all parameters in order at the start of the variable list to ensure
+    // function types are correct (no out-of-order parameters)
+    //
+    // This could be improved by only doing it for optimized builds (unoptimized
+    // builds have the right order to begin with), searching from the back (this
+    // would catch the unoptimized case quickly), or doing a binary search
+    // rather than linear search.
+    auto I = Vars.begin();
+    while (I != Vars.end()) {
+      unsigned CurNum = (*I)->getVariable().getArgNumber();
+      // A local (non-parameter) variable has been found, insert immediately
+      // before it.
+      if (CurNum == 0)
+        break;
+      // A later indexed parameter has been found, insert immediately before it.
+      if (CurNum > ArgNum)
+        break;
+      // FIXME: There are still some cases where two inlined functions are
+      // conflated together (two calls to the same function at the same
+      // location (eg: via a macro, or without column info, etc)) and then
+      // their arguments are conflated as well.
+      assert((LS->getParent() || CurNum != ArgNum) &&
+             "Duplicate argument for top level (non-inlined) function");
+      ++I;
+    }
+    Vars.insert(I, Var);
+    return;
+  }
+
+  Vars.push_back(Var);
 }
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index 3985eb28cd6d..f14d673bf6ac 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -7,27 +7,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DWARFFILE_H__
-#define CODEGEN_ASMPRINTER_DWARFFILE_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
 
+#include "AddressPool.h"
+#include "DwarfStringPool.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Allocator.h"
-#include "AddressPool.h"
-#include "DwarfStringPool.h"
-
-#include <vector>
-#include <string>
 #include <memory>
+#include <string>
+#include <vector>
 
 namespace llvm {
 class AsmPrinter;
+class DbgVariable;
 class DwarfUnit;
 class DIEAbbrev;
 class MCSymbol;
 class DIE;
+class DISubprogram;
+class LexicalScope;
 class StringRef;
 class DwarfDebug;
 class MCSection;
@@ -35,6 +37,8 @@ class DwarfFile {
   // Target of Dwarf emission, used for sizing of abbreviations.
   AsmPrinter *Asm;
 
+  DwarfDebug &DD;
+
   // Used to uniquely define abbreviations.
   FoldingSet<DIEAbbrev> AbbreviationsSet;
 
@@ -46,8 +50,20 @@ class DwarfFile {
 
   DwarfStringPool StrPool;
 
+  // Collection of dbg variables of a scope.
+  DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8>> ScopeVariables;
+
+  // Collection of abstract subprogram DIEs.
+  DenseMap<const MDNode *, DIE *> AbstractSPDies;
+
+  /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
+  /// be shared across CUs, that is why we keep the map here instead
+  /// of in DwarfCompileUnit.
+  DenseMap<const MDNode *, DIE *> MDTypeNodeToDieMap;
+
 public:
-  DwarfFile(AsmPrinter *AP, StringRef Pref, BumpPtrAllocator &DA);
+  DwarfFile(AsmPrinter *AP, DwarfDebug &DD, StringRef Pref,
+            BumpPtrAllocator &DA);
 
   ~DwarfFile();
 
@@ -67,18 +83,34 @@ public:
 
   /// \brief Emit all of the units to the section listed with the given
   /// abbreviation section.
-  void emitUnits(DwarfDebug *DD, const MCSymbol *ASectionSym);
+  void emitUnits(const MCSymbol *ASectionSym);
 
   /// \brief Emit a set of abbreviations to the specific section.
   void emitAbbrevs(const MCSection *);
 
   /// \brief Emit all of the strings to the section given.
   void emitStrings(const MCSection *StrSection,
-                   const MCSection *OffsetSection = nullptr,
-                   const MCSymbol *StrSecSym = nullptr);
+                   const MCSection *OffsetSection = nullptr);
 
   /// \brief Returns the string pool.
   DwarfStringPool &getStringPool() { return StrPool; }
+
+  void addScopeVariable(LexicalScope *LS, DbgVariable *Var);
+
+  DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8>> &getScopeVariables() {
+    return ScopeVariables;
+  }
+
+  DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
+    return AbstractSPDies;
+  }
+
+  void insertDIE(const MDNode *TypeMD, DIE *Die) {
+    MDTypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
+  }
+  DIE *getDIE(const MDNode *TypeMD) {
+    return MDTypeNodeToDieMap.lookup(TypeMD);
+  }
 };
 }
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
index 72cab60e2321..d76b66cac69f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -12,14 +12,11 @@
 
 using namespace llvm;
 
-MCSymbol *DwarfStringPool::getSectionSymbol() { return SectionSymbol; }
-
 static std::pair<MCSymbol *, unsigned> &
 getEntry(AsmPrinter &Asm,
          StringMap<std::pair<MCSymbol *, unsigned>, BumpPtrAllocator &> &Pool,
          StringRef Prefix, StringRef Str) {
-  std::pair<MCSymbol *, unsigned> &Entry =
-      Pool.GetOrCreateValue(Str).getValue();
+  std::pair<MCSymbol *, unsigned> &Entry = Pool[Str];
   if (!Entry.first) {
     Entry.second = Pool.size() - 1;
     Entry.first = Asm.GetTempSymbol(Prefix, Entry.second);
@@ -36,8 +33,7 @@ unsigned DwarfStringPool::getIndex(AsmPrinter &Asm, StringRef Str) {
 }
 
 void DwarfStringPool::emit(AsmPrinter &Asm, const MCSection *StrSection,
-                           const MCSection *OffsetSection,
-                           const MCSymbol *StrSecSym) {
+                           const MCSection *OffsetSection) {
   if (Pool.empty())
     return;
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfStringPool.h b/lib/CodeGen/AsmPrinter/DwarfStringPool.h
index c1615fb4a297..63e34123d858 100644
--- a/lib/CodeGen/AsmPrinter/DwarfStringPool.h
+++ b/lib/CodeGen/AsmPrinter/DwarfStringPool.h
@@ -7,13 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_STRINGPOOL_H__
-#define CODEGEN_ASMPRINTER_STRINGPOOL_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFSTRINGPOOL_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFSTRINGPOOL_H
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/Support/Allocator.h"
-
 #include <utility>
 
 namespace llvm {
@@ -28,18 +27,13 @@ class StringRef;
 class DwarfStringPool {
   StringMap<std::pair<MCSymbol *, unsigned>, BumpPtrAllocator &> Pool;
   StringRef Prefix;
-  MCSymbol *SectionSymbol;
 
 public:
   DwarfStringPool(BumpPtrAllocator &A, AsmPrinter &Asm, StringRef Prefix)
-      : Pool(A), Prefix(Prefix), SectionSymbol(Asm.GetTempSymbol(Prefix)) {}
+      : Pool(A), Prefix(Prefix) {}
 
   void emit(AsmPrinter &Asm, const MCSection *StrSection,
-            const MCSection *OffsetSection = nullptr,
-            const MCSymbol *StrSecSym = nullptr);
-
-  /// \brief Returns the entry into the start of the pool.
-  MCSymbol *getSectionSymbol();
+            const MCSection *OffsetSection = nullptr);
 
   /// \brief Returns an entry into the string pool with the given
   /// string text.
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 9538bee7bca8..3d3da2ac1d31 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -13,7 +13,9 @@
 
 #include "DwarfUnit.h"
 #include "DwarfAccelTable.h"
+#include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
+#include "DwarfExpression.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
@@ -30,6 +32,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -40,24 +43,30 @@ GenerateDwarfTypeUnits("generate-type-units", cl::Hidden,
                        cl::desc("Generate DWARF4 type units."),
                        cl::init(false));
 
+void DIEDwarfExpression::EmitOp(uint8_t Op, const char* Comment) {
+  DU.addUInt(DIE, dwarf::DW_FORM_data1, Op);
+}
+void DIEDwarfExpression::EmitSigned(int Value) {
+  DU.addSInt(DIE, dwarf::DW_FORM_sdata, Value);
+}
+void DIEDwarfExpression::EmitUnsigned(unsigned Value) {
+  DU.addUInt(DIE, dwarf::DW_FORM_udata, Value);
+}
+bool DIEDwarfExpression::isFrameRegister(unsigned MachineReg) {
+  return MachineReg == getTRI()->getFrameRegister(*AP.MF);
+}
+
+
 /// Unit - Unit constructor.
 DwarfUnit::DwarfUnit(unsigned UID, dwarf::Tag UnitTag, DICompileUnit Node,
                      AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU)
     : UniqueID(UID), CUNode(Node), UnitDie(UnitTag), DebugInfoOffset(0), Asm(A),
-      DD(DW), DU(DWU), IndexTyDie(nullptr), Section(nullptr),
-      Skeleton(nullptr) {
+      DD(DW), DU(DWU), IndexTyDie(nullptr), Section(nullptr) {
   assert(UnitTag == dwarf::DW_TAG_compile_unit ||
          UnitTag == dwarf::DW_TAG_type_unit);
   DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
 }
 
-DwarfCompileUnit::DwarfCompileUnit(unsigned UID, DICompileUnit Node,
-                                   AsmPrinter *A, DwarfDebug *DW,
-                                   DwarfFile *DWU)
-    : DwarfUnit(UID, dwarf::DW_TAG_compile_unit, Node, A, DW, DWU) {
-  insertDIE(Node, &getUnitDie());
-}
-
 DwarfTypeUnit::DwarfTypeUnit(unsigned UID, DwarfCompileUnit &CU, AsmPrinter *A,
                              DwarfDebug *DW, DwarfFile *DWU,
                              MCDwarfDwoLineTable *SplitLineTable)
@@ -146,7 +155,7 @@ static bool isShareableAcrossCUs(DIDescriptor D) {
 /// will be kept in DwarfDebug for shareable DIEs.
 DIE *DwarfUnit::getDIE(DIDescriptor D) const {
   if (isShareableAcrossCUs(D))
-    return DD->getDIE(D);
+    return DU->getDIE(D);
   return MDNodeToDieMap.lookup(D);
 }
 
@@ -155,7 +164,7 @@ DIE *DwarfUnit::getDIE(DIDescriptor D) const {
 /// will be kept in DwarfDebug for shareable DIEs.
 void DwarfUnit::insertDIE(DIDescriptor Desc, DIE *D) {
   if (isShareableAcrossCUs(Desc)) {
-    DD->insertDIE(Desc, D);
+    DU->insertDIE(Desc, D);
     return;
   }
   MDNodeToDieMap.insert(std::make_pair(Desc, D));
@@ -206,10 +215,14 @@ void DwarfUnit::addSInt(DIELoc &Die, Optional<dwarf::Form> Form,
 /// table.
 void DwarfUnit::addString(DIE &Die, dwarf::Attribute Attribute,
                           StringRef String) {
-
-  if (!DD->useSplitDwarf())
+  if (!isDwoUnit())
     return addLocalString(Die, Attribute, String);
 
+  addIndexedString(Die, Attribute, String);
+}
+
+void DwarfUnit::addIndexedString(DIE &Die, dwarf::Attribute Attribute,
+                                 StringRef String) {
   unsigned idx = DU->getStringPool().getIndex(*Asm, String);
   DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
   DIEValue *Str = new (DIEValueAllocator) DIEString(Value, String);
@@ -224,31 +237,12 @@ void DwarfUnit::addLocalString(DIE &Die, dwarf::Attribute Attribute,
   DIEValue *Value;
   if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
     Value = new (DIEValueAllocator) DIELabel(Symb);
-  else {
-    MCSymbol *StringPool = DU->getStringPool().getSectionSymbol();
-    Value = new (DIEValueAllocator) DIEDelta(Symb, StringPool);
-  }
+  else
+    Value = new (DIEValueAllocator) DIEDelta(Symb, DD->getDebugStrSym());
   DIEValue *Str = new (DIEValueAllocator) DIEString(Value, String);
   Die.addValue(Attribute, dwarf::DW_FORM_strp, Str);
 }
 
-/// addExpr - Add a Dwarf expression attribute data and value.
-///
-void DwarfUnit::addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr) {
-  DIEValue *Value = new (DIEValueAllocator) DIEExpr(Expr);
-  Die.addValue((dwarf::Attribute)0, Form, Value);
-}
-
-/// addLocationList - Add a Dwarf loclistptr attribute data and value.
-///
-void DwarfUnit::addLocationList(DIE &Die, dwarf::Attribute Attribute,
-                                unsigned Index) {
-  DIEValue *Value = new (DIEValueAllocator) DIELocList(Index);
-  dwarf::Form Form = DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
-                                                : dwarf::DW_FORM_data4;
-  Die.addValue(Attribute, Form, Value);
-}
-
 /// addLabel - Add a Dwarf label attribute data and value.
 ///
 void DwarfUnit::addLabel(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form,
@@ -261,16 +255,6 @@ void DwarfUnit::addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label) {
   addLabel(Die, (dwarf::Attribute)0, Form, Label);
 }
 
-/// addSectionLabel - Add a Dwarf section label attribute data and value.
-///
-void DwarfUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
-                                const MCSymbol *Label) {
-  if (DD->getDwarfVersion() >= 4)
-    addLabel(Die, Attribute, dwarf::DW_FORM_sec_offset, Label);
-  else
-    addLabel(Die, Attribute, dwarf::DW_FORM_data4, Label);
-}
-
 /// addSectionOffset - Add an offset into a section attribute data and value.
 ///
 void DwarfUnit::addSectionOffset(DIE &Die, dwarf::Attribute Attribute,
@@ -281,45 +265,6 @@ void DwarfUnit::addSectionOffset(DIE &Die, dwarf::Attribute Attribute,
     addUInt(Die, Attribute, dwarf::DW_FORM_data4, Integer);
 }
 
-/// addLabelAddress - Add a dwarf label attribute data and value using
-/// DW_FORM_addr or DW_FORM_GNU_addr_index.
-///
-void DwarfCompileUnit::addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
-                                       const MCSymbol *Label) {
-
-  if (!DD->useSplitDwarf())
-    return addLocalLabelAddress(Die, Attribute, Label);
-
-  if (Label)
-    DD->addArangeLabel(SymbolCU(this, Label));
-
-  unsigned idx = DD->getAddressPool().getIndex(Label);
-  DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
-  Die.addValue(Attribute, dwarf::DW_FORM_GNU_addr_index, Value);
-}
-
-void DwarfCompileUnit::addLocalLabelAddress(DIE &Die,
-                                            dwarf::Attribute Attribute,
-                                            const MCSymbol *Label) {
-  if (Label)
-    DD->addArangeLabel(SymbolCU(this, Label));
-
-  Die.addValue(Attribute, dwarf::DW_FORM_addr,
-               Label ? (DIEValue *)new (DIEValueAllocator) DIELabel(Label)
-                     : new (DIEValueAllocator) DIEInteger(0));
-}
-
-unsigned DwarfCompileUnit::getOrCreateSourceID(StringRef FileName, StringRef DirName) {
-  // If we print assembly, we can't separate .file entries according to
-  // compile units. Thus all files will belong to the default compile unit.
-
-  // FIXME: add a better feature test than hasRawTextSupport. Even better,
-  // extend .file to support this.
-  return Asm->OutStreamer.EmitDwarfFileDirective(
-      0, DirName, FileName,
-      Asm->OutStreamer.hasRawTextSupport() ? 0 : getUniqueID());
-}
-
 unsigned DwarfTypeUnit::getOrCreateSourceID(StringRef FileName, StringRef DirName) {
   return SplitLineTable ? SplitLineTable->getFile(DirName, FileName)
                         : getCU().getOrCreateSourceID(FileName, DirName);
@@ -339,16 +284,6 @@ void DwarfUnit::addOpAddress(DIELoc &Die, const MCSymbol *Sym) {
   }
 }
 
-/// addSectionDelta - Add a section label delta attribute data and value.
-///
-void DwarfUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
-                                const MCSymbol *Hi, const MCSymbol *Lo) {
-  DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
-  Die.addValue(Attribute, DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
-                                                     : dwarf::DW_FORM_data4,
-               Value);
-}
-
 void DwarfUnit::addLabelDelta(DIE &Die, dwarf::Attribute Attribute,
                               const MCSymbol *Hi, const MCSymbol *Lo) {
   DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
@@ -477,137 +412,19 @@ void DwarfUnit::addSourceLine(DIE &Die, DINameSpace NS) {
   addSourceLine(Die, NS.getLineNumber(), NS.getFilename(), NS.getDirectory());
 }
 
-/// addVariableAddress - Add DW_AT_location attribute for a
-/// DbgVariable based on provided MachineLocation.
-void DwarfUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
-                                   MachineLocation Location) {
-  if (DV.variableHasComplexAddress())
-    addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
-  else if (DV.isBlockByrefVariable())
-    addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location);
-  else
-    addAddress(Die, dwarf::DW_AT_location, Location,
-               DV.getVariable().isIndirect());
-}
-
 /// addRegisterOp - Add register operand.
-void DwarfUnit::addRegisterOp(DIELoc &TheDie, unsigned Reg) {
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  int DWReg = RI->getDwarfRegNum(Reg, false);
-  bool isSubRegister = DWReg < 0;
-
-  unsigned Idx = 0;
-
-  // Go up the super-register chain until we hit a valid dwarf register number.
-  for (MCSuperRegIterator SR(Reg, RI); SR.isValid() && DWReg < 0; ++SR) {
-    DWReg = RI->getDwarfRegNum(*SR, false);
-    if (DWReg >= 0)
-      Idx = RI->getSubRegIndex(*SR, Reg);
-  }
-
-  if (DWReg < 0) {
-    DEBUG(dbgs() << "Invalid Dwarf register number.\n");
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_nop);
-    return;
-  }
-
-  // Emit register
-  if (DWReg < 32)
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg);
-  else {
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
-    addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
-  }
-
-  // Emit Mask
-  if (isSubRegister) {
-    unsigned Size = RI->getSubRegIdxSize(Idx);
-    unsigned Offset = RI->getSubRegIdxOffset(Idx);
-    if (Offset > 0) {
-      addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_bit_piece);
-      addUInt(TheDie, dwarf::DW_FORM_data1, Size);
-      addUInt(TheDie, dwarf::DW_FORM_data1, Offset);
-    } else {
-      unsigned ByteSize = Size / 8; // Assuming 8 bits per byte.
-      addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_piece);
-      addUInt(TheDie, dwarf::DW_FORM_data1, ByteSize);
-    }
-  }
+bool DwarfUnit::addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
+                                   unsigned SizeInBits, unsigned OffsetInBits) {
+  DIEDwarfExpression Expr(*Asm, *this, TheDie);
+  Expr.AddMachineRegPiece(Reg, SizeInBits, OffsetInBits);
+  return true;
 }
 
 /// addRegisterOffset - Add register offset.
-void DwarfUnit::addRegisterOffset(DIELoc &TheDie, unsigned Reg,
+bool DwarfUnit::addRegisterOffset(DIELoc &TheDie, unsigned Reg,
                                   int64_t Offset) {
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  unsigned DWReg = RI->getDwarfRegNum(Reg, false);
-  const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
-  if (Reg == TRI->getFrameRegister(*Asm->MF))
-    // If variable offset is based in frame register then use fbreg.
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
-  else if (DWReg < 32)
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DWReg);
-  else {
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
-    addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
-  }
-  addSInt(TheDie, dwarf::DW_FORM_sdata, Offset);
-}
-
-/// addAddress - Add an address attribute to a die based on the location
-/// provided.
-void DwarfUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,
-                           const MachineLocation &Location, bool Indirect) {
-  DIELoc *Loc = new (DIEValueAllocator) DIELoc();
-
-  if (Location.isReg() && !Indirect)
-    addRegisterOp(*Loc, Location.getReg());
-  else {
-    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
-    if (Indirect && !Location.isReg()) {
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    }
-  }
-
-  // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Loc);
-}
-
-/// addComplexAddress - Start with the address based on the location provided,
-/// and generate the DWARF information necessary to find the actual variable
-/// given the extra address information encoded in the DbgVariable, starting
-/// from the starting location.  Add the DWARF information to the die.
-///
-void DwarfUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
-                                  dwarf::Attribute Attribute,
-                                  const MachineLocation &Location) {
-  DIELoc *Loc = new (DIEValueAllocator) DIELoc();
-  unsigned N = DV.getNumAddrElements();
-  unsigned i = 0;
-  if (Location.isReg()) {
-    if (N >= 2 && DV.getAddrElement(0) == DIBuilder::OpPlus) {
-      // If first address element is OpPlus then emit
-      // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
-      addRegisterOffset(*Loc, Location.getReg(), DV.getAddrElement(1));
-      i = 2;
-    } else
-      addRegisterOp(*Loc, Location.getReg());
-  } else
-    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
-
-  for (; i < N; ++i) {
-    uint64_t Element = DV.getAddrElement(i);
-    if (Element == DIBuilder::OpPlus) {
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-      addUInt(*Loc, dwarf::DW_FORM_udata, DV.getAddrElement(++i));
-    } else if (Element == DIBuilder::OpDeref) {
-      if (!Location.isReg())
-        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    } else
-      llvm_unreachable("unknown DIBuilder Opcode");
-  }
-
-  // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Loc);
+  DIEDwarfExpression Expr(*Asm, *this, TheDie);
+  return Expr.AddMachineRegIndirect(Reg, Offset);
 }
 
 /* Byref variables, in Blocks, are declared by the programmer as "SomeType
@@ -690,7 +507,7 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
 
   // Find the __forwarding field and the variable field in the __Block_byref
   // struct.
-  DIArray Fields = blockStruct.getTypeArray();
+  DIArray Fields = blockStruct.getElements();
   DIDerivedType varField;
   DIDerivedType forwardingField;
 
@@ -711,10 +528,14 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
   // variable's location.
   DIELoc *Loc = new (DIEValueAllocator) DIELoc();
 
+  bool validReg;
   if (Location.isReg())
-    addRegisterOp(*Loc, Location.getReg());
+    validReg = addRegisterOpPiece(*Loc, Location.getReg());
   else
-    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+    validReg = addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+
+  if (!validReg)
+    return;
 
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
@@ -752,13 +573,18 @@ static bool isUnsignedDIType(DwarfDebug *DD, DIType Ty) {
     dwarf::Tag T = (dwarf::Tag)Ty.getTag();
     // Encode pointer constants as unsigned bytes. This is used at least for
     // null pointer constant emission.
+    // (Pieces of) aggregate types that get hacked apart by SROA may also be
+    // represented by a constant. Encode them as unsigned bytes.
     // FIXME: reference and rvalue_reference /probably/ shouldn't be allowed
     // here, but accept them for now due to a bug in SROA producing bogus
     // dbg.values.
-    if (T == dwarf::DW_TAG_pointer_type ||
+    if (T == dwarf::DW_TAG_array_type ||
+        T == dwarf::DW_TAG_class_type ||
+        T == dwarf::DW_TAG_pointer_type ||
         T == dwarf::DW_TAG_ptr_to_member_type ||
         T == dwarf::DW_TAG_reference_type ||
-        T == dwarf::DW_TAG_rvalue_reference_type)
+        T == dwarf::DW_TAG_rvalue_reference_type ||
+        T == dwarf::DW_TAG_structure_type)
       return true;
     assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type ||
            T == dwarf::DW_TAG_volatile_type ||
@@ -779,11 +605,14 @@ static bool isUnsignedDIType(DwarfDebug *DD, DIType Ty) {
           Encoding == dwarf::DW_ATE_unsigned_char ||
           Encoding == dwarf::DW_ATE_signed ||
           Encoding == dwarf::DW_ATE_signed_char ||
-          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean) &&
+          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean ||
+          (Ty.getTag() == dwarf::DW_TAG_unspecified_type &&
+           Ty.getName() == "decltype(nullptr)")) &&
          "Unsupported encoding");
   return (Encoding == dwarf::DW_ATE_unsigned ||
           Encoding == dwarf::DW_ATE_unsigned_char ||
-          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean);
+          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean ||
+          Ty.getTag() == dwarf::DW_TAG_unspecified_type);
 }
 
 /// If this type is derived from a base type then return base type size.
@@ -1002,11 +831,9 @@ void DwarfUnit::updateAcceleratorTables(DIScope Context, DIType Ty,
     unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
     DD->addAccelType(Ty.getName(), TyDIE, Flags);
 
-    if ((!Context || Context.isCompileUnit() || Context.isFile() ||
-         Context.isNameSpace()) &&
-        getCUNode().getEmissionKind() != DIBuilder::LineTablesOnly)
-      GlobalTypes[getParentContextString(Context) + Ty.getName().str()] =
-          &TyDIE;
+    if (!Context || Context.isCompileUnit() || Context.isFile() ||
+        Context.isNameSpace())
+      addGlobalType(Ty, TyDIE, Context);
   }
 }
 
@@ -1031,14 +858,6 @@ void DwarfUnit::addType(DIE &Entity, DIType Ty, dwarf::Attribute Attribute) {
   addDIEEntry(Entity, Attribute, Entry);
 }
 
-/// addGlobalName - Add a new global name to the compile unit.
-void DwarfUnit::addGlobalName(StringRef Name, DIE &Die, DIScope Context) {
-  if (getCUNode().getEmissionKind() == DIBuilder::LineTablesOnly)
-    return;
-  std::string FullName = getParentContextString(Context) + Name.str();
-  GlobalNames[FullName] = &Die;
-}
-
 /// getParentContextString - Walks the metadata parent chain in a language
 /// specific manner (using the compile unit language) and returns
 /// it as a string. This is done at the metadata level because DIEs may
@@ -1117,7 +936,8 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
     addString(Buffer, dwarf::DW_AT_name, Name);
 
   // Add size if non-zero (derived types might be zero-sized.)
-  if (Size && Tag != dwarf::DW_TAG_pointer_type)
+  if (Size && Tag != dwarf::DW_TAG_pointer_type
+           && Tag != dwarf::DW_TAG_ptr_to_member_type)
     addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
 
   if (Tag == dwarf::DW_TAG_ptr_to_member_type)
@@ -1129,16 +949,16 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
 }
 
 /// constructSubprogramArguments - Construct function argument DIEs.
-void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DIArray Args) {
+void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeArray Args) {
   for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
-    DIDescriptor Ty = Args.getElement(i);
-    if (Ty.isUnspecifiedParameter()) {
+    DIType Ty = resolve(Args.getElement(i));
+    if (!Ty) {
       assert(i == N-1 && "Unspecified parameter must be the last argument");
       createAndAddDIE(dwarf::DW_TAG_unspecified_parameters, Buffer);
     } else {
       DIE &Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, Buffer);
-      addType(Arg, DIType(Ty));
-      if (DIType(Ty).isArtificial())
+      addType(Arg, Ty);
+      if (Ty.isArtificial())
         addFlag(Arg, dwarf::DW_AT_artificial);
     }
   }
@@ -1161,14 +981,14 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     break;
   case dwarf::DW_TAG_subroutine_type: {
     // Add return type. A void return won't have a type.
-    DIArray Elements = CTy.getTypeArray();
-    DIType RTy(Elements.getElement(0));
+    DITypeArray Elements = DISubroutineType(CTy).getTypeArray();
+    DIType RTy(resolve(Elements.getElement(0)));
     if (RTy)
       addType(Buffer, RTy);
 
     bool isPrototyped = true;
     if (Elements.getNumElements() == 2 &&
-        Elements.getElement(1).isUnspecifiedParameter())
+        !Elements.getElement(1))
       isPrototyped = false;
 
     constructSubprogramArguments(Buffer, Elements);
@@ -1191,7 +1011,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
   case dwarf::DW_TAG_union_type:
   case dwarf::DW_TAG_class_type: {
     // Add elements to structure type.
-    DIArray Elements = CTy.getTypeArray();
+    DIArray Elements = CTy.getElements();
     for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
       DIDescriptor Element = Elements.getElement(i);
       if (Element.isSubprogram())
@@ -1250,6 +1070,8 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     if (CTy.isAppleBlockExtension())
       addFlag(Buffer, dwarf::DW_AT_APPLE_block);
 
+    // This is outside the DWARF spec, but GDB expects a DW_AT_containing_type
+    // inside C++ composite types to point to the base class with the vtable.
     DICompositeType ContainingType(resolve(CTy.getContainingType()));
     if (ContainingType)
       addDIEEntry(Buffer, dwarf::DW_AT_containing_type,
@@ -1327,10 +1149,10 @@ DwarfUnit::constructTemplateValueParameterDIE(DIE &Buffer,
     addType(ParamDIE, resolve(VP.getType()));
   if (!VP.getName().empty())
     addString(ParamDIE, dwarf::DW_AT_name, VP.getName());
-  if (Value *Val = VP.getValue()) {
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(Val))
+  if (Metadata *Val = VP.getValue()) {
+    if (ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Val))
       addConstantValue(ParamDIE, CI, resolve(VP.getType()));
-    else if (GlobalValue *GV = dyn_cast<GlobalValue>(Val)) {
+    else if (GlobalValue *GV = mdconst::dyn_extract<GlobalValue>(Val)) {
       // For declaration non-type template parameters (such as global values and
       // functions)
       DIELoc *Loc = new (DIEValueAllocator) DIELoc();
@@ -1373,20 +1195,23 @@ DIE *DwarfUnit::getOrCreateNameSpace(DINameSpace NS) {
 }
 
 /// getOrCreateSubprogramDIE - Create new DIE using SP.
-DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
+DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP, bool Minimal) {
   // Construct the context before querying for the existence of the DIE in case
   // such construction creates the DIE (as is the case for member function
   // declarations).
-  DIE *ContextDIE = getOrCreateContextDIE(resolve(SP.getContext()));
+  DIE *ContextDIE =
+      Minimal ? &getUnitDie() : getOrCreateContextDIE(resolve(SP.getContext()));
 
   if (DIE *SPDie = getDIE(SP))
     return SPDie;
 
   if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
-    // Add subprogram definitions to the CU die directly.
-    ContextDIE = &getUnitDie();
-    // Build the decl now to ensure it precedes the definition.
-    getOrCreateSubprogramDIE(SPDecl);
+    if (!Minimal) {
+      // Add subprogram definitions to the CU die directly.
+      ContextDIE = &getUnitDie();
+      // Build the decl now to ensure it precedes the definition.
+      getOrCreateSubprogramDIE(SPDecl);
+    }
   }
 
   // DW_TAG_inlined_subroutine may refer to this DIE.
@@ -1401,14 +1226,8 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
   return &SPDie;
 }
 
-void DwarfUnit::applySubprogramAttributesToDefinition(DISubprogram SP, DIE &SPDie) {
-  DISubprogram SPDecl = SP.getFunctionDeclaration();
-  DIScope Context = resolve(SPDecl ? SPDecl.getContext() : SP.getContext());
-  applySubprogramAttributes(SP, SPDie);
-  addGlobalName(SP.getName(), SPDie, Context);
-}
-
-void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie) {
+bool DwarfUnit::applySubprogramDefinitionAttributes(DISubprogram SP,
+                                                    DIE &SPDie) {
   DIE *DeclDie = nullptr;
   StringRef DeclLinkageName;
   if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
@@ -1431,17 +1250,29 @@ void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie) {
     addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
               GlobalValue::getRealLinkageName(LinkageName));
 
-  if (DeclDie) {
-    // Refer to the function declaration where all the other attributes will be
-    // found.
-    addDIEEntry(SPDie, dwarf::DW_AT_specification, *DeclDie);
-    return;
-  }
+  if (!DeclDie)
+    return false;
+
+  // Refer to the function declaration where all the other attributes will be
+  // found.
+  addDIEEntry(SPDie, dwarf::DW_AT_specification, *DeclDie);
+  return true;
+}
+
+void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie,
+                                          bool Minimal) {
+  if (!Minimal)
+    if (applySubprogramDefinitionAttributes(SP, SPDie))
+      return;
 
   // Constructors and operators for anonymous aggregates do not have names.
   if (!SP.getName().empty())
     addString(SPDie, dwarf::DW_AT_name, SP.getName());
 
+  // Skip the rest of the attributes under -gmlt to save space.
+  if (Minimal)
+    return;
+
   addSourceLine(SPDie, SP);
 
   // Add the prototype if we have a prototype and we have a C like
@@ -1452,15 +1283,15 @@ void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie) {
        Language == dwarf::DW_LANG_ObjC))
     addFlag(SPDie, dwarf::DW_AT_prototyped);
 
-  DICompositeType SPTy = SP.getType();
+  DISubroutineType SPTy = SP.getType();
   assert(SPTy.getTag() == dwarf::DW_TAG_subroutine_type &&
          "the type of a subprogram should be a subroutine");
 
-  DIArray Args = SPTy.getTypeArray();
+  DITypeArray Args = SPTy.getTypeArray();
   // Add a return type. If this is a type like a C/C++ void type we don't add a
   // return type.
-  if (Args.getElement(0))
-    addType(SPDie, DIType(Args.getElement(0)));
+  if (resolve(Args.getElement(0)))
+    addType(SPDie, DIType(resolve(Args.getElement(0))));
 
   unsigned VK = SP.getVirtuality();
   if (VK) {
@@ -1506,7 +1337,7 @@ void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie) {
   else if (SP.isPrivate())
     addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_private);
-  else
+  else if (SP.isPublic())
     addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_public);
 
@@ -1514,184 +1345,6 @@ void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie) {
     addFlag(SPDie, dwarf::DW_AT_explicit);
 }
 
-void DwarfUnit::applyVariableAttributes(const DbgVariable &Var,
-                                        DIE &VariableDie) {
-  StringRef Name = Var.getName();
-  if (!Name.empty())
-    addString(VariableDie, dwarf::DW_AT_name, Name);
-  addSourceLine(VariableDie, Var.getVariable());
-  addType(VariableDie, Var.getType());
-  if (Var.isArtificial())
-    addFlag(VariableDie, dwarf::DW_AT_artificial);
-}
-
-// Return const expression if value is a GEP to access merged global
-// constant. e.g.
-// i8* getelementptr ({ i8, i8, i8, i8 }* @_MergedGlobals, i32 0, i32 0)
-static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
-  const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(V);
-  if (!CE || CE->getNumOperands() != 3 ||
-      CE->getOpcode() != Instruction::GetElementPtr)
-    return nullptr;
-
-  // First operand points to a global struct.
-  Value *Ptr = CE->getOperand(0);
-  if (!isa<GlobalValue>(Ptr) ||
-      !isa<StructType>(cast<PointerType>(Ptr->getType())->getElementType()))
-    return nullptr;
-
-  // Second operand is zero.
-  const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(CE->getOperand(1));
-  if (!CI || !CI->isZero())
-    return nullptr;
-
-  // Third operand is offset.
-  if (!isa<ConstantInt>(CE->getOperand(2)))
-    return nullptr;
-
-  return CE;
-}
-
-/// createGlobalVariableDIE - create global variable DIE.
-void DwarfCompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
-  // Check for pre-existence.
-  if (getDIE(GV))
-    return;
-
-  assert(GV.isGlobalVariable());
-
-  DIScope GVContext = GV.getContext();
-  DIType GTy = DD->resolve(GV.getType());
-
-  // If this is a static data member definition, some attributes belong
-  // to the declaration DIE.
-  DIE *VariableDIE = nullptr;
-  bool IsStaticMember = false;
-  DIDerivedType SDMDecl = GV.getStaticDataMemberDeclaration();
-  if (SDMDecl.Verify()) {
-    assert(SDMDecl.isStaticMember() && "Expected static member decl");
-    // We need the declaration DIE that is in the static member's class.
-    VariableDIE = getOrCreateStaticMemberDIE(SDMDecl);
-    IsStaticMember = true;
-  }
-
-  // If this is not a static data member definition, create the variable
-  // DIE and add the initial set of attributes to it.
-  if (!VariableDIE) {
-    // Construct the context before querying for the existence of the DIE in
-    // case such construction creates the DIE.
-    DIE *ContextDIE = getOrCreateContextDIE(GVContext);
-
-    // Add to map.
-    VariableDIE = &createAndAddDIE(GV.getTag(), *ContextDIE, GV);
-
-    // Add name and type.
-    addString(*VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
-    addType(*VariableDIE, GTy);
-
-    // Add scoping info.
-    if (!GV.isLocalToUnit())
-      addFlag(*VariableDIE, dwarf::DW_AT_external);
-
-    // Add line number info.
-    addSourceLine(*VariableDIE, GV);
-  }
-
-  // Add location.
-  bool addToAccelTable = false;
-  DIE *VariableSpecDIE = nullptr;
-  bool isGlobalVariable = GV.getGlobal() != nullptr;
-  if (isGlobalVariable) {
-    addToAccelTable = true;
-    DIELoc *Loc = new (DIEValueAllocator) DIELoc();
-    const MCSymbol *Sym = Asm->getSymbol(GV.getGlobal());
-    if (GV.getGlobal()->isThreadLocal()) {
-      // FIXME: Make this work with -gsplit-dwarf.
-      unsigned PointerSize = Asm->getDataLayout().getPointerSize();
-      assert((PointerSize == 4 || PointerSize == 8) &&
-             "Add support for other sizes if necessary");
-      // Based on GCC's support for TLS:
-      if (!DD->useSplitDwarf()) {
-        // 1) Start with a constNu of the appropriate pointer size
-        addUInt(*Loc, dwarf::DW_FORM_data1,
-                PointerSize == 4 ? dwarf::DW_OP_const4u : dwarf::DW_OP_const8u);
-        // 2) containing the (relocated) offset of the TLS variable
-        //    within the module's TLS block.
-        addExpr(*Loc, dwarf::DW_FORM_udata,
-                Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
-      } else {
-        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
-        addUInt(*Loc, dwarf::DW_FORM_udata,
-                DD->getAddressPool().getIndex(Sym, /* TLS */ true));
-      }
-      // 3) followed by a custom OP to make the debugger do a TLS lookup.
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_push_tls_address);
-    } else {
-      DD->addArangeLabel(SymbolCU(this, Sym));
-      addOpAddress(*Loc, Sym);
-    }
-    // Do not create specification DIE if context is either compile unit
-    // or a subprogram.
-    if (GVContext && GV.isDefinition() && !GVContext.isCompileUnit() &&
-        !GVContext.isFile() && !DD->isSubprogramContext(GVContext)) {
-      // Create specification DIE.
-      VariableSpecDIE = &createAndAddDIE(dwarf::DW_TAG_variable, UnitDie);
-      addDIEEntry(*VariableSpecDIE, dwarf::DW_AT_specification, *VariableDIE);
-      addBlock(*VariableSpecDIE, dwarf::DW_AT_location, Loc);
-      // A static member's declaration is already flagged as such.
-      if (!SDMDecl.Verify())
-        addFlag(*VariableDIE, dwarf::DW_AT_declaration);
-    } else {
-      addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
-    }
-    // Add the linkage name.
-    StringRef LinkageName = GV.getLinkageName();
-    if (!LinkageName.empty())
-      // From DWARF4: DIEs to which DW_AT_linkage_name may apply include:
-      // TAG_common_block, TAG_constant, TAG_entry_point, TAG_subprogram and
-      // TAG_variable.
-      addString(IsStaticMember && VariableSpecDIE ? *VariableSpecDIE
-                                                  : *VariableDIE,
-                DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
-                                           : dwarf::DW_AT_MIPS_linkage_name,
-                GlobalValue::getRealLinkageName(LinkageName));
-  } else if (const ConstantInt *CI =
-                 dyn_cast_or_null<ConstantInt>(GV.getConstant())) {
-    // AT_const_value was added when the static member was created. To avoid
-    // emitting AT_const_value multiple times, we only add AT_const_value when
-    // it is not a static member.
-    if (!IsStaticMember)
-      addConstantValue(*VariableDIE, CI, GTy);
-  } else if (const ConstantExpr *CE = getMergedGlobalExpr(GV->getOperand(11))) {
-    addToAccelTable = true;
-    // GV is a merged global.
-    DIELoc *Loc = new (DIEValueAllocator) DIELoc();
-    Value *Ptr = CE->getOperand(0);
-    MCSymbol *Sym = Asm->getSymbol(cast<GlobalValue>(Ptr));
-    DD->addArangeLabel(SymbolCU(this, Sym));
-    addOpAddress(*Loc, Sym);
-    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    SmallVector<Value *, 3> Idx(CE->op_begin() + 1, CE->op_end());
-    addUInt(*Loc, dwarf::DW_FORM_udata,
-            Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
-    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-    addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
-  }
-
-  if (addToAccelTable) {
-    DIE &AddrDIE = VariableSpecDIE ? *VariableSpecDIE : *VariableDIE;
-    DD->addAccelName(GV.getName(), AddrDIE);
-
-    // If the linkage name is different than the name, go ahead and output
-    // that as well into the name table.
-    if (GV.getLinkageName() != "" && GV.getName() != GV.getLinkageName())
-      DD->addAccelName(GV.getLinkageName(), AddrDIE);
-  }
-
-  addGlobalName(GV.getName(), VariableSpecDIE ? *VariableSpecDIE : *VariableDIE,
-                GV.getContext());
-}
-
 /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
 void DwarfUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy) {
   DIE &DW_Subrange = createAndAddDIE(dwarf::DW_TAG_subrange_type, Buffer);
@@ -1700,9 +1353,7 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy) {
   // The LowerBound value defines the lower bounds which is typically zero for
   // C/C++. The Count value is the number of elements.  Values are 64 bit. If
   // Count == -1 then the array is unbounded and we do not emit
-  // DW_AT_lower_bound and DW_AT_upper_bound attributes. If LowerBound == 0 and
-  // Count == 0, then the array has zero elements in which case we do not emit
-  // an upper bound.
+  // DW_AT_lower_bound and DW_AT_count attributes.
   int64_t LowerBound = SR.getLo();
   int64_t DefaultLowerBound = getDefaultLowerBound();
   int64_t Count = SR.getCount();
@@ -1710,11 +1361,22 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy) {
   if (DefaultLowerBound == -1 || LowerBound != DefaultLowerBound)
     addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, None, LowerBound);
 
-  if (Count != -1 && Count != 0)
+  if (Count != -1)
     // FIXME: An unbounded array should reference the expression that defines
     // the array.
-    addUInt(DW_Subrange, dwarf::DW_AT_upper_bound, None,
-            LowerBound + Count - 1);
+    addUInt(DW_Subrange, dwarf::DW_AT_count, None, Count);
+}
+
+DIE *DwarfUnit::getIndexTyDie() {
+  if (IndexTyDie)
+    return IndexTyDie;
+  // Construct an integer type to use for indexes.
+  IndexTyDie = &createAndAddDIE(dwarf::DW_TAG_base_type, UnitDie);
+  addString(*IndexTyDie, dwarf::DW_AT_name, "sizetype");
+  addUInt(*IndexTyDie, dwarf::DW_AT_byte_size, None, sizeof(int64_t));
+  addUInt(*IndexTyDie, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+          dwarf::DW_ATE_unsigned);
+  return IndexTyDie;
 }
 
 /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
@@ -1729,18 +1391,9 @@ void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, DICompositeType CTy) {
   // FIXME: This type should be passed down from the front end
   // as different languages may have different sizes for indexes.
   DIE *IdxTy = getIndexTyDie();
-  if (!IdxTy) {
-    // Construct an integer type to use for indexes.
-    IdxTy = &createAndAddDIE(dwarf::DW_TAG_base_type, UnitDie);
-    addString(*IdxTy, dwarf::DW_AT_name, "sizetype");
-    addUInt(*IdxTy, dwarf::DW_AT_byte_size, None, sizeof(int64_t));
-    addUInt(*IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
-            dwarf::DW_ATE_unsigned);
-    setIndexTyDie(IdxTy);
-  }
 
   // Add subranges to array type.
-  DIArray Elements = CTy.getTypeArray();
+  DIArray Elements = CTy.getElements();
   for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
     DIDescriptor Element = Elements.getElement(i);
     if (Element.getTag() == dwarf::DW_TAG_subrange_type)
@@ -1750,7 +1403,7 @@ void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, DICompositeType CTy) {
 
 /// constructEnumTypeDIE - Construct an enum type DIE from DICompositeType.
 void DwarfUnit::constructEnumTypeDIE(DIE &Buffer, DICompositeType CTy) {
-  DIArray Elements = CTy.getTypeArray();
+  DIArray Elements = CTy.getElements();
 
   // Add enumerators to enumeration type.
   for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
@@ -1788,68 +1441,6 @@ void DwarfUnit::constructContainingTypeDIEs() {
   }
 }
 
-/// constructVariableDIE - Construct a DIE for the given DbgVariable.
-std::unique_ptr<DIE> DwarfUnit::constructVariableDIE(DbgVariable &DV,
-                                                     bool Abstract) {
-  auto D = constructVariableDIEImpl(DV, Abstract);
-  DV.setDIE(*D);
-  return D;
-}
-
-std::unique_ptr<DIE> DwarfUnit::constructVariableDIEImpl(const DbgVariable &DV,
-                                                         bool Abstract) {
-  // Define variable debug information entry.
-  auto VariableDie = make_unique<DIE>(DV.getTag());
-
-  if (Abstract) {
-    applyVariableAttributes(DV, *VariableDie);
-    return VariableDie;
-  }
-
-  // Add variable address.
-
-  unsigned Offset = DV.getDotDebugLocOffset();
-  if (Offset != ~0U) {
-    addLocationList(*VariableDie, dwarf::DW_AT_location, Offset);
-    return VariableDie;
-  }
-
-  // Check if variable is described by a DBG_VALUE instruction.
-  if (const MachineInstr *DVInsn = DV.getMInsn()) {
-    assert(DVInsn->getNumOperands() == 3);
-    if (DVInsn->getOperand(0).isReg()) {
-      const MachineOperand RegOp = DVInsn->getOperand(0);
-      // If the second operand is an immediate, this is an indirect value.
-      if (DVInsn->getOperand(1).isImm()) {
-        MachineLocation Location(RegOp.getReg(),
-                                 DVInsn->getOperand(1).getImm());
-        addVariableAddress(DV, *VariableDie, Location);
-      } else if (RegOp.getReg())
-        addVariableAddress(DV, *VariableDie, MachineLocation(RegOp.getReg()));
-    } else if (DVInsn->getOperand(0).isImm())
-      addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType());
-    else if (DVInsn->getOperand(0).isFPImm())
-      addConstantFPValue(*VariableDie, DVInsn->getOperand(0));
-    else if (DVInsn->getOperand(0).isCImm())
-      addConstantValue(*VariableDie, DVInsn->getOperand(0).getCImm(),
-                       DV.getType());
-
-    return VariableDie;
-  }
-
-  // .. else use frame index.
-  int FI = DV.getFrameIndex();
-  if (FI != ~0) {
-    unsigned FrameReg = 0;
-    const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-    int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
-    MachineLocation Location(FrameReg, Offset);
-    addVariableAddress(DV, *VariableDie, Location);
-  }
-
-  return VariableDie;
-}
-
 /// constructMemberDIE - Construct member DIE from DIDerivedType.
 void DwarfUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
   DIE &MemberDie = createAndAddDIE(DT.getTag(), Buffer);
@@ -1922,7 +1513,7 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
     addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_private);
   // Otherwise C++ member and base classes are considered public.
-  else
+  else if (DT.isPublic())
     addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_public);
   if (DT.isVirtual())
@@ -1971,7 +1562,7 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(DIDerivedType DT) {
   else if (DT.isPrivate())
     addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_private);
-  else
+  else if (DT.isPublic())
     addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_public);
 
@@ -1984,6 +1575,10 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(DIDerivedType DT) {
 }
 
 void DwarfUnit::emitHeader(const MCSymbol *ASectionSym) const {
+  // Emit size of content not including length itself
+  Asm->OutStreamer.AddComment("Length of Unit");
+  Asm->EmitInt32(getHeaderSize() + UnitDie.getSize());
+
   Asm->OutStreamer.AddComment("DWARF version number");
   Asm->EmitInt16(DD->getDwarfVersion());
   Asm->OutStreamer.AddComment("Offset Into Abbrev. Section");
@@ -1999,50 +1594,9 @@ void DwarfUnit::emitHeader(const MCSymbol *ASectionSym) const {
   Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
 }
 
-void DwarfUnit::addRange(RangeSpan Range) {
-  // Only add a range for this unit if we're emitting full debug.
-  if (getCUNode().getEmissionKind() == DIBuilder::FullDebug) {
-    // If we have no current ranges just add the range and return, otherwise,
-    // check the current section and CU against the previous section and CU we
-    // emitted into and the subprogram was contained within. If these are the
-    // same then extend our current range, otherwise add this as a new range.
-    if (CURanges.size() == 0 ||
-        this != DD->getPrevCU() ||
-        Asm->getCurrentSection() != DD->getPrevSection()) {
-      CURanges.push_back(Range);
-      return;
-    }
-
-    assert(&(CURanges.back().getEnd()->getSection()) ==
-               &(Range.getEnd()->getSection()) &&
-           "We can only append to a range in the same section!");
-    CURanges.back().setEnd(Range.getEnd());
-  }
-}
-
-void DwarfCompileUnit::initStmtList(MCSymbol *DwarfLineSectionSym) {
-  // Define start line table label for each Compile Unit.
-  MCSymbol *LineTableStartSym =
-      Asm->OutStreamer.getDwarfLineTableSymbol(getUniqueID());
-
-  stmtListIndex = UnitDie.getValues().size();
-
-  // DW_AT_stmt_list is a offset of line number information for this
-  // compile unit in debug_line section. For split dwarf this is
-  // left in the skeleton CU and so not included.
-  // The line table entries are not always emitted in assembly, so it
-  // is not okay to use line_table_start here.
-  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-    addSectionLabel(UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym);
-  else
-    addSectionDelta(UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym,
-                    DwarfLineSectionSym);
-}
-
-void DwarfCompileUnit::applyStmtList(DIE &D) {
-  D.addValue(dwarf::DW_AT_stmt_list,
-             UnitDie.getAbbrev().getData()[stmtListIndex].getForm(),
-             UnitDie.getValues()[stmtListIndex]);
+void DwarfUnit::initSection(const MCSection *Section) {
+  assert(!this->Section);
+  this->Section = Section;
 }
 
 void DwarfTypeUnit::emitHeader(const MCSymbol *ASectionSym) const {
@@ -2055,16 +1609,8 @@ void DwarfTypeUnit::emitHeader(const MCSymbol *ASectionSym) const {
                                 sizeof(Ty->getOffset()));
 }
 
-void DwarfTypeUnit::initSection(const MCSection *Section) {
-  assert(!this->Section);
-  this->Section = Section;
-  // Since each type unit is contained in its own COMDAT section, the begin
-  // label and the section label are the same. Using the begin label emission in
-  // DwarfDebug to emit the section label as well is slightly subtle/sneaky, but
-  // the only other alternative of lazily constructing start-of-section labels
-  // and storing a mapping in DwarfDebug (or AsmPrinter).
-  this->SectionSym = this->LabelBegin =
-      Asm->GetTempSymbol(Section->getLabelBeginName(), getUniqueID());
-  this->LabelEnd =
-      Asm->GetTempSymbol(Section->getLabelEndName(), getUniqueID());
+bool DwarfTypeUnit::isDwoUnit() const {
+  // Since there are no skeleton type units, all type units are dwo type units
+  // when split DWARF is being used.
+  return DD->useSplitDwarf();
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index b7b83b282d82..7a5e47ddf9dd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -11,20 +11,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
-#define CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFUNIT_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFUNIT_H
 
-#include "DIE.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCDwarf.h"
 
 namespace llvm {
 
@@ -55,7 +55,8 @@ private:
   SmallVector<RangeSpan, 2> Ranges;
 
 public:
-  RangeSpanList(MCSymbol *Sym) : RangeSym(Sym) {}
+  RangeSpanList(MCSymbol *Sym, SmallVector<RangeSpan, 2> Ranges)
+      : RangeSym(Sym), Ranges(std::move(Ranges)) {}
   MCSymbol *getSym() const { return RangeSym; }
   const SmallVectorImpl<RangeSpan> &getRanges() const { return Ranges; }
   void addRange(RangeSpan Range) { Ranges.push_back(Range); }
@@ -96,12 +97,6 @@ protected:
   /// descriptors to debug information entries using a DIEEntry proxy.
   DenseMap<const MDNode *, DIEEntry *> MDNodeToDIEEntryMap;
 
-  /// GlobalNames - A map of globally visible named entities for this unit.
-  StringMap<const DIE *> GlobalNames;
-
-  /// GlobalTypes - A map of globally visible types for this unit.
-  StringMap<const DIE *> GlobalTypes;
-
   /// DIEBlocks - A list of all the DIEBlocks in use.
   std::vector<DIEBlock *> DIEBlocks;
   
@@ -113,13 +108,6 @@ protected:
   /// corresponds to the MDNode mapped with the subprogram DIE.
   DenseMap<DIE *, const MDNode *> ContainingTypeMap;
 
-  // List of ranges for a given compile unit.
-  SmallVector<RangeSpan, 1> CURanges;
-
-  // List of range lists for a given compile unit, separate from the ranges for
-  // the CU itself.
-  SmallVector<RangeSpanList, 1> CURangeLists;
-
   // DIEValueAllocator - All DIEValues are allocated through this allocator.
   BumpPtrAllocator DIEValueAllocator;
 
@@ -129,86 +117,32 @@ protected:
   /// The section this unit will be emitted in.
   const MCSection *Section;
 
-  /// A label at the start of the non-dwo section related to this unit.
-  MCSymbol *SectionSym;
+  DwarfUnit(unsigned UID, dwarf::Tag, DICompileUnit CU, AsmPrinter *A,
+            DwarfDebug *DW, DwarfFile *DWU);
 
-  /// The start of the unit within its section.
-  MCSymbol *LabelBegin;
+  void initSection(const MCSection *Section);
 
-  /// The end of the unit within its section.
-  MCSymbol *LabelEnd;
+  /// Add a string attribute data and value.
+  void addLocalString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
 
-  /// Skeleton unit associated with this unit.
-  DwarfUnit *Skeleton;
+  void addIndexedString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
 
-  DwarfUnit(unsigned UID, dwarf::Tag, DICompileUnit CU, AsmPrinter *A,
-            DwarfDebug *DW, DwarfFile *DWU);
+  bool applySubprogramDefinitionAttributes(DISubprogram SP, DIE &SPDie);
 
 public:
   virtual ~DwarfUnit();
 
-  /// Set the skeleton unit associated with this unit.
-  void setSkeleton(DwarfUnit &Skel) { Skeleton = &Skel; }
-
-  /// Get the skeleton unit associated with this unit.
-  DwarfUnit *getSkeleton() const { return Skeleton; }
-
-  /// Pass in the SectionSym even though we could recreate it in every compile
-  /// unit (type units will have actually distinct symbols once they're in
-  /// comdat sections).
-  void initSection(const MCSection *Section, MCSymbol *SectionSym) {
-    assert(!this->Section);
-    this->Section = Section;
-    this->SectionSym = SectionSym;
-    this->LabelBegin =
-        Asm->GetTempSymbol(Section->getLabelBeginName(), getUniqueID());
-    this->LabelEnd =
-        Asm->GetTempSymbol(Section->getLabelEndName(), getUniqueID());
-  }
-
   const MCSection *getSection() const {
     assert(Section);
     return Section;
   }
 
-  /// If there's a skeleton then return the section symbol for the skeleton
-  /// unit, otherwise return the section symbol for this unit.
-  MCSymbol *getLocalSectionSym() const {
-    if (Skeleton)
-      return Skeleton->getSectionSym();
-    return getSectionSym();
-  }
-
-  MCSymbol *getSectionSym() const {
-    assert(Section);
-    return SectionSym;
-  }
-
-  /// If there's a skeleton then return the begin label for the skeleton unit,
-  /// otherwise return the local label for this unit.
-  MCSymbol *getLocalLabelBegin() const {
-    if (Skeleton)
-      return Skeleton->getLabelBegin();
-    return getLabelBegin();
-  }
-
-  MCSymbol *getLabelBegin() const {
-    assert(Section);
-    return LabelBegin;
-  }
-
-  MCSymbol *getLabelEnd() const {
-    assert(Section);
-    return LabelEnd;
-  }
-
   // Accessors.
+  AsmPrinter* getAsmPrinter() const { return Asm; }
   unsigned getUniqueID() const { return UniqueID; }
   uint16_t getLanguage() const { return CUNode.getLanguage(); }
   DICompileUnit getCUNode() const { return CUNode; }
   DIE &getUnitDie() { return UnitDie; }
-  const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; }
-  const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; }
 
   unsigned getDebugInfoOffset() const { return DebugInfoOffset; }
   void setDebugInfoOffset(unsigned DbgInfoOff) { DebugInfoOffset = DbgInfoOff; }
@@ -216,29 +150,15 @@ public:
   /// hasContent - Return true if this compile unit has something to write out.
   bool hasContent() const { return !UnitDie.getChildren().empty(); }
 
-  /// addRange - Add an address range to the list of ranges for this unit.
-  void addRange(RangeSpan Range);
-
-  /// getRanges - Get the list of ranges for this unit.
-  const SmallVectorImpl<RangeSpan> &getRanges() const { return CURanges; }
-  SmallVectorImpl<RangeSpan> &getRanges() { return CURanges; }
-
-  /// addRangeList - Add an address range list to the list of range lists.
-  void addRangeList(RangeSpanList Ranges) { CURangeLists.push_back(Ranges); }
-
-  /// getRangeLists - Get the vector of range lists.
-  const SmallVectorImpl<RangeSpanList> &getRangeLists() const {
-    return CURangeLists;
-  }
-  SmallVectorImpl<RangeSpanList> &getRangeLists() { return CURangeLists; }
-
   /// getParentContextString - Get a string containing the language specific
   /// context for a global name.
   std::string getParentContextString(DIScope Context) const;
 
-  /// addGlobalName - Add a new global entity to the compile unit.
-  ///
-  void addGlobalName(StringRef Name, DIE &Die, DIScope Context);
+  /// Add a new global name to the compile unit.
+  virtual void addGlobalName(StringRef Name, DIE &Die, DIScope Context) {}
+
+  /// Add a new global type to the compile unit.
+  virtual void addGlobalType(DIType Ty, const DIE &Die, DIScope Context) {}
 
   /// addAccelNamespace - Add a new name to the namespace accelerator table.
   void addAccelNamespace(StringRef Name, const DIE &Die);
@@ -275,14 +195,7 @@ public:
   void addSInt(DIELoc &Die, Optional<dwarf::Form> Form, int64_t Integer);
 
   /// addString - Add a string attribute data and value.
-  void addString(DIE &Die, dwarf::Attribute Attribute, const StringRef Str);
-
-  /// addLocalString - Add a string attribute data and value.
-  void addLocalString(DIE &Die, dwarf::Attribute Attribute,
-                      const StringRef Str);
-
-  /// addExpr - Add a Dwarf expression attribute data and value.
-  void addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr);
+  void addString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
 
   /// addLabel - Add a Dwarf label attribute data and value.
   void addLabel(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form,
@@ -290,14 +203,6 @@ public:
 
   void addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label);
 
-  /// addLocationList - Add a Dwarf loclistptr attribute data and value.
-  void addLocationList(DIE &Die, dwarf::Attribute Attribute, unsigned Index);
-
-  /// addSectionLabel - Add a Dwarf section label attribute data and value.
-  ///
-  void addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
-                       const MCSymbol *Label);
-
   /// addSectionOffset - Add an offset into a section attribute data and value.
   ///
   void addSectionOffset(DIE &Die, dwarf::Attribute Attribute, uint64_t Integer);
@@ -306,10 +211,6 @@ public:
   /// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
   void addOpAddress(DIELoc &Die, const MCSymbol *Label);
 
-  /// addSectionDelta - Add a label delta attribute data and value.
-  void addSectionDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
-                       const MCSymbol *Lo);
-
   /// addLabelDelta - Add a label delta attribute data and value.
   void addLabelDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
                      const MCSymbol *Lo);
@@ -339,11 +240,6 @@ public:
   void addSourceLine(DIE &Die, DINameSpace NS);
   void addSourceLine(DIE &Die, DIObjCProperty Ty);
 
-  /// addAddress - Add an address attribute to a die based on the location
-  /// provided.
-  void addAddress(DIE &Die, dwarf::Attribute Attribute,
-                  const MachineLocation &Location, bool Indirect = false);
-
   /// addConstantValue - Add constant value entry in variable DIE.
   void addConstantValue(DIE &Die, const MachineOperand &MO, DIType Ty);
   void addConstantValue(DIE &Die, const ConstantInt *CI, DIType Ty);
@@ -358,19 +254,16 @@ public:
   /// addTemplateParams - Add template parameters in buffer.
   void addTemplateParams(DIE &Buffer, DIArray TParams);
 
-  /// addRegisterOp - Add register operand.
-  void addRegisterOp(DIELoc &TheDie, unsigned Reg);
+  /// \brief Add register operand.
+  /// \returns false if the register does not exist, e.g., because it was never
+  /// materialized.
+  bool addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
+                          unsigned SizeInBits = 0, unsigned OffsetInBits = 0);
 
-  /// addRegisterOffset - Add register offset.
-  void addRegisterOffset(DIELoc &TheDie, unsigned Reg, int64_t Offset);
-
-  /// addComplexAddress - Start with the address based on the location provided,
-  /// and generate the DWARF information necessary to find the actual variable
-  /// (navigating the extra location information encoded in the type) based on
-  /// the starting location.  Add the DWARF information to the die.
-  void addComplexAddress(const DbgVariable &DV, DIE &Die,
-                         dwarf::Attribute Attribute,
-                         const MachineLocation &Location);
+  /// \brief Add register offset.
+  /// \returns false if the register does not exist, e.g., because it was never
+  /// materialized.
+  bool addRegisterOffset(DIELoc &TheDie, unsigned Reg, int64_t Offset);
 
   // FIXME: Should be reformulated in terms of addComplexAddress.
   /// addBlockByrefAddress - Start with the address based on the location
@@ -382,11 +275,6 @@ public:
                             dwarf::Attribute Attribute,
                             const MachineLocation &Location);
 
-  /// addVariableAddress - Add DW_AT_location attribute for a
-  /// DbgVariable based on provided MachineLocation.
-  void addVariableAddress(const DbgVariable &DV, DIE &Die,
-                          MachineLocation Location);
-
   /// addType - Add a new type attribute to the specified entity. This takes
   /// and attribute parameter because DW_AT_friend attributes are also
   /// type references.
@@ -397,11 +285,10 @@ public:
   DIE *getOrCreateNameSpace(DINameSpace NS);
 
   /// getOrCreateSubprogramDIE - Create new DIE using SP.
-  DIE *getOrCreateSubprogramDIE(DISubprogram SP);
+  DIE *getOrCreateSubprogramDIE(DISubprogram SP, bool Minimal = false);
 
-  void applySubprogramAttributes(DISubprogram SP, DIE &SPDie);
-  void applySubprogramAttributesToDefinition(DISubprogram SP, DIE &SPDie);
-  void applyVariableAttributes(const DbgVariable &Var, DIE &VariableDie);
+  void applySubprogramAttributes(DISubprogram SP, DIE &SPDie,
+                                 bool Minimal = false);
 
   /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
   /// given DIType.
@@ -417,12 +304,8 @@ public:
   /// vtables.
   void constructContainingTypeDIEs();
 
-  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-  std::unique_ptr<DIE> constructVariableDIE(DbgVariable &DV,
-                                            bool Abstract = false);
-
   /// constructSubprogramArguments - Construct function argument DIEs.
-  void constructSubprogramArguments(DIE &Buffer, DIArray Args);
+  void constructSubprogramArguments(DIE &Buffer, DITypeArray Args);
 
   /// Create a DIE with the given Tag, add the DIE to its parent, and
   /// call insertDIE if MD is not null.
@@ -453,12 +336,13 @@ protected:
   /// none currently exists, create a new ID and insert it in the line table.
   virtual unsigned getOrCreateSourceID(StringRef File, StringRef Directory) = 0;
 
-private:
-  /// \brief Construct a DIE for the given DbgVariable without initializing the
-  /// DbgVariable's DIE reference.
-  std::unique_ptr<DIE> constructVariableDIEImpl(const DbgVariable &DV,
-                                                bool Abstract);
+  /// resolve - Look in the DwarfDebug map for the MDNode that
+  /// corresponds to the reference.
+  template <typename T> T resolve(DIRef<T> Ref) const {
+    return DD->resolve(Ref);
+  }
 
+private:
   /// constructTypeDIE - Construct basic type die from DIBasicType.
   void constructTypeDIE(DIE &Buffer, DIBasicType BTy);
 
@@ -503,7 +387,7 @@ private:
   }
 
   // getIndexTyDie - Get an anonymous type for index type.
-  DIE *getIndexTyDie() { return IndexTyDie; }
+  DIE *getIndexTyDie();
 
   // setIndexTyDie - Set D as anonymous type for index which can be reused
   // later.
@@ -513,56 +397,22 @@ private:
   /// information entry.
   DIEEntry *createDIEEntry(DIE &Entry);
 
-  /// resolve - Look in the DwarfDebug map for the MDNode that
-  /// corresponds to the reference.
-  template <typename T> T resolve(DIRef<T> Ref) const {
-    return DD->resolve(Ref);
-  }
-
   /// If this is a named finished type then include it in the list of types for
   /// the accelerator tables.
   void updateAcceleratorTables(DIScope Context, DIType Ty, const DIE &TyDIE);
-};
-
-class DwarfCompileUnit : public DwarfUnit {
-  /// The attribute index of DW_AT_stmt_list in the compile unit DIE, avoiding
-  /// the need to search for it in applyStmtList.
-  unsigned stmtListIndex;
-
-public:
-  DwarfCompileUnit(unsigned UID, DICompileUnit Node, AsmPrinter *A,
-                   DwarfDebug *DW, DwarfFile *DWU);
-
-  void initStmtList(MCSymbol *DwarfLineSectionSym);
-
-  /// Apply the DW_AT_stmt_list from this compile unit to the specified DIE.
-  void applyStmtList(DIE &D);
-
-  /// createGlobalVariableDIE - create global variable DIE.
-  void createGlobalVariableDIE(DIGlobalVariable GV);
-
-  /// addLabelAddress - Add a dwarf label attribute data and value using
-  /// either DW_FORM_addr or DW_FORM_GNU_addr_index.
-  void addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
-                       const MCSymbol *Label);
 
-  /// addLocalLabelAddress - Add a dwarf label attribute data and value using
-  /// DW_FORM_addr only.
-  void addLocalLabelAddress(DIE &Die, dwarf::Attribute Attribute,
-                            const MCSymbol *Label);
-
-  DwarfCompileUnit &getCU() override { return *this; }
-
-  unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override;
+  virtual bool isDwoUnit() const = 0;
 };
 
 class DwarfTypeUnit : public DwarfUnit {
-private:
   uint64_t TypeSignature;
   const DIE *Ty;
   DwarfCompileUnit &CU;
   MCDwarfDwoLineTable *SplitLineTable;
 
+  unsigned getOrCreateSourceID(StringRef File, StringRef Directory) override;
+  bool isDwoUnit() const override;
+
 public:
   DwarfTypeUnit(unsigned UID, DwarfCompileUnit &CU, AsmPrinter *A,
                 DwarfDebug *DW, DwarfFile *DWU,
@@ -578,11 +428,8 @@ public:
     return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature
            sizeof(uint32_t);                               // Type DIE Offset
   }
-  void initSection(const MCSection *Section);
+  using DwarfUnit::initSection;
   DwarfCompileUnit &getCU() override { return CU; }
-
-protected:
-  unsigned getOrCreateSourceID(StringRef File, StringRef Directory) override;
 };
 } // end llvm namespace
 #endif
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 73f62bfd804e..1bc86f6c222a 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -121,7 +121,8 @@ computeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
       for (unsigned J = NumShared, M = TypeIds.size(); J != M; ++J) {
         int TypeID = TypeIds[J];
         assert(-1 - TypeID < (int)FilterOffsets.size() && "Unknown filter id!");
-        int ValueForTypeID = TypeID < 0 ? FilterOffsets[-1 - TypeID] : TypeID;
+        int ValueForTypeID =
+            isFilterEHSelector(TypeID) ? FilterOffsets[-1 - TypeID] : TypeID;
         unsigned SizeTypeID = getSLEB128Size(ValueForTypeID);
 
         int NextAction = SizeAction ? -(SizeAction + SizeTypeID) : 0;
@@ -195,9 +196,22 @@ bool EHStreamer::callToNoUnwindFunction(const MachineInstr *MI) {
 /// table.  Entries must be ordered by try-range address.
 void EHStreamer::
 computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                     const RangeMapType &PadMap,
                      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
                      const SmallVectorImpl<unsigned> &FirstActions) {
+  // Invokes and nounwind calls have entries in PadMap (due to being bracketed
+  // by try-range labels when lowered).  Ordinary calls do not, so appropriate
+  // try-ranges for them need be deduced so we can put them in the LSDA.
+  RangeMapType PadMap;
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LandingPad = LandingPads[i];
+    for (unsigned j = 0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
+      MCSymbol *BeginLabel = LandingPad->BeginLabels[j];
+      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
+      PadRange P = { i, j };
+      PadMap[BeginLabel] = P;
+    }
+  }
+
   // The end label of the previous invoke or nounwind try-range.
   MCSymbol *LastLabel = nullptr;
 
@@ -208,6 +222,8 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
   // Whether the last CallSite entry was for an invoke.
   bool PreviousIsInvoke = false;
 
+  bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
+
   // Visit all instructions in order of address.
   for (const auto &MBB : *Asm->MF) {
     for (const auto &MI : MBB) {
@@ -237,7 +253,7 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
       // instruction between the previous try-range and this one may throw,
       // create a call-site entry with no landing pad for the region between the
       // try-ranges.
-      if (SawPotentiallyThrowing && Asm->MAI->isExceptionHandlingDwarf()) {
+      if (SawPotentiallyThrowing && !IsSJLJ) {
         CallSiteEntry Site = { LastLabel, BeginLabel, nullptr, 0 };
         CallSites.push_back(Site);
         PreviousIsInvoke = false;
@@ -254,14 +270,14 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
         CallSiteEntry Site = {
           BeginLabel,
           LastLabel,
-          LandingPad->LandingPadLabel,
+          LandingPad,
           FirstActions[P.PadIndex]
         };
 
         // Try to merge with the previous call-site. SJLJ doesn't do this
-        if (PreviousIsInvoke && Asm->MAI->isExceptionHandlingDwarf()) {
+        if (PreviousIsInvoke && !IsSJLJ) {
           CallSiteEntry &Prev = CallSites.back();
-          if (Site.PadLabel == Prev.PadLabel && Site.Action == Prev.Action) {
+          if (Site.LPad == Prev.LPad && Site.Action == Prev.Action) {
             // Extend the range of the previous entry.
             Prev.EndLabel = Site.EndLabel;
             continue;
@@ -269,7 +285,7 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
         }
 
         // Otherwise, create a new call-site.
-        if (Asm->MAI->isExceptionHandlingDwarf())
+        if (!IsSJLJ)
           CallSites.push_back(Site);
         else {
           // SjLj EH must maintain the call sites in the order assigned
@@ -287,7 +303,7 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
   // If some instruction between the previous try-range and the end of the
   // function may throw, create a call-site entry with no landing pad for the
   // region following the try-range.
-  if (SawPotentiallyThrowing && Asm->MAI->isExceptionHandlingDwarf()) {
+  if (SawPotentiallyThrowing && !IsSJLJ) {
     CallSiteEntry Site = { LastLabel, nullptr, nullptr, 0 };
     CallSites.push_back(Site);
   }
@@ -314,7 +330,7 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
 ///  3. Type ID table contains references to all the C++ typeinfo for all
 ///     catches in the function.  This tables is reverse indexed base 1.
 void EHStreamer::emitExceptionTable() {
-  const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<const GlobalValue *> &TypeInfos = MMI->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
   const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
 
@@ -338,23 +354,9 @@ void EHStreamer::emitExceptionTable() {
   unsigned SizeActions =
     computeActionsTable(LandingPads, Actions, FirstActions);
 
-  // Invokes and nounwind calls have entries in PadMap (due to being bracketed
-  // by try-range labels when lowered).  Ordinary calls do not, so appropriate
-  // try-ranges for them need be deduced when using DWARF exception handling.
-  RangeMapType PadMap;
-  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
-    const LandingPadInfo *LandingPad = LandingPads[i];
-    for (unsigned j = 0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
-      MCSymbol *BeginLabel = LandingPad->BeginLabels[j];
-      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
-      PadRange P = { i, j };
-      PadMap[BeginLabel] = P;
-    }
-  }
-
   // Compute the call-site table.
   SmallVector<CallSiteEntry, 64> CallSites;
-  computeCallSiteTable(CallSites, PadMap, LandingPads, FirstActions);
+  computeCallSiteTable(CallSites, LandingPads, FirstActions);
 
   // Final tallies.
 
@@ -519,8 +521,7 @@ void EHStreamer::emitExceptionTable() {
       Asm->EmitULEB128(S.Action);
     }
   } else {
-    // DWARF Exception handling
-    assert(Asm->MAI->isExceptionHandlingDwarf());
+    // Itanium LSDA exception handling
 
     // The call-site table is a list of all call sites that may throw an
     // exception (including C++ 'throw' statements) in the procedure
@@ -576,15 +577,15 @@ void EHStreamer::emitExceptionTable() {
 
       // Offset of the landing pad, counted in 16-byte bundles relative to the
       // @LPStart address.
-      if (!S.PadLabel) {
+      if (!S.LPad) {
         if (VerboseAsm)
           Asm->OutStreamer.AddComment("    has no landing pad");
         Asm->OutStreamer.EmitIntValue(0, 4/*size*/);
       } else {
         if (VerboseAsm)
           Asm->OutStreamer.AddComment(Twine("    jumps to ") +
-                                      S.PadLabel->getName());
-        Asm->EmitLabelDifference(S.PadLabel, EHFuncBeginSym, 4);
+                                      S.LPad->LandingPadLabel->getName());
+        Asm->EmitLabelDifference(S.LPad->LandingPadLabel, EHFuncBeginSym, 4);
       }
 
       // Offset of the first associated action record, relative to the start of
@@ -649,7 +650,7 @@ void EHStreamer::emitExceptionTable() {
 }
 
 void EHStreamer::emitTypeInfos(unsigned TTypeEncoding) {
-  const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<const GlobalValue *> &TypeInfos = MMI->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
 
   bool VerboseAsm = Asm->OutStreamer.isVerboseAsm();
@@ -662,9 +663,9 @@ void EHStreamer::emitTypeInfos(unsigned TTypeEncoding) {
     Entry = TypeInfos.size();
   }
 
-  for (std::vector<const GlobalVariable *>::const_reverse_iterator
+  for (std::vector<const GlobalValue *>::const_reverse_iterator
          I = TypeInfos.rbegin(), E = TypeInfos.rend(); I != E; ++I) {
-    const GlobalVariable *GV = *I;
+    const GlobalValue *GV = *I;
     if (VerboseAsm)
       Asm->OutStreamer.AddComment("TypeInfo " + Twine(Entry--));
     Asm->EmitTTypeReference(GV, TTypeEncoding);
@@ -681,7 +682,7 @@ void EHStreamer::emitTypeInfos(unsigned TTypeEncoding) {
     unsigned TypeID = *I;
     if (VerboseAsm) {
       --Entry;
-      if (TypeID != 0)
+      if (isFilterEHSelector(TypeID))
         Asm->OutStreamer.AddComment("FilterInfo " + Twine(Entry));
     }
 
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h
index 2b6ba788e6fa..9b316ff00e9b 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_ASMPRINTER_EHSTREAMER_H
-#define LLVM_CODEGEN_ASMPRINTER_EHSTREAMER_H
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H
 
 #include "AsmPrinterHandler.h"
 #include "llvm/ADT/DenseMap.h"
@@ -23,6 +23,8 @@ class MachineModuleInfo;
 class MachineInstr;
 class MachineFunction;
 class AsmPrinter;
+class MCSymbol;
+class MCSymbolRefExpr;
 
 template <typename T>
 class SmallVectorImpl;
@@ -60,11 +62,11 @@ protected:
   /// Structure describing an entry in the call-site table.
   struct CallSiteEntry {
     // The 'try-range' is BeginLabel .. EndLabel.
-    MCSymbol *BeginLabel; // zero indicates the start of the function.
-    MCSymbol *EndLabel;   // zero indicates the end of the function.
+    MCSymbol *BeginLabel; // Null indicates the start of the function.
+    MCSymbol *EndLabel;   // Null indicates the end of the function.
 
-    // The landing pad starts at PadLabel.
-    MCSymbol *PadLabel;   // zero indicates that there is no landing pad.
+    // LPad contains the landing pad start labels.
+    const LandingPadInfo *LPad; // Null indicates that there is no landing pad.
     unsigned Action;
   };
 
@@ -86,7 +88,6 @@ protected:
   /// form gaps in the table.  Entries must be ordered by try-range address.
 
   void computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                            const RangeMapType &PadMap,
                             const SmallVectorImpl<const LandingPadInfo *> &LPs,
                             const SmallVectorImpl<unsigned> &FirstActions);
 
@@ -113,6 +114,13 @@ protected:
 
   virtual void emitTypeInfos(unsigned TTypeEncoding);
 
+  // Helpers for for identifying what kind of clause an EH typeid or selector
+  // corresponds to. Negative selectors are for filter clauses, the zero
+  // selector is for cleanups, and positive selectors are for catch clauses.
+  static bool isFilterEHSelector(int Selector) { return Selector < 0; }
+  static bool isCleanupEHSelector(int Selector) { return Selector == 0; }
+  static bool isCatchEHSelector(int Selector) { return Selector > 0; }
+
 public:
   EHStreamer(AsmPrinter *A);
   virtual ~EHStreamer();
diff --git a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index bfcbe6b94e7f..e293acd43a88 100644
--- a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -35,8 +36,8 @@ namespace {
 
   class ErlangGCPrinter : public GCMetadataPrinter {
   public:
-    void beginAssembly(AsmPrinter &AP) override;
-    void finishAssembly(AsmPrinter &AP) override;
+    void finishAssembly(Module &M, GCModuleInfo &Info,
+                        AsmPrinter &AP) override;
   };
 
 }
@@ -46,11 +47,11 @@ X("erlang", "erlang-compatible garbage collector");
 
 void llvm::linkErlangGCPrinter() { }
 
-void ErlangGCPrinter::beginAssembly(AsmPrinter &AP) { }
-
-void ErlangGCPrinter::finishAssembly(AsmPrinter &AP) {
+void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
+                                     AsmPrinter &AP) {
   MCStreamer &OS = AP.OutStreamer;
-  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
+  unsigned IntPtrSize =
+      AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
 
   // Put this in a custom .note section.
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getContext()
@@ -58,9 +59,13 @@ void ErlangGCPrinter::finishAssembly(AsmPrinter &AP) {
                    SectionKind::getDataRel()));
 
   // For each function...
-  for (iterator FI = begin(), FE = end(); FI != FE; ++FI) {
+  for (GCModuleInfo::FuncInfoVec::iterator FI = Info.funcinfo_begin(),
+         IE = Info.funcinfo_end();
+       FI != IE; ++FI) {
     GCFunctionInfo &MD = **FI;
-
+    if (MD.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
     /** A compact GC layout. Emit this data structure:
      *
      * struct {
diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index 5a9ecd794a8d..ddb14a057363 100644
--- a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cctype>
 using namespace llvm;
 
@@ -33,8 +34,10 @@ namespace {
 
   class OcamlGCMetadataPrinter : public GCMetadataPrinter {
   public:
-    void beginAssembly(AsmPrinter &AP) override;
-    void finishAssembly(AsmPrinter &AP) override;
+    void beginAssembly(Module &M, GCModuleInfo &Info,
+                       AsmPrinter &AP) override;
+    void finishAssembly(Module &M, GCModuleInfo &Info,
+                        AsmPrinter &AP) override;
   };
 
 }
@@ -66,12 +69,13 @@ static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) {
   AP.OutStreamer.EmitLabel(Sym);
 }
 
-void OcamlGCMetadataPrinter::beginAssembly(AsmPrinter &AP) {
+void OcamlGCMetadataPrinter::beginAssembly(Module &M, GCModuleInfo &Info,
+                                           AsmPrinter &AP) {
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getTextSection());
-  EmitCamlGlobal(getModule(), AP, "code_begin");
+  EmitCamlGlobal(M, AP, "code_begin");
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
-  EmitCamlGlobal(getModule(), AP, "data_begin");
+  EmitCamlGlobal(M, AP, "data_begin");
 }
 
 /// emitAssembly - Print the frametable. The ocaml frametable format is thus:
@@ -90,24 +94,30 @@ void OcamlGCMetadataPrinter::beginAssembly(AsmPrinter &AP) {
 /// (FrameSize and LiveOffsets would overflow). FrameTablePrinter will abort if
 /// either condition is detected in a function which uses the GC.
 ///
-void OcamlGCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
-  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
+void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
+                                            AsmPrinter &AP) {
+  unsigned IntPtrSize =
+      AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getTextSection());
-  EmitCamlGlobal(getModule(), AP, "code_end");
+  EmitCamlGlobal(M, AP, "code_end");
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
-  EmitCamlGlobal(getModule(), AP, "data_end");
+  EmitCamlGlobal(M, AP, "data_end");
 
   // FIXME: Why does ocaml emit this??
   AP.OutStreamer.EmitIntValue(0, IntPtrSize);
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
-  EmitCamlGlobal(getModule(), AP, "frametable");
+  EmitCamlGlobal(M, AP, "frametable");
 
   int NumDescriptors = 0;
-  for (iterator I = begin(), IE = end(); I != IE; ++I) {
+  for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
+         IE = Info.funcinfo_end(); I != IE; ++I) {
     GCFunctionInfo &FI = **I;
+    if (FI.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
     for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) {
       NumDescriptors++;
     }
@@ -120,8 +130,12 @@ void OcamlGCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
   AP.EmitInt16(NumDescriptors);
   AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
 
-  for (iterator I = begin(), IE = end(); I != IE; ++I) {
+  for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
+         IE = Info.funcinfo_end(); I != IE; ++I) {
     GCFunctionInfo &FI = **I;
+    if (FI.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
 
     uint64_t FrameSize = FI.getFrameSize();
     if (FrameSize >= 1<<16) {
diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
index 81285d55d636..2138cb9514f8 100644
--- a/lib/CodeGen/AsmPrinter/Win64Exception.cpp
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DwarfException.h"
+#include "Win64Exception.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
@@ -99,9 +99,156 @@ void Win64Exception::endFunction(const MachineFunction *) {
 
   if (shouldEmitPersonality) {
     Asm->OutStreamer.PushSection();
+
+    // Emit an UNWIND_INFO struct describing the prologue.
     Asm->OutStreamer.EmitWinEHHandlerData();
-    emitExceptionTable();
+
+    // Emit either MSVC-compatible tables or the usual Itanium-style LSDA after
+    // the UNWIND_INFO struct.
+    if (Asm->MAI->getExceptionHandlingType() == ExceptionHandling::MSVC) {
+      const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
+      if (Per->getName() == "__C_specific_handler")
+        emitCSpecificHandlerTable();
+      else
+        report_fatal_error(Twine("unexpected personality function: ") +
+                           Per->getName());
+    } else {
+      emitExceptionTable();
+    }
+
     Asm->OutStreamer.PopSection();
   }
   Asm->OutStreamer.EmitWinCFIEndProc();
 }
+
+const MCSymbolRefExpr *Win64Exception::createImageRel32(const MCSymbol *Value) {
+  return MCSymbolRefExpr::Create(Value, MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                 Asm->OutContext);
+}
+
+/// Emit the language-specific data that __C_specific_handler expects.  This
+/// handler lives in the x64 Microsoft C runtime and allows catching or cleaning
+/// up after faults with __try, __except, and __finally.  The typeinfo values
+/// are not really RTTI data, but pointers to filter functions that return an
+/// integer (1, 0, or -1) indicating how to handle the exception. For __finally
+/// blocks and other cleanups, the landing pad label is zero, and the filter
+/// function is actually a cleanup handler with the same prototype.  A catch-all
+/// entry is modeled with a null filter function field and a non-zero landing
+/// pad label.
+///
+/// Possible filter function return values:
+///   EXCEPTION_EXECUTE_HANDLER (1):
+///     Jump to the landing pad label after cleanups.
+///   EXCEPTION_CONTINUE_SEARCH (0):
+///     Continue searching this table or continue unwinding.
+///   EXCEPTION_CONTINUE_EXECUTION (-1):
+///     Resume execution at the trapping PC.
+///
+/// Inferred table structure:
+///   struct Table {
+///     int NumEntries;
+///     struct Entry {
+///       imagerel32 LabelStart;
+///       imagerel32 LabelEnd;
+///       imagerel32 FilterOrFinally;  // Zero means catch-all.
+///       imagerel32 LabelLPad;        // Zero means __finally.
+///     } Entries[NumEntries];
+///   };
+void Win64Exception::emitCSpecificHandlerTable() {
+  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
+
+  // Simplifying assumptions for first implementation:
+  // - Cleanups are not implemented.
+  // - Filters are not implemented.
+
+  // The Itanium LSDA table sorts similar landing pads together to simplify the
+  // actions table, but we don't need that.
+  SmallVector<const LandingPadInfo *, 64> LandingPads;
+  LandingPads.reserve(PadInfos.size());
+  for (const auto &LP : PadInfos)
+    LandingPads.push_back(&LP);
+
+  // Compute label ranges for call sites as we would for the Itanium LSDA, but
+  // use an all zero action table because we aren't using these actions.
+  SmallVector<unsigned, 64> FirstActions;
+  FirstActions.resize(LandingPads.size());
+  SmallVector<CallSiteEntry, 64> CallSites;
+  computeCallSiteTable(CallSites, LandingPads, FirstActions);
+
+  MCSymbol *EHFuncBeginSym =
+      Asm->GetTempSymbol("eh_func_begin", Asm->getFunctionNumber());
+  MCSymbol *EHFuncEndSym =
+      Asm->GetTempSymbol("eh_func_end", Asm->getFunctionNumber());
+
+  // Emit the number of table entries.
+  unsigned NumEntries = 0;
+  for (const CallSiteEntry &CSE : CallSites) {
+    if (!CSE.LPad)
+      continue; // Ignore gaps.
+    for (int Selector : CSE.LPad->TypeIds) {
+      // Ignore C++ filter clauses in SEH.
+      // FIXME: Implement cleanup clauses.
+      if (isCatchEHSelector(Selector))
+        ++NumEntries;
+    }
+  }
+  Asm->OutStreamer.EmitIntValue(NumEntries, 4);
+
+  // Emit the four-label records for each call site entry. The table has to be
+  // sorted in layout order, and the call sites should already be sorted.
+  for (const CallSiteEntry &CSE : CallSites) {
+    // Ignore gaps. Unlike the Itanium model, unwinding through a frame without
+    // an EH table entry will propagate the exception rather than terminating
+    // the program.
+    if (!CSE.LPad)
+      continue;
+    const LandingPadInfo *LPad = CSE.LPad;
+
+    // Compute the label range. We may reuse the function begin and end labels
+    // rather than forming new ones.
+    const MCExpr *Begin =
+        createImageRel32(CSE.BeginLabel ? CSE.BeginLabel : EHFuncBeginSym);
+    const MCExpr *End;
+    if (CSE.EndLabel) {
+      // The interval is half-open, so we have to add one to include the return
+      // address of the last invoke in the range.
+      End = MCBinaryExpr::CreateAdd(createImageRel32(CSE.EndLabel),
+                                    MCConstantExpr::Create(1, Asm->OutContext),
+                                    Asm->OutContext);
+    } else {
+      End = createImageRel32(EHFuncEndSym);
+    }
+
+    // These aren't really type info globals, they are actually pointers to
+    // filter functions ordered by selector. The zero selector is used for
+    // cleanups, so slot zero corresponds to selector 1.
+    const std::vector<const GlobalValue *> &SelectorToFilter = MMI->getTypeInfos();
+
+    // Do a parallel iteration across typeids and clause labels, skipping filter
+    // clauses.
+    assert(LPad->TypeIds.size() == LPad->ClauseLabels.size());
+    for (size_t I = 0, E = LPad->TypeIds.size(); I < E; ++I) {
+      // AddLandingPadInfo stores the clauses in reverse, but there is a FIXME
+      // to change that.
+      int Selector = LPad->TypeIds[E - I - 1];
+      MCSymbol *ClauseLabel = LPad->ClauseLabels[I];
+
+      // Ignore C++ filter clauses in SEH.
+      // FIXME: Implement cleanup clauses.
+      if (!isCatchEHSelector(Selector))
+        continue;
+
+      Asm->OutStreamer.EmitValue(Begin, 4);
+      Asm->OutStreamer.EmitValue(End, 4);
+      if (isCatchEHSelector(Selector)) {
+        assert(unsigned(Selector - 1) < SelectorToFilter.size());
+        const GlobalValue *TI = SelectorToFilter[Selector - 1];
+        if (TI) // Emit the filter function pointer.
+          Asm->OutStreamer.EmitValue(createImageRel32(Asm->getSymbol(TI)), 4);
+        else  // Otherwise, this is a "catch i8* null", or catch all.
+          Asm->OutStreamer.EmitIntValue(0, 4);
+      }
+      Asm->OutStreamer.EmitValue(createImageRel32(ClauseLabel), 4);
+    }
+  }
+}
diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.h b/lib/CodeGen/AsmPrinter/Win64Exception.h
new file mode 100644
index 000000000000..b2d5d1bce563
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.h
@@ -0,0 +1,56 @@
+//===-- Win64Exception.h - Windows Exception Handling ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing windows exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WIN64EXCEPTION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_WIN64EXCEPTION_H
+
+#include "EHStreamer.h"
+
+namespace llvm {
+class MachineFunction;
+
+class Win64Exception : public EHStreamer {
+  /// Per-function flag to indicate if personality info should be emitted.
+  bool shouldEmitPersonality;
+
+  /// Per-function flag to indicate if the LSDA should be emitted.
+  bool shouldEmitLSDA;
+
+  /// Per-function flag to indicate if frame moves info should be emitted.
+  bool shouldEmitMoves;
+
+  void emitCSpecificHandlerTable();
+
+  const MCSymbolRefExpr *createImageRel32(const MCSymbol *Value);
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  Win64Exception(AsmPrinter *A);
+  virtual ~Win64Exception();
+
+  /// Emit all exception information that should come after the content.
+  void endModule() override;
+
+  /// Gather pre-function exception information.  Assumes being emitted
+  /// immediately after the function entry point.
+  void beginFunction(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function exception information.
+  void endFunction(const MachineFunction *) override;
+};
+}
+
+#endif
+
diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
index 6a5c431d4fd1..b5e0929efd91 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
@@ -116,15 +116,67 @@ WinCodeViewLineTables::WinCodeViewLineTables(AsmPrinter *AP)
   Asm = AP;
 }
 
+void WinCodeViewLineTables::endModule() {
+  if (FnDebugInfo.empty())
+    return;
+
+  assert(Asm != nullptr);
+  Asm->OutStreamer.SwitchSection(
+      Asm->getObjFileLowering().getCOFFDebugSymbolsSection());
+  Asm->EmitInt32(COFF::DEBUG_SECTION_MAGIC);
+
+  // The COFF .debug$S section consists of several subsections, each starting
+  // with a 4-byte control code (e.g. 0xF1, 0xF2, etc) and then a 4-byte length
+  // of the payload followed by the payload itself.  The subsections are 4-byte
+  // aligned.
+
+  // Emit per-function debug information.  This code is extracted into a
+  // separate function for readability.
+  for (size_t I = 0, E = VisitedFunctions.size(); I != E; ++I)
+    emitDebugInfoForFunction(VisitedFunctions[I]);
+
+  // This subsection holds a file index to offset in string table table.
+  Asm->OutStreamer.AddComment("File index to string table offset subsection");
+  Asm->EmitInt32(COFF::DEBUG_INDEX_SUBSECTION);
+  size_t NumFilenames = FileNameRegistry.Infos.size();
+  Asm->EmitInt32(8 * NumFilenames);
+  for (size_t I = 0, E = FileNameRegistry.Filenames.size(); I != E; ++I) {
+    StringRef Filename = FileNameRegistry.Filenames[I];
+    // For each unique filename, just write its offset in the string table.
+    Asm->EmitInt32(FileNameRegistry.Infos[Filename].StartOffset);
+    // The function name offset is not followed by any additional data.
+    Asm->EmitInt32(0);
+  }
+
+  // This subsection holds the string table.
+  Asm->OutStreamer.AddComment("String table");
+  Asm->EmitInt32(COFF::DEBUG_STRING_TABLE_SUBSECTION);
+  Asm->EmitInt32(FileNameRegistry.LastOffset);
+  // The payload starts with a null character.
+  Asm->EmitInt8(0);
+
+  for (size_t I = 0, E = FileNameRegistry.Filenames.size(); I != E; ++I) {
+    // Just emit unique filenames one by one, separated by a null character.
+    Asm->OutStreamer.EmitBytes(FileNameRegistry.Filenames[I]);
+    Asm->EmitInt8(0);
+  }
+
+  // No more subsections. Fill with zeros to align the end of the section by 4.
+  Asm->OutStreamer.EmitFill((-FileNameRegistry.LastOffset) % 4, 0);
+
+  clear();
+}
+
 static void EmitLabelDiff(MCStreamer &Streamer,
-                          const MCSymbol *From, const MCSymbol *To) {
+                          const MCSymbol *From, const MCSymbol *To,
+                          unsigned int Size = 4) {
   MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
   MCContext &Context = Streamer.getContext();
   const MCExpr *FromRef = MCSymbolRefExpr::Create(From, Variant, Context),
                *ToRef   = MCSymbolRefExpr::Create(To, Variant, Context);
   const MCExpr *AddrDelta =
       MCBinaryExpr::Create(MCBinaryExpr::Sub, ToRef, FromRef, Context);
-  Streamer.EmitValue(AddrDelta, 4);
+  Streamer.EmitValue(AddrDelta, Size);
 }
 
 void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
@@ -138,6 +190,51 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
     return;
   assert(FI.End && "Don't know where the function ends?");
 
+  StringRef FuncName = getDISubprogram(GV).getDisplayName(),
+            GVName = GV->getName();
+  // FIXME Clang currently sets DisplayName to "bar" for a C++
+  // "namespace_foo::bar" function, see PR21528.  Luckily, dbghelp.dll is trying
+  // to demangle display names anyways, so let's just put a mangled name into
+  // the symbols subsection until Clang gives us what we need.
+  if (GVName.startswith("\01?"))
+    FuncName = GVName.substr(1);
+  // Emit a symbol subsection, required by VS2012+ to find function boundaries.
+  MCSymbol *SymbolsBegin = Asm->MMI->getContext().CreateTempSymbol(),
+           *SymbolsEnd = Asm->MMI->getContext().CreateTempSymbol();
+  Asm->OutStreamer.AddComment("Symbol subsection for " + Twine(FuncName));
+  Asm->EmitInt32(COFF::DEBUG_SYMBOL_SUBSECTION);
+  EmitLabelDiff(Asm->OutStreamer, SymbolsBegin, SymbolsEnd);
+  Asm->OutStreamer.EmitLabel(SymbolsBegin);
+  {
+    MCSymbol *ProcSegmentBegin = Asm->MMI->getContext().CreateTempSymbol(),
+             *ProcSegmentEnd = Asm->MMI->getContext().CreateTempSymbol();
+    EmitLabelDiff(Asm->OutStreamer, ProcSegmentBegin, ProcSegmentEnd, 2);
+    Asm->OutStreamer.EmitLabel(ProcSegmentBegin);
+
+    Asm->EmitInt16(COFF::DEBUG_SYMBOL_TYPE_PROC_START);
+    // Some bytes of this segment don't seem to be required for basic debugging,
+    // so just fill them with zeroes.
+    Asm->OutStreamer.EmitFill(12, 0);
+    // This is the important bit that tells the debugger where the function
+    // code is located and what's its size:
+    EmitLabelDiff(Asm->OutStreamer, Fn, FI.End);
+    Asm->OutStreamer.EmitFill(12, 0);
+    Asm->OutStreamer.EmitCOFFSecRel32(Fn);
+    Asm->OutStreamer.EmitCOFFSectionIndex(Fn);
+    Asm->EmitInt8(0);
+    // Emit the function display name as a null-terminated string.
+    Asm->OutStreamer.EmitBytes(FuncName);
+    Asm->EmitInt8(0);
+    Asm->OutStreamer.EmitLabel(ProcSegmentEnd);
+
+    // We're done with this function.
+    Asm->EmitInt16(0x0002);
+    Asm->EmitInt16(COFF::DEBUG_SYMBOL_TYPE_PROC_END);
+  }
+  Asm->OutStreamer.EmitLabel(SymbolsEnd);
+  // Every subsection must be aligned to a 4-byte boundary.
+  Asm->OutStreamer.EmitFill((-FuncName.size()) % 4, 0);
+
   // PCs/Instructions are grouped into segments sharing the same filename.
   // Pre-calculate the lengths (in instructions) of these segments and store
   // them in a map for convenience.  Each index in the map is the sequential
@@ -154,18 +251,19 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
   }
   FilenameSegmentLengths[LastSegmentEnd] = FI.Instrs.size() - LastSegmentEnd;
 
-  // Emit the control code of the subsection followed by the payload size.
-  Asm->OutStreamer.AddComment(
-      "Linetable subsection for " + Twine(Fn->getName()));
+  // Emit a line table subsection, requred to do PC-to-file:line lookup.
+  Asm->OutStreamer.AddComment("Line table subsection for " + Twine(FuncName));
   Asm->EmitInt32(COFF::DEBUG_LINE_TABLE_SUBSECTION);
-  MCSymbol *SubsectionBegin = Asm->MMI->getContext().CreateTempSymbol(),
-           *SubsectionEnd = Asm->MMI->getContext().CreateTempSymbol();
-  EmitLabelDiff(Asm->OutStreamer, SubsectionBegin, SubsectionEnd);
-  Asm->OutStreamer.EmitLabel(SubsectionBegin);
+  MCSymbol *LineTableBegin = Asm->MMI->getContext().CreateTempSymbol(),
+           *LineTableEnd = Asm->MMI->getContext().CreateTempSymbol();
+  EmitLabelDiff(Asm->OutStreamer, LineTableBegin, LineTableEnd);
+  Asm->OutStreamer.EmitLabel(LineTableBegin);
 
   // Identify the function this subsection is for.
   Asm->OutStreamer.EmitCOFFSecRel32(Fn);
   Asm->OutStreamer.EmitCOFFSectionIndex(Fn);
+  // Insert padding after a 16-bit section index.
+  Asm->EmitInt16(0);
 
   // Length of the function's code, in bytes.
   EmitLabelDiff(Asm->OutStreamer, Fn, FI.End);
@@ -209,56 +307,7 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
 
   if (FileSegmentEnd)
     Asm->OutStreamer.EmitLabel(FileSegmentEnd);
-  Asm->OutStreamer.EmitLabel(SubsectionEnd);
-}
-
-void WinCodeViewLineTables::endModule() {
-  if (FnDebugInfo.empty())
-    return;
-
-  assert(Asm != nullptr);
-  Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getCOFFDebugSymbolsSection());
-  Asm->EmitInt32(COFF::DEBUG_SECTION_MAGIC);
-
-  // The COFF .debug$S section consists of several subsections, each starting
-  // with a 4-byte control code (e.g. 0xF1, 0xF2, etc) and then a 4-byte length
-  // of the payload followed by the payload itself.  The subsections are 4-byte
-  // aligned.
-
-  for (size_t I = 0, E = VisitedFunctions.size(); I != E; ++I)
-    emitDebugInfoForFunction(VisitedFunctions[I]);
-
-  // This subsection holds a file index to offset in string table table.
-  Asm->OutStreamer.AddComment("File index to string table offset subsection");
-  Asm->EmitInt32(COFF::DEBUG_INDEX_SUBSECTION);
-  size_t NumFilenames = FileNameRegistry.Infos.size();
-  Asm->EmitInt32(8 * NumFilenames);
-  for (size_t I = 0, E = FileNameRegistry.Filenames.size(); I != E; ++I) {
-    StringRef Filename = FileNameRegistry.Filenames[I];
-    // For each unique filename, just write it's offset in the string table.
-    Asm->EmitInt32(FileNameRegistry.Infos[Filename].StartOffset);
-    // The function name offset is not followed by any additional data.
-    Asm->EmitInt32(0);
-  }
-
-  // This subsection holds the string table.
-  Asm->OutStreamer.AddComment("String table");
-  Asm->EmitInt32(COFF::DEBUG_STRING_TABLE_SUBSECTION);
-  Asm->EmitInt32(FileNameRegistry.LastOffset);
-  // The payload starts with a null character.
-  Asm->EmitInt8(0);
-
-  for (size_t I = 0, E = FileNameRegistry.Filenames.size(); I != E; ++I) {
-    // Just emit unique filenames one by one, separated by a null character.
-    Asm->OutStreamer.EmitBytes(FileNameRegistry.Filenames[I]);
-    Asm->EmitInt8(0);
-  }
-
-  // No more subsections. Fill with zeros to align the end of the section by 4.
-  Asm->OutStreamer.EmitFill((-FileNameRegistry.LastOffset) % 4, 0);
-
-  clear();
+  Asm->OutStreamer.EmitLabel(LineTableEnd);
 }
 
 void WinCodeViewLineTables::beginFunction(const MachineFunction *MF) {
diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
index 0734d97ba714..8492eacb7ff7 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_WINCODEVIEWLINETABLES_H__
-#define CODEGEN_ASMPRINTER_WINCODEVIEWLINETABLES_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WINCODEVIEWLINETABLES_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_WINCODEVIEWLINETABLES_H
 
 #include "AsmPrinterHandler.h"
 #include "llvm/ADT/DenseMap.h"
diff --git a/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp b/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
deleted file mode 100644
index 421946ded40b..000000000000
--- a/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
+++ /dev/null
@@ -1,380 +0,0 @@
-//===-- AtomicExpandLoadLinkedPass.cpp - Expand atomic instructions -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass (at IR level) to replace atomic instructions with
-// appropriate (intrinsic-based) ldrex/strex loops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm-atomic-expand"
-
-namespace {
-  class AtomicExpandLoadLinked : public FunctionPass {
-    const TargetMachine *TM;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit AtomicExpandLoadLinked(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TM(TM) {
-      initializeAtomicExpandLoadLinkedPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override;
-    bool expandAtomicInsts(Function &F);
-
-    bool expandAtomicLoad(LoadInst *LI);
-    bool expandAtomicStore(StoreInst *LI);
-    bool expandAtomicRMW(AtomicRMWInst *AI);
-    bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
-
-    AtomicOrdering insertLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
-    void insertTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
-  };
-}
-
-char AtomicExpandLoadLinked::ID = 0;
-char &llvm::AtomicExpandLoadLinkedID = AtomicExpandLoadLinked::ID;
-INITIALIZE_TM_PASS(AtomicExpandLoadLinked, "atomic-ll-sc",
-    "Expand Atomic calls in terms of load-linked & store-conditional",
-    false, false)
-
-FunctionPass *llvm::createAtomicExpandLoadLinkedPass(const TargetMachine *TM) {
-  return new AtomicExpandLoadLinked(TM);
-}
-
-bool AtomicExpandLoadLinked::runOnFunction(Function &F) {
-  if (!TM || !TM->getSubtargetImpl()->enableAtomicExpandLoadLinked())
-    return false;
-
-  SmallVector<Instruction *, 1> AtomicInsts;
-
-  // Changing control-flow while iterating through it is a bad idea, so gather a
-  // list of all atomic instructions before we start.
-  for (BasicBlock &BB : F)
-    for (Instruction &Inst : BB) {
-      if (isa<AtomicRMWInst>(&Inst) || isa<AtomicCmpXchgInst>(&Inst) ||
-          (isa<LoadInst>(&Inst) && cast<LoadInst>(&Inst)->isAtomic()) ||
-          (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
-        AtomicInsts.push_back(&Inst);
-    }
-
-  bool MadeChange = false;
-  for (Instruction *Inst : AtomicInsts) {
-    if (!TM->getTargetLowering()->shouldExpandAtomicInIR(Inst))
-      continue;
-
-    if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
-      MadeChange |= expandAtomicRMW(AI);
-    else if (AtomicCmpXchgInst *CI = dyn_cast<AtomicCmpXchgInst>(Inst))
-      MadeChange |= expandAtomicCmpXchg(CI);
-    else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-      MadeChange |= expandAtomicLoad(LI);
-    else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-      MadeChange |= expandAtomicStore(SI);
-    else
-      llvm_unreachable("Unknown atomic instruction");
-  }
-
-  return MadeChange;
-}
-
-bool AtomicExpandLoadLinked::expandAtomicLoad(LoadInst *LI) {
-  // Load instructions don't actually need a leading fence, even in the
-  // SequentiallyConsistent case.
-  AtomicOrdering MemOpOrder =
-      TM->getTargetLowering()->getInsertFencesForAtomic() ? Monotonic
-                                                          : LI->getOrdering();
-
-  // The only 64-bit load guaranteed to be single-copy atomic by the ARM ARM is
-  // an ldrexd (A3.5.3).
-  IRBuilder<> Builder(LI);
-  Value *Val = TM->getTargetLowering()->emitLoadLinked(
-      Builder, LI->getPointerOperand(), MemOpOrder);
-
-  insertTrailingFence(Builder, LI->getOrdering());
-
-  LI->replaceAllUsesWith(Val);
-  LI->eraseFromParent();
-
-  return true;
-}
-
-bool AtomicExpandLoadLinked::expandAtomicStore(StoreInst *SI) {
-  // The only atomic 64-bit store on ARM is an strexd that succeeds, which means
-  // we need a loop and the entire instruction is essentially an "atomicrmw
-  // xchg" that ignores the value loaded.
-  IRBuilder<> Builder(SI);
-  AtomicRMWInst *AI =
-      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
-                              SI->getValueOperand(), SI->getOrdering());
-  SI->eraseFromParent();
-
-  // Now we have an appropriate swap instruction, lower it as usual.
-  return expandAtomicRMW(AI);
-}
-
-bool AtomicExpandLoadLinked::expandAtomicRMW(AtomicRMWInst *AI) {
-  AtomicOrdering Order = AI->getOrdering();
-  Value *Addr = AI->getPointerOperand();
-  BasicBlock *BB = AI->getParent();
-  Function *F = BB->getParent();
-  LLVMContext &Ctx = F->getContext();
-
-  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
-  //
-  // The standard expansion we produce is:
-  //     [...]
-  //     fence?
-  // atomicrmw.start:
-  //     %loaded = @load.linked(%addr)
-  //     %new = some_op iN %loaded, %incr
-  //     %stored = @store_conditional(%new, %addr)
-  //     %try_again = icmp i32 ne %stored, 0
-  //     br i1 %try_again, label %loop, label %atomicrmw.end
-  // atomicrmw.end:
-  //     fence?
-  //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
-  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
-
-  // This grabs the DebugLoc from AI.
-  IRBuilder<> Builder(AI);
-
-  // The split call above "helpfully" added a branch at the end of BB (to the
-  // wrong place), but we might want a fence too. It's easiest to just remove
-  // the branch entirely.
-  std::prev(BB->end())->eraseFromParent();
-  Builder.SetInsertPoint(BB);
-  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, Order);
-  Builder.CreateBr(LoopBB);
-
-  // Start the main loop block now that we've taken care of the preliminaries.
-  Builder.SetInsertPoint(LoopBB);
-  Value *Loaded =
-      TM->getTargetLowering()->emitLoadLinked(Builder, Addr, MemOpOrder);
-
-  Value *NewVal;
-  switch (AI->getOperation()) {
-  case AtomicRMWInst::Xchg:
-    NewVal = AI->getValOperand();
-    break;
-  case AtomicRMWInst::Add:
-    NewVal = Builder.CreateAdd(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Sub:
-    NewVal = Builder.CreateSub(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::And:
-    NewVal = Builder.CreateAnd(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Nand:
-    NewVal = Builder.CreateNot(Builder.CreateAnd(Loaded, AI->getValOperand()),
-                               "new");
-    break;
-  case AtomicRMWInst::Or:
-    NewVal = Builder.CreateOr(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Xor:
-    NewVal = Builder.CreateXor(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Max:
-    NewVal = Builder.CreateICmpSGT(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Min:
-    NewVal = Builder.CreateICmpSLE(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::UMax:
-    NewVal = Builder.CreateICmpUGT(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::UMin:
-    NewVal = Builder.CreateICmpULE(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  default:
-    llvm_unreachable("Unknown atomic op");
-  }
-
-  Value *StoreSuccess = TM->getTargetLowering()->emitStoreConditional(
-      Builder, NewVal, Addr, MemOpOrder);
-  Value *TryAgain = Builder.CreateICmpNE(
-      StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
-  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
-
-  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
-  insertTrailingFence(Builder, Order);
-
-  AI->replaceAllUsesWith(Loaded);
-  AI->eraseFromParent();
-
-  return true;
-}
-
-bool AtomicExpandLoadLinked::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
-  AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
-  AtomicOrdering FailureOrder = CI->getFailureOrdering();
-  Value *Addr = CI->getPointerOperand();
-  BasicBlock *BB = CI->getParent();
-  Function *F = BB->getParent();
-  LLVMContext &Ctx = F->getContext();
-
-  // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
-  //
-  // The full expansion we produce is:
-  //     [...]
-  //     fence?
-  // cmpxchg.start:
-  //     %loaded = @load.linked(%addr)
-  //     %should_store = icmp eq %loaded, %desired
-  //     br i1 %should_store, label %cmpxchg.trystore,
-  //                          label %cmpxchg.failure
-  // cmpxchg.trystore:
-  //     %stored = @store_conditional(%new, %addr)
-  //     %success = icmp eq i32 %stored, 0
-  //     br i1 %success, label %cmpxchg.success, label %loop/%cmpxchg.failure
-  // cmpxchg.success:
-  //     fence?
-  //     br label %cmpxchg.end
-  // cmpxchg.failure:
-  //     fence?
-  //     br label %cmpxchg.end
-  // cmpxchg.end:
-  //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
-  //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
-  //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
-  //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end");
-  auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
-  auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, FailureBB);
-  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, SuccessBB);
-  auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
-
-  // This grabs the DebugLoc from CI
-  IRBuilder<> Builder(CI);
-
-  // The split call above "helpfully" added a branch at the end of BB (to the
-  // wrong place), but we might want a fence too. It's easiest to just remove
-  // the branch entirely.
-  std::prev(BB->end())->eraseFromParent();
-  Builder.SetInsertPoint(BB);
-  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, SuccessOrder);
-  Builder.CreateBr(LoopBB);
-
-  // Start the main loop block now that we've taken care of the preliminaries.
-  Builder.SetInsertPoint(LoopBB);
-  Value *Loaded =
-      TM->getTargetLowering()->emitLoadLinked(Builder, Addr, MemOpOrder);
-  Value *ShouldStore =
-      Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store");
-
-  // If the the cmpxchg doesn't actually need any ordering when it fails, we can
-  // jump straight past that fence instruction (if it exists).
-  Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
-
-  Builder.SetInsertPoint(TryStoreBB);
-  Value *StoreSuccess = TM->getTargetLowering()->emitStoreConditional(
-      Builder, CI->getNewValOperand(), Addr, MemOpOrder);
-  StoreSuccess = Builder.CreateICmpEQ(
-      StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
-  Builder.CreateCondBr(StoreSuccess, SuccessBB,
-                       CI->isWeak() ? FailureBB : LoopBB);
-
-  // Make sure later instructions don't get reordered with a fence if necessary.
-  Builder.SetInsertPoint(SuccessBB);
-  insertTrailingFence(Builder, SuccessOrder);
-  Builder.CreateBr(ExitBB);
-
-  Builder.SetInsertPoint(FailureBB);
-  insertTrailingFence(Builder, FailureOrder);
-  Builder.CreateBr(ExitBB);
-
-  // Finally, we have control-flow based knowledge of whether the cmpxchg
-  // succeeded or not. We expose this to later passes by converting any
-  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate PHI.
-
-  // Setup the builder so we can create any PHIs we need.
-  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
-  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
-  Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
-  Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
-
-  // Look for any users of the cmpxchg that are just comparing the loaded value
-  // against the desired one, and replace them with the CFG-derived version.
-  SmallVector<ExtractValueInst *, 2> PrunedInsts;
-  for (auto User : CI->users()) {
-    ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
-    if (!EV)
-      continue;
-
-    assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
-           "weird extraction from { iN, i1 }");
-
-    if (EV->getIndices()[0] == 0)
-      EV->replaceAllUsesWith(Loaded);
-    else
-      EV->replaceAllUsesWith(Success);
-
-    PrunedInsts.push_back(EV);
-  }
-
-  // We can remove the instructions now we're no longer iterating through them.
-  for (auto EV : PrunedInsts)
-    EV->eraseFromParent();
-
-  if (!CI->use_empty()) {
-    // Some use of the full struct return that we don't understand has happened,
-    // so we've got to reconstruct it properly.
-    Value *Res;
-    Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
-    Res = Builder.CreateInsertValue(Res, Success, 1);
-
-    CI->replaceAllUsesWith(Res);
-  }
-
-  CI->eraseFromParent();
-  return true;
-}
-
-AtomicOrdering AtomicExpandLoadLinked::insertLeadingFence(IRBuilder<> &Builder,
-                                                       AtomicOrdering Ord) {
-  if (!TM->getTargetLowering()->getInsertFencesForAtomic())
-    return Ord;
-
-  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    Builder.CreateFence(Release);
-
-  // The exclusive operations don't need any barrier if we're adding separate
-  // fences.
-  return Monotonic;
-}
-
-void AtomicExpandLoadLinked::insertTrailingFence(IRBuilder<> &Builder,
-                                              AtomicOrdering Ord) {
-  if (!TM->getTargetLowering()->getInsertFencesForAtomic())
-    return;
-
-  if (Ord == Acquire || Ord == AcquireRelease)
-    Builder.CreateFence(Acquire);
-  else if (Ord == SequentiallyConsistent)
-    Builder.CreateFence(SequentiallyConsistent);
-}
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
new file mode 100644
index 000000000000..12f6bd77d334
--- /dev/null
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -0,0 +1,563 @@
+//===-- AtomicExpandPass.cpp - Expand atomic instructions -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass (at IR level) to replace atomic instructions with
+// either (intrinsic-based) load-linked/store-conditional loops or AtomicCmpXchg.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "atomic-expand"
+
+namespace {
+  class AtomicExpand: public FunctionPass {
+    const TargetMachine *TM;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit AtomicExpand(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {
+      initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+  private:
+    bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+                               bool IsStore, bool IsLoad);
+    bool expandAtomicLoad(LoadInst *LI);
+    bool expandAtomicLoadToLL(LoadInst *LI);
+    bool expandAtomicLoadToCmpXchg(LoadInst *LI);
+    bool expandAtomicStore(StoreInst *SI);
+    bool expandAtomicRMW(AtomicRMWInst *AI);
+    bool expandAtomicRMWToLLSC(AtomicRMWInst *AI);
+    bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI);
+    bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
+    bool isIdempotentRMW(AtomicRMWInst *AI);
+    bool simplifyIdempotentRMW(AtomicRMWInst *AI);
+  };
+}
+
+char AtomicExpand::ID = 0;
+char &llvm::AtomicExpandID = AtomicExpand::ID;
+INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand",
+    "Expand Atomic calls in terms of either load-linked & store-conditional or cmpxchg",
+    false, false)
+
+FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
+  return new AtomicExpand(TM);
+}
+
+bool AtomicExpand::runOnFunction(Function &F) {
+  if (!TM || !TM->getSubtargetImpl()->enableAtomicExpand())
+    return false;
+  auto TargetLowering = TM->getSubtargetImpl()->getTargetLowering();
+
+  SmallVector<Instruction *, 1> AtomicInsts;
+
+  // Changing control-flow while iterating through it is a bad idea, so gather a
+  // list of all atomic instructions before we start.
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    if (I->isAtomic())
+      AtomicInsts.push_back(&*I);
+  }
+
+  bool MadeChange = false;
+  for (auto I : AtomicInsts) {
+    auto LI = dyn_cast<LoadInst>(I);
+    auto SI = dyn_cast<StoreInst>(I);
+    auto RMWI = dyn_cast<AtomicRMWInst>(I);
+    auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
+    assert((LI || SI || RMWI || CASI || isa<FenceInst>(I)) &&
+           "Unknown atomic instruction");
+
+    auto FenceOrdering = Monotonic;
+    bool IsStore, IsLoad;
+    if (TargetLowering->getInsertFencesForAtomic()) {
+      if (LI && isAtLeastAcquire(LI->getOrdering())) {
+        FenceOrdering = LI->getOrdering();
+        LI->setOrdering(Monotonic);
+        IsStore = false;
+        IsLoad = true;
+      } else if (SI && isAtLeastRelease(SI->getOrdering())) {
+        FenceOrdering = SI->getOrdering();
+        SI->setOrdering(Monotonic);
+        IsStore = true;
+        IsLoad = false;
+      } else if (RMWI && (isAtLeastRelease(RMWI->getOrdering()) ||
+                          isAtLeastAcquire(RMWI->getOrdering()))) {
+        FenceOrdering = RMWI->getOrdering();
+        RMWI->setOrdering(Monotonic);
+        IsStore = IsLoad = true;
+      } else if (CASI && !TargetLowering->hasLoadLinkedStoreConditional() &&
+                    (isAtLeastRelease(CASI->getSuccessOrdering()) ||
+                     isAtLeastAcquire(CASI->getSuccessOrdering()))) {
+        // If a compare and swap is lowered to LL/SC, we can do smarter fence
+        // insertion, with a stronger one on the success path than on the
+        // failure path. As a result, fence insertion is directly done by
+        // expandAtomicCmpXchg in that case.
+        FenceOrdering = CASI->getSuccessOrdering();
+        CASI->setSuccessOrdering(Monotonic);
+        CASI->setFailureOrdering(Monotonic);
+        IsStore = IsLoad = true;
+      }
+
+      if (FenceOrdering != Monotonic) {
+        MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
+      }
+    }
+
+    if (LI && TargetLowering->shouldExpandAtomicLoadInIR(LI)) {
+      MadeChange |= expandAtomicLoad(LI);
+    } else if (SI && TargetLowering->shouldExpandAtomicStoreInIR(SI)) {
+      MadeChange |= expandAtomicStore(SI);
+    } else if (RMWI) {
+      // There are two different ways of expanding RMW instructions:
+      // - into a load if it is idempotent
+      // - into a Cmpxchg/LL-SC loop otherwise
+      // we try them in that order.
+      MadeChange |= (isIdempotentRMW(RMWI) &&
+                        simplifyIdempotentRMW(RMWI)) ||
+                    (TargetLowering->shouldExpandAtomicRMWInIR(RMWI) &&
+                        expandAtomicRMW(RMWI));
+    } else if (CASI && TargetLowering->hasLoadLinkedStoreConditional()) {
+      MadeChange |= expandAtomicCmpXchg(CASI);
+    }
+  }
+  return MadeChange;
+}
+
+bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+                                         bool IsStore, bool IsLoad) {
+  IRBuilder<> Builder(I);
+
+  auto LeadingFence =
+      TM->getSubtargetImpl()->getTargetLowering()->emitLeadingFence(
+      Builder, Order, IsStore, IsLoad);
+
+  auto TrailingFence =
+      TM->getSubtargetImpl()->getTargetLowering()->emitTrailingFence(
+      Builder, Order, IsStore, IsLoad);
+  // The trailing fence is emitted before the instruction instead of after
+  // because there is no easy way of setting Builder insertion point after
+  // an instruction. So we must erase it from the BB, and insert it back
+  // in the right place.
+  // We have a guard here because not every atomic operation generates a
+  // trailing fence.
+  if (TrailingFence) {
+    TrailingFence->removeFromParent();
+    TrailingFence->insertAfter(I);
+  }
+
+  return (LeadingFence || TrailingFence);
+}
+
+bool AtomicExpand::expandAtomicLoad(LoadInst *LI) {
+   if (TM->getSubtargetImpl()
+          ->getTargetLowering()
+          ->hasLoadLinkedStoreConditional())
+    return expandAtomicLoadToLL(LI);
+  else
+    return expandAtomicLoadToCmpXchg(LI);
+}
+
+bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
+  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
+  IRBuilder<> Builder(LI);
+
+  // On some architectures, load-linked instructions are atomic for larger
+  // sizes than normal loads. For example, the only 64-bit load guaranteed
+  // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
+  Value *Val =
+      TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
+
+  LI->replaceAllUsesWith(Val);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
+  IRBuilder<> Builder(LI);
+  AtomicOrdering Order = LI->getOrdering();
+  Value *Addr = LI->getPointerOperand();
+  Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
+  Constant *DummyVal = Constant::getNullValue(Ty);
+
+  Value *Pair = Builder.CreateAtomicCmpXchg(
+      Addr, DummyVal, DummyVal, Order,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
+  Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
+
+  LI->replaceAllUsesWith(Loaded);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
+  // This function is only called on atomic stores that are too large to be
+  // atomic if implemented as a native store. So we replace them by an
+  // atomic swap, that can be implemented for example as a ldrex/strex on ARM
+  // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
+  // It is the responsibility of the target to only return true in
+  // shouldExpandAtomicRMW in cases where this is required and possible.
+  IRBuilder<> Builder(SI);
+  AtomicRMWInst *AI =
+      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
+                              SI->getValueOperand(), SI->getOrdering());
+  SI->eraseFromParent();
+
+  // Now we have an appropriate swap instruction, lower it as usual.
+  return expandAtomicRMW(AI);
+}
+
+bool AtomicExpand::expandAtomicRMW(AtomicRMWInst *AI) {
+  if (TM->getSubtargetImpl()
+          ->getTargetLowering()
+          ->hasLoadLinkedStoreConditional())
+    return expandAtomicRMWToLLSC(AI);
+  else
+    return expandAtomicRMWToCmpXchg(AI);
+}
+
+/// Emit IR to implement the given atomicrmw operation on values in registers,
+/// returning the new value.
+static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
+                              Value *Loaded, Value *Inc) {
+  Value *NewVal;
+  switch (Op) {
+  case AtomicRMWInst::Xchg:
+    return Inc;
+  case AtomicRMWInst::Add:
+    return Builder.CreateAdd(Loaded, Inc, "new");
+  case AtomicRMWInst::Sub:
+    return Builder.CreateSub(Loaded, Inc, "new");
+  case AtomicRMWInst::And:
+    return Builder.CreateAnd(Loaded, Inc, "new");
+  case AtomicRMWInst::Nand:
+    return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
+  case AtomicRMWInst::Or:
+    return Builder.CreateOr(Loaded, Inc, "new");
+  case AtomicRMWInst::Xor:
+    return Builder.CreateXor(Loaded, Inc, "new");
+  case AtomicRMWInst::Max:
+    NewVal = Builder.CreateICmpSGT(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::Min:
+    NewVal = Builder.CreateICmpSLE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMax:
+    NewVal = Builder.CreateICmpUGT(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMin:
+    NewVal = Builder.CreateICmpULE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  default:
+    llvm_unreachable("Unknown atomic op");
+  }
+}
+
+bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
+  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
+  AtomicOrdering MemOpOrder = AI->getOrdering();
+  Value *Addr = AI->getPointerOperand();
+  BasicBlock *BB = AI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     fence?
+  // atomicrmw.start:
+  //     %loaded = @load.linked(%addr)
+  //     %new = some_op iN %loaded, %incr
+  //     %stored = @store_conditional(%new, %addr)
+  //     %try_again = icmp i32 ne %stored, 0
+  //     br i1 %try_again, label %loop, label %atomicrmw.end
+  // atomicrmw.end:
+  //     fence?
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
+  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // This grabs the DebugLoc from AI.
+  IRBuilder<> Builder(AI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we might want a fence too. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+
+  Value *NewVal =
+      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+
+  Value *StoreSuccess =
+      TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
+  Value *TryAgain = Builder.CreateICmpNE(
+      StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
+  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+
+  AI->replaceAllUsesWith(Loaded);
+  AI->eraseFromParent();
+
+  return true;
+}
+
+bool AtomicExpand::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI) {
+  AtomicOrdering MemOpOrder =
+      AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
+  Value *Addr = AI->getPointerOperand();
+  BasicBlock *BB = AI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     %init_loaded = load atomic iN* %addr
+  //     br label %loop
+  // loop:
+  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
+  //     %new = some_op iN %loaded, %incr
+  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
+  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
+  //     %success = extractvalue { iN, i1 } %pair, 1
+  //     br i1 %success, label %atomicrmw.end, label %loop
+  // atomicrmw.end:
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
+  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // This grabs the DebugLoc from AI.
+  IRBuilder<> Builder(AI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we want a load. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  LoadInst *InitLoaded = Builder.CreateLoad(Addr);
+  // Atomics require at least natural alignment.
+  InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits());
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
+  Loaded->addIncoming(InitLoaded, BB);
+
+  Value *NewVal =
+      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+
+  Value *Pair = Builder.CreateAtomicCmpXchg(
+      Addr, Loaded, NewVal, MemOpOrder,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
+  Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+  Loaded->addIncoming(NewLoaded, LoopBB);
+
+  Value *Success = Builder.CreateExtractValue(Pair, 1, "success");
+  Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+
+  AI->replaceAllUsesWith(NewLoaded);
+  AI->eraseFromParent();
+
+  return true;
+}
+
+bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
+  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
+  AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
+  AtomicOrdering FailureOrder = CI->getFailureOrdering();
+  Value *Addr = CI->getPointerOperand();
+  BasicBlock *BB = CI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+  // If getInsertFencesForAtomic() returns true, then the target does not want
+  // to deal with memory orders, and emitLeading/TrailingFence should take care
+  // of everything. Otherwise, emitLeading/TrailingFence are no-op and we
+  // should preserve the ordering.
+  AtomicOrdering MemOpOrder =
+      TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder;
+
+  // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
+  //
+  // The full expansion we produce is:
+  //     [...]
+  //     fence?
+  // cmpxchg.start:
+  //     %loaded = @load.linked(%addr)
+  //     %should_store = icmp eq %loaded, %desired
+  //     br i1 %should_store, label %cmpxchg.trystore,
+  //                          label %cmpxchg.failure
+  // cmpxchg.trystore:
+  //     %stored = @store_conditional(%new, %addr)
+  //     %success = icmp eq i32 %stored, 0
+  //     br i1 %success, label %cmpxchg.success, label %loop/%cmpxchg.failure
+  // cmpxchg.success:
+  //     fence?
+  //     br label %cmpxchg.end
+  // cmpxchg.failure:
+  //     fence?
+  //     br label %cmpxchg.end
+  // cmpxchg.end:
+  //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
+  //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
+  //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end");
+  auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
+  auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, FailureBB);
+  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, SuccessBB);
+  auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
+
+  // This grabs the DebugLoc from CI
+  IRBuilder<> Builder(CI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we might want a fence too. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
+                        /*IsLoad=*/true);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+  Value *ShouldStore =
+      Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store");
+
+  // If the the cmpxchg doesn't actually need any ordering when it fails, we can
+  // jump straight past that fence instruction (if it exists).
+  Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
+
+  Builder.SetInsertPoint(TryStoreBB);
+  Value *StoreSuccess = TLI->emitStoreConditional(
+      Builder, CI->getNewValOperand(), Addr, MemOpOrder);
+  StoreSuccess = Builder.CreateICmpEQ(
+      StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
+  Builder.CreateCondBr(StoreSuccess, SuccessBB,
+                       CI->isWeak() ? FailureBB : LoopBB);
+
+  // Make sure later instructions don't get reordered with a fence if necessary.
+  Builder.SetInsertPoint(SuccessBB);
+  TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
+                         /*IsLoad=*/true);
+  Builder.CreateBr(ExitBB);
+
+  Builder.SetInsertPoint(FailureBB);
+  TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
+                         /*IsLoad=*/true);
+  Builder.CreateBr(ExitBB);
+
+  // Finally, we have control-flow based knowledge of whether the cmpxchg
+  // succeeded or not. We expose this to later passes by converting any
+  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate PHI.
+
+  // Setup the builder so we can create any PHIs we need.
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
+  Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
+  Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
+
+  // Look for any users of the cmpxchg that are just comparing the loaded value
+  // against the desired one, and replace them with the CFG-derived version.
+  SmallVector<ExtractValueInst *, 2> PrunedInsts;
+  for (auto User : CI->users()) {
+    ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
+    if (!EV)
+      continue;
+
+    assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
+           "weird extraction from { iN, i1 }");
+
+    if (EV->getIndices()[0] == 0)
+      EV->replaceAllUsesWith(Loaded);
+    else
+      EV->replaceAllUsesWith(Success);
+
+    PrunedInsts.push_back(EV);
+  }
+
+  // We can remove the instructions now we're no longer iterating through them.
+  for (auto EV : PrunedInsts)
+    EV->eraseFromParent();
+
+  if (!CI->use_empty()) {
+    // Some use of the full struct return that we don't understand has happened,
+    // so we've got to reconstruct it properly.
+    Value *Res;
+    Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
+    Res = Builder.CreateInsertValue(Res, Success, 1);
+
+    CI->replaceAllUsesWith(Res);
+  }
+
+  CI->eraseFromParent();
+  return true;
+}
+
+bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
+  auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
+  if(!C)
+    return false;
+
+  AtomicRMWInst::BinOp Op = RMWI->getOperation();
+  switch(Op) {
+    case AtomicRMWInst::Add:
+    case AtomicRMWInst::Sub:
+    case AtomicRMWInst::Or:
+    case AtomicRMWInst::Xor:
+      return C->isZero();
+    case AtomicRMWInst::And:
+      return C->isMinusOne();
+    // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
+    default:
+      return false;
+  }
+}
+
+bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
+  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
+
+  if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
+    if (TLI->shouldExpandAtomicLoadInIR(ResultingLoad))
+      expandAtomicLoad(ResultingLoad);
+    return true;
+  }
+
+  return false;
+}
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index b2737bf754f9..72da80646c5b 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -42,7 +42,9 @@ class BasicTTI final : public ImmutablePass, public TargetTransformInfo {
   /// Estimate the cost overhead of SK_Alternate shuffle.
   unsigned getAltShuffleOverhead(Type *Ty) const;
 
-  const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); }
+  const TargetLoweringBase *getTLI() const {
+    return TM->getSubtargetImpl()->getTargetLowering();
+  }
 
 public:
   BasicTTI() : ImmutablePass(ID), TM(nullptr) {
@@ -90,7 +92,7 @@ public:
   unsigned getJumpBufSize() const override;
   bool shouldBuildLookupTables() const override;
   bool haveFastSqrt(Type *Ty) const override;
-  void getUnrollingPreferences(Loop *L,
+  void getUnrollingPreferences(const Function *F, Loop *L,
                                UnrollingPreferences &UP) const override;
 
   /// @}
@@ -99,10 +101,11 @@ public:
   /// @{
 
   unsigned getNumberOfRegisters(bool Vector) const override;
-  unsigned getMaximumUnrollFactor() const override;
+  unsigned getMaxInterleaveFactor() const override;
   unsigned getRegisterBitWidth(bool Vector) const override;
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
-                                  OperandValueKind) const override;
+                                  OperandValueKind, OperandValueProperties,
+                                  OperandValueProperties) const override;
   unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
                           int Index, Type *SubTp) const override;
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
@@ -186,9 +189,8 @@ unsigned BasicTTI::getJumpBufSize() const {
 
 bool BasicTTI::shouldBuildLookupTables() const {
   const TargetLoweringBase *TLI = getTLI();
-  return TLI->supportJumpTables() &&
-      (TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
-       TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
+  return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
+         TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
 }
 
 bool BasicTTI::haveFastSqrt(Type *Ty) const {
@@ -197,7 +199,7 @@ bool BasicTTI::haveFastSqrt(Type *Ty) const {
   return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
 }
 
-void BasicTTI::getUnrollingPreferences(Loop *L,
+void BasicTTI::getUnrollingPreferences(const Function *F, Loop *L,
                                        UnrollingPreferences &UP) const {
   // This unrolling functionality is target independent, but to provide some
   // motivation for its intended use, for x86:
@@ -223,11 +225,11 @@ void BasicTTI::getUnrollingPreferences(Loop *L,
   // until someone finds a case where it matters in practice.
 
   unsigned MaxOps;
-  const TargetSubtargetInfo *ST = &TM->getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo *ST = &TM->getSubtarget<TargetSubtargetInfo>(F);
   if (PartialUnrollingThreshold.getNumOccurrences() > 0)
     MaxOps = PartialUnrollingThreshold;
-  else if (ST->getSchedModel()->LoopMicroOpBufferSize > 0)
-    MaxOps = ST->getSchedModel()->LoopMicroOpBufferSize;
+  else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
+    MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
   else
     return;
 
@@ -282,13 +284,14 @@ unsigned BasicTTI::getRegisterBitWidth(bool Vector) const {
   return 32;
 }
 
-unsigned BasicTTI::getMaximumUnrollFactor() const {
+unsigned BasicTTI::getMaxInterleaveFactor() const {
   return 1;
 }
 
 unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                          OperandValueKind,
-                                          OperandValueKind) const {
+                                          OperandValueKind, OperandValueKind,
+                                          OperandValueProperties,
+                                          OperandValueProperties) const {
   // Check if any of the operands are vector operands.
   const TargetLoweringBase *TLI = getTLI();
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -465,7 +468,8 @@ unsigned BasicTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
 
-  if (!TLI->isOperationExpand(ISD, LT.second)) {
+  if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
+      !TLI->isOperationExpand(ISD, LT.second)) {
     // The operation is legal. Assume it costs 1. Multiply
     // by the type-legalization overhead.
     return LT.first * 1;
@@ -515,7 +519,7 @@ unsigned BasicTTI::getMemoryOpCost(unsigned Opcode, Type *Src,
       if (Opcode == Instruction::Store)
         LA = getTLI()->getTruncStoreAction(LT.second, MemVT.getSimpleVT());
       else
-        LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, MemVT.getSimpleVT());
+        LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
     }
 
     if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
@@ -561,6 +565,8 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   case Intrinsic::log10:   ISD = ISD::FLOG10; break;
   case Intrinsic::log2:    ISD = ISD::FLOG2;  break;
   case Intrinsic::fabs:    ISD = ISD::FABS;   break;
+  case Intrinsic::minnum:  ISD = ISD::FMINNUM; break;
+  case Intrinsic::maxnum:  ISD = ISD::FMAXNUM; break;
   case Intrinsic::copysign: ISD = ISD::FCOPYSIGN; break;
   case Intrinsic::floor:   ISD = ISD::FFLOOR; break;
   case Intrinsic::ceil:    ISD = ISD::FCEIL;  break;
@@ -572,6 +578,7 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   case Intrinsic::pow:     ISD = ISD::FPOW;   break;
   case Intrinsic::fma:     ISD = ISD::FMA;    break;
   case Intrinsic::fmuladd: ISD = ISD::FMA;    break;
+  // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end:
     return 0;
@@ -582,7 +589,7 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
 
   if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
     // The operation is legal. Assume it costs 1.
-    // If the type is split to multiple registers, assume that thre is some
+    // If the type is split to multiple registers, assume that there is some
     // overhead to this.
     // TODO: Once we have extract/insert subvector cost we need to use them.
     if (LT.first > 1)
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 7503e5751994..2128da1b8aac 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -20,6 +20,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -32,8 +34,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -70,6 +72,8 @@ namespace {
     bool runOnMachineFunction(MachineFunction &MF) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addRequired<MachineBranchProbabilityInfo>();
       AU.addRequired<TargetPassConfig>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -91,22 +95,24 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
   // HW that requires structurized CFG.
   bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
       PassConfig->getEnableTailMerge();
-  BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true);
-  return Folder.OptimizeFunction(MF,
-                                 MF.getTarget().getInstrInfo(),
-                                 MF.getTarget().getRegisterInfo(),
+  BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true,
+                      getAnalysis<MachineBlockFrequencyInfo>(),
+                      getAnalysis<MachineBranchProbabilityInfo>());
+  return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(),
+                                 MF.getSubtarget().getRegisterInfo(),
                                  getAnalysisIfAvailable<MachineModuleInfo>());
 }
 
-
-BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist) {
+BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
+                           const MachineBlockFrequencyInfo &FreqInfo,
+                           const MachineBranchProbabilityInfo &ProbInfo)
+    : EnableHoistCommonCode(CommonHoist), MBBFreqInfo(FreqInfo),
+      MBPI(ProbInfo) {
   switch (FlagEnableTailMerge) {
   case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break;
   case cl::BOU_TRUE: EnableTailMerge = true; break;
   case cl::BOU_FALSE: EnableTailMerge = false; break;
   }
-
-  EnableHoistCommonCode = CommonHoist;
 }
 
 /// RemoveDeadBlock - Remove the specified dead machine basic block from the
@@ -388,10 +394,8 @@ void BranchFolder::MaintainLiveIns(MachineBasicBlock *CurMBB,
     RS->enterBasicBlock(CurMBB);
     if (!CurMBB->empty())
       RS->forward(std::prev(CurMBB->end()));
-    BitVector RegsLiveAtExit(TRI->getNumRegs());
-    RS->getRegsUsed(RegsLiveAtExit, false);
-    for (unsigned int i = 0, e = TRI->getNumRegs(); i != e; i++)
-      if (RegsLiveAtExit[i])
+    for (unsigned int i = 1, e = TRI->getNumRegs(); i != e; i++)
+      if (RS->isRegUsed(i, false))
         NewMBB->addLiveIn(i);
   }
 }
@@ -435,6 +439,9 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
   // Splice the code over.
   NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());
 
+  // NewMBB inherits CurMBB's block frequency.
+  MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB));
+
   // For targets that use the register scavenger, we must maintain LiveIns.
   MaintainLiveIns(&CurMBB, NewMBB);
 
@@ -504,6 +511,21 @@ BranchFolder::MergePotentialsElt::operator<(const MergePotentialsElt &o) const {
 #endif
 }
 
+BlockFrequency
+BranchFolder::MBFIWrapper::getBlockFreq(const MachineBasicBlock *MBB) const {
+  auto I = MergedBBFreq.find(MBB);
+
+  if (I != MergedBBFreq.end())
+    return I->second;
+
+  return MBFI.getBlockFreq(MBB);
+}
+
+void BranchFolder::MBFIWrapper::setBlockFreq(const MachineBasicBlock *MBB,
+                                             BlockFrequency F) {
+  MergedBBFreq[MBB] = F;
+}
+
 /// CountTerminators - Count the number of terminators in the given
 /// block and set I to the position of the first non-terminator, if there
 /// is one, or MBB->end() otherwise.
@@ -806,6 +828,10 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
     }
 
     MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+
+    // Recompute commont tail MBB's edge weights and block frequency.
+    setCommonTailEdgeWeights(*MBB);
+
     // MBB is common tail.  Adjust all other BB's to jump to this one.
     // Traversal must be forwards so erases work.
     DEBUG(dbgs() << "\nUsing common tail in BB#" << MBB->getNumber()
@@ -890,7 +916,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
         continue;
 
       // Visit each predecessor only once.
-      if (!UniquePreds.insert(PBB))
+      if (!UniquePreds.insert(PBB).second)
         continue;
 
       // Skip blocks which may jump to a landing pad. Can't tail merge these.
@@ -968,6 +994,44 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
   return MadeChange;
 }
 
+void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) {
+  SmallVector<BlockFrequency, 2> EdgeFreqLs(TailMBB.succ_size());
+  BlockFrequency AccumulatedMBBFreq;
+
+  // Aggregate edge frequency of successor edge j:
+  //  edgeFreq(j) = sum (freq(bb) * edgeProb(bb, j)),
+  //  where bb is a basic block that is in SameTails.
+  for (const auto &Src : SameTails) {
+    const MachineBasicBlock *SrcMBB = Src.getBlock();
+    BlockFrequency BlockFreq = MBBFreqInfo.getBlockFreq(SrcMBB);
+    AccumulatedMBBFreq += BlockFreq;
+
+    // It is not necessary to recompute edge weights if TailBB has less than two
+    // successors.
+    if (TailMBB.succ_size() <= 1)
+      continue;
+
+    auto EdgeFreq = EdgeFreqLs.begin();
+
+    for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
+         SuccI != SuccE; ++SuccI, ++EdgeFreq)
+      *EdgeFreq += BlockFreq * MBPI.getEdgeProbability(SrcMBB, *SuccI);
+  }
+
+  MBBFreqInfo.setBlockFreq(&TailMBB, AccumulatedMBBFreq);
+
+  if (TailMBB.succ_size() <= 1)
+    return;
+
+  auto MaxEdgeFreq = *std::max_element(EdgeFreqLs.begin(), EdgeFreqLs.end());
+  uint64_t Scale = MaxEdgeFreq.getFrequency() / UINT32_MAX + 1;
+  auto EdgeFreq = EdgeFreqLs.begin();
+
+  for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
+       SuccI != SuccE; ++SuccI, ++EdgeFreq)
+    TailMBB.setSuccWeight(SuccI, EdgeFreq->getFrequency() / Scale);
+}
+
 //===----------------------------------------------------------------------===//
 //  Branch Optimization
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index 0d15ed7e792a..3653a2ccd623 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h
@@ -7,14 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_BRANCHFOLDING_HPP
-#define LLVM_CODEGEN_BRANCHFOLDING_HPP
+#ifndef LLVM_LIB_CODEGEN_BRANCHFOLDING_H
+#define LLVM_LIB_CODEGEN_BRANCHFOLDING_H
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/Support/BlockFrequency.h"
 #include <vector>
 
 namespace llvm {
+  class MachineBlockFrequencyInfo;
+  class MachineBranchProbabilityInfo;
   class MachineFunction;
   class MachineModuleInfo;
   class RegScavenger;
@@ -23,7 +26,9 @@ namespace llvm {
 
   class BranchFolder {
   public:
-    explicit BranchFolder(bool defaultEnableTailMerge, bool CommonHoist);
+    explicit BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
+                          const MachineBlockFrequencyInfo &MBFI,
+                          const MachineBranchProbabilityInfo &MBPI);
 
     bool OptimizeFunction(MachineFunction &MF,
                           const TargetInstrInfo *tii,
@@ -92,9 +97,26 @@ namespace llvm {
     MachineModuleInfo *MMI;
     RegScavenger *RS;
 
+    /// \brief This class keeps track of branch frequencies of newly created
+    /// blocks and tail-merged blocks.
+    class MBFIWrapper {
+    public:
+      MBFIWrapper(const MachineBlockFrequencyInfo &I) : MBFI(I) {}
+      BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const;
+      void setBlockFreq(const MachineBasicBlock *MBB, BlockFrequency F);
+
+    private:
+      const MachineBlockFrequencyInfo &MBFI;
+      DenseMap<const MachineBasicBlock *, BlockFrequency> MergedBBFreq;
+    };
+
+    MBFIWrapper MBBFreqInfo;
+    const MachineBranchProbabilityInfo &MBPI;
+
     bool TailMergeBlocks(MachineFunction &MF);
     bool TryTailMergeBlocks(MachineBasicBlock* SuccBB,
                        MachineBasicBlock* PredBB);
+    void setCommonTailEdgeWeights(MachineBasicBlock &TailMBB);
     void MaintainLiveIns(MachineBasicBlock *CurMBB,
                          MachineBasicBlock *NewMBB);
     void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index b71b30cc1cca..48bf3545e733 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -2,7 +2,7 @@ add_llvm_library(LLVMCodeGen
   AggressiveAntiDepBreaker.cpp
   AllocationOrder.cpp
   Analysis.cpp
-  AtomicExpandLoadLinkedPass.cpp
+  AtomicExpandPass.cpp
   BasicTargetTransformInfo.cpp
   BranchFolding.cpp
   CalcSpillWeights.cpp
@@ -19,6 +19,7 @@ add_llvm_library(LLVMCodeGen
   ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
   ExpandPostRAPseudos.cpp
+  ForwardControlFlowIntegrity.cpp
   GCMetadata.cpp
   GCMetadataPrinter.cpp
   GCStrategy.cpp
@@ -27,7 +28,6 @@ add_llvm_library(LLVMCodeGen
   InlineSpiller.cpp
   InterferenceCache.cpp
   IntrinsicLowering.cpp
-  JITCodeEmitter.cpp
   JumpInstrTables.cpp
   LLVMTargetMachine.cpp
   LatencyPriorityQueue.cpp
@@ -48,7 +48,7 @@ add_llvm_library(LLVMCodeGen
   MachineBlockPlacement.cpp
   MachineBranchProbabilityInfo.cpp
   MachineCSE.cpp
-  MachineCodeEmitter.cpp
+  MachineCombiner.cpp
   MachineCopyPropagation.cpp
   MachineDominators.cpp
   MachineDominanceFrontier.cpp
@@ -98,13 +98,13 @@ add_llvm_library(LLVMCodeGen
   SjLjEHPrepare.cpp
   SlotIndexes.cpp
   SpillPlacement.cpp
-  Spiller.cpp
   SplitKit.cpp
   StackColoring.cpp
   StackProtector.cpp
   StackSlotColoring.cpp
   StackMapLivenessAnalysis.cpp
   StackMaps.cpp
+  StatepointExampleGC.cpp
   TailDuplication.cpp
   TargetFrameLoweringImpl.cpp
   TargetInstrInfo.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index bc033f96bcf2..d08fae09323c 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -16,8 +16,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "calcspillweights"
@@ -95,11 +95,12 @@ static bool isRematerializable(const LiveInterval &LI,
 void
 VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
   MachineRegisterInfo &mri = MF.getRegInfo();
-  const TargetRegisterInfo &tri = *MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo &tri = *MF.getSubtarget().getRegisterInfo();
   MachineBasicBlock *mbb = nullptr;
   MachineLoop *loop = nullptr;
   bool isExiting = false;
   float totalWeight = 0;
+  unsigned numInstr = 0; // Number of instructions using li
   SmallPtrSet<MachineInstr*, 8> visited;
 
   // Find the best physreg hint and the best virtreg hint.
@@ -116,9 +117,10 @@ VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
        I = mri.reg_instr_begin(li.reg), E = mri.reg_instr_end();
        I != E; ) {
     MachineInstr *mi = &*(I++);
+    numInstr++;
     if (mi->isIdentityCopy() || mi->isImplicitDef() || mi->isDebugValue())
       continue;
-    if (!visited.insert(mi))
+    if (!visited.insert(mi).second)
       continue;
 
     float weight = 1.0f;
@@ -186,8 +188,8 @@ VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
   // it is a preferred candidate for spilling.
   // FIXME: this gets much more complicated once we support non-trivial
   // re-materialization.
-  if (isRematerializable(li, LIS, *MF.getTarget().getInstrInfo()))
+  if (isRematerializable(li, LIS, *MF.getSubtarget().getInstrInfo()))
     totalWeight *= 0.5F;
 
-  li.weight = normalize(totalWeight, li.getSize());
+  li.weight = normalize(totalWeight, li.getSize(), numInstr);
 }
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index add861a83632..034ffb34b9cc 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -14,21 +14,22 @@
 
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
-                 const TargetMachine &tm, SmallVectorImpl<CCValAssign> &locs,
-                 LLVMContext &C)
-  : CallingConv(CC), IsVarArg(isVarArg), MF(mf), TM(tm),
-    TRI(*TM.getRegisterInfo()), Locs(locs), Context(C),
-    CallOrPrologue(Unknown) {
+                 SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
+    : CallingConv(CC), IsVarArg(isVarArg), MF(mf),
+      TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C),
+      CallOrPrologue(Unknown) {
   // No stack is used.
   StackOffset = 0;
 
@@ -50,7 +51,8 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
   if (MinAlign > (int)Align)
     Align = MinAlign;
   MF.getFrameInfo()->ensureMaxAlignment(Align);
-  TM.getTargetLowering()->HandleByVal(this, Size, Align);
+  MF.getSubtarget().getTargetLowering()->HandleByVal(this, Size, Align);
+  Size = unsigned(RoundUpToAlignment(Size, MinAlign));
   unsigned Offset = AllocateStack(Size, Align);
   addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
 }
@@ -178,3 +180,70 @@ void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) {
     llvm_unreachable(nullptr);
   }
 }
+
+static bool isValueTypeInRegForCC(CallingConv::ID CC, MVT VT) {
+  if (VT.isVector())
+    return true; // Assume -msse-regparm might be in effect.
+  if (!VT.isInteger())
+    return false;
+  if (CC == CallingConv::X86_VectorCall || CC == CallingConv::X86_FastCall)
+    return true;
+  return false;
+}
+
+void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
+                                          MVT VT, CCAssignFn Fn) {
+  unsigned SavedStackOffset = StackOffset;
+  unsigned NumLocs = Locs.size();
+
+  // Set the 'inreg' flag if it is used for this calling convention.
+  ISD::ArgFlagsTy Flags;
+  if (isValueTypeInRegForCC(CallingConv, VT))
+    Flags.setInReg();
+
+  // Allocate something of this value type repeatedly until we get assigned a
+  // location in memory.
+  bool HaveRegParm = true;
+  while (HaveRegParm) {
+    if (Fn(0, VT, VT, CCValAssign::Full, Flags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call has unhandled type " << EVT(VT).getEVTString()
+             << " while computing remaining regparms\n";
+#endif
+      llvm_unreachable(nullptr);
+    }
+    HaveRegParm = Locs.back().isRegLoc();
+  }
+
+  // Copy all the registers from the value locations we added.
+  assert(NumLocs < Locs.size() && "CC assignment failed to add location");
+  for (unsigned I = NumLocs, E = Locs.size(); I != E; ++I)
+    if (Locs[I].isRegLoc())
+      Regs.push_back(MCPhysReg(Locs[I].getLocReg()));
+
+  // Clear the assigned values and stack memory. We leave the registers marked
+  // as allocated so that future queries don't return the same registers, i.e.
+  // when i64 and f64 are both passed in GPRs.
+  StackOffset = SavedStackOffset;
+  Locs.resize(NumLocs);
+}
+
+void CCState::analyzeMustTailForwardedRegisters(
+    SmallVectorImpl<ForwardedRegister> &Forwards, ArrayRef<MVT> RegParmTypes,
+    CCAssignFn Fn) {
+  // Oftentimes calling conventions will not user register parameters for
+  // variadic functions, so we need to assume we're not variadic so that we get
+  // all the registers that might be used in a non-variadic call.
+  SaveAndRestore<bool> SavedVarArg(IsVarArg, false);
+
+  for (MVT RegVT : RegParmTypes) {
+    SmallVector<MCPhysReg, 8> RemainingRegs;
+    getRemainingRegParmsForType(RemainingRegs, RegVT, Fn);
+    const TargetLowering *TL = MF.getSubtarget().getTargetLowering();
+    const TargetRegisterClass *RC = TL->getRegClassFor(RegVT);
+    for (MCPhysReg PReg : RemainingRegs) {
+      unsigned VReg = MF.addLiveIn(PReg, RC);
+      Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT));
+    }
+  }
+}
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index b3beac3932bf..307dec548fc5 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -20,7 +20,7 @@ using namespace llvm;
 
 /// initializeCodeGen - Initialize all passes linked into the CodeGen library.
 void llvm::initializeCodeGen(PassRegistry &Registry) {
-  initializeAtomicExpandLoadLinkedPass(Registry);
+  initializeAtomicExpandPass(Registry);
   initializeBasicTTIPass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeCodeGenPreparePass(Registry);
@@ -41,6 +41,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineBlockPlacementPass(Registry);
   initializeMachineBlockPlacementStatsPass(Registry);
   initializeMachineCopyPropagationPass(Registry);
+  initializeMachineCombinerPass(Registry);
   initializeMachineCSEPass(Registry);
   initializeMachineDominatorTreePass(Registry);
   initializeMachinePostDominatorTreePass(Registry);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index d5039b2e8517..82cd3801f583 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -29,6 +30,7 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueMap.h"
@@ -43,6 +45,7 @@
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -63,6 +66,7 @@ STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
 STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
+STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
 
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
@@ -80,15 +84,40 @@ static cl::opt<bool> EnableAndCmpSinking(
    "enable-andcmp-sinking", cl::Hidden, cl::init(true),
    cl::desc("Enable sinkinig and/cmp into branches."));
 
+static cl::opt<bool> DisableStoreExtract(
+    "disable-cgp-store-extract", cl::Hidden, cl::init(false),
+    cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));
+
+static cl::opt<bool> StressStoreExtract(
+    "stress-cgp-store-extract", cl::Hidden, cl::init(false),
+    cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
+
+static cl::opt<bool> DisableExtLdPromotion(
+    "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
+    cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
+             "CodeGenPrepare"));
+
+static cl::opt<bool> StressExtLdPromotion(
+    "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
+    cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
+             "optimization in CodeGenPrepare"));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
-typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
+struct TypeIsSExt {
+  Type *Ty;
+  bool IsSExt;
+  TypeIsSExt(Type *Ty, bool IsSExt) : Ty(Ty), IsSExt(IsSExt) {}
+};
+typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
+class TypePromotionTransaction;
 
   class CodeGenPrepare : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
     /// transformation profitability.
     const TargetMachine *TM;
     const TargetLowering *TLI;
+    const TargetTransformInfo *TTI;
     const TargetLibraryInfo *TLInfo;
     DominatorTree *DT;
 
@@ -118,7 +147,7 @@ typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit CodeGenPrepare(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TM(TM), TLI(nullptr) {
+        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr) {
         initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
       }
     bool runOnFunction(Function &F) override;
@@ -128,6 +157,7 @@ typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetTransformInfo>();
     }
 
   private:
@@ -135,18 +165,24 @@ typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
     bool EliminateMostlyEmptyBlocks(Function &F);
     bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
     void EliminateMostlyEmptyBlock(BasicBlock *BB);
-    bool OptimizeBlock(BasicBlock &BB);
-    bool OptimizeInst(Instruction *I);
+    bool OptimizeBlock(BasicBlock &BB, bool& ModifiedDT);
+    bool OptimizeInst(Instruction *I, bool& ModifiedDT);
     bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy);
     bool OptimizeInlineAsmInst(CallInst *CS);
-    bool OptimizeCallInst(CallInst *CI);
-    bool MoveExtToFormExtLoad(Instruction *I);
+    bool OptimizeCallInst(CallInst *CI, bool& ModifiedDT);
+    bool MoveExtToFormExtLoad(Instruction *&I);
     bool OptimizeExtUses(Instruction *I);
     bool OptimizeSelectInst(SelectInst *SI);
     bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI);
+    bool OptimizeExtractElementInst(Instruction *Inst);
     bool DupRetToEnableTailCallOpts(BasicBlock *BB);
     bool PlaceDbgValues(Function &F);
     bool sinkAndCmp(Function &F);
+    bool ExtLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
+                        Instruction *&Inst,
+                        const SmallVectorImpl<Instruction *> &Exts,
+                        unsigned CreatedInst);
+    bool splitBranchCondition(Function &F);
   };
 }
 
@@ -168,8 +204,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   PromotedInsts.clear();
 
   ModifiedDT = false;
-  if (TM) TLI = TM->getTargetLowering();
+  if (TM)
+    TLI = TM->getSubtargetImpl()->getTargetLowering();
   TLInfo = &getAnalysis<TargetLibraryInfo>();
+  TTI = &getAnalysis<TargetTransformInfo>();
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
@@ -198,15 +236,23 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   // into a single target instruction, push the mask and compare into branch
   // users. Do this before OptimizeBlock -> OptimizeInst ->
   // OptimizeCmpExpression, which perturbs the pattern being searched for.
-  if (!DisableBranchOpts)
+  if (!DisableBranchOpts) {
     EverMadeChange |= sinkAndCmp(F);
+    EverMadeChange |= splitBranchCondition(F);
+  }
 
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
     for (Function::iterator I = F.begin(); I != F.end(); ) {
       BasicBlock *BB = I++;
-      MadeChange |= OptimizeBlock(*BB);
+      bool ModifiedDTOnIteration = false;
+      MadeChange |= OptimizeBlock(*BB, ModifiedDTOnIteration);
+      
+      // Restart BB iteration if the dominator tree of the Function was changed
+      ModifiedDT |= ModifiedDTOnIteration;
+      if (ModifiedDTOnIteration)
+        break;
     }
     EverMadeChange |= MadeChange;
   }
@@ -216,9 +262,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   if (!DisableBranchOpts) {
     MadeChange = false;
     SmallPtrSet<BasicBlock*, 8> WorkList;
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-      SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
-      MadeChange |= ConstantFoldTerminator(BB, true);
+    for (BasicBlock &BB : F) {
+      SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
+      MadeChange |= ConstantFoldTerminator(&BB, true);
       if (!MadeChange) continue;
 
       for (SmallVectorImpl<BasicBlock*>::iterator
@@ -662,10 +708,13 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
     if (!ISDOpcode)
       continue;
 
-    // If the use is actually a legal node, there will not be an implicit
-    // truncate.
-    if (TLI.isOperationLegalOrCustom(ISDOpcode,
-                                     EVT::getEVT(TruncUser->getType())))
+    // If the use is actually a legal node, there will not be an
+    // implicit truncate.
+    // FIXME: always querying the result type is just an
+    // approximation; some nodes' legality is determined by the
+    // operand or other means. There's no good way to find out though.
+    if (TLI.isOperationLegalOrCustom(
+            ISDOpcode, TLI.getValueType(TruncUser->getType(), true)))
       continue;
 
     // Don't bother for PHI nodes.
@@ -799,23 +848,211 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
   return MadeChange;
 }
 
-namespace {
-class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls {
-protected:
-  void replaceCall(Value *With) override {
-    CI->replaceAllUsesWith(With);
-    CI->eraseFromParent();
+//  ScalarizeMaskedLoad() translates masked load intrinsic, like 
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask, <16 x i32> %passthru)
+// to a chain of basic blocks, whith loading element one-by-one if
+// the appropriate mask bit is set
+// 
+//  %1 = bitcast i8* %addr to i32*
+//  %2 = extractelement <16 x i1> %mask, i32 0
+//  %3 = icmp eq i1 %2, true
+//  br i1 %3, label %cond.load, label %else
+//
+//cond.load:                                        ; preds = %0
+//  %4 = getelementptr i32* %1, i32 0
+//  %5 = load i32* %4
+//  %6 = insertelement <16 x i32> undef, i32 %5, i32 0
+//  br label %else
+//
+//else:                                             ; preds = %0, %cond.load
+//  %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
+//  %7 = extractelement <16 x i1> %mask, i32 1
+//  %8 = icmp eq i1 %7, true
+//  br i1 %8, label %cond.load1, label %else2
+//
+//cond.load1:                                       ; preds = %else
+//  %9 = getelementptr i32* %1, i32 1
+//  %10 = load i32* %9
+//  %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
+//  br label %else2
+//
+//else2:                                            ; preds = %else, %cond.load1
+//  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+//  %12 = extractelement <16 x i1> %mask, i32 2
+//  %13 = icmp eq i1 %12, true
+//  br i1 %13, label %cond.load4, label %else5
+//
+static void ScalarizeMaskedLoad(CallInst *CI) {
+  Value *Ptr  = CI->getArgOperand(0);
+  Value *Src0 = CI->getArgOperand(3);
+  Value *Mask = CI->getArgOperand(2);
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+  Type *EltTy = VecType->getElementType();
+
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  unsigned VectorWidth = VecType->getNumElements();
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_load = icmp eq i1 %mask_1, true
+    //  br i1 %to_load, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+    
+    Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
+    LoadInst* Load = Builder.CreateLoad(Gep, false);
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
   }
-  bool isFoldable(unsigned SizeCIOp, unsigned, bool) const override {
-      if (ConstantInt *SizeCI =
-                             dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp)))
-        return SizeCI->isAllOnesValue();
-    return false;
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+//  ScalarizeMaskedStore() translates masked store intrinsic, like
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set
+//
+//   %1 = bitcast i8* %addr to i32*
+//   %2 = extractelement <16 x i1> %mask, i32 0
+//   %3 = icmp eq i1 %2, true
+//   br i1 %3, label %cond.store, label %else
+//
+// cond.store:                                       ; preds = %0
+//   %4 = extractelement <16 x i32> %val, i32 0
+//   %5 = getelementptr i32* %1, i32 0
+//   store i32 %4, i32* %5
+//   br label %else
+// 
+// else:                                             ; preds = %0, %cond.store
+//   %6 = extractelement <16 x i1> %mask, i32 1
+//   %7 = icmp eq i1 %6, true
+//   br i1 %7, label %cond.store1, label %else2
+// 
+// cond.store1:                                      ; preds = %else
+//   %8 = extractelement <16 x i32> %val, i32 1
+//   %9 = getelementptr i32* %1, i32 1
+//   store i32 %8, i32* %9
+//   br label %else2
+//   . . .
+static void ScalarizeMaskedStore(CallInst *CI) {
+  Value *Ptr  = CI->getArgOperand(1);
+  Value *Src = CI->getArgOperand(0);
+  Value *Mask = CI->getArgOperand(3);
+
+  VectorType *VecType = dyn_cast<VectorType>(Src->getType());
+  Type *EltTy = VecType->getElementType();
+
+  assert(VecType && "Unexpected data type in masked store intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+
+  unsigned VectorWidth = VecType->getNumElements();
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_store = icmp eq i1 %mask_1, true
+    //  br i1 %to_load, label %cond.store, label %else
+    //
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %store i32 %OneElt, i32* %EltAddr
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+    
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+    Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
+    Builder.CreateStore(OneElt, Gep);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
   }
-};
-} // end anonymous namespace
+  CI->eraseFromParent();
+}
 
-bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
+bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
   BasicBlock *BB = CI->getParent();
 
   // Lower inline assembly if we can.
@@ -835,38 +1072,60 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
       return true;
   }
 
-  // Lower all uses of llvm.objectsize.*
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
-  if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
-    bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
-    Type *ReturnTy = CI->getType();
-    Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
-
-    // Substituting this can cause recursive simplifications, which can
-    // invalidate our iterator.  Use a WeakVH to hold onto it in case this
-    // happens.
-    WeakVH IterHandle(CurInstIterator);
-
-    replaceAndRecursivelySimplify(CI, RetVal,
-                                  TLI ? TLI->getDataLayout() : nullptr,
-                                  TLInfo, ModifiedDT ? nullptr : DT);
+  if (II) {
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::objectsize: {
+      // Lower all uses of llvm.objectsize.*
+      bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
+      Type *ReturnTy = CI->getType();
+      Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
+
+      // Substituting this can cause recursive simplifications, which can
+      // invalidate our iterator.  Use a WeakVH to hold onto it in case this
+      // happens.
+      WeakVH IterHandle(CurInstIterator);
+
+      replaceAndRecursivelySimplify(CI, RetVal,
+                                    TLI ? TLI->getDataLayout() : nullptr,
+                                    TLInfo, ModifiedDT ? nullptr : DT);
 
-    // If the iterator instruction was recursively deleted, start over at the
-    // start of the block.
-    if (IterHandle != CurInstIterator) {
-      CurInstIterator = BB->begin();
-      SunkAddrs.clear();
+      // If the iterator instruction was recursively deleted, start over at the
+      // start of the block.
+      if (IterHandle != CurInstIterator) {
+        CurInstIterator = BB->begin();
+        SunkAddrs.clear();
+      }
+      return true;
+    }
+    case Intrinsic::masked_load: {
+      // Scalarize unsupported vector masked load
+      if (!TTI->isLegalMaskedLoad(CI->getType(), 1)) {
+        ScalarizeMaskedLoad(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_store: {
+      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(), 1)) {
+        ScalarizeMaskedStore(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
     }
-    return true;
-  }
 
-  if (II && TLI) {
-    SmallVector<Value*, 2> PtrOps;
-    Type *AccessTy;
-    if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
-      while (!PtrOps.empty())
-        if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
-          return true;
+    if (TLI) {
+      SmallVector<Value*, 2> PtrOps;
+      Type *AccessTy;
+      if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
+        while (!PtrOps.empty())
+          if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
+            return true;
+    }
   }
 
   // From here on out we're working with named functions.
@@ -878,10 +1137,15 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
 
   // Lower all default uses of _chk calls.  This is very similar
   // to what InstCombineCalls does, but here we are only lowering calls
-  // that have the default "don't know" as the objectsize.  Anything else
-  // should be left alone.
-  CodeGenPrepareFortifiedLibCalls Simplifier;
-  return Simplifier.fold(CI, TD, TLInfo);
+  // to fortified library functions (e.g. __memcpy_chk) that have the default
+  // "don't know" as the objectsize.  Anything else should be left alone.
+  FortifiedLibCallSimplifier Simplifier(TD, TLInfo, true);
+  if (Value *V = Simplifier.optimizeCall(CI)) {
+    CI->replaceAllUsesWith(V);
+    CI->eraseFromParent();
+    return true;
+  }
+  return false;
 }
 
 /// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return
@@ -978,7 +1242,7 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
   } else {
     SmallPtrSet<BasicBlock*, 4> VisitedBBs;
     for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
-      if (!VisitedBBs.insert(*PI))
+      if (!VisitedBBs.insert(*PI).second)
         continue;
 
       BasicBlock::InstListType &InstList = (*PI)->getInstList();
@@ -1250,46 +1514,75 @@ class TypePromotionTransaction {
 
   /// \brief Build a truncate instruction.
   class TruncBuilder : public TypePromotionAction {
+    Value *Val;
   public:
     /// \brief Build a truncate instruction of \p Opnd producing a \p Ty
     /// result.
     /// trunc Opnd to Ty.
     TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
       IRBuilder<> Builder(Opnd);
-      Inst = cast<Instruction>(Builder.CreateTrunc(Opnd, Ty, "promoted"));
-      DEBUG(dbgs() << "Do: TruncBuilder: " << *Inst << "\n");
+      Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
+      DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
     }
 
-    /// \brief Get the built instruction.
-    Instruction *getBuiltInstruction() { return Inst; }
+    /// \brief Get the built value.
+    Value *getBuiltValue() { return Val; }
 
     /// \brief Remove the built instruction.
     void undo() override {
-      DEBUG(dbgs() << "Undo: TruncBuilder: " << *Inst << "\n");
-      Inst->eraseFromParent();
+      DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
+      if (Instruction *IVal = dyn_cast<Instruction>(Val))
+        IVal->eraseFromParent();
     }
   };
 
   /// \brief Build a sign extension instruction.
   class SExtBuilder : public TypePromotionAction {
+    Value *Val;
   public:
     /// \brief Build a sign extension instruction of \p Opnd producing a \p Ty
     /// result.
     /// sext Opnd to Ty.
     SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
-        : TypePromotionAction(Inst) {
+        : TypePromotionAction(InsertPt) {
+      IRBuilder<> Builder(InsertPt);
+      Val = Builder.CreateSExt(Opnd, Ty, "promoted");
+      DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
+    }
+
+    /// \brief Get the built value.
+    Value *getBuiltValue() { return Val; }
+
+    /// \brief Remove the built instruction.
+    void undo() override {
+      DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
+      if (Instruction *IVal = dyn_cast<Instruction>(Val))
+        IVal->eraseFromParent();
+    }
+  };
+
+  /// \brief Build a zero extension instruction.
+  class ZExtBuilder : public TypePromotionAction {
+    Value *Val;
+  public:
+    /// \brief Build a zero extension instruction of \p Opnd producing a \p Ty
+    /// result.
+    /// zext Opnd to Ty.
+    ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
+        : TypePromotionAction(InsertPt) {
       IRBuilder<> Builder(InsertPt);
-      Inst = cast<Instruction>(Builder.CreateSExt(Opnd, Ty, "promoted"));
-      DEBUG(dbgs() << "Do: SExtBuilder: " << *Inst << "\n");
+      Val = Builder.CreateZExt(Opnd, Ty, "promoted");
+      DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
     }
 
-    /// \brief Get the built instruction.
-    Instruction *getBuiltInstruction() { return Inst; }
+    /// \brief Get the built value.
+    Value *getBuiltValue() { return Val; }
 
     /// \brief Remove the built instruction.
     void undo() override {
-      DEBUG(dbgs() << "Undo: SExtBuilder: " << *Inst << "\n");
-      Inst->eraseFromParent();
+      DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
+      if (Instruction *IVal = dyn_cast<Instruction>(Val))
+        IVal->eraseFromParent();
     }
   };
 
@@ -1418,9 +1711,11 @@ public:
   /// Same as Value::mutateType.
   void mutateType(Instruction *Inst, Type *NewTy);
   /// Same as IRBuilder::createTrunc.
-  Instruction *createTrunc(Instruction *Opnd, Type *Ty);
+  Value *createTrunc(Instruction *Opnd, Type *Ty);
   /// Same as IRBuilder::createSExt.
-  Instruction *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
+  Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
+  /// Same as IRBuilder::createZExt.
+  Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty);
   /// Same as Instruction::moveBefore.
   void moveBefore(Instruction *Inst, Instruction *Before);
   /// @}
@@ -1452,20 +1747,28 @@ void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
   Actions.push_back(make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
 }
 
-Instruction *TypePromotionTransaction::createTrunc(Instruction *Opnd,
-                                                   Type *Ty) {
+Value *TypePromotionTransaction::createTrunc(Instruction *Opnd,
+                                             Type *Ty) {
   std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
-  Instruction *I = Ptr->getBuiltInstruction();
+  Value *Val = Ptr->getBuiltValue();
   Actions.push_back(std::move(Ptr));
-  return I;
+  return Val;
 }
 
-Instruction *TypePromotionTransaction::createSExt(Instruction *Inst,
-                                                  Value *Opnd, Type *Ty) {
+Value *TypePromotionTransaction::createSExt(Instruction *Inst,
+                                            Value *Opnd, Type *Ty) {
   std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
-  Instruction *I = Ptr->getBuiltInstruction();
+  Value *Val = Ptr->getBuiltValue();
+  Actions.push_back(std::move(Ptr));
+  return Val;
+}
+
+Value *TypePromotionTransaction::createZExt(Instruction *Inst,
+                                            Value *Opnd, Type *Ty) {
+  std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
+  Value *Val = Ptr->getBuiltValue();
   Actions.push_back(std::move(Ptr));
-  return I;
+  return Val;
 }
 
 void TypePromotionTransaction::moveBefore(Instruction *Inst,
@@ -1656,60 +1959,107 @@ static bool MightBeFoldableInst(Instruction *I) {
   }
 }
 
+/// \brief Check whether or not \p Val is a legal instruction for \p TLI.
+/// \note \p Val is assumed to be the product of some type promotion.
+/// Therefore if \p Val has an undefined state in \p TLI, this is assumed
+/// to be legal, as the non-promoted value would have had the same state.
+static bool isPromotedInstructionLegal(const TargetLowering &TLI, Value *Val) {
+  Instruction *PromotedInst = dyn_cast<Instruction>(Val);
+  if (!PromotedInst)
+    return false;
+  int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
+  // If the ISDOpcode is undefined, it was undefined before the promotion.
+  if (!ISDOpcode)
+    return true;
+  // Otherwise, check if the promoted instruction is legal or not.
+  return TLI.isOperationLegalOrCustom(
+      ISDOpcode, TLI.getValueType(PromotedInst->getType()));
+}
+
 /// \brief Hepler class to perform type promotion.
 class TypePromotionHelper {
-  /// \brief Utility function to check whether or not a sign extension of
-  /// \p Inst with \p ConsideredSExtType can be moved through \p Inst by either
-  /// using the operands of \p Inst or promoting \p Inst.
+  /// \brief Utility function to check whether or not a sign or zero extension
+  /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
+  /// either using the operands of \p Inst or promoting \p Inst.
+  /// The type of the extension is defined by \p IsSExt.
   /// In other words, check if:
-  /// sext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredSExtType.
+  /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
   /// #1 Promotion applies:
-  /// ConsideredSExtType Inst (sext opnd1 to ConsideredSExtType, ...).
+  /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
   /// #2 Operand reuses:
-  /// sext opnd1 to ConsideredSExtType.
+  /// ext opnd1 to ConsideredExtType.
   /// \p PromotedInsts maps the instructions to their type before promotion.
-  static bool canGetThrough(const Instruction *Inst, Type *ConsideredSExtType,
-                            const InstrToOrigTy &PromotedInsts);
+  static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,
+                            const InstrToOrigTy &PromotedInsts, bool IsSExt);
 
   /// \brief Utility function to determine if \p OpIdx should be promoted when
   /// promoting \p Inst.
-  static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
+  static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
     if (isa<SelectInst>(Inst) && OpIdx == 0)
       return false;
     return true;
   }
 
-  /// \brief Utility function to promote the operand of \p SExt when this
-  /// operand is a promotable trunc or sext.
+  /// \brief Utility function to promote the operand of \p Ext when this
+  /// operand is a promotable trunc or sext or zext.
   /// \p PromotedInsts maps the instructions to their type before promotion.
   /// \p CreatedInsts[out] contains how many non-free instructions have been
-  /// created to promote the operand of SExt.
+  /// created to promote the operand of Ext.
+  /// Newly added extensions are inserted in \p Exts.
+  /// Newly added truncates are inserted in \p Truncs.
   /// Should never be called directly.
-  /// \return The promoted value which is used instead of SExt.
-  static Value *promoteOperandForTruncAndSExt(Instruction *SExt,
-                                              TypePromotionTransaction &TPT,
-                                              InstrToOrigTy &PromotedInsts,
-                                              unsigned &CreatedInsts);
-
-  /// \brief Utility function to promote the operand of \p SExt when this
+  /// \return The promoted value which is used instead of Ext.
+  static Value *promoteOperandForTruncAndAnyExt(
+      Instruction *Ext, TypePromotionTransaction &TPT,
+      InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+      SmallVectorImpl<Instruction *> *Exts,
+      SmallVectorImpl<Instruction *> *Truncs);
+
+  /// \brief Utility function to promote the operand of \p Ext when this
   /// operand is promotable and is not a supported trunc or sext.
   /// \p PromotedInsts maps the instructions to their type before promotion.
   /// \p CreatedInsts[out] contains how many non-free instructions have been
-  /// created to promote the operand of SExt.
+  /// created to promote the operand of Ext.
+  /// Newly added extensions are inserted in \p Exts.
+  /// Newly added truncates are inserted in \p Truncs.
   /// Should never be called directly.
-  /// \return The promoted value which is used instead of SExt.
-  static Value *promoteOperandForOther(Instruction *SExt,
-                                       TypePromotionTransaction &TPT,
-                                       InstrToOrigTy &PromotedInsts,
-                                       unsigned &CreatedInsts);
+  /// \return The promoted value which is used instead of Ext.
+  static Value *
+  promoteOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT,
+                         InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+                         SmallVectorImpl<Instruction *> *Exts,
+                         SmallVectorImpl<Instruction *> *Truncs, bool IsSExt);
+
+  /// \see promoteOperandForOther.
+  static Value *
+  signExtendOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT,
+                            InstrToOrigTy &PromotedInsts,
+                            unsigned &CreatedInsts,
+                            SmallVectorImpl<Instruction *> *Exts,
+                            SmallVectorImpl<Instruction *> *Truncs) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, Exts,
+                                  Truncs, true);
+  }
+
+  /// \see promoteOperandForOther.
+  static Value *
+  zeroExtendOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT,
+                            InstrToOrigTy &PromotedInsts,
+                            unsigned &CreatedInsts,
+                            SmallVectorImpl<Instruction *> *Exts,
+                            SmallVectorImpl<Instruction *> *Truncs) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, Exts,
+                                  Truncs, false);
+  }
 
 public:
-  /// Type for the utility function that promotes the operand of SExt.
-  typedef Value *(*Action)(Instruction *SExt, TypePromotionTransaction &TPT,
-                           InstrToOrigTy &PromotedInsts,
-                           unsigned &CreatedInsts);
-  /// \brief Given a sign extend instruction \p SExt, return the approriate
-  /// action to promote the operand of \p SExt instead of using SExt.
+  /// Type for the utility function that promotes the operand of Ext.
+  typedef Value *(*Action)(Instruction *Ext, TypePromotionTransaction &TPT,
+                           InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+                           SmallVectorImpl<Instruction *> *Exts,
+                           SmallVectorImpl<Instruction *> *Truncs);
+  /// \brief Given a sign/zero extend instruction \p Ext, return the approriate
+  /// action to promote the operand of \p Ext instead of using Ext.
   /// \return NULL if no promotable action is possible with the current
   /// sign extension.
   /// \p InsertedTruncs keeps track of all the truncate instructions inserted by
@@ -1717,36 +2067,49 @@ public:
   /// because we do not want to promote these instructions as CodeGenPrepare
   /// will reinsert them later. Thus creating an infinite loop: create/remove.
   /// \p PromotedInsts maps the instructions to their type before promotion.
-  static Action getAction(Instruction *SExt, const SetOfInstrs &InsertedTruncs,
+  static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedTruncs,
                           const TargetLowering &TLI,
                           const InstrToOrigTy &PromotedInsts);
 };
 
 bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
-                                        Type *ConsideredSExtType,
-                                        const InstrToOrigTy &PromotedInsts) {
-  // We can always get through sext.
-  if (isa<SExtInst>(Inst))
+                                        Type *ConsideredExtType,
+                                        const InstrToOrigTy &PromotedInsts,
+                                        bool IsSExt) {
+  // The promotion helper does not know how to deal with vector types yet.
+  // To be able to fix that, we would need to fix the places where we
+  // statically extend, e.g., constants and such.
+  if (Inst->getType()->isVectorTy())
+    return false;
+
+  // We can always get through zext.
+  if (isa<ZExtInst>(Inst))
+    return true;
+
+  // sext(sext) is ok too.
+  if (IsSExt && isa<SExtInst>(Inst))
     return true;
 
   // We can get through binary operator, if it is legal. In other words, the
   // binary operator must have a nuw or nsw flag.
   const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
   if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
-      (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
+      ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
+       (IsSExt && BinOp->hasNoSignedWrap())))
     return true;
 
   // Check if we can do the following simplification.
-  // sext(trunc(sext)) --> sext
+  // ext(trunc(opnd)) --> ext(opnd)
   if (!isa<TruncInst>(Inst))
     return false;
 
   Value *OpndVal = Inst->getOperand(0);
-  // Check if we can use this operand in the sext.
-  // If the type is larger than the result type of the sign extension,
+  // Check if we can use this operand in the extension.
+  // If the type is larger than the result type of the extension,
   // we cannot.
-  if (OpndVal->getType()->getIntegerBitWidth() >
-      ConsideredSExtType->getIntegerBitWidth())
+  if (!OpndVal->getType()->isIntegerTy() ||
+      OpndVal->getType()->getIntegerBitWidth() >
+          ConsideredExtType->getIntegerBitWidth())
     return false;
 
   // If the operand of the truncate is not an instruction, we will not have
@@ -1757,18 +2120,19 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
     return false;
 
   // Check if the source of the type is narrow enough.
-  // I.e., check that trunc just drops sign extended bits.
-  // #1 get the type of the operand.
+  // I.e., check that trunc just drops extended bits of the same kind of
+  // the extension.
+  // #1 get the type of the operand and check the kind of the extended bits.
   const Type *OpndType;
   InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
-  if (It != PromotedInsts.end())
-    OpndType = It->second;
-  else if (isa<SExtInst>(Opnd))
-    OpndType = cast<Instruction>(Opnd)->getOperand(0)->getType();
+  if (It != PromotedInsts.end() && It->second.IsSExt == IsSExt)
+    OpndType = It->second.Ty;
+  else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))
+    OpndType = Opnd->getOperand(0)->getType();
   else
     return false;
 
-  // #2 check that the truncate just drop sign extended bits.
+  // #2 check that the truncate just drop extended bits.
   if (Inst->getType()->getIntegerBitWidth() >= OpndType->getIntegerBitWidth())
     return true;
 
@@ -1776,149 +2140,181 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
 }
 
 TypePromotionHelper::Action TypePromotionHelper::getAction(
-    Instruction *SExt, const SetOfInstrs &InsertedTruncs,
+    Instruction *Ext, const SetOfInstrs &InsertedTruncs,
     const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
-  Instruction *SExtOpnd = dyn_cast<Instruction>(SExt->getOperand(0));
-  Type *SExtTy = SExt->getType();
-  // If the operand of the sign extension is not an instruction, we cannot
+  assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+         "Unexpected instruction type");
+  Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));
+  Type *ExtTy = Ext->getType();
+  bool IsSExt = isa<SExtInst>(Ext);
+  // If the operand of the extension is not an instruction, we cannot
   // get through.
   // If it, check we can get through.
-  if (!SExtOpnd || !canGetThrough(SExtOpnd, SExtTy, PromotedInsts))
+  if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))
     return nullptr;
 
   // Do not promote if the operand has been added by codegenprepare.
   // Otherwise, it means we are undoing an optimization that is likely to be
   // redone, thus causing potential infinite loop.
-  if (isa<TruncInst>(SExtOpnd) && InsertedTruncs.count(SExtOpnd))
+  if (isa<TruncInst>(ExtOpnd) && InsertedTruncs.count(ExtOpnd))
     return nullptr;
 
   // SExt or Trunc instructions.
   // Return the related handler.
-  if (isa<SExtInst>(SExtOpnd) || isa<TruncInst>(SExtOpnd))
-    return promoteOperandForTruncAndSExt;
+  if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) ||
+      isa<ZExtInst>(ExtOpnd))
+    return promoteOperandForTruncAndAnyExt;
 
   // Regular instruction.
   // Abort early if we will have to insert non-free instructions.
-  if (!SExtOpnd->hasOneUse() &&
-      !TLI.isTruncateFree(SExtTy, SExtOpnd->getType()))
+  if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))
     return nullptr;
-  return promoteOperandForOther;
+  return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
 }
 
-Value *TypePromotionHelper::promoteOperandForTruncAndSExt(
+Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
     llvm::Instruction *SExt, TypePromotionTransaction &TPT,
-    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts) {
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+    SmallVectorImpl<Instruction *> *Exts,
+    SmallVectorImpl<Instruction *> *Truncs) {
   // By construction, the operand of SExt is an instruction. Otherwise we cannot
   // get through it and this method should not be called.
   Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
-  // Replace sext(trunc(opnd)) or sext(sext(opnd))
-  // => sext(opnd).
-  TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
+  Value *ExtVal = SExt;
+  if (isa<ZExtInst>(SExtOpnd)) {
+    // Replace s|zext(zext(opnd))
+    // => zext(opnd).
+    Value *ZExt =
+        TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
+    TPT.replaceAllUsesWith(SExt, ZExt);
+    TPT.eraseInstruction(SExt);
+    ExtVal = ZExt;
+  } else {
+    // Replace z|sext(trunc(opnd)) or sext(sext(opnd))
+    // => z|sext(opnd).
+    TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
+  }
   CreatedInsts = 0;
 
   // Remove dead code.
   if (SExtOpnd->use_empty())
     TPT.eraseInstruction(SExtOpnd);
 
-  // Check if the sext is still needed.
-  if (SExt->getType() != SExt->getOperand(0)->getType())
-    return SExt;
+  // Check if the extension is still needed.
+  Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
+  if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
+    if (ExtInst && Exts)
+      Exts->push_back(ExtInst);
+    return ExtVal;
+  }
 
-  // At this point we have: sext ty opnd to ty.
-  // Reassign the uses of SExt to the opnd and remove SExt.
-  Value *NextVal = SExt->getOperand(0);
-  TPT.eraseInstruction(SExt, NextVal);
+  // At this point we have: ext ty opnd to ty.
+  // Reassign the uses of ExtInst to the opnd and remove ExtInst.
+  Value *NextVal = ExtInst->getOperand(0);
+  TPT.eraseInstruction(ExtInst, NextVal);
   return NextVal;
 }
 
-Value *
-TypePromotionHelper::promoteOperandForOther(Instruction *SExt,
-                                            TypePromotionTransaction &TPT,
-                                            InstrToOrigTy &PromotedInsts,
-                                            unsigned &CreatedInsts) {
-  // By construction, the operand of SExt is an instruction. Otherwise we cannot
+Value *TypePromotionHelper::promoteOperandForOther(
+    Instruction *Ext, TypePromotionTransaction &TPT,
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+    SmallVectorImpl<Instruction *> *Exts,
+    SmallVectorImpl<Instruction *> *Truncs, bool IsSExt) {
+  // By construction, the operand of Ext is an instruction. Otherwise we cannot
   // get through it and this method should not be called.
-  Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
+  Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
   CreatedInsts = 0;
-  if (!SExtOpnd->hasOneUse()) {
-    // SExtOpnd will be promoted.
-    // All its uses, but SExt, will need to use a truncated value of the
+  if (!ExtOpnd->hasOneUse()) {
+    // ExtOpnd will be promoted.
+    // All its uses, but Ext, will need to use a truncated value of the
     // promoted version.
     // Create the truncate now.
-    Instruction *Trunc = TPT.createTrunc(SExt, SExtOpnd->getType());
-    Trunc->removeFromParent();
-    // Insert it just after the definition.
-    Trunc->insertAfter(SExtOpnd);
+    Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());
+    if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {
+      ITrunc->removeFromParent();
+      // Insert it just after the definition.
+      ITrunc->insertAfter(ExtOpnd);
+      if (Truncs)
+        Truncs->push_back(ITrunc);
+    }
 
-    TPT.replaceAllUsesWith(SExtOpnd, Trunc);
-    // Restore the operand of SExt (which has been replace by the previous call
+    TPT.replaceAllUsesWith(ExtOpnd, Trunc);
+    // Restore the operand of Ext (which has been replace by the previous call
     // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
-    TPT.setOperand(SExt, 0, SExtOpnd);
+    TPT.setOperand(Ext, 0, ExtOpnd);
   }
 
   // Get through the Instruction:
   // 1. Update its type.
-  // 2. Replace the uses of SExt by Inst.
-  // 3. Sign extend each operand that needs to be sign extended.
+  // 2. Replace the uses of Ext by Inst.
+  // 3. Extend each operand that needs to be extended.
 
   // Remember the original type of the instruction before promotion.
   // This is useful to know that the high bits are sign extended bits.
-  PromotedInsts.insert(
-      std::pair<Instruction *, Type *>(SExtOpnd, SExtOpnd->getType()));
+  PromotedInsts.insert(std::pair<Instruction *, TypeIsSExt>(
+      ExtOpnd, TypeIsSExt(ExtOpnd->getType(), IsSExt)));
   // Step #1.
-  TPT.mutateType(SExtOpnd, SExt->getType());
+  TPT.mutateType(ExtOpnd, Ext->getType());
   // Step #2.
-  TPT.replaceAllUsesWith(SExt, SExtOpnd);
+  TPT.replaceAllUsesWith(Ext, ExtOpnd);
   // Step #3.
-  Instruction *SExtForOpnd = SExt;
+  Instruction *ExtForOpnd = Ext;
 
-  DEBUG(dbgs() << "Propagate SExt to operands\n");
-  for (int OpIdx = 0, EndOpIdx = SExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
+  DEBUG(dbgs() << "Propagate Ext to operands\n");
+  for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
        ++OpIdx) {
-    DEBUG(dbgs() << "Operand:\n" << *(SExtOpnd->getOperand(OpIdx)) << '\n');
-    if (SExtOpnd->getOperand(OpIdx)->getType() == SExt->getType() ||
-        !shouldSExtOperand(SExtOpnd, OpIdx)) {
+    DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
+    if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() ||
+        !shouldExtOperand(ExtOpnd, OpIdx)) {
       DEBUG(dbgs() << "No need to propagate\n");
       continue;
     }
-    // Check if we can statically sign extend the operand.
-    Value *Opnd = SExtOpnd->getOperand(OpIdx);
+    // Check if we can statically extend the operand.
+    Value *Opnd = ExtOpnd->getOperand(OpIdx);
     if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
-      DEBUG(dbgs() << "Statically sign extend\n");
-      TPT.setOperand(
-          SExtOpnd, OpIdx,
-          ConstantInt::getSigned(SExt->getType(), Cst->getSExtValue()));
+      DEBUG(dbgs() << "Statically extend\n");
+      unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
+      APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
+                            : Cst->getValue().zext(BitWidth);
+      TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));
       continue;
     }
     // UndefValue are typed, so we have to statically sign extend them.
     if (isa<UndefValue>(Opnd)) {
-      DEBUG(dbgs() << "Statically sign extend\n");
-      TPT.setOperand(SExtOpnd, OpIdx, UndefValue::get(SExt->getType()));
+      DEBUG(dbgs() << "Statically extend\n");
+      TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
       continue;
     }
 
     // Otherwise we have to explicity sign extend the operand.
-    // Check if SExt was reused to sign extend an operand.
-    if (!SExtForOpnd) {
+    // Check if Ext was reused to extend an operand.
+    if (!ExtForOpnd) {
       // If yes, create a new one.
-      DEBUG(dbgs() << "More operands to sext\n");
-      SExtForOpnd = TPT.createSExt(SExt, Opnd, SExt->getType());
+      DEBUG(dbgs() << "More operands to ext\n");
+      Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
+        : TPT.createZExt(Ext, Opnd, Ext->getType());
+      if (!isa<Instruction>(ValForExtOpnd)) {
+        TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
+        continue;
+      }
+      ExtForOpnd = cast<Instruction>(ValForExtOpnd);
       ++CreatedInsts;
     }
-
-    TPT.setOperand(SExtForOpnd, 0, Opnd);
+    if (Exts)
+      Exts->push_back(ExtForOpnd);
+    TPT.setOperand(ExtForOpnd, 0, Opnd);
 
     // Move the sign extension before the insertion point.
-    TPT.moveBefore(SExtForOpnd, SExtOpnd);
-    TPT.setOperand(SExtOpnd, OpIdx, SExtForOpnd);
+    TPT.moveBefore(ExtForOpnd, ExtOpnd);
+    TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
     // If more sext are required, new instructions will have to be created.
-    SExtForOpnd = nullptr;
+    ExtForOpnd = nullptr;
   }
-  if (SExtForOpnd == SExt) {
-    DEBUG(dbgs() << "Sign extension is useless now\n");
-    TPT.eraseInstruction(SExt);
+  if (ExtForOpnd == Ext) {
+    DEBUG(dbgs() << "Extension is useless now\n");
+    TPT.eraseInstruction(Ext);
   }
-  return SExtOpnd;
+  return ExtOpnd;
 }
 
 /// IsPromotionProfitable - Check whether or not promoting an instruction
@@ -1943,16 +2339,7 @@ AddressingModeMatcher::IsPromotionProfitable(unsigned MatchedSize,
   // The promotion is neutral but it may help folding the sign extension in
   // loads for instance.
   // Check that we did not create an illegal instruction.
-  Instruction *PromotedInst = dyn_cast<Instruction>(PromotedOperand);
-  if (!PromotedInst)
-    return false;
-  int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
-  // If the ISDOpcode is undefined, it was undefined before the promotion.
-  if (!ISDOpcode)
-    return true;
-  // Otherwise, check if the promoted instruction is legal or not.
-  return TLI.isOperationLegalOrCustom(ISDOpcode,
-                                      EVT::getEVT(PromotedInst->getType()));
+  return isPromotedInstructionLegal(TLI, PromotedOperand);
 }
 
 /// MatchOperationAddr - Given an instruction or constant expr, see if we can
@@ -2130,31 +2517,33 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
 
     return true;
   }
-  case Instruction::SExt: {
-    Instruction *SExt = dyn_cast<Instruction>(AddrInst);
-    if (!SExt)
+  case Instruction::SExt:
+  case Instruction::ZExt: {
+    Instruction *Ext = dyn_cast<Instruction>(AddrInst);
+    if (!Ext)
       return false;
 
-    // Try to move this sext out of the way of the addressing mode.
+    // Try to move this ext out of the way of the addressing mode.
     // Ask for a method for doing so.
-    TypePromotionHelper::Action TPH = TypePromotionHelper::getAction(
-        SExt, InsertedTruncs, TLI, PromotedInsts);
+    TypePromotionHelper::Action TPH =
+        TypePromotionHelper::getAction(Ext, InsertedTruncs, TLI, PromotedInsts);
     if (!TPH)
       return false;
 
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
     unsigned CreatedInsts = 0;
-    Value *PromotedOperand = TPH(SExt, TPT, PromotedInsts, CreatedInsts);
+    Value *PromotedOperand =
+        TPH(Ext, TPT, PromotedInsts, CreatedInsts, nullptr, nullptr);
     // SExt has been moved away.
     // Thus either it will be rematched later in the recursive calls or it is
     // gone. Anyway, we must not fold it into the addressing mode at this point.
     // E.g.,
     // op = add opnd, 1
-    // idx = sext op
+    // idx = ext op
     // addr = gep base, idx
     // is now:
-    // promotedOpnd = sext opnd           <- no match here
+    // promotedOpnd = ext opnd            <- no match here
     // op = promoted_add promotedOpnd, 1  <- match (later in recursive calls)
     // addr = gep base, op                <- match
     if (MovedAway)
@@ -2293,10 +2682,10 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
 /// Add the ultimately found memory instructions to MemoryUses.
 static bool FindAllMemoryUses(Instruction *I,
                 SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses,
-                              SmallPtrSet<Instruction*, 16> &ConsideredInsts,
+                              SmallPtrSetImpl<Instruction*> &ConsideredInsts,
                               const TargetLowering &TLI) {
   // If we already considered this instruction, we're done.
-  if (!ConsideredInsts.insert(I))
+  if (!ConsideredInsts.insert(I).second)
     return false;
 
   // If this is an obviously unfoldable instruction, bail out.
@@ -2510,7 +2899,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     worklist.pop_back();
 
     // Break use-def graph loops.
-    if (!Visited.insert(V)) {
+    if (!Visited.insert(V).second) {
       Consensus = nullptr;
       break;
     }
@@ -2700,8 +3089,8 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       if (AddrMode.BaseOffs) {
         Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
         if (ResultIndex) {
-	  // We need to add this separately from the scale above to help with
-	  // SDAG consecutive load/store merging.
+          // We need to add this separately from the scale above to help with
+          // SDAG consecutive load/store merging.
           if (ResultPtr->getType() != I8PtrTy)
             ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
           ResultPtr = Builder.CreateGEP(ResultPtr, ResultIndex, "sunkaddr");
@@ -2844,26 +3233,186 @@ bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
   return MadeChange;
 }
 
+/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or
+/// sign extensions.
+static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
+  assert(!Inst->use_empty() && "Input must have at least one use");
+  const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin());
+  bool IsSExt = isa<SExtInst>(FirstUser);
+  Type *ExtTy = FirstUser->getType();
+  for (const User *U : Inst->users()) {
+    const Instruction *UI = cast<Instruction>(U);
+    if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
+      return false;
+    Type *CurTy = UI->getType();
+    // Same input and output types: Same instruction after CSE.
+    if (CurTy == ExtTy)
+      continue;
+
+    // If IsSExt is true, we are in this situation:
+    // a = Inst
+    // b = sext ty1 a to ty2
+    // c = sext ty1 a to ty3
+    // Assuming ty2 is shorter than ty3, this could be turned into:
+    // a = Inst
+    // b = sext ty1 a to ty2
+    // c = sext ty2 b to ty3
+    // However, the last sext is not free.
+    if (IsSExt)
+      return false;
+
+    // This is a ZExt, maybe this is free to extend from one type to another.
+    // In that case, we would not account for a different use.
+    Type *NarrowTy;
+    Type *LargeTy;
+    if (ExtTy->getScalarType()->getIntegerBitWidth() >
+        CurTy->getScalarType()->getIntegerBitWidth()) {
+      NarrowTy = CurTy;
+      LargeTy = ExtTy;
+    } else {
+      NarrowTy = ExtTy;
+      LargeTy = CurTy;
+    }
+
+    if (!TLI.isZExtFree(NarrowTy, LargeTy))
+      return false;
+  }
+  // All uses are the same or can be derived from one another for free.
+  return true;
+}
+
+/// \brief Try to form ExtLd by promoting \p Exts until they reach a
+/// load instruction.
+/// If an ext(load) can be formed, it is returned via \p LI for the load
+/// and \p Inst for the extension.
+/// Otherwise LI == nullptr and Inst == nullptr.
+/// When some promotion happened, \p TPT contains the proper state to
+/// revert them.
+///
+/// \return true when promoting was necessary to expose the ext(load)
+/// opportunity, false otherwise.
+///
+/// Example:
+/// \code
+/// %ld = load i32* %addr
+/// %add = add nuw i32 %ld, 4
+/// %zext = zext i32 %add to i64
+/// \endcode
+/// =>
+/// \code
+/// %ld = load i32* %addr
+/// %zext = zext i32 %ld to i64
+/// %add = add nuw i64 %zext, 4
+/// \encode
+/// Thanks to the promotion, we can match zext(load i32*) to i64.
+bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT,
+                                    LoadInst *&LI, Instruction *&Inst,
+                                    const SmallVectorImpl<Instruction *> &Exts,
+                                    unsigned CreatedInsts = 0) {
+  // Iterate over all the extensions to see if one form an ext(load).
+  for (auto I : Exts) {
+    // Check if we directly have ext(load).
+    if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) {
+      Inst = I;
+      // No promotion happened here.
+      return false;
+    }
+    // Check whether or not we want to do any promotion.
+    if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)
+      continue;
+    // Get the action to perform the promotion.
+    TypePromotionHelper::Action TPH = TypePromotionHelper::getAction(
+        I, InsertedTruncsSet, *TLI, PromotedInsts);
+    // Check if we can promote.
+    if (!TPH)
+      continue;
+    // Save the current state.
+    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+        TPT.getRestorationPoint();
+    SmallVector<Instruction *, 4> NewExts;
+    unsigned NewCreatedInsts = 0;
+    // Promote.
+    Value *PromotedVal =
+        TPH(I, TPT, PromotedInsts, NewCreatedInsts, &NewExts, nullptr);
+    assert(PromotedVal &&
+           "TypePromotionHelper should have filtered out those cases");
+
+    // We would be able to merge only one extension in a load.
+    // Therefore, if we have more than 1 new extension we heuristically
+    // cut this search path, because it means we degrade the code quality.
+    // With exactly 2, the transformation is neutral, because we will merge
+    // one extension but leave one. However, we optimistically keep going,
+    // because the new extension may be removed too.
+    unsigned TotalCreatedInsts = CreatedInsts + NewCreatedInsts;
+    if (!StressExtLdPromotion &&
+        (TotalCreatedInsts > 1 ||
+         !isPromotedInstructionLegal(*TLI, PromotedVal))) {
+      // The promotion is not profitable, rollback to the previous state.
+      TPT.rollback(LastKnownGood);
+      continue;
+    }
+    // The promotion is profitable.
+    // Check if it exposes an ext(load).
+    (void)ExtLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInsts);
+    if (LI && (StressExtLdPromotion || NewCreatedInsts == 0 ||
+               // If we have created a new extension, i.e., now we have two
+               // extensions. We must make sure one of them is merged with
+               // the load, otherwise we may degrade the code quality.
+               (LI->hasOneUse() || hasSameExtUse(LI, *TLI))))
+      // Promotion happened.
+      return true;
+    // If this does not help to expose an ext(load) then, rollback.
+    TPT.rollback(LastKnownGood);
+  }
+  // None of the extension can form an ext(load).
+  LI = nullptr;
+  Inst = nullptr;
+  return false;
+}
+
 /// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same
 /// basic block as the load, unless conditions are unfavorable. This allows
 /// SelectionDAG to fold the extend into the load.
+/// \p I[in/out] the extension may be modified during the process if some
+/// promotions apply.
 ///
-bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) {
+bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) {
+  // Try to promote a chain of computation if it allows to form
+  // an extended load.
+  TypePromotionTransaction TPT;
+  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+    TPT.getRestorationPoint();
+  SmallVector<Instruction *, 1> Exts;
+  Exts.push_back(I);
   // Look for a load being extended.
-  LoadInst *LI = dyn_cast<LoadInst>(I->getOperand(0));
-  if (!LI) return false;
+  LoadInst *LI = nullptr;
+  Instruction *OldExt = I;
+  bool HasPromoted = ExtLdPromotion(TPT, LI, I, Exts);
+  if (!LI || !I) {
+    assert(!HasPromoted && !LI && "If we did not match any load instruction "
+                                  "the code must remain the same");
+    I = OldExt;
+    return false;
+  }
 
   // If they're already in the same block, there's nothing to do.
-  if (LI->getParent() == I->getParent())
+  // Make the cheap checks first if we did not promote.
+  // If we promoted, we need to check if it is indeed profitable.
+  if (!HasPromoted && LI->getParent() == I->getParent())
     return false;
 
+  EVT VT = TLI->getValueType(I->getType());
+  EVT LoadVT = TLI->getValueType(LI->getType());
+
   // If the load has other users and the truncate is not free, this probably
   // isn't worthwhile.
-  if (!LI->hasOneUse() &&
-      TLI && (TLI->isTypeLegal(TLI->getValueType(LI->getType())) ||
-              !TLI->isTypeLegal(TLI->getValueType(I->getType()))) &&
-      !TLI->isTruncateFree(I->getType(), LI->getType()))
+  if (!LI->hasOneUse() && TLI &&
+      (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) &&
+      !TLI->isTruncateFree(I->getType(), LI->getType())) {
+    I = OldExt;
+    TPT.rollback(LastKnownGood);
     return false;
+  }
 
   // Check whether the target supports casts folded into loads.
   unsigned LType;
@@ -2873,11 +3422,15 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) {
     assert(isa<SExtInst>(I) && "Unexpected ext type!");
     LType = ISD::SEXTLOAD;
   }
-  if (TLI && !TLI->isLoadExtLegal(LType, TLI->getValueType(LI->getType())))
+  if (TLI && !TLI->isLoadExtLegal(LType, VT, LoadVT)) {
+    I = OldExt;
+    TPT.rollback(LastKnownGood);
     return false;
+  }
 
   // Move the extend into the same block as the load, so that SelectionDAG
   // can fold it.
+  TPT.commit();
   I->removeFromParent();
   I->insertAfter(LI);
   ++NumExtsMoved;
@@ -3109,7 +3662,511 @@ bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
   return MadeChange;
 }
 
-bool CodeGenPrepare::OptimizeInst(Instruction *I) {
+namespace {
+/// \brief Helper class to promote a scalar operation to a vector one.
+/// This class is used to move downward extractelement transition.
+/// E.g.,
+/// a = vector_op <2 x i32>
+/// b = extractelement <2 x i32> a, i32 0
+/// c = scalar_op b
+/// store c
+///
+/// =>
+/// a = vector_op <2 x i32>
+/// c = vector_op a (equivalent to scalar_op on the related lane)
+/// * d = extractelement <2 x i32> c, i32 0
+/// * store d
+/// Assuming both extractelement and store can be combine, we get rid of the
+/// transition.
+class VectorPromoteHelper {
+  /// Used to perform some checks on the legality of vector operations.
+  const TargetLowering &TLI;
+
+  /// Used to estimated the cost of the promoted chain.
+  const TargetTransformInfo &TTI;
+
+  /// The transition being moved downwards.
+  Instruction *Transition;
+  /// The sequence of instructions to be promoted.
+  SmallVector<Instruction *, 4> InstsToBePromoted;
+  /// Cost of combining a store and an extract.
+  unsigned StoreExtractCombineCost;
+  /// Instruction that will be combined with the transition.
+  Instruction *CombineInst;
+
+  /// \brief The instruction that represents the current end of the transition.
+  /// Since we are faking the promotion until we reach the end of the chain
+  /// of computation, we need a way to get the current end of the transition.
+  Instruction *getEndOfTransition() const {
+    if (InstsToBePromoted.empty())
+      return Transition;
+    return InstsToBePromoted.back();
+  }
+
+  /// \brief Return the index of the original value in the transition.
+  /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
+  /// c, is at index 0.
+  unsigned getTransitionOriginalValueIdx() const {
+    assert(isa<ExtractElementInst>(Transition) &&
+           "Other kind of transitions are not supported yet");
+    return 0;
+  }
+
+  /// \brief Return the index of the index in the transition.
+  /// E.g., for "extractelement <2 x i32> c, i32 0" the index
+  /// is at index 1.
+  unsigned getTransitionIdx() const {
+    assert(isa<ExtractElementInst>(Transition) &&
+           "Other kind of transitions are not supported yet");
+    return 1;
+  }
+
+  /// \brief Get the type of the transition.
+  /// This is the type of the original value.
+  /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
+  /// transition is <2 x i32>.
+  Type *getTransitionType() const {
+    return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
+  }
+
+  /// \brief Promote \p ToBePromoted by moving \p Def downward through.
+  /// I.e., we have the following sequence:
+  /// Def = Transition <ty1> a to <ty2>
+  /// b = ToBePromoted <ty2> Def, ...
+  /// =>
+  /// b = ToBePromoted <ty1> a, ...
+  /// Def = Transition <ty1> ToBePromoted to <ty2>
+  void promoteImpl(Instruction *ToBePromoted);
+
+  /// \brief Check whether or not it is profitable to promote all the
+  /// instructions enqueued to be promoted.
+  bool isProfitableToPromote() {
+    Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
+    unsigned Index = isa<ConstantInt>(ValIdx)
+                         ? cast<ConstantInt>(ValIdx)->getZExtValue()
+                         : -1;
+    Type *PromotedType = getTransitionType();
+
+    StoreInst *ST = cast<StoreInst>(CombineInst);
+    unsigned AS = ST->getPointerAddressSpace();
+    unsigned Align = ST->getAlignment();
+    // Check if this store is supported.
+    if (!TLI.allowsMisalignedMemoryAccesses(
+            TLI.getValueType(ST->getValueOperand()->getType()), AS, Align)) {
+      // If this is not supported, there is no way we can combine
+      // the extract with the store.
+      return false;
+    }
+
+    // The scalar chain of computation has to pay for the transition
+    // scalar to vector.
+    // The vector chain has to account for the combining cost.
+    uint64_t ScalarCost =
+        TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
+    uint64_t VectorCost = StoreExtractCombineCost;
+    for (const auto &Inst : InstsToBePromoted) {
+      // Compute the cost.
+      // By construction, all instructions being promoted are arithmetic ones.
+      // Moreover, one argument is a constant that can be viewed as a splat
+      // constant.
+      Value *Arg0 = Inst->getOperand(0);
+      bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||
+                            isa<ConstantFP>(Arg0);
+      TargetTransformInfo::OperandValueKind Arg0OVK =
+          IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
+                         : TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Arg1OVK =
+          !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
+                          : TargetTransformInfo::OK_AnyValue;
+      ScalarCost += TTI.getArithmeticInstrCost(
+          Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);
+      VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
+                                               Arg0OVK, Arg1OVK);
+    }
+    DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
+                 << ScalarCost << "\nVector: " << VectorCost << '\n');
+    return ScalarCost > VectorCost;
+  }
+
+  /// \brief Generate a constant vector with \p Val with the same
+  /// number of elements as the transition.
+  /// \p UseSplat defines whether or not \p Val should be replicated
+  /// accross the whole vector.
+  /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
+  /// otherwise we generate a vector with as many undef as possible:
+  /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
+  /// used at the index of the extract.
+  Value *getConstantVector(Constant *Val, bool UseSplat) const {
+    unsigned ExtractIdx = UINT_MAX;
+    if (!UseSplat) {
+      // If we cannot determine where the constant must be, we have to
+      // use a splat constant.
+      Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
+      if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
+        ExtractIdx = CstVal->getSExtValue();
+      else
+        UseSplat = true;
+    }
+
+    unsigned End = getTransitionType()->getVectorNumElements();
+    if (UseSplat)
+      return ConstantVector::getSplat(End, Val);
+
+    SmallVector<Constant *, 4> ConstVec;
+    UndefValue *UndefVal = UndefValue::get(Val->getType());
+    for (unsigned Idx = 0; Idx != End; ++Idx) {
+      if (Idx == ExtractIdx)
+        ConstVec.push_back(Val);
+      else
+        ConstVec.push_back(UndefVal);
+    }
+    return ConstantVector::get(ConstVec);
+  }
+
+  /// \brief Check if promoting to a vector type an operand at \p OperandIdx
+  /// in \p Use can trigger undefined behavior.
+  static bool canCauseUndefinedBehavior(const Instruction *Use,
+                                        unsigned OperandIdx) {
+    // This is not safe to introduce undef when the operand is on
+    // the right hand side of a division-like instruction.
+    if (OperandIdx != 1)
+      return false;
+    switch (Use->getOpcode()) {
+    default:
+      return false;
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::SRem:
+    case Instruction::URem:
+      return true;
+    case Instruction::FDiv:
+    case Instruction::FRem:
+      return !Use->hasNoNaNs();
+    }
+    llvm_unreachable(nullptr);
+  }
+
+public:
+  VectorPromoteHelper(const TargetLowering &TLI, const TargetTransformInfo &TTI,
+                      Instruction *Transition, unsigned CombineCost)
+      : TLI(TLI), TTI(TTI), Transition(Transition),
+        StoreExtractCombineCost(CombineCost), CombineInst(nullptr) {
+    assert(Transition && "Do not know how to promote null");
+  }
+
+  /// \brief Check if we can promote \p ToBePromoted to \p Type.
+  bool canPromote(const Instruction *ToBePromoted) const {
+    // We could support CastInst too.
+    return isa<BinaryOperator>(ToBePromoted);
+  }
+
+  /// \brief Check if it is profitable to promote \p ToBePromoted
+  /// by moving downward the transition through.
+  bool shouldPromote(const Instruction *ToBePromoted) const {
+    // Promote only if all the operands can be statically expanded.
+    // Indeed, we do not want to introduce any new kind of transitions.
+    for (const Use &U : ToBePromoted->operands()) {
+      const Value *Val = U.get();
+      if (Val == getEndOfTransition()) {
+        // If the use is a division and the transition is on the rhs,
+        // we cannot promote the operation, otherwise we may create a
+        // division by zero.
+        if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
+          return false;
+        continue;
+      }
+      if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
+          !isa<ConstantFP>(Val))
+        return false;
+    }
+    // Check that the resulting operation is legal.
+    int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
+    if (!ISDOpcode)
+      return false;
+    return StressStoreExtract ||
+           TLI.isOperationLegalOrCustom(
+               ISDOpcode, TLI.getValueType(getTransitionType(), true));
+  }
+
+  /// \brief Check whether or not \p Use can be combined
+  /// with the transition.
+  /// I.e., is it possible to do Use(Transition) => AnotherUse?
+  bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
+
+  /// \brief Record \p ToBePromoted as part of the chain to be promoted.
+  void enqueueForPromotion(Instruction *ToBePromoted) {
+    InstsToBePromoted.push_back(ToBePromoted);
+  }
+
+  /// \brief Set the instruction that will be combined with the transition.
+  void recordCombineInstruction(Instruction *ToBeCombined) {
+    assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
+    CombineInst = ToBeCombined;
+  }
+
+  /// \brief Promote all the instructions enqueued for promotion if it is
+  /// is profitable.
+  /// \return True if the promotion happened, false otherwise.
+  bool promote() {
+    // Check if there is something to promote.
+    // Right now, if we do not have anything to combine with,
+    // we assume the promotion is not profitable.
+    if (InstsToBePromoted.empty() || !CombineInst)
+      return false;
+
+    // Check cost.
+    if (!StressStoreExtract && !isProfitableToPromote())
+      return false;
+
+    // Promote.
+    for (auto &ToBePromoted : InstsToBePromoted)
+      promoteImpl(ToBePromoted);
+    InstsToBePromoted.clear();
+    return true;
+  }
+};
+} // End of anonymous namespace.
+
+void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
+  // At this point, we know that all the operands of ToBePromoted but Def
+  // can be statically promoted.
+  // For Def, we need to use its parameter in ToBePromoted:
+  // b = ToBePromoted ty1 a
+  // Def = Transition ty1 b to ty2
+  // Move the transition down.
+  // 1. Replace all uses of the promoted operation by the transition.
+  // = ... b => = ... Def.
+  assert(ToBePromoted->getType() == Transition->getType() &&
+         "The type of the result of the transition does not match "
+         "the final type");
+  ToBePromoted->replaceAllUsesWith(Transition);
+  // 2. Update the type of the uses.
+  // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
+  Type *TransitionTy = getTransitionType();
+  ToBePromoted->mutateType(TransitionTy);
+  // 3. Update all the operands of the promoted operation with promoted
+  // operands.
+  // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
+  for (Use &U : ToBePromoted->operands()) {
+    Value *Val = U.get();
+    Value *NewVal = nullptr;
+    if (Val == Transition)
+      NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
+    else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
+             isa<ConstantFP>(Val)) {
+      // Use a splat constant if it is not safe to use undef.
+      NewVal = getConstantVector(
+          cast<Constant>(Val),
+          isa<UndefValue>(Val) ||
+              canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
+    } else
+      llvm_unreachable("Did you modified shouldPromote and forgot to update "
+                       "this?");
+    ToBePromoted->setOperand(U.getOperandNo(), NewVal);
+  }
+  Transition->removeFromParent();
+  Transition->insertAfter(ToBePromoted);
+  Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
+}
+
+// See if we can speculate calls to intrinsic cttz/ctlz.
+//
+// Example:
+// entry:
+//   ...
+//   %cmp = icmp eq i64 %val, 0
+//   br i1 %cmp, label %end.bb, label %then.bb
+//
+// then.bb:
+//   %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 true)
+//   br label %EndBB
+//
+// end.bb:
+//   %cond = phi i64 [ %c, %then.bb ], [ 64, %entry ]
+//
+// ==>
+//
+// entry:
+//   ...
+//   %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 false)
+//
+static bool OptimizeBranchInst(BranchInst *BrInst, const TargetLowering &TLI) {
+  assert(BrInst->isConditional() && "Expected a conditional branch!");
+  BasicBlock *ThenBB = BrInst->getSuccessor(1);
+  BasicBlock *EndBB = BrInst->getSuccessor(0);
+
+  // See if ThenBB contains only one instruction (excluding the
+  // terminator and DbgInfoIntrinsic calls).
+  IntrinsicInst *II = nullptr;
+  CastInst *CI = nullptr;
+  for (BasicBlock::iterator I = ThenBB->begin(),
+                            E = std::prev(ThenBB->end()); I != E; ++I) {
+    // Skip debug info.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    // Check if this is a zero extension or a truncate of a previously
+    // matched call to intrinsic cttz/ctlz.
+    if (II) {
+      // Early exit if we already found a "free" zero extend/truncate.
+      if (CI)
+        return false;
+
+      Type *SrcTy = II->getType();
+      Type *DestTy = I->getType();
+      Value *V;
+ 
+      if (match(cast<Instruction>(I), m_ZExt(m_Value(V))) && V == II) {
+        // Speculate this zero extend only if it is "free" for the target.
+        if (TLI.isZExtFree(SrcTy, DestTy)) {
+          CI = cast<CastInst>(I);
+          continue;
+        }
+      } else if (match(cast<Instruction>(I), m_Trunc(m_Value(V))) && V == II) {
+        // Speculate this truncate only if it is "free" for the target.
+        if (TLI.isTruncateFree(SrcTy, DestTy)) {
+          CI = cast<CastInst>(I);
+          continue;
+        }
+      } else {
+        // Avoid speculating more than one instruction.
+        return false;
+      }
+    }
+
+    // See if this is a call to intrinsic cttz/ctlz.
+    if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::cttz>())) {
+      // Avoid speculating expensive intrinsic calls.
+      if (!TLI.isCheapToSpeculateCttz())
+        return false;
+    }
+    else if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::ctlz>())) {
+      // Avoid speculating expensive intrinsic calls.
+      if (!TLI.isCheapToSpeculateCtlz())
+        return false;
+    } else
+      return false;
+    
+    II = cast<IntrinsicInst>(I);
+  }
+
+  // Look for PHI nodes with 'II' as the incoming value from 'ThenBB'.
+  BasicBlock *EntryBB = BrInst->getParent();
+  for (BasicBlock::iterator I = EndBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
+    Value *OrigV = PN->getIncomingValueForBlock(EntryBB);
+
+    if (!OrigV)
+      return false;
+
+    if (ThenV != II && (!CI || ThenV != CI))
+      return false;
+    
+    if (ConstantInt *CInt = dyn_cast<ConstantInt>(OrigV)) {
+      unsigned BitWidth = II->getType()->getIntegerBitWidth();
+
+      // Don't try to simplify this phi node if 'ThenV' is a cttz/ctlz
+      // intrinsic call, but 'OrigV' is not equal to the 'size-of' in bits
+      // of the value in input to the cttz/ctlz.
+      if (CInt->getValue() != BitWidth)
+        return false;
+
+      // Hoist the call to cttz/ctlz from ThenBB into EntryBB.
+      EntryBB->getInstList().splice(BrInst, ThenBB->getInstList(),
+                                    ThenBB->begin(), std::prev(ThenBB->end()));
+ 
+      // Update PN setting ThenV as the incoming value from both 'EntryBB'
+      // and 'ThenBB'. Eventually, method 'OptimizeInst' will fold this
+      // phi node if all the incoming values are the same.
+      PN->setIncomingValue(PN->getBasicBlockIndex(EntryBB), ThenV);
+      PN->setIncomingValue(PN->getBasicBlockIndex(ThenBB), ThenV);
+
+      // Clear the 'undef on zero' flag of the cttz/ctlz intrinsic call.
+      if (cast<ConstantInt>(II->getArgOperand(1))->isOne()) {
+        Type *Ty = II->getArgOperand(0)->getType();
+        Value *Args[] = { II->getArgOperand(0),
+                          ConstantInt::getFalse(II->getContext()) };
+        Module *M = EntryBB->getParent()->getParent();
+        Value *IF = Intrinsic::getDeclaration(M, II->getIntrinsicID(), Ty);
+        IRBuilder<> Builder(II);
+        Instruction *NewI = Builder.CreateCall(IF, Args);
+
+        // Replace the old call to cttz/ctlz.
+        II->replaceAllUsesWith(NewI);
+        II->eraseFromParent();
+      }
+ 
+      // Update BrInst condition so that the branch to EndBB is always taken.
+      // Later on, method 'ConstantFoldTerminator' will simplify this branch
+      // replacing it with a direct branch to 'EndBB'.
+      // As a side effect, CodeGenPrepare will attempt to simplify the control
+      // flow graph by deleting basic block 'ThenBB' and merging 'EntryBB' into
+      // 'EndBB' (calling method 'EliminateFallThrough').
+      BrInst->setCondition(ConstantInt::getTrue(BrInst->getContext()));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// Some targets can do store(extractelement) with one instruction.
+/// Try to push the extractelement towards the stores when the target
+/// has this feature and this is profitable.
+bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
+  unsigned CombineCost = UINT_MAX;
+  if (DisableStoreExtract || !TLI ||
+      (!StressStoreExtract &&
+       !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
+                                       Inst->getOperand(1), CombineCost)))
+    return false;
+
+  // At this point we know that Inst is a vector to scalar transition.
+  // Try to move it down the def-use chain, until:
+  // - We can combine the transition with its single use
+  //   => we got rid of the transition.
+  // - We escape the current basic block
+  //   => we would need to check that we are moving it at a cheaper place and
+  //      we do not do that for now.
+  BasicBlock *Parent = Inst->getParent();
+  DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
+  VectorPromoteHelper VPH(*TLI, *TTI, Inst, CombineCost);
+  // If the transition has more than one use, assume this is not going to be
+  // beneficial.
+  while (Inst->hasOneUse()) {
+    Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
+    DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
+
+    if (ToBePromoted->getParent() != Parent) {
+      DEBUG(dbgs() << "Instruction to promote is in a different block ("
+                   << ToBePromoted->getParent()->getName()
+                   << ") than the transition (" << Parent->getName() << ").\n");
+      return false;
+    }
+
+    if (VPH.canCombine(ToBePromoted)) {
+      DEBUG(dbgs() << "Assume " << *Inst << '\n'
+                   << "will be combined with: " << *ToBePromoted << '\n');
+      VPH.recordCombineInstruction(ToBePromoted);
+      bool Changed = VPH.promote();
+      NumStoreExtractExposed += Changed;
+      return Changed;
+    }
+
+    DEBUG(dbgs() << "Try promoting.\n");
+    if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
+      return false;
+
+    DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
+
+    VPH.enqueueForPromotion(ToBePromoted);
+    Inst = ToBePromoted;
+  }
+  return false;
+}
+
+bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
   if (PHINode *P = dyn_cast<PHINode>(I)) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
@@ -3188,14 +4245,14 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
       GEPI->replaceAllUsesWith(NC);
       GEPI->eraseFromParent();
       ++NumGEPsElim;
-      OptimizeInst(NC);
+      OptimizeInst(NC, ModifiedDT);
       return true;
     }
     return false;
   }
 
   if (CallInst *CI = dyn_cast<CallInst>(I))
-    return OptimizeCallInst(CI);
+    return OptimizeCallInst(CI, ModifiedDT);
 
   if (SelectInst *SI = dyn_cast<SelectInst>(I))
     return OptimizeSelectInst(SI);
@@ -3203,20 +4260,53 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
   if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
     return OptimizeShuffleVectorInst(SVI);
 
+  if (isa<ExtractElementInst>(I))
+    return OptimizeExtractElementInst(I);
+
+  if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+    if (TLI && BI->isConditional() && BI->getCondition()->hasOneUse()) {
+      // Check if the branch condition compares a value agaist zero.
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) {
+        if (ICI->getPredicate() == ICmpInst::ICMP_EQ &&
+            match(ICI->getOperand(1), m_Zero())) {
+          BasicBlock *ThenBB = BI->getSuccessor(1);
+          BasicBlock *EndBB = BI->getSuccessor(0);
+
+          // Check if ThenBB is only reachable from this basic block; also,
+          // check if EndBB has more than one predecessor.
+          if (ThenBB->getSinglePredecessor() &&
+              !EndBB->getSinglePredecessor()) {
+            TerminatorInst *TI = ThenBB->getTerminator();
+
+            if (TI->getNumSuccessors() == 1 && TI->getSuccessor(0) == EndBB &&
+                // Try to speculate calls to intrinsic cttz/ctlz from 'ThenBB'.
+                OptimizeBranchInst(BI, *TLI)) {
+              ModifiedDT = true;
+              return true;
+            }
+          }
+        }
+      }
+    }
+    return false;
+  }
+
   return false;
 }
 
 // In this pass we look for GEP and cast instructions that are used
 // across basic blocks and rewrite them to improve basic-block-at-a-time
 // selection.
-bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
+bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
   SunkAddrs.clear();
   bool MadeChange = false;
 
   CurInstIterator = BB.begin();
-  while (CurInstIterator != BB.end())
-    MadeChange |= OptimizeInst(CurInstIterator++);
-
+  while (CurInstIterator != BB.end()) {
+    MadeChange |= OptimizeInst(CurInstIterator++, ModifiedDT);
+    if (ModifiedDT)
+      return true;
+  }
   MadeChange |= DupRetToEnableTailCallOpts(&BB);
 
   return MadeChange;
@@ -3227,10 +4317,10 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
 // find a node corresponding to the value.
 bool CodeGenPrepare::PlaceDbgValues(Function &F) {
   bool MadeChange = false;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+  for (BasicBlock &BB : F) {
     Instruction *PrevNonDbgInst = nullptr;
-    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
-      Instruction *Insn = BI; ++BI;
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+      Instruction *Insn = BI++;
       DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
       // Leave dbg.values that refer to an alloca alone. These
       // instrinsics describe the address of a variable (= the alloca)
@@ -3324,3 +4414,233 @@ bool CodeGenPrepare::sinkAndCmp(Function &F) {
   }
   return MadeChange;
 }
+
+/// \brief Retrieve the probabilities of a conditional branch. Returns true on
+/// success, or returns false if no or invalid metadata was found.
+static bool extractBranchMetadata(BranchInst *BI,
+                                  uint64_t &ProbTrue, uint64_t &ProbFalse) {
+  assert(BI->isConditional() &&
+         "Looking for probabilities on unconditional branch?");
+  auto *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
+  if (!ProfileData || ProfileData->getNumOperands() != 3)
+    return false;
+
+  const auto *CITrue =
+      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(1));
+  const auto *CIFalse =
+      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2));
+  if (!CITrue || !CIFalse)
+    return false;
+
+  ProbTrue = CITrue->getValue().getZExtValue();
+  ProbFalse = CIFalse->getValue().getZExtValue();
+
+  return true;
+}
+
+/// \brief Scale down both weights to fit into uint32_t.
+static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
+  uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
+  uint32_t Scale = (NewMax / UINT32_MAX) + 1;
+  NewTrue = NewTrue / Scale;
+  NewFalse = NewFalse / Scale;
+}
+
+/// \brief Some targets prefer to split a conditional branch like:
+/// \code
+///   %0 = icmp ne i32 %a, 0
+///   %1 = icmp ne i32 %b, 0
+///   %or.cond = or i1 %0, %1
+///   br i1 %or.cond, label %TrueBB, label %FalseBB
+/// \endcode
+/// into multiple branch instructions like:
+/// \code
+///   bb1:
+///     %0 = icmp ne i32 %a, 0
+///     br i1 %0, label %TrueBB, label %bb2
+///   bb2:
+///     %1 = icmp ne i32 %b, 0
+///     br i1 %1, label %TrueBB, label %FalseBB
+/// \endcode
+/// This usually allows instruction selection to do even further optimizations
+/// and combine the compare with the branch instruction. Currently this is
+/// applied for targets which have "cheap" jump instructions.
+///
+/// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
+///
+bool CodeGenPrepare::splitBranchCondition(Function &F) {
+  if (!TM || TM->Options.EnableFastISel != true ||
+      !TLI || TLI->isJumpExpensive())
+    return false;
+
+  bool MadeChange = false;
+  for (auto &BB : F) {
+    // Does this BB end with the following?
+    //   %cond1 = icmp|fcmp|binary instruction ...
+    //   %cond2 = icmp|fcmp|binary instruction ...
+    //   %cond.or = or|and i1 %cond1, cond2
+    //   br i1 %cond.or label %dest1, label %dest2"
+    BinaryOperator *LogicOp;
+    BasicBlock *TBB, *FBB;
+    if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
+      continue;
+
+    unsigned Opc;
+    Value *Cond1, *Cond2;
+    if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
+                             m_OneUse(m_Value(Cond2)))))
+      Opc = Instruction::And;
+    else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
+                                 m_OneUse(m_Value(Cond2)))))
+      Opc = Instruction::Or;
+    else
+      continue;
+
+    if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) ||
+        !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp()))   )
+      continue;
+
+    DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
+
+    // Create a new BB.
+    auto *InsertBefore = std::next(Function::iterator(BB))
+        .getNodePtrUnchecked();
+    auto TmpBB = BasicBlock::Create(BB.getContext(),
+                                    BB.getName() + ".cond.split",
+                                    BB.getParent(), InsertBefore);
+
+    // Update original basic block by using the first condition directly by the
+    // branch instruction and removing the no longer needed and/or instruction.
+    auto *Br1 = cast<BranchInst>(BB.getTerminator());
+    Br1->setCondition(Cond1);
+    LogicOp->eraseFromParent();
+
+    // Depending on the conditon we have to either replace the true or the false
+    // successor of the original branch instruction.
+    if (Opc == Instruction::And)
+      Br1->setSuccessor(0, TmpBB);
+    else
+      Br1->setSuccessor(1, TmpBB);
+
+    // Fill in the new basic block.
+    auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
+    if (auto *I = dyn_cast<Instruction>(Cond2)) {
+      I->removeFromParent();
+      I->insertBefore(Br2);
+    }
+
+    // Update PHI nodes in both successors. The original BB needs to be
+    // replaced in one succesor's PHI nodes, because the branch comes now from
+    // the newly generated BB (NewBB). In the other successor we need to add one
+    // incoming edge to the PHI nodes, because both branch instructions target
+    // now the same successor. Depending on the original branch condition
+    // (and/or) we have to swap the successors (TrueDest, FalseDest), so that
+    // we perfrom the correct update for the PHI nodes.
+    // This doesn't change the successor order of the just created branch
+    // instruction (or any other instruction).
+    if (Opc == Instruction::Or)
+      std::swap(TBB, FBB);
+
+    // Replace the old BB with the new BB.
+    for (auto &I : *TBB) {
+      PHINode *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+      int i;
+      while ((i = PN->getBasicBlockIndex(&BB)) >= 0)
+        PN->setIncomingBlock(i, TmpBB);
+    }
+
+    // Add another incoming edge form the new BB.
+    for (auto &I : *FBB) {
+      PHINode *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+      auto *Val = PN->getIncomingValueForBlock(&BB);
+      PN->addIncoming(Val, TmpBB);
+    }
+
+    // Update the branch weights (from SelectionDAGBuilder::
+    // FindMergedConditions).
+    if (Opc == Instruction::Or) {
+      // Codegen X | Y as:
+      // BB1:
+      //   jmp_if_X TBB
+      //   jmp TmpBB
+      // TmpBB:
+      //   jmp_if_Y TBB
+      //   jmp FBB
+      //
+
+      // We have flexibility in setting Prob for BB1 and Prob for NewBB.
+      // The requirement is that
+      //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
+      //     = TrueProb for orignal BB.
+      // Assuming the orignal weights are A and B, one choice is to set BB1's
+      // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
+      // assumes that
+      //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
+      // Another choice is to assume TrueProb for BB1 equals to TrueProb for
+      // TmpBB, but the math is more complicated.
+      uint64_t TrueWeight, FalseWeight;
+      if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) {
+        uint64_t NewTrueWeight = TrueWeight;
+        uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+
+        NewTrueWeight = TrueWeight;
+        NewFalseWeight = 2 * FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+      }
+    } else {
+      // Codegen X & Y as:
+      // BB1:
+      //   jmp_if_X TmpBB
+      //   jmp FBB
+      // TmpBB:
+      //   jmp_if_Y TBB
+      //   jmp FBB
+      //
+      //  This requires creation of TmpBB after CurBB.
+
+      // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+      // The requirement is that
+      //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
+      //     = FalseProb for orignal BB.
+      // Assuming the orignal weights are A and B, one choice is to set BB1's
+      // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
+      // assumes that
+      //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
+      uint64_t TrueWeight, FalseWeight;
+      if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) {
+        uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
+        uint64_t NewFalseWeight = FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+
+        NewTrueWeight = 2 * TrueWeight;
+        NewFalseWeight = FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+      }
+    }
+
+    // Request DOM Tree update.
+    // Note: No point in getting fancy here, since the DT info is never
+    // available to CodeGenPrepare and the existing update code is broken
+    // anyways.
+    ModifiedDT = true;
+
+    MadeChange = true;
+
+    DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
+          TmpBB->dump());
+  }
+  return MadeChange;
+}
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index d3ffcc78471b..3d62d4887602 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -20,24 +20,20 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "post-RA-sched"
 
-CriticalAntiDepBreaker::
-CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo &RCI) :
-  AntiDepBreaker(), MF(MFi),
-  MRI(MF.getRegInfo()),
-  TII(MF.getTarget().getInstrInfo()),
-  TRI(MF.getTarget().getRegisterInfo()),
-  RegClassInfo(RCI),
-  Classes(TRI->getNumRegs(), nullptr),
-  KillIndices(TRI->getNumRegs(), 0),
-  DefIndices(TRI->getNumRegs(), 0),
-  KeepRegs(TRI->getNumRegs(), false) {}
+CriticalAntiDepBreaker::CriticalAntiDepBreaker(MachineFunction &MFi,
+                                               const RegisterClassInfo &RCI)
+    : AntiDepBreaker(), MF(MFi), MRI(MF.getRegInfo()),
+      TII(MF.getSubtarget().getInstrInfo()),
+      TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RCI),
+      Classes(TRI->getNumRegs(), nullptr), KillIndices(TRI->getNumRegs(), 0),
+      DefIndices(TRI->getNumRegs(), 0), KeepRegs(TRI->getNumRegs(), false) {}
 
 CriticalAntiDepBreaker::~CriticalAntiDepBreaker() {
 }
@@ -94,7 +90,14 @@ void CriticalAntiDepBreaker::FinishBlock() {
 
 void CriticalAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
                                      unsigned InsertPosIndex) {
-  if (MI->isDebugValue())
+  // Kill instructions can define registers but are really nops, and there might
+  // be a real definition earlier that needs to be paired with uses dominated by
+  // this kill.
+
+  // FIXME: It may be possible to remove the isKill() restriction once PR18663
+  // has been properly fixed. There can be value in processing kills as seen in
+  // the AggressiveAntiDepBreaker class.
+  if (MI->isDebugValue() || MI->isKill())
     return;
   assert(Count < InsertPosIndex && "Instruction index out of expected range!");
 
@@ -237,6 +240,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
   // Update liveness.
   // Proceeding upwards, registers that are defed but not used in this
   // instruction are now dead.
+  assert(!MI->isKill() && "Attempting to scan a kill instruction");
 
   if (!TII->isPredicated(MI)) {
     // Predicated defs are modeled as read + write, i.e. similar to two
@@ -265,19 +269,10 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
       // Ignore two-addr defs.
       if (MI->isRegTiedToUseOperand(i)) continue;
 
-      // FIXME: we should use a SubRegIterator that includes self (as above), so
-      // we don't have to repeat all this code for the reg itself.
-      DefIndices[Reg] = Count;
-      KillIndices[Reg] = ~0u;
-      assert(((KillIndices[Reg] == ~0u) !=
-              (DefIndices[Reg] == ~0u)) &&
-             "Kill and Def maps aren't consistent for Reg!");
-      KeepRegs.reset(Reg);
-      Classes[Reg] = nullptr;
-      RegRefs.erase(Reg);
-      // Repeat, for all subregs.
-      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
-        unsigned SubregReg = *SubRegs;
+      // For the reg itself and all subregs: update the def to current;
+      // reset the kill state, any restrictions, and references.
+      for (MCSubRegIterator SRI(Reg, TRI, true); SRI.isValid(); ++SRI) {
+        unsigned SubregReg = *SRI;
         DefIndices[SubregReg] = Count;
         KillIndices[SubregReg] = ~0u;
         KeepRegs.reset(SubregReg);
@@ -309,19 +304,9 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
 
     RegRefs.insert(std::make_pair(Reg, &MO));
 
-    // FIXME: we should use an MCRegAliasIterator that includes self so we don't
-    // have to repeat all this code for the reg itself.
-    
     // It wasn't previously live but now it is, this is a kill.
-    if (KillIndices[Reg] == ~0u) {
-      KillIndices[Reg] = Count;
-      DefIndices[Reg] = ~0u;
-          assert(((KillIndices[Reg] == ~0u) !=
-                  (DefIndices[Reg] == ~0u)) &&
-               "Kill and Def maps aren't consistent for Reg!");
-    }
-    // Repeat, for all aliases.
-    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+    // Repeat for all aliases.
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
       unsigned AliasReg = *AI;
       if (KillIndices[AliasReg] == ~0u) {
         KillIndices[AliasReg] = Count;
@@ -527,7 +512,14 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
   unsigned Count = InsertPosIndex - 1;
   for (MachineBasicBlock::iterator I = End, E = Begin; I != E; --Count) {
     MachineInstr *MI = --I;
-    if (MI->isDebugValue())
+    // Kill instructions can define registers but are really nops, and there
+    // might be a real definition earlier that needs to be paired with uses
+    // dominated by this kill.
+    
+    // FIXME: It may be possible to remove the isKill() restriction once PR18663
+    // has been properly fixed. There can be value in processing kills as seen
+    // in the AggressiveAntiDepBreaker class.
+    if (MI->isDebugValue() || MI->isKill())
       continue;
 
     // Check if this instruction has a dependence on the critical path that
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index 45e4ff5c78b9..ceef74d1a440 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H
-#define LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H
+#ifndef LLVM_LIB_CODEGEN_CRITICALANTIDEPBREAKER_H
+#define LLVM_LIB_CODEGEN_CRITICALANTIDEPBREAKER_H
 
 #include "AntiDepBreaker.h"
 #include "llvm/ADT/BitVector.h"
@@ -38,32 +38,32 @@ class TargetRegisterInfo;
     const TargetRegisterInfo *TRI;
     const RegisterClassInfo &RegClassInfo;
 
-    /// AllocatableSet - The set of allocatable registers.
+    /// The set of allocatable registers.
     /// We'll be ignoring anti-dependencies on non-allocatable registers,
     /// because they may not be safe to break.
     const BitVector AllocatableSet;
 
-    /// Classes - For live regs that are only used in one register class in a
+    /// For live regs that are only used in one register class in a
     /// live range, the register class. If the register is not live, the
     /// corresponding value is null. If the register is live but used in
     /// multiple register classes, the corresponding value is -1 casted to a
     /// pointer.
     std::vector<const TargetRegisterClass*> Classes;
 
-    /// RegRefs - Map registers to all their references within a live range.
+    /// Map registers to all their references within a live range.
     std::multimap<unsigned, MachineOperand *> RegRefs;
     typedef std::multimap<unsigned, MachineOperand *>::const_iterator
       RegRefIter;
 
-    /// KillIndices - The index of the most recent kill (proceeding bottom-up),
+    /// The index of the most recent kill (proceeding bottom-up),
     /// or ~0u if the register is not live.
     std::vector<unsigned> KillIndices;
 
-    /// DefIndices - The index of the most recent complete def (proceeding
+    /// The index of the most recent complete def (proceeding
     /// bottom up), or ~0u if the register is live.
     std::vector<unsigned> DefIndices;
 
-    /// KeepRegs - A set of registers which are live and cannot be changed to
+    /// A set of registers which are live and cannot be changed to
     /// break anti-dependencies.
     BitVector KeepRegs;
 
@@ -71,26 +71,23 @@ class TargetRegisterInfo;
     CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo&);
     ~CriticalAntiDepBreaker();
 
-    /// Start - Initialize anti-dep breaking for a new basic block.
+    /// Initialize anti-dep breaking for a new basic block.
     void StartBlock(MachineBasicBlock *BB) override;
 
-    /// BreakAntiDependencies - Identifiy anti-dependencies along the critical
-    /// path
+    /// Identifiy anti-dependencies along the critical path
     /// of the ScheduleDAG and break them by renaming registers.
-    ///
     unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                                    MachineBasicBlock::iterator Begin,
                                    MachineBasicBlock::iterator End,
                                    unsigned InsertPosIndex,
                                    DbgValueVector &DbgValues) override;
 
-    /// Observe - Update liveness information to account for the current
+    /// Update liveness information to account for the current
     /// instruction, which will not be scheduled.
-    ///
     void Observe(MachineInstr *MI, unsigned Count,
                  unsigned InsertPosIndex) override;
 
-    /// Finish - Finish anti-dep breaking for a basic block.
+    /// Finish anti-dep breaking for a basic block.
     void FinishBlock() override;
 
   private:
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index bc6e9dc43e13..0a188c0935ad 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -106,16 +106,15 @@ namespace llvm {
 class DefaultVLIWScheduler : public ScheduleDAGInstrs {
 public:
   DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI,
-                   MachineDominatorTree &MDT, bool IsPostRA);
+                       bool IsPostRA);
   // Schedule - Actual scheduling work.
   void schedule() override;
 };
 }
 
-DefaultVLIWScheduler::DefaultVLIWScheduler(
-  MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
-  bool IsPostRA) :
-  ScheduleDAGInstrs(MF, MLI, MDT, IsPostRA) {
+DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF,
+                                           MachineLoopInfo &MLI, bool IsPostRA)
+    : ScheduleDAGInstrs(MF, &MLI, IsPostRA) {
   CanHandleTerminators = true;
 }
 
@@ -125,12 +124,12 @@ void DefaultVLIWScheduler::schedule() {
 }
 
 // VLIWPacketizerList Ctor
-VLIWPacketizerList::VLIWPacketizerList(
-  MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
-  bool IsPostRA) : TM(MF.getTarget()), MF(MF)  {
-  TII = TM.getInstrInfo();
-  ResourceTracker = TII->CreateTargetScheduleState(&TM, nullptr);
-  VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, MDT, IsPostRA);
+VLIWPacketizerList::VLIWPacketizerList(MachineFunction &MF,
+                                       MachineLoopInfo &MLI, bool IsPostRA)
+    : MF(MF) {
+  TII = MF.getSubtarget().getInstrInfo();
+  ResourceTracker = TII->CreateTargetScheduleState(MF.getSubtarget());
+  VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, IsPostRA);
 }
 
 // VLIWPacketizerList Dtor
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index 2b144d89e651..c17a35d1c734 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -19,7 +19,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "codegen-dce"
@@ -58,6 +59,10 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
   if (MI->isInlineAsm())
     return false;
 
+  // Don't delete frame allocation labels.
+  if (MI->getOpcode() == TargetOpcode::FRAME_ALLOC)
+    return false;
+
   // Don't delete instructions with side effects.
   bool SawStore = false;
   if (!MI->isSafeToMove(TII, nullptr, SawStore) && !MI->isPHI())
@@ -90,8 +95,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
 
   bool AnyChanges = false;
   MRI = &MF.getRegInfo();
-  TRI = MF.getTarget().getRegisterInfo();
-  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
 
   // Loop over all instructions in all blocks, from bottom to top, so that it's
   // more likely that chains of dependent but ultimately dead instructions will
@@ -122,19 +127,10 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
       if (isDead(MI)) {
         DEBUG(dbgs() << "DeadMachineInstructionElim: DELETING: " << *MI);
         // It is possible that some DBG_VALUE instructions refer to this
-        // instruction.  Examine each def operand for such references;
-        // if found, mark the DBG_VALUE as undef (but don't delete it).
-        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-          const MachineOperand &MO = MI->getOperand(i);
-          if (!MO.isReg() || !MO.isDef())
-            continue;
-          unsigned Reg = MO.getReg();
-          if (!TargetRegisterInfo::isVirtualRegister(Reg))
-            continue;
-          MRI->markUsesInDebugValueAsUndef(Reg);
-        }
+        // instruction.  They get marked as undef and will be deleted
+        // in the live debug variable analysis.
+        MI->eraseFromParentAndMarkDBGValuesForRemoval();
         AnyChanges = true;
-        MI->eraseFromParent();
         ++NumDeletes;
         MIE = MBB->rend();
         // MII is now pointing to the next instruction to process,
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index a195586cf624..75b74d9a6c33 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
@@ -50,6 +51,11 @@ namespace {
 
     bool runOnFunction(Function &Fn) override;
 
+    bool doFinalization(Module &M) override {
+      RewindFunction = nullptr;
+      return false;
+    }
+
     void getAnalysisUsage(AnalysisUsage &AU) const override { }
 
     const char *getPassName() const override {
@@ -118,7 +124,7 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
     return false;
 
   // Find the rewind function if we didn't already.
-  const TargetLowering *TLI = TM->getTargetLowering();
+  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
   if (!RewindFunction) {
     LLVMContext &Ctx = Resumes[0]->getContext();
     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index c4706328ea52..995606f65d77 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -153,8 +153,8 @@ private:
 public:
   /// runOnMachineFunction - Initialize per-function data structures.
   void runOnMachineFunction(MachineFunction &MF) {
-    TII = MF.getTarget().getInstrInfo();
-    TRI = MF.getTarget().getRegisterInfo();
+    TII = MF.getSubtarget().getInstrInfo();
+    TRI = MF.getSubtarget().getRegisterInfo();
     MRI = &MF.getRegInfo();
     LiveRegUnits.clear();
     LiveRegUnits.setUniverse(TRI->getNumRegUnits());
@@ -245,7 +245,7 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
       MachineInstr *DefMI = MRI->getVRegDef(Reg);
       if (!DefMI || DefMI->getParent() != Head)
         continue;
-      if (InsertAfter.insert(DefMI))
+      if (InsertAfter.insert(DefMI).second)
         DEBUG(dbgs() << "BB#" << MBB->getNumber() << " depends on " << *DefMI);
       if (DefMI->isTerminator()) {
         DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
@@ -580,7 +580,7 @@ namespace {
 class EarlyIfConverter : public MachineFunctionPass {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
-  const MCSchedModel *SchedModel;
+  MCSchedModel SchedModel;
   MachineRegisterInfo *MRI;
   MachineDominatorTree *DomTree;
   MachineLoopInfo *Loops;
@@ -688,7 +688,7 @@ bool EarlyIfConverter::shouldConvertIf() {
                               FBBTrace.getCriticalPath());
 
   // Set a somewhat arbitrary limit on the critical path extension we accept.
-  unsigned CritLimit = SchedModel->MispredictPenalty/2;
+  unsigned CritLimit = SchedModel.MispredictPenalty/2;
 
   // If-conversion only makes sense when there is unexploited ILP. Compute the
   // maximum-ILP resource length of the trace after if-conversion. Compare it
@@ -782,8 +782,8 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
            .enableEarlyIfConversion())
     return false;
 
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   SchedModel =
     MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
   MRI = &MF.getRegInfo();
diff --git a/lib/CodeGen/ErlangGC.cpp b/lib/CodeGen/ErlangGC.cpp
index e976d7fd9d82..85b089343ca1 100644
--- a/lib/CodeGen/ErlangGC.cpp
+++ b/lib/CodeGen/ErlangGC.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -53,7 +54,7 @@ ErlangGC::ErlangGC() {
 MCSymbol *ErlangGC::InsertLabel(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI,
                                 DebugLoc DL) const {
-  const TargetInstrInfo* TII = MBB.getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
   MCSymbol *Label = MBB.getParent()->getContext().CreateTempSymbol();
   BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
   return Label;
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index cf55b68b16f8..b3a22c83a808 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -22,6 +22,7 @@
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -29,7 +30,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "execution-fix"
@@ -73,6 +75,9 @@ struct DomainValue {
 
   // Is domain available?
   bool hasDomain(unsigned domain) const {
+    assert(domain <
+               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+           "undefined behavior");
     return AvailableDomains & (1u << domain);
   }
 
@@ -132,7 +137,7 @@ class ExeDepsFix : public MachineFunctionPass {
   MachineFunction *MF;
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
-  std::vector<int> AliasMap;
+  std::vector<SmallVector<int, 1>> AliasMap;
   const unsigned NumRegs;
   LiveReg *LiveRegs;
   typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap;
@@ -168,8 +173,8 @@ public:
   }
 
 private:
-  // Register mapping.
-  int regIndex(unsigned Reg);
+  iterator_range<SmallVectorImpl<int>::const_iterator>
+  regIndizes(unsigned Reg) const;
 
   // DomainValue allocation.
   DomainValue *alloc(int domain = -1);
@@ -200,11 +205,13 @@ private:
 
 char ExeDepsFix::ID = 0;
 
-/// Translate TRI register number to an index into our smaller tables of
-/// interesting registers. Return -1 for boring registers.
-int ExeDepsFix::regIndex(unsigned Reg) {
+/// Translate TRI register number to a list of indizes into our stmaller tables
+/// of interesting registers.
+iterator_range<SmallVectorImpl<int>::const_iterator>
+ExeDepsFix::regIndizes(unsigned Reg) const {
   assert(Reg < AliasMap.size() && "Invalid register");
-  return AliasMap[Reg];
+  const auto &Entry = AliasMap[Reg];
+  return make_range(Entry.begin(), Entry.end());
 }
 
 DomainValue *ExeDepsFix::alloc(int domain) {
@@ -337,9 +344,11 @@ bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {
   // All uses of B are referred to A.
   B->Next = retain(A);
 
-  for (unsigned rx = 0; rx != NumRegs; ++rx)
+  for (unsigned rx = 0; rx != NumRegs; ++rx) {
+    assert(LiveRegs && "no space allocated for live registers");
     if (LiveRegs[rx].Value == B)
       setLiveReg(rx, A);
+  }
   return true;
 }
 
@@ -369,13 +378,12 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
   if (MBB->pred_empty()) {
     for (MachineBasicBlock::livein_iterator i = MBB->livein_begin(),
          e = MBB->livein_end(); i != e; ++i) {
-      int rx = regIndex(*i);
-      if (rx < 0)
-        continue;
-      // Treat function live-ins as if they were defined just before the first
-      // instruction.  Usually, function arguments are set up immediately
-      // before the call.
-      LiveRegs[rx].Def = -1;
+      for (int rx : regIndizes(*i)) {
+        // Treat function live-ins as if they were defined just before the first
+        // instruction.  Usually, function arguments are set up immediately
+        // before the call.
+        LiveRegs[rx].Def = -1;
+      }
     }
     DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n");
     return;
@@ -466,26 +474,26 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) {
 /// or undef use.
 bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
                                        unsigned Pref) {
-  int rx = regIndex(MI->getOperand(OpIdx).getReg());
-  if (rx < 0)
-    return false;
-
-  unsigned Clearance = CurInstr - LiveRegs[rx].Def;
-  DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
+  unsigned reg = MI->getOperand(OpIdx).getReg();
+  for (int rx : regIndizes(reg)) {
+    unsigned Clearance = CurInstr - LiveRegs[rx].Def;
+    DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
 
-  if (Pref > Clearance) {
-    DEBUG(dbgs() << ": Break dependency.\n");
-    return true;
-  }
-  // The current clearance seems OK, but we may be ignoring a def from a
-  // back-edge.
-  if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
-    DEBUG(dbgs() << ": OK .\n");
+    if (Pref > Clearance) {
+      DEBUG(dbgs() << ": Break dependency.\n");
+      continue;
+    }
+    // The current clearance seems OK, but we may be ignoring a def from a
+    // back-edge.
+    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
+      DEBUG(dbgs() << ": OK .\n");
+      return false;
+    }
+    // A def from an unprocessed back-edge may make us break this dependency.
+    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
     return false;
   }
-  // A def from an unprocessed back-edge may make us break this dependency.
-  DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
-  return false;
+  return true;
 }
 
 // Update def-ages for registers defined by MI.
@@ -513,26 +521,24 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
       break;
     if (MO.isUse())
       continue;
-    int rx = regIndex(MO.getReg());
-    if (rx < 0)
-      continue;
-
-    // This instruction explicitly defines rx.
-    DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
-                 << '\t' << *MI);
-
-    // Check clearance before partial register updates.
-    // Call breakDependence before setting LiveRegs[rx].Def.
-    unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
-    if (Pref && shouldBreakDependence(MI, i, Pref))
-      TII->breakPartialRegDependency(MI, i, TRI);
-
-    // How many instructions since rx was last written?
-    LiveRegs[rx].Def = CurInstr;
-
-    // Kill off domains redefined by generic instructions.
-    if (Kill)
-      kill(rx);
+    for (int rx : regIndizes(MO.getReg())) {
+      // This instruction explicitly defines rx.
+      DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
+                   << '\t' << *MI);
+
+      // Check clearance before partial register updates.
+      // Call breakDependence before setting LiveRegs[rx].Def.
+      unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
+      if (Pref && shouldBreakDependence(MI, i, Pref))
+        TII->breakPartialRegDependency(MI, i, TRI);
+
+      // How many instructions since rx was last written?
+      LiveRegs[rx].Def = CurInstr;
+
+      // Kill off domains redefined by generic instructions.
+      if (Kill)
+        kill(rx);
+    }
   }
   ++CurInstr;
 }
@@ -581,19 +587,19 @@ void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
                 e = mi->getDesc().getNumOperands(); i != e; ++i) {
     MachineOperand &mo = mi->getOperand(i);
     if (!mo.isReg()) continue;
-    int rx = regIndex(mo.getReg());
-    if (rx < 0) continue;
-    force(rx, domain);
+    for (int rx : regIndizes(mo.getReg())) {
+      force(rx, domain);
+    }
   }
 
   // Kill all defs and force them.
   for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
     MachineOperand &mo = mi->getOperand(i);
     if (!mo.isReg()) continue;
-    int rx = regIndex(mo.getReg());
-    if (rx < 0) continue;
-    kill(rx);
-    force(rx, domain);
+    for (int rx : regIndizes(mo.getReg())) {
+      kill(rx);
+      force(rx, domain);
+    }
   }
 }
 
@@ -610,9 +616,10 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
                   e = mi->getDesc().getNumOperands(); i != e; ++i) {
       MachineOperand &mo = mi->getOperand(i);
       if (!mo.isReg()) continue;
-      int rx = regIndex(mo.getReg());
-      if (rx < 0) continue;
-      if (DomainValue *dv = LiveRegs[rx].Value) {
+      for (int rx : regIndizes(mo.getReg())) {
+        DomainValue *dv = LiveRegs[rx].Value;
+        if (dv == nullptr)
+          continue;
         // Bitmask of domains that dv and available have in common.
         unsigned common = dv->getCommonDomains(available);
         // Is it possible to use this collapsed register for free?
@@ -644,6 +651,7 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   SmallVector<LiveReg, 4> Regs;
   for (SmallVectorImpl<int>::iterator i=used.begin(), e=used.end(); i!=e; ++i) {
     int rx = *i;
+    assert(LiveRegs && "no space allocated for live registers");
     const LiveReg &LR = LiveRegs[rx];
     // This useless DomainValue could have been missed above.
     if (!LR.Value->getCommonDomains(available)) {
@@ -683,9 +691,11 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
       continue;
 
     // If latest didn't merge, it is useless now. Kill all registers using it.
-    for (SmallVectorImpl<int>::iterator i=used.begin(), e=used.end(); i!=e; ++i)
-      if (LiveRegs[*i].Value == Latest)
-        kill(*i);
+    for (int i : used) {
+      assert(LiveRegs && "no space allocated for live registers");
+      if (LiveRegs[i].Value == Latest)
+        kill(i);
+    }
   }
 
   // dv is the DomainValue we are going to use for this instruction.
@@ -702,24 +712,24 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
                                   ii != ee; ++ii) {
     MachineOperand &mo = *ii;
     if (!mo.isReg()) continue;
-    int rx = regIndex(mo.getReg());
-    if (rx < 0) continue;
-    if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) {
-      kill(rx);
-      setLiveReg(rx, dv);
+    for (int rx : regIndizes(mo.getReg())) {
+      if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) {
+        kill(rx);
+        setLiveReg(rx, dv);
+      }
     }
   }
 }
 
 bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
-  TII = MF->getTarget().getInstrInfo();
-  TRI = MF->getTarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
   LiveRegs = nullptr;
   assert(NumRegs == RC->getNumRegs() && "Bad regclass");
 
   DEBUG(dbgs() << "********** FIX EXECUTION DEPENDENCIES: "
-               << RC->getName() << " **********\n");
+               << TRI->getRegClassName(RC) << " **********\n");
 
   // If no relevant registers are used in the function, we can skip it
   // completely.
@@ -734,13 +744,13 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
 
   // Initialize the AliasMap on the first use.
   if (AliasMap.empty()) {
-    // Given a PhysReg, AliasMap[PhysReg] is either the relevant index into RC,
-    // or -1.
-    AliasMap.resize(TRI->getNumRegs(), -1);
+    // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and
+    // therefore the LiveRegs array.
+    AliasMap.resize(TRI->getNumRegs());
     for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
       for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true);
            AI.isValid(); ++AI)
-        AliasMap[*AI] = i;
+        AliasMap[*AI].push_back(i);
   }
 
   MachineBasicBlock *Entry = MF->begin();
diff --git a/lib/CodeGen/ExpandISelPseudos.cpp b/lib/CodeGen/ExpandISelPseudos.cpp
index 90b62b5661e1..55e809e24278 100644
--- a/lib/CodeGen/ExpandISelPseudos.cpp
+++ b/lib/CodeGen/ExpandISelPseudos.cpp
@@ -19,7 +19,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "expand-isel-pseudos"
@@ -46,7 +46,7 @@ INITIALIZE_PASS(ExpandISelPseudos, "expand-isel-pseudos",
 
 bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
-  const TargetLowering *TLI = MF.getTarget().getTargetLowering();
+  const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
 
   // Iterate through each instruction in the function, looking for pseudos.
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 8969bccd4603..e7bf143b82ee 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -20,8 +20,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "postrapseudos"
@@ -182,8 +183,8 @@ bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "Machine Function\n"
                << "********** EXPANDING POST-RA PSEUDO INSTRS **********\n"
                << "********** Function: " << MF.getName() << '\n');
-  TRI = MF.getTarget().getRegisterInfo();
-  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
 
   bool MadeChange = false;
 
diff --git a/lib/CodeGen/ForwardControlFlowIntegrity.cpp b/lib/CodeGen/ForwardControlFlowIntegrity.cpp
new file mode 100644
index 000000000000..63c3699e13b6
--- /dev/null
+++ b/lib/CodeGen/ForwardControlFlowIntegrity.cpp
@@ -0,0 +1,374 @@
+//===-- ForwardControlFlowIntegrity.cpp: Forward-Edge CFI -----------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief A pass that instruments code with fast checks for indirect calls and
+/// hooks for a function to check violations.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "cfi"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/JumpInstrTableInfo.h"
+#include "llvm/CodeGen/ForwardControlFlowIntegrity.h"
+#include "llvm/CodeGen/JumpInstrTables.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+STATISTIC(NumCFIIndirectCalls,
+          "Number of indirect call sites rewritten by the CFI pass");
+
+char ForwardControlFlowIntegrity::ID = 0;
+INITIALIZE_PASS_BEGIN(ForwardControlFlowIntegrity, "forward-cfi",
+                      "Control-Flow Integrity", true, true)
+INITIALIZE_PASS_DEPENDENCY(JumpInstrTableInfo);
+INITIALIZE_PASS_DEPENDENCY(JumpInstrTables);
+INITIALIZE_PASS_END(ForwardControlFlowIntegrity, "forward-cfi",
+                    "Control-Flow Integrity", true, true)
+
+ModulePass *llvm::createForwardControlFlowIntegrityPass() {
+  return new ForwardControlFlowIntegrity();
+}
+
+ModulePass *llvm::createForwardControlFlowIntegrityPass(
+    JumpTable::JumpTableType JTT, CFIntegrity CFIType, bool CFIEnforcing,
+    StringRef CFIFuncName) {
+  return new ForwardControlFlowIntegrity(JTT, CFIType, CFIEnforcing,
+                                         CFIFuncName);
+}
+
+// Checks to see if a given CallSite is making an indirect call, including
+// cases where the indirect call is made through a bitcast.
+static bool isIndirectCall(CallSite &CS) {
+  if (CS.getCalledFunction())
+    return false;
+
+  // Check the value to see if it is merely a bitcast of a function. In
+  // this case, it will translate to a direct function call in the resulting
+  // assembly, so we won't treat it as an indirect call here.
+  const Value *V = CS.getCalledValue();
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    return !(CE->isCast() && isa<Function>(CE->getOperand(0)));
+  }
+
+  // Otherwise, since we know it's a call, it must be an indirect call
+  return true;
+}
+
+static const char cfi_failure_func_name[] = "__llvm_cfi_pointer_warning";
+
+ForwardControlFlowIntegrity::ForwardControlFlowIntegrity()
+    : ModulePass(ID), IndirectCalls(), JTType(JumpTable::Single),
+      CFIType(CFIntegrity::Sub), CFIEnforcing(false), CFIFuncName("") {
+  initializeForwardControlFlowIntegrityPass(*PassRegistry::getPassRegistry());
+}
+
+ForwardControlFlowIntegrity::ForwardControlFlowIntegrity(
+    JumpTable::JumpTableType JTT, CFIntegrity CFIType, bool CFIEnforcing,
+    std::string CFIFuncName)
+    : ModulePass(ID), IndirectCalls(), JTType(JTT), CFIType(CFIType),
+      CFIEnforcing(CFIEnforcing), CFIFuncName(CFIFuncName) {
+  initializeForwardControlFlowIntegrityPass(*PassRegistry::getPassRegistry());
+}
+
+ForwardControlFlowIntegrity::~ForwardControlFlowIntegrity() {}
+
+void ForwardControlFlowIntegrity::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<JumpInstrTableInfo>();
+  AU.addRequired<JumpInstrTables>();
+}
+
+void ForwardControlFlowIntegrity::getIndirectCalls(Module &M) {
+  // To get the indirect calls, we iterate over all functions and iterate over
+  // the list of basic blocks in each. We extract a total list of indirect calls
+  // before modifying any of them, since our modifications will modify the list
+  // of basic blocks.
+  for (Function &F : M) {
+    for (BasicBlock &BB : F) {
+      for (Instruction &I : BB) {
+        CallSite CS(&I);
+        if (!(CS && isIndirectCall(CS)))
+          continue;
+
+        Value *CalledValue = CS.getCalledValue();
+
+        // Don't rewrite this instruction if the indirect call is actually just
+        // inline assembly, since our transformation will generate an invalid
+        // module in that case.
+        if (isa<InlineAsm>(CalledValue))
+          continue;
+
+        IndirectCalls.push_back(&I);
+      }
+    }
+  }
+}
+
+void ForwardControlFlowIntegrity::updateIndirectCalls(Module &M,
+                                                      CFITables &CFIT) {
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+  for (Instruction *I : IndirectCalls) {
+    CallSite CS(I);
+    Value *CalledValue = CS.getCalledValue();
+
+    // Get the function type for this call and look it up in the tables.
+    Type *VTy = CalledValue->getType();
+    PointerType *PTy = dyn_cast<PointerType>(VTy);
+    Type *EltTy = PTy->getElementType();
+    FunctionType *FunTy = dyn_cast<FunctionType>(EltTy);
+    FunctionType *TransformedTy = JumpInstrTables::transformType(JTType, FunTy);
+    ++NumCFIIndirectCalls;
+    Constant *JumpTableStart = nullptr;
+    Constant *JumpTableMask = nullptr;
+    Constant *JumpTableSize = nullptr;
+
+    // Some call sites have function types that don't correspond to any
+    // address-taken function in the module. This happens when function pointers
+    // are passed in from external code.
+    auto it = CFIT.find(TransformedTy);
+    if (it == CFIT.end()) {
+      // In this case, make sure that the function pointer will change by
+      // setting the mask and the start to be 0 so that the transformed
+      // function is 0.
+      JumpTableStart = ConstantInt::get(Int64Ty, 0);
+      JumpTableMask = ConstantInt::get(Int64Ty, 0);
+      JumpTableSize = ConstantInt::get(Int64Ty, 0);
+    } else {
+      JumpTableStart = it->second.StartValue;
+      JumpTableMask = it->second.MaskValue;
+      JumpTableSize = it->second.Size;
+    }
+
+    rewriteFunctionPointer(M, I, CalledValue, JumpTableStart, JumpTableMask,
+                           JumpTableSize);
+  }
+
+  return;
+}
+
+bool ForwardControlFlowIntegrity::runOnModule(Module &M) {
+  JumpInstrTableInfo *JITI = &getAnalysis<JumpInstrTableInfo>();
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
+
+  // JumpInstrTableInfo stores information about the alignment of each entry.
+  // The alignment returned by JumpInstrTableInfo is alignment in bytes, not
+  // in the exponent.
+  ByteAlignment = JITI->entryByteAlignment();
+  LogByteAlignment = llvm::Log2_64(ByteAlignment);
+
+  // Set up tables for control-flow integrity based on information about the
+  // jump-instruction tables.
+  CFITables CFIT;
+  for (const auto &KV : JITI->getTables()) {
+    uint64_t Size = static_cast<uint64_t>(KV.second.size());
+    uint64_t TableSize = NextPowerOf2(Size);
+
+    int64_t MaskValue = ((TableSize << LogByteAlignment) - 1) & -ByteAlignment;
+    Constant *JumpTableMaskValue = ConstantInt::get(Int64Ty, MaskValue);
+    Constant *JumpTableSize = ConstantInt::get(Int64Ty, Size);
+
+    // The base of the table is defined to be the first jumptable function in
+    // the table.
+    Function *First = KV.second.begin()->second;
+    Constant *JumpTableStartValue = ConstantExpr::getBitCast(First, VoidPtrTy);
+    CFIT[KV.first].StartValue = JumpTableStartValue;
+    CFIT[KV.first].MaskValue = JumpTableMaskValue;
+    CFIT[KV.first].Size = JumpTableSize;
+  }
+
+  if (CFIT.empty())
+    return false;
+
+  getIndirectCalls(M);
+
+  if (!CFIEnforcing) {
+    addWarningFunction(M);
+  }
+
+  // Update the instructions with the check and the indirect jump through our
+  // table.
+  updateIndirectCalls(M, CFIT);
+
+  return true;
+}
+
+void ForwardControlFlowIntegrity::addWarningFunction(Module &M) {
+  PointerType *CharPtrTy = Type::getInt8PtrTy(M.getContext());
+
+  // Get the type of the Warning Function: void (i8*, i8*),
+  // where the first argument is the name of the function in which the violation
+  // occurs, and the second is the function pointer that violates CFI.
+  SmallVector<Type *, 2> WarningFunArgs;
+  WarningFunArgs.push_back(CharPtrTy);
+  WarningFunArgs.push_back(CharPtrTy);
+  FunctionType *WarningFunTy =
+      FunctionType::get(Type::getVoidTy(M.getContext()), WarningFunArgs, false);
+
+  if (!CFIFuncName.empty()) {
+    Constant *FailureFun = M.getOrInsertFunction(CFIFuncName, WarningFunTy);
+    if (!FailureFun)
+      report_fatal_error("Could not get or insert the function specified by"
+                         " -cfi-func-name");
+  } else {
+    // The default warning function swallows the warning and lets the call
+    // continue, since there's no generic way for it to print out this
+    // information.
+    Function *WarningFun = M.getFunction(cfi_failure_func_name);
+    if (!WarningFun) {
+      WarningFun =
+          Function::Create(WarningFunTy, GlobalValue::LinkOnceAnyLinkage,
+                           cfi_failure_func_name, &M);
+    }
+
+    BasicBlock *Entry =
+        BasicBlock::Create(M.getContext(), "entry", WarningFun, 0);
+    ReturnInst::Create(M.getContext(), Entry);
+  }
+}
+
+void ForwardControlFlowIntegrity::rewriteFunctionPointer(
+    Module &M, Instruction *I, Value *FunPtr, Constant *JumpTableStart,
+    Constant *JumpTableMask, Constant *JumpTableSize) {
+  IRBuilder<> TempBuilder(I);
+
+  Type *OrigFunType = FunPtr->getType();
+
+  BasicBlock *CurBB = cast<BasicBlock>(I->getParent());
+  Function *CurF = cast<Function>(CurBB->getParent());
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+
+  Value *TI = TempBuilder.CreatePtrToInt(FunPtr, Int64Ty);
+  Value *TStartInt = TempBuilder.CreatePtrToInt(JumpTableStart, Int64Ty);
+
+  Value *NewFunPtr = nullptr;
+  Value *Check = nullptr;
+  switch (CFIType) {
+  case CFIntegrity::Sub: {
+    // This is the subtract, mask, and add version.
+    // Subtract from the base.
+    Value *Sub = TempBuilder.CreateSub(TI, TStartInt);
+
+    // Mask the difference to force this to be a table offset.
+    Value *And = TempBuilder.CreateAnd(Sub, JumpTableMask);
+
+    // Add it back to the base.
+    Value *Result = TempBuilder.CreateAdd(And, TStartInt);
+
+    // Convert it back into a function pointer that we can call.
+    NewFunPtr = TempBuilder.CreateIntToPtr(Result, OrigFunType);
+    break;
+  }
+  case CFIntegrity::Ror: {
+    // This is the subtract and rotate version.
+    // Rotate right by the alignment value. The optimizer should recognize
+    // this sequence as a rotation.
+
+    // This cast is safe, since unsigned is always a subset of uint64_t.
+    uint64_t LogByteAlignment64 = static_cast<uint64_t>(LogByteAlignment);
+    Constant *RightShift = ConstantInt::get(Int64Ty, LogByteAlignment64);
+    Constant *LeftShift = ConstantInt::get(Int64Ty, 64 - LogByteAlignment64);
+
+    // Subtract from the base.
+    Value *Sub = TempBuilder.CreateSub(TI, TStartInt);
+
+    // Create the equivalent of a rotate-right instruction.
+    Value *Shr = TempBuilder.CreateLShr(Sub, RightShift);
+    Value *Shl = TempBuilder.CreateShl(Sub, LeftShift);
+    Value *Or = TempBuilder.CreateOr(Shr, Shl);
+
+    // Perform unsigned comparison to check for inclusion in the table.
+    Check = TempBuilder.CreateICmpULT(Or, JumpTableSize);
+    NewFunPtr = FunPtr;
+    break;
+  }
+  case CFIntegrity::Add: {
+    // This is the mask and add version.
+    // Mask the function pointer to turn it into an offset into the table.
+    Value *And = TempBuilder.CreateAnd(TI, JumpTableMask);
+
+    // Then or this offset to the base and get the pointer value.
+    Value *Result = TempBuilder.CreateAdd(And, TStartInt);
+
+    // Convert it back into a function pointer that we can call.
+    NewFunPtr = TempBuilder.CreateIntToPtr(Result, OrigFunType);
+    break;
+  }
+  }
+
+  if (!CFIEnforcing) {
+    // If a check hasn't been added (in the rotation version), then check to see
+    // if it's the same as the original function. This check determines whether
+    // or not we call the CFI failure function.
+    if (!Check)
+      Check = TempBuilder.CreateICmpEQ(NewFunPtr, FunPtr);
+    BasicBlock *InvalidPtrBlock =
+        BasicBlock::Create(M.getContext(), "invalid.ptr", CurF, 0);
+    BasicBlock *ContinuationBB = CurBB->splitBasicBlock(I);
+
+    // Remove the unconditional branch that connects the two blocks.
+    TerminatorInst *TermInst = CurBB->getTerminator();
+    TermInst->eraseFromParent();
+
+    // Add a conditional branch that depends on the Check above.
+    BranchInst::Create(ContinuationBB, InvalidPtrBlock, Check, CurBB);
+
+    // Call the warning function for this pointer, then continue.
+    Instruction *BI = BranchInst::Create(ContinuationBB, InvalidPtrBlock);
+    insertWarning(M, InvalidPtrBlock, BI, FunPtr);
+  } else {
+    // Modify the instruction to call this value.
+    CallSite CS(I);
+    CS.setCalledFunction(NewFunPtr);
+  }
+}
+
+void ForwardControlFlowIntegrity::insertWarning(Module &M, BasicBlock *Block,
+                                                Instruction *I, Value *FunPtr) {
+  Function *ParentFun = cast<Function>(Block->getParent());
+
+  // Get the function to call right before the instruction.
+  Function *WarningFun = nullptr;
+  if (CFIFuncName.empty()) {
+    WarningFun = M.getFunction(cfi_failure_func_name);
+  } else {
+    WarningFun = M.getFunction(CFIFuncName);
+  }
+
+  assert(WarningFun && "Could not find the CFI failure function");
+
+  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
+
+  IRBuilder<> WarningInserter(I);
+  // Create a mergeable GlobalVariable containing the name of the function.
+  Value *ParentNameGV =
+      WarningInserter.CreateGlobalString(ParentFun->getName());
+  Value *ParentNamePtr = WarningInserter.CreateBitCast(ParentNameGV, VoidPtrTy);
+  Value *FunVoidPtr = WarningInserter.CreateBitCast(FunPtr, VoidPtrTy);
+  WarningInserter.CreateCall2(WarningFun, ParentNamePtr, FunVoidPtr);
+}
diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp
index c3e4f3ee2fb1..6101c679a5d3 100644
--- a/lib/CodeGen/GCMetadata.cpp
+++ b/lib/CodeGen/GCMetadata.cpp
@@ -71,9 +71,8 @@ GCStrategy *GCModuleInfo::getOrCreateStrategy(const Module *M,
                             E = GCRegistry::end(); I != E; ++I) {
     if (Name == I->getName()) {
       std::unique_ptr<GCStrategy> S = I->instantiate();
-      S->M = M;
       S->Name = Name;
-      StrategyMap.GetOrCreateValue(Name).setValue(S.get());
+      StrategyMap[Name] = S.get();
       StrategyList.push_back(std::move(S));
       return StrategyList.back().get();
     }
@@ -92,12 +91,14 @@ GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) {
     return *I->second;
   
   GCStrategy *S = getOrCreateStrategy(F.getParent(), F.getGC());
-  GCFunctionInfo *GFI = S->insertFunctionInfo(F);
+  Functions.push_back(make_unique<GCFunctionInfo>(F, *S));
+  GCFunctionInfo *GFI = Functions.back().get();
   FInfoMap[&F] = GFI;
   return *GFI;
 }
 
 void GCModuleInfo::clear() {
+  Functions.clear();
   FInfoMap.clear();
   StrategyMap.clear();
   StrategyList.clear();
diff --git a/lib/CodeGen/GCMetadataPrinter.cpp b/lib/CodeGen/GCMetadataPrinter.cpp
index f80e9ced0bc9..fdff4a78e079 100644
--- a/lib/CodeGen/GCMetadataPrinter.cpp
+++ b/lib/CodeGen/GCMetadataPrinter.cpp
@@ -17,11 +17,3 @@ using namespace llvm;
 GCMetadataPrinter::GCMetadataPrinter() { }
 
 GCMetadataPrinter::~GCMetadataPrinter() { }
-
-void GCMetadataPrinter::beginAssembly(AsmPrinter &AP) {
-  // Default is no action.
-}
-
-void GCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
-  // Default is no action.
-}
diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp
index 1fdff6bbf106..05c36fcb00d4 100644
--- a/lib/CodeGen/GCStrategy.cpp
+++ b/lib/CodeGen/GCStrategy.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -92,6 +93,7 @@ namespace {
 // -----------------------------------------------------------------------------
 
 GCStrategy::GCStrategy() :
+  UseStatepoints(false),
   NeededSafePoints(0),
   CustomReadBarriers(false),
   CustomWriteBarriers(false),
@@ -101,25 +103,6 @@ GCStrategy::GCStrategy() :
   UsesMetadata(false)
 {}
 
-bool GCStrategy::initializeCustomLowering(Module &M) { return false; }
-
-bool GCStrategy::performCustomLowering(Function &F) {
-  dbgs() << "gc " << getName() << " must override performCustomLowering.\n";
-  llvm_unreachable("must override performCustomLowering");
-}
-
-
-bool GCStrategy::findCustomSafePoints(GCFunctionInfo& FI, MachineFunction &F) {
-  dbgs() << "gc " << getName() << " must override findCustomSafePoints.\n";
-  llvm_unreachable(nullptr);
-}
-
-
-GCFunctionInfo *GCStrategy::insertFunctionInfo(const Function &F) {
-  Functions.push_back(make_unique<GCFunctionInfo>(F, *this));
-  return Functions.back().get();
-}
-
 // -----------------------------------------------------------------------------
 
 INITIALIZE_PASS_BEGIN(LowerIntrinsics, "gc-lowering", "GC Lowering",
@@ -377,7 +360,7 @@ void GCMachineCodeAnalysis::FindSafePoints(MachineFunction &MF) {
 }
 
 void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
-  const TargetFrameLowering *TFI = TM->getFrameLowering();
+  const TargetFrameLowering *TFI = TM->getSubtargetImpl()->getFrameLowering();
   assert(TFI && "TargetRegisterInfo not available!");
 
   for (GCFunctionInfo::roots_iterator RI = FI->roots_begin();
@@ -403,7 +386,7 @@ bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
 
   TM = &MF.getTarget();
   MMI = &getAnalysis<MachineModuleInfo>();
-  TII = TM->getInstrInfo();
+  TII = TM->getSubtargetImpl()->getInstrInfo();
 
   // Find the size of the stack frame.
   FI->setFrameSize(MF.getFrameInfo()->getStackSize());
diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
index 5572a062912f..82575671e681 100644
--- a/lib/CodeGen/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp
@@ -54,6 +54,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -64,10 +65,10 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "global-merge"
@@ -142,7 +143,7 @@ INITIALIZE_TM_PASS(GlobalMerge, "global-merge", "Merge global variables",
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
-  const TargetLowering *TLI = TM->getTargetLowering();
+  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
   const DataLayout *DL = TLI->getDataLayout();
 
   // FIXME: Infer the maximum possible offset depending on the actual users
@@ -281,7 +282,7 @@ bool GlobalMerge::doInitialization(Module &M) {
 
   DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
                                                         BSSGlobals;
-  const TargetLowering *TLI = TM->getTargetLowering();
+  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
   const DataLayout *DL = TLI->getDataLayout();
   unsigned MaxOffset = TLI->getMaximalGlobalOffset();
   bool Changed = false;
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 1502d5f23efe..e84d25d96961 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -30,7 +31,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
@@ -161,6 +161,7 @@ namespace {
     const TargetLoweringBase *TLI;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
+    const MachineBlockFrequencyInfo *MBFI;
     const MachineBranchProbabilityInfo *MBPI;
     MachineRegisterInfo *MRI;
 
@@ -177,6 +178,7 @@ namespace {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBlockFrequencyInfo>();
       AU.addRequired<MachineBranchProbabilityInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -269,15 +271,16 @@ INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false)
 
 bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
-  TLI = MF.getTarget().getTargetLowering();
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  TLI = MF.getSubtarget().getTargetLowering();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   MRI = &MF.getRegInfo();
 
   const TargetSubtargetInfo &ST =
     MF.getTarget().getSubtarget<TargetSubtargetInfo>();
-  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
 
   if (!TII) return false;
 
@@ -286,9 +289,8 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   bool BFChange = false;
   if (!PreRegAlloc) {
     // Tail merge tend to expose more if-conversion opportunities.
-    BranchFolder BF(true, false);
-    BFChange = BF.OptimizeFunction(MF, TII,
-                                   MF.getTarget().getRegisterInfo(),
+    BranchFolder BF(true, false, *MBFI, *MBPI);
+    BFChange = BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
                                    getAnalysisIfAvailable<MachineModuleInfo>());
   }
 
@@ -420,9 +422,8 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   BBAnalysis.clear();
 
   if (MadeChange && IfCvtBranchFold) {
-    BranchFolder BF(false, false);
-    BF.OptimizeFunction(MF, TII,
-                        MF.getTarget().getRegisterInfo(),
+    BranchFolder BF(false, false, *MBFI, *MBPI);
+    BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
                         getAnalysisIfAvailable<MachineModuleInfo>());
   }
 
@@ -940,9 +941,8 @@ static bool canFallThroughTo(MachineBasicBlock *BB, MachineBasicBlock *ToBB) {
 /// to determine if it can be if-converted. If predecessor is already enqueued,
 /// dequeue it!
 void IfConverter::InvalidatePreds(MachineBasicBlock *BB) {
-  for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
-         E = BB->pred_end(); PI != E; ++PI) {
-    BBInfo &PBBI = BBAnalysis[(*PI)->getNumber()];
+  for (const auto &Predecessor : BB->predecessors()) {
+    BBInfo &PBBI = BBAnalysis[Predecessor->getNumber()];
     if (PBBI.IsDone || PBBI.BB == BB)
       continue;
     PBBI.IsAnalyzed = false;
@@ -1184,6 +1184,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   bool HasEarlyExit = CvtBBI->FalseBB != nullptr;
   uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0;
   uint32_t WeightScale = 0;
+
   if (HasEarlyExit) {
     // Get weights before modifying CvtBBI->BB and BBI.BB.
     CvtNext = MBPI->getEdgeWeight(CvtBBI->BB, NextBBI->BB);
@@ -1192,6 +1193,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     BBCvt = MBPI->getEdgeWeight(BBI.BB, CvtBBI->BB);
     SumWeight = MBPI->getSumForBlock(CvtBBI->BB, WeightScale);
   }
+
   if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index f3c8d3dc7fba..df889f76412c 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -34,7 +34,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -139,21 +138,16 @@ private:
   ~InlineSpiller() {}
 
 public:
-  InlineSpiller(MachineFunctionPass &pass,
-                MachineFunction &mf,
-                VirtRegMap &vrm)
-    : MF(mf),
-      LIS(pass.getAnalysis<LiveIntervals>()),
-      LSS(pass.getAnalysis<LiveStacks>()),
-      AA(&pass.getAnalysis<AliasAnalysis>()),
-      MDT(pass.getAnalysis<MachineDominatorTree>()),
-      Loops(pass.getAnalysis<MachineLoopInfo>()),
-      VRM(vrm),
-      MFI(*mf.getFrameInfo()),
-      MRI(mf.getRegInfo()),
-      TII(*mf.getTarget().getInstrInfo()),
-      TRI(*mf.getTarget().getRegisterInfo()),
-      MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()) {}
+  InlineSpiller(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm)
+      : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()),
+        LSS(pass.getAnalysis<LiveStacks>()),
+        AA(&pass.getAnalysis<AliasAnalysis>()),
+        MDT(pass.getAnalysis<MachineDominatorTree>()),
+        Loops(pass.getAnalysis<MachineLoopInfo>()), VRM(vrm),
+        MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
+        TII(*mf.getSubtarget().getInstrInfo()),
+        TRI(*mf.getSubtarget().getRegisterInfo()),
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()) {}
 
   void spill(LiveRangeEdit &) override;
 
@@ -190,11 +184,16 @@ private:
 }
 
 namespace llvm {
+
+Spiller::~Spiller() { }
+void Spiller::anchor() { }
+
 Spiller *createInlineSpiller(MachineFunctionPass &pass,
                              MachineFunction &mf,
                              VirtRegMap &vrm) {
   return new InlineSpiller(pass, mf, vrm);
 }
+
 }
 
 //===----------------------------------------------------------------------===//
@@ -509,6 +508,7 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
   SmallVector<std::pair<unsigned, VNInfo*>, 8> WorkList;
   WorkList.push_back(std::make_pair(UseReg, UseVNI));
 
+  LiveInterval &OrigLI = LIS.getInterval(Original);
   do {
     unsigned Reg;
     VNInfo *VNI;
@@ -522,8 +522,11 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
 
     // Trace through PHI-defs created by live range splitting.
     if (VNI->isPHIDef()) {
-      // Stop at original PHIs.  We don't know the value at the predecessors.
-      if (VNI->def == OrigVNI->def) {
+      // Stop at original PHIs.  We don't know the value at the
+      // predecessors. Look up the VNInfo for the current definition
+      // in OrigLI, to properly determine whether or not this phi was
+      // added by splitting.
+      if (VNI->def == OrigLI.getVNInfoAt(VNI->def)->def) {
         DEBUG(dbgs() << "orig phi value\n");
         SVI->second.DefByOrigPHI = true;
         SVI->second.AllDefsAreReloads = false;
@@ -543,7 +546,6 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
       // Separate all values dominated by OrigVNI into PHIs and non-PHIs.
       SmallVector<VNInfo*, 8> PHIs, NonPHIs;
       LiveInterval &LI = LIS.getInterval(Reg);
-      LiveInterval &OrigLI = LIS.getInterval(Original);
 
       for (LiveInterval::vni_iterator VI = LI.vni_begin(), VE = LI.vni_end();
            VI != VE; ++VI) {
@@ -824,7 +826,7 @@ void InlineSpiller::markValueUsed(LiveInterval *LI, VNInfo *VNI) {
   WorkList.push_back(std::make_pair(LI, VNI));
   do {
     std::tie(LI, VNI) = WorkList.pop_back_val();
-    if (!UsedValues.insert(VNI))
+    if (!UsedValues.insert(VNI).second)
       continue;
 
     if (VNI->isPHIDef()) {
@@ -853,6 +855,15 @@ void InlineSpiller::markValueUsed(LiveInterval *LI, VNInfo *VNI) {
 /// reMaterializeFor - Attempt to rematerialize before MI instead of reloading.
 bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
                                      MachineBasicBlock::iterator MI) {
+
+  // Analyze instruction
+  SmallVector<std::pair<MachineInstr *, unsigned>, 8> Ops;
+  MIBundleOperands::VirtRegInfo RI =
+    MIBundleOperands(MI).analyzeVirtReg(VirtReg.reg, &Ops);
+
+  if (!RI.Reads)
+    return false;
+
   SlotIndex UseIdx = LIS.getInstructionIndex(MI).getRegSlot(true);
   VNInfo *ParentVNI = VirtReg.getVNInfoAt(UseIdx.getBaseIndex());
 
@@ -883,9 +894,6 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
 
   // If the instruction also writes VirtReg.reg, it had better not require the
   // same register for uses and defs.
-  SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
-  MIBundleOperands::VirtRegInfo RI =
-    MIBundleOperands(MI).analyzeVirtReg(VirtReg.reg, &Ops);
   if (RI.Tied) {
     markValueUsed(&VirtReg, ParentVNI);
     DEBUG(dbgs() << "\tcannot remat tied reg: " << UseIdx << '\t' << *MI);
@@ -939,10 +947,15 @@ void InlineSpiller::reMaterializeAll() {
   for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
     unsigned Reg = RegsToSpill[i];
     LiveInterval &LI = LIS.getInterval(Reg);
-    for (MachineRegisterInfo::use_bundle_nodbg_iterator
-         RI = MRI.use_bundle_nodbg_begin(Reg), E = MRI.use_bundle_nodbg_end();
-         RI != E; ) {
-      MachineInstr *MI = &*(RI++);
+    for (MachineRegisterInfo::reg_bundle_iterator
+           RegI = MRI.reg_bundle_begin(Reg), E = MRI.reg_bundle_end();
+         RegI != E; ) {
+      MachineInstr *MI = &*(RegI++);
+
+      // Debug values are not allowed to affect codegen.
+      if (MI->isDebugValue())
+        continue;
+
       anyRemat |= reMaterializeFor(LI, MI);
     }
   }
@@ -1078,7 +1091,8 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
   bool WasCopy = MI->isCopy();
   unsigned ImpReg = 0;
 
-  bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::PATCHPOINT ||
+  bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::STATEPOINT ||
+                       MI->getOpcode() == TargetOpcode::PATCHPOINT ||
                        MI->getOpcode() == TargetOpcode::STACKMAP);
 
   // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
@@ -1218,12 +1232,16 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
       // Modify DBG_VALUE now that the value is in a spill slot.
       bool IsIndirect = MI->isIndirectDebugValue();
       uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
-      const MDNode *MDPtr = MI->getOperand(2).getMetadata();
+      const MDNode *Var = MI->getDebugVariable();
+      const MDNode *Expr = MI->getDebugExpression();
       DebugLoc DL = MI->getDebugLoc();
       DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
       MachineBasicBlock *MBB = MI->getParent();
       BuildMI(*MBB, MBB->erase(MI), DL, TII.get(TargetOpcode::DBG_VALUE))
-          .addFrameIndex(StackSlot).addImm(Offset).addMetadata(MDPtr);
+          .addFrameIndex(StackSlot)
+          .addImm(Offset)
+          .addMetadata(Var)
+          .addMetadata(Expr);
       continue;
     }
 
@@ -1363,7 +1381,7 @@ void InlineSpiller::spill(LiveRangeEdit &edit) {
   StackInt = nullptr;
 
   DEBUG(dbgs() << "Inline spilling "
-               << MRI.getRegClass(edit.getReg())->getName()
+               << TRI.getRegClassName(MRI.getRegClass(edit.getReg()))
                << ':' << edit.getParent()
                << "\nFrom original " << PrintReg(Original) << '\n');
   assert(edit.getParent().isSpillable() &&
diff --git a/lib/CodeGen/InterferenceCache.h b/lib/CodeGen/InterferenceCache.h
index 91a1da94fbaf..1791afbe6c25 100644
--- a/lib/CodeGen/InterferenceCache.h
+++ b/lib/CodeGen/InterferenceCache.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_INTERFERENCECACHE
-#define LLVM_CODEGEN_INTERFERENCECACHE
+#ifndef LLVM_LIB_CODEGEN_INTERFERENCECACHE_H
+#define LLVM_LIB_CODEGEN_INTERFERENCECACHE_H
 
 #include "llvm/CodeGen/LiveIntervalUnion.h"
 
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index a8b860034f49..2c95e9e7d0d3 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -459,9 +459,10 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     CI->replaceAllUsesWith(CI->getOperand(0));
     break;
 
+  case Intrinsic::assume:
   case Intrinsic::var_annotation:
-    break;   // Strip out annotate intrinsic
-    
+    break;   // Strip out these intrinsics
+ 
   case Intrinsic::memcpy: {
     Type *IntPtr = DL.getIntPtrType(Context);
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
@@ -527,6 +528,34 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     ReplaceFPIntrinsicWithCall(CI, "powf", "pow", "powl");
     break;
   }
+  case Intrinsic::sin: {
+    ReplaceFPIntrinsicWithCall(CI, "sinf", "sin", "sinl");
+    break;
+  }
+  case Intrinsic::cos: {
+    ReplaceFPIntrinsicWithCall(CI, "cosf", "cos", "cosl");
+    break;
+  }
+  case Intrinsic::floor: {
+    ReplaceFPIntrinsicWithCall(CI, "floorf", "floor", "floorl");
+    break;
+  }
+  case Intrinsic::ceil: {
+    ReplaceFPIntrinsicWithCall(CI, "ceilf", "ceil", "ceill");
+    break;
+  }
+  case Intrinsic::trunc: {
+    ReplaceFPIntrinsicWithCall(CI, "truncf", "trunc", "truncl");
+    break;
+  }
+  case Intrinsic::round: {
+    ReplaceFPIntrinsicWithCall(CI, "roundf", "round", "roundl");
+    break;
+  }
+  case Intrinsic::copysign: {
+    ReplaceFPIntrinsicWithCall(CI, "copysignf", "copysign", "copysignl");
+    break;
+  }
   case Intrinsic::flt_rounds:
      // Lower to "round to the nearest"
      if (!CI->getType()->isVoidTy())
diff --git a/lib/CodeGen/JITCodeEmitter.cpp b/lib/CodeGen/JITCodeEmitter.cpp
deleted file mode 100644
index 96a53892f6d3..000000000000
--- a/lib/CodeGen/JITCodeEmitter.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//===-- llvm/CodeGen/JITCodeEmitter.cpp - Code emission --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/JITCodeEmitter.h"
-
-using namespace llvm;
-
-void JITCodeEmitter::anchor() { }
diff --git a/lib/CodeGen/JumpInstrTables.cpp b/lib/CodeGen/JumpInstrTables.cpp
index 750f71f6022e..75fa26197398 100644
--- a/lib/CodeGen/JumpInstrTables.cpp
+++ b/lib/CodeGen/JumpInstrTables.cpp
@@ -13,7 +13,6 @@
 #define DEBUG_TYPE "jt"
 
 #include "llvm/CodeGen/JumpInstrTables.h"
-
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/JumpInstrTableInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -30,7 +29,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-
 #include <vector>
 
 using namespace llvm;
@@ -117,8 +115,8 @@ bool replaceGlobalValueIndirectUse(GlobalValue *GV, Value *V, Use *U) {
     if (!isa<GlobalAlias>(C))
       C->replaceUsesOfWithOnConstant(GV, V, U);
   } else {
-    assert(false && "The Use of a Function symbol is neither an instruction nor"
-                    " a constant");
+    llvm_unreachable("The Use of a Function symbol is neither an instruction "
+                     "nor a constant");
   }
 
   return true;
@@ -163,7 +161,7 @@ void JumpInstrTables::getAnalysisUsage(AnalysisUsage &AU) const {
 
 Function *JumpInstrTables::insertEntry(Module &M, Function *Target) {
   FunctionType *OrigFunTy = Target->getFunctionType();
-  FunctionType *FunTy = transformType(OrigFunTy);
+  FunctionType *FunTy = transformType(JTType, OrigFunTy);
 
   JumpMap::iterator it = Metadata.find(FunTy);
   if (Metadata.end() == it) {
@@ -191,11 +189,12 @@ Function *JumpInstrTables::insertEntry(Module &M, Function *Target) {
 }
 
 bool JumpInstrTables::hasTable(FunctionType *FunTy) {
-  FunctionType *TransTy = transformType(FunTy);
+  FunctionType *TransTy = transformType(JTType, FunTy);
   return Metadata.end() != Metadata.find(TransTy);
 }
 
-FunctionType *JumpInstrTables::transformType(FunctionType *FunTy) {
+FunctionType *JumpInstrTables::transformType(JumpTable::JumpTableType JTT,
+                                             FunctionType *FunTy) {
   // Returning nullptr forces all types into the same table, since all types map
   // to the same type
   Type *VoidPtrTy = Type::getInt8PtrTy(FunTy->getContext());
@@ -211,7 +210,7 @@ FunctionType *JumpInstrTables::transformType(FunctionType *FunTy) {
   Type *Int32Ty = Type::getInt32Ty(FunTy->getContext());
   FunctionType *VoidFnTy = FunctionType::get(
       Type::getVoidTy(FunTy->getContext()), EmptyParams, false);
-  switch (JTType) {
+  switch (JTT) {
   case JumpTable::Single:
 
     return FunctionType::get(RetTy, EmptyParams, false);
@@ -253,10 +252,10 @@ FunctionType *JumpInstrTables::transformType(FunctionType *FunTy) {
 bool JumpInstrTables::runOnModule(Module &M) {
   JITI = &getAnalysis<JumpInstrTableInfo>();
 
-  // Get the set of jumptable-annotated functions.
+  // Get the set of jumptable-annotated functions that have their address taken.
   DenseMap<Function *, Function *> Functions;
   for (Function &F : M) {
-    if (F.hasFnAttribute(Attribute::JumpTable)) {
+    if (F.hasFnAttribute(Attribute::JumpTable) && F.hasAddressTaken()) {
       assert(F.hasUnnamedAddr() &&
              "Attribute 'jumptable' requires 'unnamed_addr'");
       Functions[&F] = nullptr;
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index df96b945a8d9..9018314f2132 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -12,9 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetMachine.h"
-
+#include "llvm/Analysis/JumpInstrTableInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/ForwardControlFlowIntegrity.h"
 #include "llvm/CodeGen/JumpInstrTables.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -48,8 +49,8 @@ EnableFastISelOption("fast-isel", cl::Hidden,
   cl::desc("Enable the \"fast\" instruction selector"));
 
 void LLVMTargetMachine::initAsmInfo() {
-  MCAsmInfo *TmpAsmInfo = TheTarget.createMCAsmInfo(*getRegisterInfo(),
-                                                    TargetTriple);
+  MCAsmInfo *TmpAsmInfo = TheTarget.createMCAsmInfo(
+      *getSubtargetImpl()->getRegisterInfo(), getTargetTriple());
   // TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0,
   // and if the old one gets included then MCAsmInfo will be NULL and
   // we'll crash later.
@@ -110,9 +111,9 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
 
   // Install a MachineModuleInfo class, which is an immutable pass that holds
   // all the per-module stuff we're generating, including MCContext.
-  MachineModuleInfo *MMI =
-    new MachineModuleInfo(*TM->getMCAsmInfo(), *TM->getRegisterInfo(),
-                          &TM->getTargetLowering()->getObjFileLowering());
+  MachineModuleInfo *MMI = new MachineModuleInfo(
+      *TM->getMCAsmInfo(), *TM->getSubtargetImpl()->getRegisterInfo(),
+      &TM->getSubtargetImpl()->getTargetLowering()->getObjFileLowering());
   PM.add(MMI);
 
   // Set up a MachineFunction for the rest of CodeGen to work on.
@@ -143,8 +144,13 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                             AnalysisID StopAfter) {
   // Passes to handle jumptable function annotations. These can't be handled at
   // JIT time, so we don't add them directly to addPassesToGenerateCode.
-  PM.add(createJumpInstrTableInfoPass());
+  PM.add(createJumpInstrTableInfoPass(
+      getSubtargetImpl()->getInstrInfo()->getJumpInstrTableEntryBound()));
   PM.add(createJumpInstrTablesPass(Options.JTType));
+  if (Options.FCFI)
+    PM.add(createForwardControlFlowIntegrityPass(
+        Options.JTType, Options.CFIType, Options.CFIEnforcing,
+        Options.getCFIFuncName()));
 
   // Add common CodeGen passes.
   MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify,
@@ -165,10 +171,10 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   if (Options.MCOptions.MCSaveTempLabels)
     Context->setAllowTemporaryLabels(false);
 
-  const MCAsmInfo &MAI = *getMCAsmInfo();
-  const MCRegisterInfo &MRI = *getRegisterInfo();
-  const MCInstrInfo &MII = *getInstrInfo();
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
+  const MCAsmInfo &MAI = *getMCAsmInfo();
+  const MCRegisterInfo &MRI = *getSubtargetImpl()->getRegisterInfo();
+  const MCInstrInfo &MII = *getSubtargetImpl()->getInstrInfo();
   std::unique_ptr<MCStreamer> AsmStreamer;
 
   switch (FileType) {
@@ -201,9 +207,10 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
     if (!MCE || !MAB)
       return true;
 
-    AsmStreamer.reset(getTarget().createMCObjectStreamer(
-        getTargetTriple(), *Context, *MAB, Out, MCE, STI,
-        Options.MCOptions.MCRelaxAll, Options.MCOptions.MCNoExecStack));
+    AsmStreamer.reset(
+        getTarget()
+            .createMCObjectStreamer(getTargetTriple(), *Context, *MAB, Out, MCE,
+                                    STI, Options.MCOptions.MCRelaxAll));
     break;
   }
   case CGFT_Null:
@@ -226,26 +233,6 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   return false;
 }
 
-/// addPassesToEmitMachineCode - Add passes to the specified pass manager to
-/// get machine code emitted.  This uses a JITCodeEmitter object to handle
-/// actually outputting the machine code and resolving things like the address
-/// of functions.  This method should return true if machine code emission is
-/// not supported.
-///
-bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
-                                                   JITCodeEmitter &JCE,
-                                                   bool DisableVerify) {
-  // Add common CodeGen passes.
-  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify, nullptr,
-                                               nullptr);
-  if (!Context)
-    return true;
-
-  addCodeEmitter(PM, JCE);
-
-  return false; // success!
-}
-
 /// addPassesToEmitMC - Add passes to the specified pass manager to get
 /// machine code emitted with the MCJIT. This method returns true if machine
 /// code is not supported. It fills the MCContext Ctx pointer which can be
@@ -265,19 +252,20 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
 
   // Create the code emitter for the target if it exists.  If not, .o file
   // emission fails.
-  const MCRegisterInfo &MRI = *getRegisterInfo();
+  const MCRegisterInfo &MRI = *getSubtargetImpl()->getRegisterInfo();
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
-  MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
-                                                       STI, *Ctx);
+  MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(
+      *getSubtargetImpl()->getInstrInfo(), MRI, STI, *Ctx);
   MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
                                                      TargetCPU);
   if (!MCE || !MAB)
     return true;
 
   std::unique_ptr<MCStreamer> AsmStreamer;
-  AsmStreamer.reset(getTarget().createMCObjectStreamer(
-      getTargetTriple(), *Ctx, *MAB, Out, MCE, STI,
-      Options.MCOptions.MCRelaxAll, Options.MCOptions.MCNoExecStack));
+  AsmStreamer.reset(getTarget()
+                        .createMCObjectStreamer(getTargetTriple(), *Ctx, *MAB,
+                                                Out, MCE, STI,
+                                                Options.MCOptions.MCRelaxAll));
 
   // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
   FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index d12c234bf3b2..b621e3baee51 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -137,6 +137,8 @@ LexicalScope *LexicalScopes::findLexicalScope(DebugLoc DL) {
 /// getOrCreateLexicalScope - Find lexical scope for the given DebugLoc. If
 /// not available then create new lexical scope.
 LexicalScope *LexicalScopes::getOrCreateLexicalScope(DebugLoc DL) {
+  if (DL.isUnknown())
+    return nullptr;
   MDNode *Scope = nullptr;
   MDNode *InlinedAt = nullptr;
   DL.getScopeAndInlinedAt(Scope, InlinedAt, MF->getFunction()->getContext());
@@ -172,9 +174,12 @@ LexicalScope *LexicalScopes::getOrCreateRegularScope(MDNode *Scope) {
                               std::make_tuple(Parent, DIDescriptor(Scope),
                                               nullptr, false)).first;
 
-  if (!Parent && DIDescriptor(Scope).isSubprogram() &&
-      DISubprogram(Scope).describes(MF->getFunction()))
+  if (!Parent) {
+    assert(DIDescriptor(Scope).isSubprogram());
+    assert(DISubprogram(Scope).describes(MF->getFunction()));
+    assert(!CurrentFnLexicalScope);
     CurrentFnLexicalScope = &I->second;
+  }
 
   return &I->second;
 }
@@ -285,7 +290,7 @@ void LexicalScopes::assignInstructionRanges(
 /// have machine instructions that belong to lexical scope identified by
 /// DebugLoc.
 void LexicalScopes::getMachineBasicBlocks(
-    DebugLoc DL, SmallPtrSet<const MachineBasicBlock *, 4> &MBBs) {
+    DebugLoc DL, SmallPtrSetImpl<const MachineBasicBlock *> &MBBs) {
   MBBs.clear();
   LexicalScope *Scope = getOrCreateLexicalScope(DL);
   if (!Scope)
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 388f58fde2ab..dc936a3f5b85 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -39,7 +39,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <memory>
 
 using namespace llvm;
@@ -109,7 +109,8 @@ public:
 namespace {
 class LDVImpl;
 class UserValue {
-  const MDNode *variable; ///< The debug info variable we are part of.
+  const MDNode *Variable;   ///< The debug info variable we are part of.
+  const MDNode *Expression; ///< Any complex address expression.
   unsigned offset;        ///< Byte offset into variable.
   bool IsIndirect;        ///< true if this is a register-indirect+offset value.
   DebugLoc dl;            ///< The debug location for the variable. This is
@@ -139,11 +140,10 @@ class UserValue {
 
 public:
   /// UserValue - Create a new UserValue.
-  UserValue(const MDNode *var, unsigned o, bool i, DebugLoc L,
-            LocMap::Allocator &alloc)
-    : variable(var), offset(o), IsIndirect(i), dl(L), leader(this),
-      next(nullptr), locInts(alloc)
-  {}
+  UserValue(const MDNode *var, const MDNode *expr, unsigned o, bool i,
+            DebugLoc L, LocMap::Allocator &alloc)
+      : Variable(var), Expression(expr), offset(o), IsIndirect(i), dl(L),
+        leader(this), next(nullptr), locInts(alloc) {}
 
   /// getLeader - Get the leader of this value's equivalence class.
   UserValue *getLeader() {
@@ -157,8 +157,10 @@ public:
   UserValue *getNext() const { return next; }
 
   /// match - Does this UserValue match the parameters?
-  bool match(const MDNode *Var, unsigned Offset, bool indirect) const {
-    return Var == variable && Offset == offset && indirect == IsIndirect;
+  bool match(const MDNode *Var, const MDNode *Expr, unsigned Offset,
+             bool indirect) const {
+    return Var == Variable && Expr == Expression && Offset == offset &&
+           indirect == IsIndirect;
   }
 
   /// merge - Merge equivalence classes.
@@ -267,7 +269,7 @@ public:
                        LiveIntervals &LIS, const TargetInstrInfo &TRI);
 
   /// findDebugLoc - Return DebugLoc used for this DBG_VALUE instruction. A
-  /// variable may have more than one corresponding DBG_VALUE instructions. 
+  /// variable may have more than one corresponding DBG_VALUE instructions.
   /// Only first one needs DebugLoc to identify variable's lexical scope
   /// in source file.
   DebugLoc findDebugLoc();
@@ -306,8 +308,8 @@ class LDVImpl {
   UVMap userVarMap;
 
   /// getUserValue - Find or create a UserValue.
-  UserValue *getUserValue(const MDNode *Var, unsigned Offset,
-                          bool IsIndirect, DebugLoc DL);
+  UserValue *getUserValue(const MDNode *Var, const MDNode *Expr,
+                          unsigned Offset, bool IsIndirect, DebugLoc DL);
 
   /// lookupVirtReg - Find the EC leader for VirtReg or null.
   UserValue *lookupVirtReg(unsigned VirtReg);
@@ -329,12 +331,13 @@ class LDVImpl {
   void computeIntervals();
 
 public:
-  LDVImpl(LiveDebugVariables *ps) : pass(*ps), EmitDone(false),
-                                    ModifiedMF(false) {}
+  LDVImpl(LiveDebugVariables *ps)
+      : pass(*ps), MF(nullptr), EmitDone(false), ModifiedMF(false) {}
   bool runOnMachineFunction(MachineFunction &mf);
 
   /// clear - Release all memory.
   void clear() {
+    MF = nullptr;
     userValues.clear();
     virtRegToEqClass.clear();
     userVarMap.clear();
@@ -343,6 +346,7 @@ public:
            "Dbg values are not emitted in LDV");
     EmitDone = false;
     ModifiedMF = false;
+    LS.reset();
   }
 
   /// mapVirtReg - Map virtual register to an equivalence class.
@@ -359,8 +363,8 @@ public:
 } // namespace
 
 void UserValue::print(raw_ostream &OS, const TargetMachine *TM) {
-  DIVariable DV(variable);
-  OS << "!\""; 
+  DIVariable DV(Variable);
+  OS << "!\"";
   DV.printExtendedName(OS);
   OS << "\"\t";
   if (offset)
@@ -420,19 +424,20 @@ void UserValue::mapVirtRegs(LDVImpl *LDV) {
       LDV->mapVirtReg(locations[i].getReg(), this);
 }
 
-UserValue *LDVImpl::getUserValue(const MDNode *Var, unsigned Offset,
-                                 bool IsIndirect, DebugLoc DL) {
+UserValue *LDVImpl::getUserValue(const MDNode *Var, const MDNode *Expr,
+                                 unsigned Offset, bool IsIndirect,
+                                 DebugLoc DL) {
   UserValue *&Leader = userVarMap[Var];
   if (Leader) {
     UserValue *UV = Leader->getLeader();
     Leader = UV;
     for (; UV; UV = UV->getNext())
-      if (UV->match(Var, Offset, IsIndirect))
+      if (UV->match(Var, Expr, Offset, IsIndirect))
         return UV;
   }
 
   userValues.push_back(
-      make_unique<UserValue>(Var, Offset, IsIndirect, DL, allocator));
+      make_unique<UserValue>(Var, Expr, Offset, IsIndirect, DL, allocator));
   UserValue *UV = userValues.back().get();
   Leader = UserValue::merge(Leader, UV);
   return UV;
@@ -452,7 +457,7 @@ UserValue *LDVImpl::lookupVirtReg(unsigned VirtReg) {
 
 bool LDVImpl::handleDebugValue(MachineInstr *MI, SlotIndex Idx) {
   // DBG_VALUE loc, offset, variable
-  if (MI->getNumOperands() != 3 ||
+  if (MI->getNumOperands() != 4 ||
       !(MI->getOperand(1).isReg() || MI->getOperand(1).isImm()) ||
       !MI->getOperand(2).isMetadata()) {
     DEBUG(dbgs() << "Can't handle " << *MI);
@@ -462,9 +467,11 @@ bool LDVImpl::handleDebugValue(MachineInstr *MI, SlotIndex Idx) {
   // Get or create the UserValue for (variable,offset).
   bool IsIndirect = MI->isIndirectDebugValue();
   unsigned Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
-  const MDNode *Var = MI->getOperand(2).getMetadata();
+  const MDNode *Var = MI->getDebugVariable();
+  const MDNode *Expr = MI->getDebugExpression();
   //here.
-  UserValue *UV = getUserValue(Var, Offset, IsIndirect, MI->getDebugLoc());
+  UserValue *UV =
+      getUserValue(Var, Expr, Offset, IsIndirect, MI->getDebugLoc());
   UV->addDef(Idx, MI->getOperand(0));
   return true;
 }
@@ -693,11 +700,11 @@ void LDVImpl::computeIntervals() {
 }
 
 bool LDVImpl::runOnMachineFunction(MachineFunction &mf) {
+  clear();
   MF = &mf;
   LIS = &pass.getAnalysis<LiveIntervals>();
   MDT = &pass.getAnalysis<MachineDominatorTree>();
-  TRI = mf.getTarget().getRegisterInfo();
-  clear();
+  TRI = mf.getSubtarget().getRegisterInfo();
   LS.initialize(mf);
   DEBUG(dbgs() << "********** COMPUTING LIVE DEBUG VARIABLES: "
                << mf.getName() << " **********\n");
@@ -709,9 +716,25 @@ bool LDVImpl::runOnMachineFunction(MachineFunction &mf) {
   return Changed;
 }
 
+static void removeDebugValues(MachineFunction &mf) {
+  for (MachineBasicBlock &MBB : mf) {
+    for (auto MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ) {
+      if (!MBBI->isDebugValue()) {
+        ++MBBI;
+        continue;
+      }
+      MBBI = MBB.erase(MBBI);
+    }
+  }
+}
+
 bool LiveDebugVariables::runOnMachineFunction(MachineFunction &mf) {
   if (!EnableLDV)
     return false;
+  if (!FunctionDIs.count(mf.getFunction())) {
+    removeDebugValues(mf);
+    return false;
+  }
   if (!pImpl)
     pImpl = new LDVImpl(this);
   return static_cast<LDVImpl*>(pImpl)->runOnMachineFunction(mf);
@@ -933,10 +956,13 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx,
 
   if (Loc.isReg())
     BuildMI(*MBB, I, findDebugLoc(), TII.get(TargetOpcode::DBG_VALUE),
-            IsIndirect, Loc.getReg(), offset, variable);
+            IsIndirect, Loc.getReg(), offset, Variable, Expression);
   else
     BuildMI(*MBB, I, findDebugLoc(), TII.get(TargetOpcode::DBG_VALUE))
-      .addOperand(Loc).addImm(offset).addMetadata(variable);
+        .addOperand(Loc)
+        .addImm(offset)
+        .addMetadata(Variable)
+        .addMetadata(Expression);
 }
 
 void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
@@ -974,7 +1000,9 @@ void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
 
 void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
   DEBUG(dbgs() << "********** EMITTING LIVE DEBUG VARIABLES **********\n");
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  if (!MF)
+    return;
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
     DEBUG(userValues[i]->print(dbgs(), &MF->getTarget()));
     userValues[i]->rewriteLocations(*VRM, *TRI);
@@ -988,6 +1016,10 @@ void LiveDebugVariables::emitDebugValues(VirtRegMap *VRM) {
     static_cast<LDVImpl*>(pImpl)->emitDebugValues(VRM);
 }
 
+bool LiveDebugVariables::doInitialization(Module &M) {
+  FunctionDIs = makeSubprogramMap(M);
+  return Pass::doInitialization(M);
+}
 
 #ifndef NDEBUG
 void LiveDebugVariables::dump() {
diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index bb6743547e3d..9748329314d6 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h
@@ -18,11 +18,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
-#define LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
+#ifndef LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
+#define LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/IR/DebugInfo.h"
 
 namespace llvm {
 
@@ -32,6 +33,7 @@ class VirtRegMap;
 
 class LiveDebugVariables : public MachineFunctionPass {
   void *pImpl;
+  DenseMap<const Function*, DISubprogram> FunctionDIs;
 public:
   static char ID; // Pass identification, replacement for typeid
 
@@ -64,9 +66,10 @@ private:
   bool runOnMachineFunction(MachineFunction &) override;
   void releaseMemory() override;
   void getAnalysisUsage(AnalysisUsage &) const override;
+  bool doInitialization(Module &) override;
 
 };
 
 } // namespace llvm
 
-#endif // LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
+#endif
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index ce8ce962417b..9423edc30910 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
@@ -185,6 +186,27 @@ bool LiveRange::overlaps(SlotIndex Start, SlotIndex End) const {
   return I != begin() && (--I)->end > Start;
 }
 
+bool LiveRange::covers(const LiveRange &Other) const {
+  if (empty())
+    return Other.empty();
+
+  const_iterator I = begin();
+  for (const Segment &O : Other.segments) {
+    I = advanceTo(I, O.start);
+    if (I == end() || I->start > O.start)
+      return false;
+
+    // Check adjacent live segments and see if we can get behind O.end.
+    while (I->end < O.end) {
+      const_iterator Last = I;
+      // Get next segment and abort if it was not adjacent.
+      ++I;
+      if (I == end() || Last->end != I->start)
+        return false;
+    }
+  }
+  return true;
+}
 
 /// ValNo is dead, remove it.  If it is the largest value number, just nuke it
 /// (and any other deleted values neighboring it), otherwise mark it as ~1U so
@@ -204,9 +226,9 @@ void LiveRange::markValNoForDeletion(VNInfo *ValNo) {
 void LiveRange::RenumberValues() {
   SmallPtrSet<VNInfo*, 8> Seen;
   valnos.clear();
-  for (const_iterator I = begin(), E = end(); I != E; ++I) {
-    VNInfo *VNI = I->valno;
-    if (!Seen.insert(VNI))
+  for (const Segment &S : segments) {
+    VNInfo *VNI = S.valno;
+    if (!Seen.insert(VNI).second)
       continue;
     assert(!VNI->isUnused() && "Unused valno used by live segment");
     VNI->id = (unsigned)valnos.size();
@@ -278,6 +300,12 @@ LiveRange::extendSegmentStartTo(iterator I, SlotIndex NewStart) {
   return MergeTo;
 }
 
+void LiveRange::append(const Segment S) {
+  // Check that the segment belongs to the back of the list.
+  assert(segments.empty() || segments.back().end <= S.start);
+  segments.push_back(S);
+}
+
 LiveRange::iterator LiveRange::addSegmentFrom(Segment S, iterator From) {
   SlotIndex Start = S.start, End = S.end;
   iterator it = std::upper_bound(From, end(), Start);
@@ -461,8 +489,8 @@ void LiveRange::join(LiveRange &Other,
   // This can leave Other in an invalid state because we're not coalescing
   // touching segments that now have identical values. That's OK since Other is
   // not supposed to be valid after calling join();
-  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
-    I->valno = NewVNInfo[RHSValNoAssignments[I->valno->id]];
+  for (Segment &S : Other.segments)
+    S.valno = NewVNInfo[RHSValNoAssignments[S.valno->id]];
 
   // Update val# info. Renumber them and make sure they all belong to this
   // LiveRange now. Also remove dead val#'s.
@@ -482,8 +510,8 @@ void LiveRange::join(LiveRange &Other,
 
   // Okay, now insert the RHS live segments into the LHS.
   LiveRangeUpdater Updater(this);
-  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
-    Updater.add(*I);
+  for (Segment &S : Other.segments)
+    Updater.add(S);
 }
 
 /// Merge all of the segments in RHS into this live range as the specified
@@ -493,8 +521,8 @@ void LiveRange::join(LiveRange &Other,
 void LiveRange::MergeSegmentsInAsValue(const LiveRange &RHS,
                                        VNInfo *LHSValNo) {
   LiveRangeUpdater Updater(this);
-  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
-    Updater.add(I->start, I->end, LHSValNo);
+  for (const Segment &S : RHS.segments)
+    Updater.add(S.start, S.end, LHSValNo);
 }
 
 /// MergeValueInAsValue - Merge all of the live segments of a specific val#
@@ -506,9 +534,9 @@ void LiveRange::MergeValueInAsValue(const LiveRange &RHS,
                                     const VNInfo *RHSValNo,
                                     VNInfo *LHSValNo) {
   LiveRangeUpdater Updater(this);
-  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
-    if (I->valno == RHSValNo)
-      Updater.add(I->start, I->end, LHSValNo);
+  for (const Segment &S : RHS.segments)
+    if (S.valno == RHSValNo)
+      Updater.add(S.start, S.end, LHSValNo);
 }
 
 /// MergeValueNumberInto - This method is called when two value nubmers
@@ -570,10 +598,232 @@ VNInfo *LiveRange::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
   return V2;
 }
 
+void LiveInterval::removeEmptySubRanges() {
+  SubRange **NextPtr = &SubRanges;
+  SubRange *I = *NextPtr;
+  while (I != nullptr) {
+    if (!I->empty()) {
+      NextPtr = &I->Next;
+      I = *NextPtr;
+      continue;
+    }
+    // Skip empty subranges until we find the first nonempty one.
+    do {
+      I = I->Next;
+    } while (I != nullptr && I->empty());
+    *NextPtr = I;
+  }
+}
+
+/// Helper function for constructMainRangeFromSubranges(): Search the CFG
+/// backwards until we find a place covered by a LiveRange segment that actually
+/// has a valno set.
+static VNInfo *searchForVNI(const SlotIndexes &Indexes, LiveRange &LR,
+    const MachineBasicBlock *MBB,
+    SmallPtrSetImpl<const MachineBasicBlock*> &Visited) {
+  // We start the search at the end of MBB.
+  SlotIndex EndIdx = Indexes.getMBBEndIdx(MBB);
+  // In our use case we can't live the area covered by the live segments without
+  // finding an actual VNI def.
+  LiveRange::iterator I = LR.find(EndIdx.getPrevSlot());
+  assert(I != LR.end());
+  LiveRange::Segment &S = *I;
+  if (S.valno != nullptr)
+    return S.valno;
+
+  VNInfo *VNI = nullptr;
+  // Continue at predecessors (we could even go to idom with domtree available).
+  for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+    // Avoid going in circles.
+    if (!Visited.insert(Pred).second)
+      continue;
+
+    VNI = searchForVNI(Indexes, LR, Pred, Visited);
+    if (VNI != nullptr) {
+      S.valno = VNI;
+      break;
+    }
+  }
+
+  return VNI;
+}
+
+static void determineMissingVNIs(const SlotIndexes &Indexes, LiveInterval &LI) {
+  SmallPtrSet<const MachineBasicBlock*, 5> Visited;
+  for (LiveRange::Segment &S : LI.segments) {
+    if (S.valno != nullptr)
+      continue;
+    // This can only happen at the begin of a basic block.
+    assert(S.start.isBlock() && "valno should only be missing at block begin");
+
+    Visited.clear();
+    const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(S.start);
+    for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+      VNInfo *VNI = searchForVNI(Indexes, LI, Pred, Visited);
+      if (VNI != nullptr) {
+        S.valno = VNI;
+        break;
+      }
+    }
+    assert(S.valno != nullptr && "could not determine valno");
+  }
+}
+
+void LiveInterval::constructMainRangeFromSubranges(
+    const SlotIndexes &Indexes, VNInfo::Allocator &VNIAllocator) {
+  // The basic observations on which this algorithm is based:
+  // - Each Def/ValNo in a subrange must have a corresponding def on the main
+  //   range, but not further defs/valnos are necessary.
+  // - If any of the subranges is live at a point the main liverange has to be
+  //   live too, conversily if no subrange is live the main range mustn't be
+  //   live either.
+  // We do this by scannig through all the subranges simultaneously creating new
+  // segments in the main range as segments start/ends come up in the subranges.
+  assert(hasSubRanges() && "expected subranges to be present");
+  assert(segments.empty() && valnos.empty() && "expected empty main range");
+
+  // Collect subrange, iterator pairs for the walk and determine first and last
+  // SlotIndex involved.
+  SmallVector<std::pair<const SubRange*, const_iterator>, 4> SRs;
+  SlotIndex First;
+  SlotIndex Last;
+  for (const SubRange &SR : subranges()) {
+    if (SR.empty())
+      continue;
+    SRs.push_back(std::make_pair(&SR, SR.begin()));
+    if (!First.isValid() || SR.segments.front().start < First)
+      First = SR.segments.front().start;
+    if (!Last.isValid() || SR.segments.back().end > Last)
+      Last = SR.segments.back().end;
+  }
+
+  // Walk over all subranges simultaneously.
+  Segment CurrentSegment;
+  bool ConstructingSegment = false;
+  bool NeedVNIFixup = false;
+  unsigned ActiveMask = 0;
+  SlotIndex Pos = First;
+  while (true) {
+    SlotIndex NextPos = Last;
+    enum {
+      NOTHING,
+      BEGIN_SEGMENT,
+      END_SEGMENT,
+    } Event = NOTHING;
+    // Which subregister lanes are affected by the current event.
+    unsigned EventMask = 0;
+    // Whether a BEGIN_SEGMENT is also a valno definition point.
+    bool IsDef = false;
+    // Find the next begin or end of a subrange segment. Combine masks if we
+    // have multiple begins/ends at the same position. Ends take precedence over
+    // Begins.
+    for (auto &SRP : SRs) {
+      const SubRange &SR = *SRP.first;
+      const_iterator &I = SRP.second;
+      // Advance iterator of subrange to a segment involving Pos; the earlier
+      // segments are already merged at this point.
+      while (I != SR.end() &&
+             (I->end < Pos ||
+              (I->end == Pos && (ActiveMask & SR.LaneMask) == 0)))
+        ++I;
+      if (I == SR.end())
+        continue;
+      if ((ActiveMask & SR.LaneMask) == 0 &&
+          Pos <= I->start && I->start <= NextPos) {
+        // Merge multiple begins at the same position.
+        if (I->start == NextPos && Event == BEGIN_SEGMENT) {
+          EventMask |= SR.LaneMask;
+          IsDef |= I->valno->def == I->start;
+        } else if (I->start < NextPos || Event != END_SEGMENT) {
+          Event = BEGIN_SEGMENT;
+          NextPos = I->start;
+          EventMask = SR.LaneMask;
+          IsDef = I->valno->def == I->start;
+        }
+      }
+      if ((ActiveMask & SR.LaneMask) != 0 &&
+          Pos <= I->end && I->end <= NextPos) {
+        // Merge multiple ends at the same position.
+        if (I->end == NextPos && Event == END_SEGMENT)
+          EventMask |= SR.LaneMask;
+        else {
+          Event = END_SEGMENT;
+          NextPos = I->end;
+          EventMask = SR.LaneMask;
+        }
+      }
+    }
+
+    // Advance scan position.
+    Pos = NextPos;
+    if (Event == BEGIN_SEGMENT) {
+      if (ConstructingSegment && IsDef) {
+        // Finish previous segment because we have to start a new one.
+        CurrentSegment.end = Pos;
+        append(CurrentSegment);
+        ConstructingSegment = false;
+      }
+
+      // Start a new segment if necessary.
+      if (!ConstructingSegment) {
+        // Determine value number for the segment.
+        VNInfo *VNI;
+        if (IsDef) {
+          VNI = getNextValue(Pos, VNIAllocator);
+        } else {
+          // We have to reuse an existing value number, if we are lucky
+          // then we already passed one of the predecessor blocks and determined
+          // its value number (with blocks in reverse postorder this would be
+          // always true but we have no such guarantee).
+          assert(Pos.isBlock());
+          const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(Pos);
+          // See if any of the predecessor blocks has a lower number and a VNI
+          for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+            SlotIndex PredEnd = Indexes.getMBBEndIdx(Pred);
+            VNI = getVNInfoBefore(PredEnd);
+            if (VNI != nullptr)
+              break;
+          }
+          // Def will come later: We have to do an extra fixup pass.
+          if (VNI == nullptr)
+            NeedVNIFixup = true;
+        }
+
+        CurrentSegment.start = Pos;
+        CurrentSegment.valno = VNI;
+        ConstructingSegment = true;
+      }
+      ActiveMask |= EventMask;
+    } else if (Event == END_SEGMENT) {
+      assert(ConstructingSegment);
+      // Finish segment if no lane is active anymore.
+      ActiveMask &= ~EventMask;
+      if (ActiveMask == 0) {
+        CurrentSegment.end = Pos;
+        append(CurrentSegment);
+        ConstructingSegment = false;
+      }
+    } else {
+      // We reached the end of the last subranges and can stop.
+      assert(Event == NOTHING);
+      break;
+    }
+  }
+
+  // We might not be able to assign new valnos for all segments if the basic
+  // block containing the definition comes after a segment using the valno.
+  // Do a fixup pass for this uncommon case.
+  if (NeedVNIFixup)
+    determineMissingVNIs(Indexes, *this);
+
+  assert(ActiveMask == 0 && !ConstructingSegment && "all segments ended");
+  verify();
+}
+
 unsigned LiveInterval::getSize() const {
   unsigned Sum = 0;
-  for (const_iterator I = begin(), E = end(); I != E; ++I)
-    Sum += I->start.distance(I->end);
+  for (const Segment &S : segments)
+    Sum += S.start.distance(S.end);
   return Sum;
 }
 
@@ -591,9 +841,9 @@ void LiveRange::print(raw_ostream &OS) const {
   if (empty())
     OS << "EMPTY";
   else {
-    for (const_iterator I = begin(), E = end(); I != E; ++I) {
-      OS << *I;
-      assert(I->valno == getValNumInfo(I->valno->id) && "Bad VNInfo");
+    for (const Segment &S : segments) {
+      OS << S;
+      assert(S.valno == getValNumInfo(S.valno->id) && "Bad VNInfo");
     }
   }
 
@@ -620,6 +870,10 @@ void LiveRange::print(raw_ostream &OS) const {
 void LiveInterval::print(raw_ostream &OS) const {
   OS << PrintReg(reg) << ' ';
   super::print(OS);
+  // Print subranges
+  for (const SubRange &SR : subranges()) {
+    OS << format(" L%04X ", SR.LaneMask) << SR;
+  }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -648,6 +902,26 @@ void LiveRange::verify() const {
     }
   }
 }
+
+void LiveInterval::verify(const MachineRegisterInfo *MRI) const {
+  super::verify();
+
+  // Make sure SubRanges are fine and LaneMasks are disjunct.
+  unsigned Mask = 0;
+  unsigned MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg) : ~0u;
+  for (const SubRange &SR : subranges()) {
+    // Subrange lanemask should be disjunct to any previous subrange masks.
+    assert((Mask & SR.LaneMask) == 0);
+    Mask |= SR.LaneMask;
+
+    // subrange mask should not contained in maximum lane mask for the vreg.
+    assert((Mask & ~MaxMask) == 0);
+
+    SR.verify();
+    // Main liverange should cover subrange.
+    assert(covers(SR));
+  }
+}
 #endif
 
 
@@ -692,14 +966,14 @@ void LiveRangeUpdater::print(raw_ostream &OS) const {
   OS << " updater with gap = " << (ReadI - WriteI)
      << ", last start = " << LastStart
      << ":\n  Area 1:";
-  for (LiveRange::const_iterator I = LR->begin(); I != WriteI; ++I)
-    OS << ' ' << *I;
+  for (const auto &S : make_range(LR->begin(), WriteI))
+    OS << ' ' << S;
   OS << "\n  Spills:";
   for (unsigned I = 0, E = Spills.size(); I != E; ++I)
     OS << ' ' << Spills[I];
   OS << "\n  Area 2:";
-  for (LiveRange::const_iterator I = ReadI, E = LR->end(); I != E; ++I)
-    OS << ' ' << *I;
+  for (const auto &S : make_range(ReadI, LR->end()))
+    OS << ' ' << S;
   OS << '\n';
 }
 
@@ -860,9 +1134,7 @@ unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) {
   const VNInfo *used = nullptr, *unused = nullptr;
 
   // Determine connections.
-  for (LiveInterval::const_vni_iterator I = LI->vni_begin(), E = LI->vni_end();
-       I != E; ++I) {
-    const VNInfo *VNI = *I;
+  for (const VNInfo *VNI : LI->valnos) {
     // Group all unused values into one class.
     if (VNI->isUnused()) {
       if (unused)
@@ -938,6 +1210,8 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
     } else
       *J++ = *I;
   }
+  // TODO: do not cheat anymore by simply cleaning all subranges
+  LI.clearSubRanges();
   LI.segments.erase(J, E);
 
   // Transfer VNInfos to their new owners and renumber them.
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 15595609e904..56f38b6407d9 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -32,10 +32,11 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 #include <cmath>
 #include <limits>
@@ -62,6 +63,10 @@ static cl::opt<bool> EnablePrecomputePhysRegs(
 static bool EnablePrecomputePhysRegs = false;
 #endif // NDEBUG
 
+static cl::opt<bool> EnableSubRegLiveness(
+  "enable-subreg-liveness", cl::Hidden, cl::init(true),
+  cl::desc("Enable subregister liveness tracking."));
+
 void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<AliasAnalysis>();
@@ -110,12 +115,15 @@ void LiveIntervals::releaseMemory() {
 bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   MF = &fn;
   MRI = &MF->getRegInfo();
-  TM = &fn.getTarget();
-  TRI = TM->getRegisterInfo();
-  TII = TM->getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
   AA = &getAnalysis<AliasAnalysis>();
   Indexes = &getAnalysis<SlotIndexes>();
   DomTree = &getAnalysis<MachineDominatorTree>();
+
+  if (EnableSubRegLiveness && MF->getSubtarget().enableSubRegLiveness())
+    MRI->enableSubRegLiveness(true);
+
   if (!LRCalc)
     LRCalc = new LiveRangeCalc();
 
@@ -184,9 +192,8 @@ void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   assert(LRCalc && "LRCalc not initialized.");
   assert(LI.empty() && "Should only compute empty intervals.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
-  LRCalc->createDeadDefs(LI);
-  LRCalc->extendToUses(LI);
-  computeDeadValues(&LI, LI, nullptr, nullptr);
+  LRCalc->calculate(LI);
+  computeDeadValues(LI, nullptr);
 }
 
 void LiveIntervals::computeVirtRegs() {
@@ -313,6 +320,70 @@ void LiveIntervals::computeLiveInRegUnits() {
 }
 
 
+static void createSegmentsForValues(LiveRange &LR,
+      iterator_range<LiveInterval::vni_iterator> VNIs) {
+  for (auto VNI : VNIs) {
+    if (VNI->isUnused())
+      continue;
+    SlotIndex Def = VNI->def;
+    LR.addSegment(LiveRange::Segment(Def, Def.getDeadSlot(), VNI));
+  }
+}
+
+typedef SmallVector<std::pair<SlotIndex, VNInfo*>, 16> ShrinkToUsesWorkList;
+
+static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
+                                 ShrinkToUsesWorkList &WorkList,
+                                 const LiveRange &OldRange) {
+  // Keep track of the PHIs that are in use.
+  SmallPtrSet<VNInfo*, 8> UsedPHIs;
+  // Blocks that have already been added to WorkList as live-out.
+  SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
+
+  // Extend intervals to reach all uses in WorkList.
+  while (!WorkList.empty()) {
+    SlotIndex Idx = WorkList.back().first;
+    VNInfo *VNI = WorkList.back().second;
+    WorkList.pop_back();
+    const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(Idx.getPrevSlot());
+    SlotIndex BlockStart = Indexes.getMBBStartIdx(MBB);
+
+    // Extend the live range for VNI to be live at Idx.
+    if (VNInfo *ExtVNI = LR.extendInBlock(BlockStart, Idx)) {
+      assert(ExtVNI == VNI && "Unexpected existing value number");
+      (void)ExtVNI;
+      // Is this a PHIDef we haven't seen before?
+      if (!VNI->isPHIDef() || VNI->def != BlockStart ||
+          !UsedPHIs.insert(VNI).second)
+        continue;
+      // The PHI is live, make sure the predecessors are live-out.
+      for (auto &Pred : MBB->predecessors()) {
+        if (!LiveOut.insert(Pred).second)
+          continue;
+        SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
+        // A predecessor is not required to have a live-out value for a PHI.
+        if (VNInfo *PVNI = OldRange.getVNInfoBefore(Stop))
+          WorkList.push_back(std::make_pair(Stop, PVNI));
+      }
+      continue;
+    }
+
+    // VNI is live-in to MBB.
+    DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
+    LR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
+
+    // Make sure VNI is live-out from the predecessors.
+    for (auto &Pred : MBB->predecessors()) {
+      if (!LiveOut.insert(Pred).second)
+        continue;
+      SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
+      assert(OldRange.getVNInfoBefore(Stop) == VNI &&
+             "Wrong value out of predecessor");
+      WorkList.push_back(std::make_pair(Stop, VNI));
+    }
+  }
+}
+
 /// shrinkToUses - After removing some uses of a register, shrink its live
 /// range to just the remaining uses. This method does not compute reaching
 /// defs for new uses, and it doesn't remove dead defs.
@@ -321,11 +392,14 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
   DEBUG(dbgs() << "Shrink: " << *li << '\n');
   assert(TargetRegisterInfo::isVirtualRegister(li->reg)
          && "Can only shrink virtual registers");
-  // Find all the values used, including PHI kills.
-  SmallVector<std::pair<SlotIndex, VNInfo*>, 16> WorkList;
 
-  // Blocks that have already been added to WorkList as live-out.
-  SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
+  // Shrink subregister live ranges.
+  for (LiveInterval::SubRange &S : li->subranges()) {
+    shrinkToUses(S, li->reg);
+  }
+
+  // Find all the values used, including PHI kills.
+  ShrinkToUsesWorkList WorkList;
 
   // Visit all instructions reading li->reg.
   for (MachineRegisterInfo::reg_instr_iterator
@@ -356,102 +430,115 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 
   // Create new live ranges with only minimal live segments per def.
   LiveRange NewLR;
-  for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
-       I != E; ++I) {
-    VNInfo *VNI = *I;
-    if (VNI->isUnused())
-      continue;
-    NewLR.addSegment(LiveRange::Segment(VNI->def, VNI->def.getDeadSlot(), VNI));
-  }
-
-  // Keep track of the PHIs that are in use.
-  SmallPtrSet<VNInfo*, 8> UsedPHIs;
-
-  // Extend intervals to reach all uses in WorkList.
-  while (!WorkList.empty()) {
-    SlotIndex Idx = WorkList.back().first;
-    VNInfo *VNI = WorkList.back().second;
-    WorkList.pop_back();
-    const MachineBasicBlock *MBB = getMBBFromIndex(Idx.getPrevSlot());
-    SlotIndex BlockStart = getMBBStartIdx(MBB);
-
-    // Extend the live range for VNI to be live at Idx.
-    if (VNInfo *ExtVNI = NewLR.extendInBlock(BlockStart, Idx)) {
-      (void)ExtVNI;
-      assert(ExtVNI == VNI && "Unexpected existing value number");
-      // Is this a PHIDef we haven't seen before?
-      if (!VNI->isPHIDef() || VNI->def != BlockStart || !UsedPHIs.insert(VNI))
-        continue;
-      // The PHI is live, make sure the predecessors are live-out.
-      for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
-           PE = MBB->pred_end(); PI != PE; ++PI) {
-        if (!LiveOut.insert(*PI))
-          continue;
-        SlotIndex Stop = getMBBEndIdx(*PI);
-        // A predecessor is not required to have a live-out value for a PHI.
-        if (VNInfo *PVNI = li->getVNInfoBefore(Stop))
-          WorkList.push_back(std::make_pair(Stop, PVNI));
-      }
-      continue;
-    }
-
-    // VNI is live-in to MBB.
-    DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
-    NewLR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
-
-    // Make sure VNI is live-out from the predecessors.
-    for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-      if (!LiveOut.insert(*PI))
-        continue;
-      SlotIndex Stop = getMBBEndIdx(*PI);
-      assert(li->getVNInfoBefore(Stop) == VNI &&
-             "Wrong value out of predecessor");
-      WorkList.push_back(std::make_pair(Stop, VNI));
-    }
-  }
-
-  // Handle dead values.
-  bool CanSeparate = false;
-  computeDeadValues(li, NewLR, &CanSeparate, dead);
+  createSegmentsForValues(NewLR, make_range(li->vni_begin(), li->vni_end()));
+  extendSegmentsToUses(NewLR, *Indexes, WorkList, *li);
 
   // Move the trimmed segments back.
   li->segments.swap(NewLR.segments);
+
+  // Handle dead values.
+  bool CanSeparate = computeDeadValues(*li, dead);
   DEBUG(dbgs() << "Shrunk: " << *li << '\n');
   return CanSeparate;
 }
 
-void LiveIntervals::computeDeadValues(LiveInterval *li,
-                                      LiveRange &LR,
-                                      bool *CanSeparate,
+bool LiveIntervals::computeDeadValues(LiveInterval &LI,
                                       SmallVectorImpl<MachineInstr*> *dead) {
-  for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
-       I != E; ++I) {
-    VNInfo *VNI = *I;
+  bool PHIRemoved = false;
+  for (auto VNI : LI.valnos) {
     if (VNI->isUnused())
       continue;
-    LiveRange::iterator LRI = LR.FindSegmentContaining(VNI->def);
-    assert(LRI != LR.end() && "Missing segment for PHI");
-    if (LRI->end != VNI->def.getDeadSlot())
+    LiveRange::iterator I = LI.FindSegmentContaining(VNI->def);
+    assert(I != LI.end() && "Missing segment for VNI");
+    if (I->end != VNI->def.getDeadSlot())
       continue;
     if (VNI->isPHIDef()) {
       // This is a dead PHI. Remove it.
       VNI->markUnused();
-      LR.removeSegment(LRI->start, LRI->end);
+      LI.removeSegment(I);
       DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
-      if (CanSeparate)
-        *CanSeparate = true;
+      PHIRemoved = true;
     } else {
       // This is a dead def. Make sure the instruction knows.
       MachineInstr *MI = getInstructionFromIndex(VNI->def);
       assert(MI && "No instruction defining live value");
-      MI->addRegisterDead(li->reg, TRI);
+      MI->addRegisterDead(LI.reg, TRI);
       if (dead && MI->allDefsAreDead()) {
         DEBUG(dbgs() << "All defs dead: " << VNI->def << '\t' << *MI);
         dead->push_back(MI);
       }
     }
   }
+  return PHIRemoved;
+}
+
+void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg)
+{
+  DEBUG(dbgs() << "Shrink: " << SR << '\n');
+  assert(TargetRegisterInfo::isVirtualRegister(Reg)
+         && "Can only shrink virtual registers");
+  // Find all the values used, including PHI kills.
+  ShrinkToUsesWorkList WorkList;
+
+  // Visit all instructions reading Reg.
+  SlotIndex LastIdx;
+  for (MachineOperand &MO : MRI->reg_operands(Reg)) {
+    MachineInstr *UseMI = MO.getParent();
+    if (UseMI->isDebugValue())
+      continue;
+    // Maybe the operand is for a subregister we don't care about.
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg != 0) {
+      unsigned SubRegMask = TRI->getSubRegIndexLaneMask(SubReg);
+      if ((SubRegMask & SR.LaneMask) == 0)
+        continue;
+    }
+    // We only need to visit each instruction once.
+    SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
+    if (Idx == LastIdx)
+      continue;
+    LastIdx = Idx;
+
+    LiveQueryResult LRQ = SR.Query(Idx);
+    VNInfo *VNI = LRQ.valueIn();
+    // For Subranges it is possible that only undef values are left in that
+    // part of the subregister, so there is no real liverange at the use
+    if (!VNI)
+      continue;
+
+    // Special case: An early-clobber tied operand reads and writes the
+    // register one slot early.
+    if (VNInfo *DefVNI = LRQ.valueDefined())
+      Idx = DefVNI->def;
+
+    WorkList.push_back(std::make_pair(Idx, VNI));
+  }
+
+  // Create a new live ranges with only minimal live segments per def.
+  LiveRange NewLR;
+  createSegmentsForValues(NewLR, make_range(SR.vni_begin(), SR.vni_end()));
+  extendSegmentsToUses(NewLR, *Indexes, WorkList, SR);
+
+  // Move the trimmed ranges back.
+  SR.segments.swap(NewLR.segments);
+
+  // Remove dead PHI value numbers
+  for (auto VNI : SR.valnos) {
+    if (VNI->isUnused())
+      continue;
+    const LiveRange::Segment *Segment = SR.getSegmentContaining(VNI->def);
+    assert(Segment != nullptr && "Missing segment for VNI");
+    if (Segment->end != VNI->def.getDeadSlot())
+      continue;
+    if (VNI->isPHIDef()) {
+      // This is a dead PHI. Remove it.
+      VNI->markUnused();
+      SR.removeSegment(*Segment);
+      DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
+    }
+  }
+
+  DEBUG(dbgs() << "Shrunk: " << SR << '\n');
 }
 
 void LiveIntervals::extendToIndices(LiveRange &LR,
@@ -462,26 +549,25 @@ void LiveIntervals::extendToIndices(LiveRange &LR,
     LRCalc->extend(LR, Indices[i]);
 }
 
-void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
+void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
                                SmallVectorImpl<SlotIndex> *EndPoints) {
-  LiveQueryResult LRQ = LI->Query(Kill);
-  VNInfo *VNI = LRQ.valueOut();
+  LiveQueryResult LRQ = LR.Query(Kill);
+  VNInfo *VNI = LRQ.valueOutOrDead();
   if (!VNI)
     return;
 
   MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill);
-  SlotIndex MBBStart, MBBEnd;
-  std::tie(MBBStart, MBBEnd) = Indexes->getMBBRange(KillMBB);
+  SlotIndex MBBEnd = Indexes->getMBBEndIdx(KillMBB);
 
   // If VNI isn't live out from KillMBB, the value is trivially pruned.
   if (LRQ.endPoint() < MBBEnd) {
-    LI->removeSegment(Kill, LRQ.endPoint());
+    LR.removeSegment(Kill, LRQ.endPoint());
     if (EndPoints) EndPoints->push_back(LRQ.endPoint());
     return;
   }
 
   // VNI is live out of KillMBB.
-  LI->removeSegment(Kill, MBBEnd);
+  LR.removeSegment(Kill, MBBEnd);
   if (EndPoints) EndPoints->push_back(MBBEnd);
 
   // Find all blocks that are reachable from KillMBB without leaving VNI's live
@@ -498,8 +584,9 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
       MachineBasicBlock *MBB = *I;
 
       // Check if VNI is live in to MBB.
+      SlotIndex MBBStart, MBBEnd;
       std::tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB);
-      LiveQueryResult LRQ = LI->Query(MBBStart);
+      LiveQueryResult LRQ = LR.Query(MBBStart);
       if (LRQ.valueIn() != VNI) {
         // This block isn't part of the VNI segment. Prune the search.
         I.skipChildren();
@@ -508,34 +595,46 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
 
       // Prune the search if VNI is killed in MBB.
       if (LRQ.endPoint() < MBBEnd) {
-        LI->removeSegment(MBBStart, LRQ.endPoint());
+        LR.removeSegment(MBBStart, LRQ.endPoint());
         if (EndPoints) EndPoints->push_back(LRQ.endPoint());
         I.skipChildren();
         continue;
       }
 
       // VNI is live through MBB.
-      LI->removeSegment(MBBStart, MBBEnd);
+      LR.removeSegment(MBBStart, MBBEnd);
       if (EndPoints) EndPoints->push_back(MBBEnd);
       ++I;
     }
   }
 }
 
+void LiveIntervals::pruneValue(LiveInterval &LI, SlotIndex Kill,
+                               SmallVectorImpl<SlotIndex> *EndPoints) {
+  pruneValue((LiveRange&)LI, Kill, EndPoints);
+
+  for (LiveInterval::SubRange &SR : LI.subranges()) {
+    pruneValue(SR, Kill, nullptr);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Register allocator hooks.
 //
 
 void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
   // Keep track of regunit ranges.
-  SmallVector<std::pair<LiveRange*, LiveRange::iterator>, 8> RU;
+  SmallVector<std::pair<const LiveRange*, LiveRange::const_iterator>, 8> RU;
+  // Keep track of subregister ranges.
+  SmallVector<std::pair<const LiveInterval::SubRange*,
+                        LiveRange::const_iterator>, 4> SRs;
 
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
-    LiveInterval *LI = &getInterval(Reg);
-    if (LI->empty())
+    const LiveInterval &LI = getInterval(Reg);
+    if (LI.empty())
       continue;
 
     // Find the regunit intervals for the assigned register. They may overlap
@@ -543,15 +642,22 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
     RU.clear();
     for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid();
          ++Units) {
-      LiveRange &RURanges = getRegUnit(*Units);
-      if (RURanges.empty())
+      const LiveRange &RURange = getRegUnit(*Units);
+      if (RURange.empty())
         continue;
-      RU.push_back(std::make_pair(&RURanges, RURanges.find(LI->begin()->end)));
+      RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end)));
+    }
+
+    if (MRI->tracksSubRegLiveness()) {
+      SRs.clear();
+      for (const LiveInterval::SubRange &SR : LI.subranges()) {
+        SRs.push_back(std::make_pair(&SR, SR.find(LI.begin()->end)));
+      }
     }
 
     // Every instruction that kills Reg corresponds to a segment range end
     // point.
-    for (LiveInterval::iterator RI = LI->begin(), RE = LI->end(); RI != RE;
+    for (LiveInterval::const_iterator RI = LI.begin(), RE = LI.end(); RI != RE;
          ++RI) {
       // A block index indicates an MBB edge.
       if (RI->end.isBlock())
@@ -568,23 +674,80 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
       //   BAR %EAX<kill>
       //
       // There should be no kill flag on FOO when %vreg5 is rewritten as %EAX.
-      bool CancelKill = false;
-      for (unsigned u = 0, e = RU.size(); u != e; ++u) {
-        LiveRange &RRanges = *RU[u].first;
-        LiveRange::iterator &I = RU[u].second;
-        if (I == RRanges.end())
+      for (auto &RUP : RU) {
+        const LiveRange &RURange = *RUP.first;
+        LiveRange::const_iterator &I = RUP.second;
+        if (I == RURange.end())
           continue;
-        I = RRanges.advanceTo(I, RI->end);
-        if (I == RRanges.end() || I->start >= RI->end)
+        I = RURange.advanceTo(I, RI->end);
+        if (I == RURange.end() || I->start >= RI->end)
           continue;
         // I is overlapping RI.
-        CancelKill = true;
-        break;
+        goto CancelKill;
+      }
+
+      if (MRI->tracksSubRegLiveness()) {
+        // When reading a partial undefined value we must not add a kill flag.
+        // The regalloc might have used the undef lane for something else.
+        // Example:
+        //     %vreg1 = ...              ; R32: %vreg1
+        //     %vreg2:high16 = ...       ; R64: %vreg2
+        //        = read %vreg2<kill>    ; R64: %vreg2
+        //        = read %vreg1          ; R32: %vreg1
+        // The <kill> flag is correct for %vreg2, but the register allocator may
+        // assign R0L to %vreg1, and R0 to %vreg2 because the low 32bits of R0
+        // are actually never written by %vreg2. After assignment the <kill>
+        // flag at the read instruction is invalid.
+        unsigned DefinedLanesMask;
+        if (!SRs.empty()) {
+          // Compute a mask of lanes that are defined.
+          DefinedLanesMask = 0;
+          for (auto &SRP : SRs) {
+            const LiveInterval::SubRange &SR = *SRP.first;
+            LiveRange::const_iterator &I = SRP.second;
+            if (I == SR.end())
+              continue;
+            I = SR.advanceTo(I, RI->end);
+            if (I == SR.end() || I->start >= RI->end)
+              continue;
+            // I is overlapping RI
+            DefinedLanesMask |= SR.LaneMask;
+          }
+        } else
+          DefinedLanesMask = ~0u;
+
+        bool IsFullWrite = false;
+        for (const MachineOperand &MO : MI->operands()) {
+          if (!MO.isReg() || MO.getReg() != Reg)
+            continue;
+          if (MO.isUse()) {
+            // Reading any undefined lanes?
+            unsigned UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+            if ((UseMask & ~DefinedLanesMask) != 0)
+              goto CancelKill;
+          } else if (MO.getSubReg() == 0) {
+            // Writing to the full register?
+            assert(MO.isDef());
+            IsFullWrite = true;
+          }
+        }
+
+        // If an instruction writes to a subregister, a new segment starts in
+        // the LiveInterval. But as this is only overriding part of the register
+        // adding kill-flags is not correct here after registers have been
+        // assigned.
+        if (!IsFullWrite) {
+          // Next segment has to be adjacent in the subregister write case.
+          LiveRange::const_iterator N = std::next(RI);
+          if (N != LI.end() && N->start == RI->end)
+            goto CancelKill;
+        }
       }
-      if (CancelKill)
-        MI->clearRegisterKills(Reg, nullptr);
-      else
-        MI->addRegisterKilled(Reg, nullptr);
+
+      MI->addRegisterKilled(Reg, nullptr);
+      continue;
+CancelKill:
+      MI->clearRegisterKills(Reg, nullptr);
     }
   }
 }
@@ -615,9 +778,7 @@ LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
 
 bool
 LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
-  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
-       I != E; ++I) {
-    const VNInfo *PHI = *I;
+  for (const VNInfo *PHI : LI.valnos) {
     if (PHI->isUnused() || !PHI->isPHIDef())
       continue;
     const MachineBasicBlock *PHIMBB = getMBBFromIndex(PHI->def);
@@ -767,7 +928,16 @@ public:
         continue;
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         LiveInterval &LI = LIS.getInterval(Reg);
-        updateRange(LI, Reg);
+        if (LI.hasSubRanges()) {
+          unsigned SubReg = MO->getSubReg();
+          unsigned LaneMask = TRI.getSubRegIndexLaneMask(SubReg);
+          for (LiveInterval::SubRange &S : LI.subranges()) {
+            if ((S.LaneMask & LaneMask) == 0)
+              continue;
+            updateRange(S, Reg, S.LaneMask);
+          }
+        }
+        updateRange(LI, Reg, 0);
         continue;
       }
 
@@ -775,7 +945,7 @@ public:
       // precomputed live range.
       for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
         if (LiveRange *LR = getRegUnitLI(*Units))
-          updateRange(*LR, *Units);
+          updateRange(*LR, *Units, 0);
     }
     if (hasRegMask)
       updateRegMaskSlots();
@@ -784,21 +954,24 @@ public:
 private:
   /// Update a single live range, assuming an instruction has been moved from
   /// OldIdx to NewIdx.
-  void updateRange(LiveRange &LR, unsigned Reg) {
-    if (!Updated.insert(&LR))
+  void updateRange(LiveRange &LR, unsigned Reg, unsigned LaneMask) {
+    if (!Updated.insert(&LR).second)
       return;
     DEBUG({
       dbgs() << "     ";
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         dbgs() << PrintReg(Reg);
-      else
+        if (LaneMask != 0)
+          dbgs() << format(" L%04X", LaneMask);
+      } else {
         dbgs() << PrintRegUnit(Reg, &TRI);
+      }
       dbgs() << ":\t" << LR << '\n';
     });
     if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
       handleMoveDown(LR);
     else
-      handleMoveUp(LR, Reg);
+      handleMoveUp(LR, Reg, LaneMask);
     DEBUG(dbgs() << "        -->\t" << LR << '\n');
     LR.verify();
   }
@@ -911,7 +1084,7 @@ private:
   ///    Hoist kill to NewIdx, then scan for last kill between NewIdx and
   ///    OldIdx.
   ///
-  void handleMoveUp(LiveRange &LR, unsigned Reg) {
+  void handleMoveUp(LiveRange &LR, unsigned Reg, unsigned LaneMask) {
     // First look for a kill at OldIdx.
     LiveRange::iterator I = LR.find(OldIdx.getBaseIndex());
     LiveRange::iterator E = LR.end();
@@ -932,7 +1105,7 @@ private:
       if (I == E || !SlotIndex::isSameInstr(I->start, OldIdx)) {
         // No def, search for the new kill.
         // This can never be an early clobber kill since there is no def.
-        std::prev(I)->end = findLastUseBefore(Reg).getRegSlot();
+        std::prev(I)->end = findLastUseBefore(Reg, LaneMask).getRegSlot();
         return;
       }
     }
@@ -988,15 +1161,17 @@ private:
   }
 
   // Return the last use of reg between NewIdx and OldIdx.
-  SlotIndex findLastUseBefore(unsigned Reg) {
+  SlotIndex findLastUseBefore(unsigned Reg, unsigned LaneMask) {
 
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
       SlotIndex LastUse = NewIdx;
-      for (MachineRegisterInfo::use_instr_nodbg_iterator
-             UI = MRI.use_instr_nodbg_begin(Reg),
-             UE = MRI.use_instr_nodbg_end();
-           UI != UE; ++UI) {
-        const MachineInstr* MI = &*UI;
+      for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+        unsigned SubReg = MO.getSubReg();
+        if (SubReg != 0 && LaneMask != 0
+            && (TRI.getSubRegIndexLaneMask(SubReg) & LaneMask) == 0)
+          continue;
+
+        const MachineInstr *MI = MO.getParent();
         SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
         if (InstSlot > LastUse && InstSlot < OldIdx)
           LastUse = InstSlot;
@@ -1062,6 +1237,94 @@ void LiveIntervals::handleMoveIntoBundle(MachineInstr* MI,
   HME.updateAllRanges(MI);
 }
 
+void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
+                                        const MachineBasicBlock::iterator End,
+                                        const SlotIndex endIdx,
+                                        LiveRange &LR, const unsigned Reg,
+                                        const unsigned LaneMask) {
+  LiveInterval::iterator LII = LR.find(endIdx);
+  SlotIndex lastUseIdx;
+  if (LII != LR.end() && LII->start < endIdx)
+    lastUseIdx = LII->end;
+  else
+    --LII;
+
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr *MI = I;
+    if (MI->isDebugValue())
+      continue;
+
+    SlotIndex instrIdx = getInstructionIndex(MI);
+    bool isStartValid = getInstructionFromIndex(LII->start);
+    bool isEndValid = getInstructionFromIndex(LII->end);
+
+    // FIXME: This doesn't currently handle early-clobber or multiple removed
+    // defs inside of the region to repair.
+    for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+         OE = MI->operands_end(); OI != OE; ++OI) {
+      const MachineOperand &MO = *OI;
+      if (!MO.isReg() || MO.getReg() != Reg)
+        continue;
+
+      unsigned SubReg = MO.getSubReg();
+      unsigned Mask = TRI->getSubRegIndexLaneMask(SubReg);
+      if ((Mask & LaneMask) == 0)
+        continue;
+
+      if (MO.isDef()) {
+        if (!isStartValid) {
+          if (LII->end.isDead()) {
+            SlotIndex prevStart;
+            if (LII != LR.begin())
+              prevStart = std::prev(LII)->start;
+
+            // FIXME: This could be more efficient if there was a
+            // removeSegment method that returned an iterator.
+            LR.removeSegment(*LII, true);
+            if (prevStart.isValid())
+              LII = LR.find(prevStart);
+            else
+              LII = LR.begin();
+          } else {
+            LII->start = instrIdx.getRegSlot();
+            LII->valno->def = instrIdx.getRegSlot();
+            if (MO.getSubReg() && !MO.isUndef())
+              lastUseIdx = instrIdx.getRegSlot();
+            else
+              lastUseIdx = SlotIndex();
+            continue;
+          }
+        }
+
+        if (!lastUseIdx.isValid()) {
+          VNInfo *VNI = LR.getNextValue(instrIdx.getRegSlot(), VNInfoAllocator);
+          LiveRange::Segment S(instrIdx.getRegSlot(),
+                               instrIdx.getDeadSlot(), VNI);
+          LII = LR.addSegment(S);
+        } else if (LII->start != instrIdx.getRegSlot()) {
+          VNInfo *VNI = LR.getNextValue(instrIdx.getRegSlot(), VNInfoAllocator);
+          LiveRange::Segment S(instrIdx.getRegSlot(), lastUseIdx, VNI);
+          LII = LR.addSegment(S);
+        }
+
+        if (MO.getSubReg() && !MO.isUndef())
+          lastUseIdx = instrIdx.getRegSlot();
+        else
+          lastUseIdx = SlotIndex();
+      } else if (MO.isUse()) {
+        // FIXME: This should probably be handled outside of this branch,
+        // either as part of the def case (for defs inside of the region) or
+        // after the loop over the region.
+        if (!isEndValid && !LII->end.isBlock())
+          LII->end = instrIdx.getRegSlot();
+        if (!lastUseIdx.isValid())
+          lastUseIdx = instrIdx.getRegSlot();
+      }
+    }
+  }
+}
+
 void
 LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator Begin,
@@ -1107,83 +1370,9 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
     if (!LI.hasAtLeastOneValue())
       continue;
 
-    LiveInterval::iterator LII = LI.find(endIdx);
-    SlotIndex lastUseIdx;
-    if (LII != LI.end() && LII->start < endIdx)
-      lastUseIdx = LII->end;
-    else
-      --LII;
-
-    for (MachineBasicBlock::iterator I = End; I != Begin;) {
-      --I;
-      MachineInstr *MI = I;
-      if (MI->isDebugValue())
-        continue;
-
-      SlotIndex instrIdx = getInstructionIndex(MI);
-      bool isStartValid = getInstructionFromIndex(LII->start);
-      bool isEndValid = getInstructionFromIndex(LII->end);
-
-      // FIXME: This doesn't currently handle early-clobber or multiple removed
-      // defs inside of the region to repair.
-      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
-           OE = MI->operands_end(); OI != OE; ++OI) {
-        const MachineOperand &MO = *OI;
-        if (!MO.isReg() || MO.getReg() != Reg)
-          continue;
-
-        if (MO.isDef()) {
-          if (!isStartValid) {
-            if (LII->end.isDead()) {
-              SlotIndex prevStart;
-              if (LII != LI.begin())
-                prevStart = std::prev(LII)->start;
-
-              // FIXME: This could be more efficient if there was a
-              // removeSegment method that returned an iterator.
-              LI.removeSegment(*LII, true);
-              if (prevStart.isValid())
-                LII = LI.find(prevStart);
-              else
-                LII = LI.begin();
-            } else {
-              LII->start = instrIdx.getRegSlot();
-              LII->valno->def = instrIdx.getRegSlot();
-              if (MO.getSubReg() && !MO.isUndef())
-                lastUseIdx = instrIdx.getRegSlot();
-              else
-                lastUseIdx = SlotIndex();
-              continue;
-            }
-          }
-
-          if (!lastUseIdx.isValid()) {
-            VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
-                                          VNInfoAllocator);
-            LiveRange::Segment S(instrIdx.getRegSlot(),
-                                 instrIdx.getDeadSlot(), VNI);
-            LII = LI.addSegment(S);
-          } else if (LII->start != instrIdx.getRegSlot()) {
-            VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
-                                          VNInfoAllocator);
-            LiveRange::Segment S(instrIdx.getRegSlot(), lastUseIdx, VNI);
-            LII = LI.addSegment(S);
-          }
-
-          if (MO.getSubReg() && !MO.isUndef())
-            lastUseIdx = instrIdx.getRegSlot();
-          else
-            lastUseIdx = SlotIndex();
-        } else if (MO.isUse()) {
-          // FIXME: This should probably be handled outside of this branch,
-          // either as part of the def case (for defs inside of the region) or
-          // after the loop over the region.
-          if (!isEndValid && !LII->end.isBlock())
-            LII->end = instrIdx.getRegSlot();
-          if (!lastUseIdx.isValid())
-            lastUseIdx = instrIdx.getRegSlot();
-        }
-      }
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      repairOldRegInRange(Begin, End, endIdx, S, Reg, S.LaneMask);
     }
+    repairOldRegInRange(Begin, End, endIdx, LI, Reg);
   }
 }
diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp
index d81221b6a526..025d99ce7881 100644
--- a/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/lib/CodeGen/LiveIntervalUnion.cpp
@@ -26,14 +26,14 @@ using namespace llvm;
 
 
 // Merge a LiveInterval's segments. Guarantee no overlaps.
-void LiveIntervalUnion::unify(LiveInterval &VirtReg) {
-  if (VirtReg.empty())
+void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) {
+  if (Range.empty())
     return;
   ++Tag;
 
   // Insert each of the virtual register's live segments into the map.
-  LiveInterval::iterator RegPos = VirtReg.begin();
-  LiveInterval::iterator RegEnd = VirtReg.end();
+  LiveRange::const_iterator RegPos = Range.begin();
+  LiveRange::const_iterator RegEnd = Range.end();
   SegmentIter SegPos = Segments.find(RegPos->start);
 
   while (SegPos.valid()) {
@@ -53,14 +53,14 @@ void LiveIntervalUnion::unify(LiveInterval &VirtReg) {
 }
 
 // Remove a live virtual register's segments from this union.
-void LiveIntervalUnion::extract(LiveInterval &VirtReg) {
-  if (VirtReg.empty())
+void LiveIntervalUnion::extract(LiveInterval &VirtReg, const LiveRange &Range) {
+  if (Range.empty())
     return;
   ++Tag;
 
   // Remove each of the virtual register's live segments from the map.
-  LiveInterval::iterator RegPos = VirtReg.begin();
-  LiveInterval::iterator RegEnd = VirtReg.end();
+  LiveRange::const_iterator RegPos = Range.begin();
+  LiveRange::const_iterator RegEnd = Range.end();
   SegmentIter SegPos = Segments.find(RegPos->start);
 
   for (;;) {
@@ -70,7 +70,7 @@ void LiveIntervalUnion::extract(LiveInterval &VirtReg) {
       return;
 
     // Skip all segments that may have been coalesced.
-    RegPos = VirtReg.advanceTo(RegPos, SegPos.start());
+    RegPos = Range.advanceTo(RegPos, SegPos.start());
     if (RegPos == RegEnd)
       return;
 
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index a558e142e013..1d46161ad711 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -19,6 +19,13 @@ using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
 
+void LiveRangeCalc::resetLiveOutMap() {
+  unsigned NumBlocks = MF->getNumBlockIDs();
+  Seen.clear();
+  Seen.resize(NumBlocks);
+  Map.resize(NumBlocks);
+}
+
 void LiveRangeCalc::reset(const MachineFunction *mf,
                           SlotIndexes *SI,
                           MachineDominatorTree *MDT,
@@ -28,104 +35,188 @@ void LiveRangeCalc::reset(const MachineFunction *mf,
   Indexes = SI;
   DomTree = MDT;
   Alloc = VNIA;
-
-  unsigned N = MF->getNumBlockIDs();
-  Seen.clear();
-  Seen.resize(N);
-  LiveOut.resize(N);
+  resetLiveOutMap();
   LiveIn.clear();
 }
 
 
-void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) {
-  assert(MRI && Indexes && "call reset() first");
-
-  // Visit all def operands. If the same instruction has multiple defs of Reg,
-  // LR.createDeadDef() will deduplicate.
-  for (MachineOperand &MO : MRI->def_operands(Reg)) {
+static void createDeadDef(SlotIndexes &Indexes, VNInfo::Allocator &Alloc,
+                          LiveRange &LR, const MachineOperand &MO) {
     const MachineInstr *MI = MO.getParent();
-    // Find the corresponding slot index.
-    SlotIndex Idx;
+    SlotIndex DefIdx;
     if (MI->isPHI())
-      // PHI defs begin at the basic block start index.
-      Idx = Indexes->getMBBStartIdx(MI->getParent());
+      DefIdx = Indexes.getMBBStartIdx(MI->getParent());
     else
-      // Instructions are either normal 'r', or early clobber 'e'.
-      Idx = Indexes->getInstructionIndex(MI)
-        .getRegSlot(MO.isEarlyClobber());
+      DefIdx = Indexes.getInstructionIndex(MI).getRegSlot(MO.isEarlyClobber());
 
     // Create the def in LR. This may find an existing def.
-    LR.createDeadDef(Idx, *Alloc);
+    LR.createDeadDef(DefIdx, Alloc);
+}
+
+void LiveRangeCalc::calculate(LiveInterval &LI) {
+  assert(MRI && Indexes && "call reset() first");
+
+  // Step 1: Create minimal live segments for every definition of Reg.
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // createDeadDef() will deduplicate.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  unsigned Reg = LI.reg;
+  for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+    if (!MO.isDef() && !MO.readsReg())
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (LI.hasSubRanges() || (SubReg != 0 && MRI->tracksSubRegLiveness())) {
+      unsigned Mask = SubReg != 0 ? TRI.getSubRegIndexLaneMask(SubReg)
+                                  : MRI->getMaxLaneMaskForVReg(Reg);
+
+      // If this is the first time we see a subregister def, initialize
+      // subranges by creating a copy of the main range.
+      if (!LI.hasSubRanges() && !LI.empty()) {
+        unsigned ClassMask = MRI->getMaxLaneMaskForVReg(Reg);
+        LI.createSubRangeFrom(*Alloc, ClassMask, LI);
+      }
+
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        // A Mask for subregs common to the existing subrange and current def.
+        unsigned Common = S.LaneMask & Mask;
+        if (Common == 0)
+          continue;
+        // A Mask for subregs covered by the subrange but not the current def.
+        unsigned LRest = S.LaneMask & ~Mask;
+        LiveInterval::SubRange *CommonRange;
+        if (LRest != 0) {
+          // Split current subrange into Common and LRest ranges.
+          S.LaneMask = LRest;
+          CommonRange = LI.createSubRangeFrom(*Alloc, Common, S);
+        } else {
+          assert(Common == S.LaneMask);
+          CommonRange = &S;
+        }
+        if (MO.isDef())
+          createDeadDef(*Indexes, *Alloc, *CommonRange, MO);
+        Mask &= ~Common;
+      }
+      // Create a new SubRange for subregs we did not cover yet.
+      if (Mask != 0) {
+        LiveInterval::SubRange *NewRange = LI.createSubRange(*Alloc, Mask);
+        if (MO.isDef())
+          createDeadDef(*Indexes, *Alloc, *NewRange, MO);
+      }
+    }
+
+    // Create the def in the main liverange. We do not have to do this if
+    // subranges are tracked as we recreate the main range later in this case.
+    if (MO.isDef() && !LI.hasSubRanges())
+      createDeadDef(*Indexes, *Alloc, LI, MO);
+  }
+
+  // We may have created empty live ranges for partially undefined uses, we
+  // can't keep them because we won't find defs in them later.
+  LI.removeEmptySubRanges();
+
+  // Step 2: Extend live segments to all uses, constructing SSA form as
+  // necessary.
+  if (LI.hasSubRanges()) {
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      resetLiveOutMap();
+      extendToUses(S, Reg, S.LaneMask);
+    }
+    LI.clear();
+    LI.constructMainRangeFromSubranges(*Indexes, *Alloc);
+  } else {
+    resetLiveOutMap();
+    extendToUses(LI, Reg, ~0u);
   }
 }
 
 
-void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg) {
+void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) {
   assert(MRI && Indexes && "call reset() first");
 
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // LR.createDeadDef() will deduplicate.
+  for (MachineOperand &MO : MRI->def_operands(Reg))
+    createDeadDef(*Indexes, *Alloc, LR, MO);
+}
+
+
+void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg, unsigned Mask) {
   // Visit all operands that read Reg. This may include partial defs.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
   for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
     // Clear all kill flags. They will be reinserted after register allocation
     // by LiveIntervalAnalysis::addKillFlags().
     if (MO.isUse())
       MO.setIsKill(false);
+    else {
+      // We only care about uses, but on the main range (mask ~0u) this includes
+      // the "virtual" reads happening for subregister defs.
+      if (Mask != ~0u)
+        continue;
+    }
+
     if (!MO.readsReg())
       continue;
-    // MI is reading Reg. We may have visited MI before if it happens to be
-    // reading Reg multiple times. That is OK, extend() is idempotent.
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg != 0) {
+      unsigned SubRegMask = TRI.getSubRegIndexLaneMask(SubReg);
+      // Ignore uses not covering the current subrange.
+      if ((SubRegMask & Mask) == 0)
+        continue;
+    }
+
+    // Determine the actual place of the use.
     const MachineInstr *MI = MO.getParent();
     unsigned OpNo = (&MO - &MI->getOperand(0));
-
-    // Find the SlotIndex being read.
-    SlotIndex Idx;
+    SlotIndex UseIdx;
     if (MI->isPHI()) {
       assert(!MO.isDef() && "Cannot handle PHI def of partial register.");
-      // PHI operands are paired: (Reg, PredMBB).
-      // Extend the live range to be live-out from PredMBB.
-      Idx = Indexes->getMBBEndIdx(MI->getOperand(OpNo+1).getMBB());
+      // The actual place where a phi operand is used is the end of the pred
+      // MBB. PHI operands are paired: (Reg, PredMBB).
+      UseIdx = Indexes->getMBBEndIdx(MI->getOperand(OpNo+1).getMBB());
     } else {
-      // This is a normal instruction.
-      Idx = Indexes->getInstructionIndex(MI).getRegSlot();
       // Check for early-clobber redefs.
+      bool isEarlyClobber = false;
       unsigned DefIdx;
-      if (MO.isDef()) {
-        if (MO.isEarlyClobber())
-          Idx = Idx.getRegSlot(true);
-      } else if (MI->isRegTiedToDefOperand(OpNo, &DefIdx)) {
+      if (MO.isDef())
+        isEarlyClobber = MO.isEarlyClobber();
+      else if (MI->isRegTiedToDefOperand(OpNo, &DefIdx)) {
         // FIXME: This would be a lot easier if tied early-clobber uses also
         // had an early-clobber flag.
-        if (MI->getOperand(DefIdx).isEarlyClobber())
-          Idx = Idx.getRegSlot(true);
+        isEarlyClobber = MI->getOperand(DefIdx).isEarlyClobber();
       }
+      UseIdx = Indexes->getInstructionIndex(MI).getRegSlot(isEarlyClobber);
     }
-    extend(LR, Idx, Reg);
+
+    // MI is reading Reg. We may have visited MI before if it happens to be
+    // reading Reg multiple times. That is OK, extend() is idempotent.
+    extend(LR, UseIdx, Reg);
   }
 }
 
 
-// Transfer information from the LiveIn vector to the live ranges.
-void LiveRangeCalc::updateLiveIns() {
+void LiveRangeCalc::updateFromLiveIns() {
   LiveRangeUpdater Updater;
-  for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
-         E = LiveIn.end(); I != E; ++I) {
-    if (!I->DomNode)
+  for (const LiveInBlock &I : LiveIn) {
+    if (!I.DomNode)
       continue;
-    MachineBasicBlock *MBB = I->DomNode->getBlock();
-    assert(I->Value && "No live-in value found");
+    MachineBasicBlock *MBB = I.DomNode->getBlock();
+    assert(I.Value && "No live-in value found");
     SlotIndex Start, End;
     std::tie(Start, End) = Indexes->getMBBRange(MBB);
 
-    if (I->Kill.isValid())
+    if (I.Kill.isValid())
       // Value is killed inside this block.
-      End = I->Kill;
+      End = I.Kill;
     else {
       // The value is live-through, update LiveOut as well.
       // Defer the Domtree lookup until it is needed.
       assert(Seen.test(MBB->getNumber()));
-      LiveOut[MBB] = LiveOutPair(I->Value, (MachineDomTreeNode *)nullptr);
+      Map[MBB] = LiveOutPair(I.Value, nullptr);
     }
-    Updater.setDest(&I->LR);
-    Updater.add(Start, End, I->Value);
+    Updater.setDest(&I.LR);
+    Updater.add(Start, End, I.Value);
   }
   LiveIn.clear();
 }
@@ -162,7 +253,7 @@ void LiveRangeCalc::calculateValues() {
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
   updateSSA();
-  updateLiveIns();
+  updateFromLiveIns();
 }
 
 
@@ -202,7 +293,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
 
        // Is this a known live-out block?
        if (Seen.test(Pred->getNumber())) {
-         if (VNInfo *VNI = LiveOut[Pred].first) {
+         if (VNInfo *VNI = Map[Pred].first) {
            if (TheVNI && TheVNI != VNI)
              UniqueVNI = false;
            TheVNI = VNI;
@@ -251,8 +342,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
        if (*I == KillMBBNum && Kill.isValid())
          End = Kill;
        else
-         LiveOut[MF->getBlockNumbered(*I)] =
-           LiveOutPair(TheVNI, nullptr);
+         Map[MF->getBlockNumbered(*I)] = LiveOutPair(TheVNI, nullptr);
        Updater.add(Start, End, TheVNI);
     }
     return true;
@@ -285,9 +375,8 @@ void LiveRangeCalc::updateSSA() {
     Changes = 0;
     // Propagate live-out values down the dominator tree, inserting phi-defs
     // when necessary.
-    for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
-           E = LiveIn.end(); I != E; ++I) {
-      MachineDomTreeNode *Node = I->DomNode;
+    for (LiveInBlock &I : LiveIn) {
+      MachineDomTreeNode *Node = I.DomNode;
       // Skip block if the live-in value has already been determined.
       if (!Node)
         continue;
@@ -303,16 +392,16 @@ void LiveRangeCalc::updateSSA() {
       // immediate dominator. Check if any of them have live-out values that are
       // properly dominated by IDom. If so, we need a phi-def here.
       if (!needPHI) {
-        IDomValue = LiveOut[IDom->getBlock()];
+        IDomValue = Map[IDom->getBlock()];
 
         // Cache the DomTree node that defined the value.
         if (IDomValue.first && !IDomValue.second)
-          LiveOut[IDom->getBlock()].second = IDomValue.second =
+          Map[IDom->getBlock()].second = IDomValue.second =
             DomTree->getNode(Indexes->getMBBFromIndex(IDomValue.first->def));
 
         for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
                PE = MBB->pred_end(); PI != PE; ++PI) {
-          LiveOutPair &Value = LiveOut[*PI];
+          LiveOutPair &Value = Map[*PI];
           if (!Value.first || Value.first == IDomValue.first)
             continue;
 
@@ -334,7 +423,7 @@ void LiveRangeCalc::updateSSA() {
       // The value may be live-through even if Kill is set, as can happen when
       // we are called from extendRange. In that case LiveOutSeen is true, and
       // LiveOut indicates a foreign or missing value.
-      LiveOutPair &LOP = LiveOut[MBB];
+      LiveOutPair &LOP = Map[MBB];
 
       // Create a phi-def if required.
       if (needPHI) {
@@ -342,25 +431,25 @@ void LiveRangeCalc::updateSSA() {
         assert(Alloc && "Need VNInfo allocator to create PHI-defs");
         SlotIndex Start, End;
         std::tie(Start, End) = Indexes->getMBBRange(MBB);
-        LiveRange &LR = I->LR;
+        LiveRange &LR = I.LR;
         VNInfo *VNI = LR.getNextValue(Start, *Alloc);
-        I->Value = VNI;
+        I.Value = VNI;
         // This block is done, we know the final value.
-        I->DomNode = nullptr;
+        I.DomNode = nullptr;
 
-        // Add liveness since updateLiveIns now skips this node.
-        if (I->Kill.isValid())
-          LR.addSegment(LiveInterval::Segment(Start, I->Kill, VNI));
+        // Add liveness since updateFromLiveIns now skips this node.
+        if (I.Kill.isValid())
+          LR.addSegment(LiveInterval::Segment(Start, I.Kill, VNI));
         else {
           LR.addSegment(LiveInterval::Segment(Start, End, VNI));
           LOP = LiveOutPair(VNI, Node);
         }
       } else if (IDomValue.first) {
         // No phi-def here. Remember incoming value.
-        I->Value = IDomValue.first;
+        I.Value = IDomValue.first;
 
         // If the IDomValue is killed in the block, don't propagate through.
-        if (I->Kill.isValid())
+        if (I.Kill.isValid())
           continue;
 
         // Propagate IDomValue if it isn't killed:
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index 67ab5596d299..1b9099b5fc7e 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -19,8 +19,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_LIVERANGECALC_H
-#define LLVM_CODEGEN_LIVERANGECALC_H
+#ifndef LLVM_LIB_CODEGEN_LIVERANGECALC_H
+#define LLVM_LIB_CODEGEN_LIVERANGECALC_H
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/IndexedMap.h"
@@ -40,12 +40,6 @@ class LiveRangeCalc {
   MachineDominatorTree *DomTree;
   VNInfo::Allocator *Alloc;
 
-  /// Seen - Bit vector of active entries in LiveOut, also used as a visited
-  /// set by findReachingDefs.  One entry per basic block, indexed by block
-  /// number.  This is kept as a separate bit vector because it can be cleared
-  /// quickly when switching live ranges.
-  BitVector Seen;
-
   /// LiveOutPair - A value and the block that defined it.  The domtree node is
   /// redundant, it can be computed as: MDT[Indexes.getMBBFromIndex(VNI->def)].
   typedef std::pair<VNInfo*, MachineDomTreeNode*> LiveOutPair;
@@ -53,8 +47,14 @@ class LiveRangeCalc {
   /// LiveOutMap - Map basic blocks to the value leaving the block.
   typedef IndexedMap<LiveOutPair, MBB2NumberFunctor> LiveOutMap;
 
-  /// LiveOut - Map each basic block where a live range is live out to the
-  /// live-out value and its defining block.
+  /// Bit vector of active entries in LiveOut, also used as a visited set by
+  /// findReachingDefs.  One entry per basic block, indexed by block number.
+  /// This is kept as a separate bit vector because it can be cleared quickly
+  /// when switching live ranges.
+  BitVector Seen;
+
+  /// Map each basic block where a live range is live out to the live-out value
+  /// and its defining block.
   ///
   /// For every basic block, MBB, one of these conditions shall be true:
   ///
@@ -70,7 +70,7 @@ class LiveRangeCalc {
   ///
   /// The map can be shared by multiple live ranges as long as no two are
   /// live-out of the same block.
-  LiveOutMap LiveOut;
+  LiveOutMap Map;
 
   /// LiveInBlock - Information about a basic block where a live range is known
   /// to be live-in, but the value has not yet been determined.
@@ -121,8 +121,18 @@ class LiveRangeCalc {
   /// blocks.  No values are read from the live ranges.
   void updateSSA();
 
-  /// Add liveness as specified in the LiveIn vector.
-  void updateLiveIns();
+  /// Transfer information from the LiveIn vector to the live ranges and update
+  /// the given @p LiveOuts.
+  void updateFromLiveIns();
+
+  /// Extend the live range of @p LR to reach all uses of Reg.
+  ///
+  /// All uses must be jointly dominated by existing liveness.  PHI-defs are
+  /// inserted as needed to preserve SSA form.
+  void extendToUses(LiveRange &LR, unsigned Reg, unsigned LaneMask);
+
+  /// Reset Map and Seen fields.
+  void resetLiveOutMap();
 
 public:
   LiveRangeCalc() : MF(nullptr), MRI(nullptr), Indexes(nullptr),
@@ -167,22 +177,19 @@ public:
   /// minimal live range.
   void createDeadDefs(LiveRange &LR, unsigned Reg);
 
-  /// createDeadDefs - Create a dead def in LI for every def of LI->reg.
-  void createDeadDefs(LiveInterval &LI) {
-    createDeadDefs(LI, LI.reg);
-  }
-
-  /// extendToUses - Extend the live range of LI to reach all uses of Reg.
+  /// Extend the live range of @p LR to reach all uses of Reg.
   ///
   /// All uses must be jointly dominated by existing liveness.  PHI-defs are
   /// inserted as needed to preserve SSA form.
-  void extendToUses(LiveRange &LR, unsigned Reg);
-
-  /// extendToUses - Extend the live range of LI to reach all uses of LI->reg.
-  void extendToUses(LiveInterval &LI) {
-    extendToUses(LI, LI.reg);
+  void extendToUses(LiveRange &LR, unsigned PhysReg) {
+    extendToUses(LR, PhysReg, ~0u);
   }
 
+  /// Calculates liveness for the register specified in live interval @p LI.
+  /// Creates subregister live ranges as needed if subreg liveness tracking is
+  /// enabled.
+  void calculate(LiveInterval &LI);
+
   //===--------------------------------------------------------------------===//
   // Low-level interface.
   //===--------------------------------------------------------------------===//
@@ -204,7 +211,7 @@ public:
   /// addLiveInBlock().
   void setLiveOutValue(MachineBasicBlock *MBB, VNInfo *VNI) {
     Seen.set(MBB->getNumber());
-    LiveOut[MBB] = LiveOutPair(VNI, nullptr);
+    Map[MBB] = LiveOutPair(VNI, nullptr);
   }
 
   /// addLiveInBlock - Add a block with an unknown live-in value.  This
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 431241fbfb63..47da205df694 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -60,9 +60,7 @@ bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
 }
 
 void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
-  for (LiveInterval::vni_iterator I = getParent().vni_begin(),
-       E = getParent().vni_end(); I != E; ++I) {
-    VNInfo *VNI = *I;
+  for (VNInfo *VNI : getParent().valnos) {
     if (VNI->isUnused())
       continue;
     MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
@@ -135,7 +133,7 @@ bool LiveRangeEdit::canRematerializeAt(Remat &RM,
   }
 
   // If only cheap remats were requested, bail out early.
-  if (cheapAsAMove && !RM.OrigMI->isAsCheapAsAMove())
+  if (cheapAsAMove && !TII.isAsCheapAsAMove(RM.OrigMI))
     return false;
 
   // Verify that all used registers are available with the same values.
@@ -286,8 +284,16 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
         if (TheDelegate)
           TheDelegate->LRE_WillShrinkVirtReg(LI.reg);
         LI.removeValNo(VNI);
-        if (LI.empty())
+        if (LI.empty()) {
           RegsToErase.push_back(Reg);
+        } else {
+          // Also remove the value in subranges.
+          for (LiveInterval::SubRange &S : LI.subranges()) {
+            if (VNInfo *SVNI = S.getVNInfoAt(Idx))
+              S.removeValNo(SVNI);
+          }
+          LI.removeEmptySubRanges();
+        }
       }
     }
   }
@@ -411,8 +417,11 @@ LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
   for (unsigned I = 0, Size = size(); I < Size; ++I) {
     LiveInterval &LI = LIS.getInterval(get(I));
     if (MRI.recomputeRegClass(LI.reg, MF.getTarget()))
-      DEBUG(dbgs() << "Inflated " << PrintReg(LI.reg) << " to "
-                   << MRI.getRegClass(LI.reg)->getName() << '\n');
+      DEBUG({
+        const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+        dbgs() << "Inflated " << PrintReg(LI.reg) << " to "
+               << TRI->getRegClassName(MRI.getRegClass(LI.reg)) << '\n';
+      });
     VRAI.calculateSpillWeightAndHint(LI);
   }
 }
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index de2ce22f7fbf..154ce6fc122b 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -18,8 +18,8 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
@@ -48,7 +48,7 @@ void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) {
-  TRI = MF.getTarget().getRegisterInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
   VRM = &getAnalysis<VirtRegMap>();
@@ -72,16 +72,44 @@ void LiveRegMatrix::releaseMemory() {
   }
 }
 
+template<typename Callable>
+bool foreachUnit(const TargetRegisterInfo *TRI, LiveInterval &VRegInterval,
+                 unsigned PhysReg, Callable Func) {
+  if (VRegInterval.hasSubRanges()) {
+    for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+      unsigned Unit = (*Units).first;
+      unsigned Mask = (*Units).second;
+      for (LiveInterval::SubRange &S : VRegInterval.subranges()) {
+        if (S.LaneMask & Mask) {
+          if (Func(Unit, S))
+            return true;
+          break;
+        }
+      }
+    }
+  } else {
+    for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+      if (Func(*Units, VRegInterval))
+        return true;
+    }
+  }
+  return false;
+}
+
 void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
   DEBUG(dbgs() << "assigning " << PrintReg(VirtReg.reg, TRI)
                << " to " << PrintReg(PhysReg, TRI) << ':');
   assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
   VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
   MRI->setPhysRegUsed(PhysReg);
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI));
-    Matrix[*Units].unify(VirtReg);
-  }
+
+  foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                         const LiveRange &Range) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI) << ' ' << Range);
+    Matrix[Unit].unify(VirtReg, Range);
+    return false;
+  });
+
   ++NumAssigned;
   DEBUG(dbgs() << '\n');
 }
@@ -91,10 +119,14 @@ void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
   DEBUG(dbgs() << "unassigning " << PrintReg(VirtReg.reg, TRI)
                << " from " << PrintReg(PhysReg, TRI) << ':');
   VRM->clearVirt(VirtReg.reg);
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI));
-    Matrix[*Units].extract(VirtReg);
-  }
+
+  foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                         const LiveRange &Range) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI));
+    Matrix[Unit].extract(VirtReg, Range);
+    return false;
+  });
+
   ++NumUnassigned;
   DEBUG(dbgs() << '\n');
 }
@@ -122,12 +154,13 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
   if (VirtReg.empty())
     return false;
   CoalescerPair CP(VirtReg.reg, PhysReg, *TRI);
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    const LiveRange &UnitRange = LIS->getRegUnit(*Units);
-    if (VirtReg.overlaps(UnitRange, CP, *LIS->getSlotIndexes()))
-      return true;
-  }
-  return false;
+
+  bool Result = foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                                       const LiveRange &Range) {
+    const LiveRange &UnitRange = LIS->getRegUnit(Unit);
+    return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes());
+  });
+  return Result;
 }
 
 LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg,
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
index b3161a459707..8a6ac251ab2d 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStackAnalysis.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <limits>
 using namespace llvm;
 
@@ -49,7 +50,7 @@ void LiveStacks::releaseMemory() {
 }
 
 bool LiveStacks::runOnMachineFunction(MachineFunction &MF) {
-  TRI = MF.getTarget().getRegisterInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   // FIXME: No analysis is being done right now. We are relying on the
   // register allocators to provide the information.
   return false;
@@ -80,7 +81,7 @@ void LiveStacks::print(raw_ostream &OS, const Module*) const {
     int Slot = I->first;
     const TargetRegisterClass *RC = getIntervalRegClass(Slot);
     if (RC)
-      OS << " [" << RC->getName() << "]\n";
+      OS << " [" << TRI->getRegClassName(RC) << "]\n";
     else
       OS << " [Unknown]\n";
   }
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 758b2163ff4f..c4bca5f3a11e 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -37,7 +37,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -497,17 +496,135 @@ void LiveVariables::UpdatePhysRegDefs(MachineInstr *MI,
   }
 }
 
+void LiveVariables::runOnInstr(MachineInstr *MI,
+                               SmallVectorImpl<unsigned> &Defs) {
+  assert(!MI->isDebugValue());
+  // Process all of the operands of the instruction...
+  unsigned NumOperandsToProcess = MI->getNumOperands();
+
+  // Unless it is a PHI node.  In this case, ONLY process the DEF, not any
+  // of the uses.  They will be handled in other basic blocks.
+  if (MI->isPHI())
+    NumOperandsToProcess = 1;
+
+  // Clear kill and dead markers. LV will recompute them.
+  SmallVector<unsigned, 4> UseRegs;
+  SmallVector<unsigned, 4> DefRegs;
+  SmallVector<unsigned, 1> RegMasks;
+  for (unsigned i = 0; i != NumOperandsToProcess; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isRegMask()) {
+      RegMasks.push_back(i);
+      continue;
+    }
+    if (!MO.isReg() || MO.getReg() == 0)
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (MO.isUse()) {
+      MO.setIsKill(false);
+      if (MO.readsReg())
+        UseRegs.push_back(MOReg);
+    } else /*MO.isDef()*/ {
+      MO.setIsDead(false);
+      DefRegs.push_back(MOReg);
+    }
+  }
+
+  MachineBasicBlock *MBB = MI->getParent();
+  // Process all uses.
+  for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) {
+    unsigned MOReg = UseRegs[i];
+    if (TargetRegisterInfo::isVirtualRegister(MOReg))
+      HandleVirtRegUse(MOReg, MBB, MI);
+    else if (!MRI->isReserved(MOReg))
+      HandlePhysRegUse(MOReg, MI);
+  }
+
+  // Process all masked registers. (Call clobbers).
+  for (unsigned i = 0, e = RegMasks.size(); i != e; ++i)
+    HandleRegMask(MI->getOperand(RegMasks[i]));
+
+  // Process all defs.
+  for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) {
+    unsigned MOReg = DefRegs[i];
+    if (TargetRegisterInfo::isVirtualRegister(MOReg))
+      HandleVirtRegDef(MOReg, MI);
+    else if (!MRI->isReserved(MOReg))
+      HandlePhysRegDef(MOReg, MI, Defs);
+  }
+  UpdatePhysRegDefs(MI, Defs);
+}
+
+void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) {
+  // Mark live-in registers as live-in.
+  SmallVector<unsigned, 4> Defs;
+  for (MachineBasicBlock::livein_iterator II = MBB->livein_begin(),
+         EE = MBB->livein_end(); II != EE; ++II) {
+    assert(TargetRegisterInfo::isPhysicalRegister(*II) &&
+           "Cannot have a live-in virtual register!");
+    HandlePhysRegDef(*II, nullptr, Defs);
+  }
+
+  // Loop over all of the instructions, processing them.
+  DistanceMap.clear();
+  unsigned Dist = 0;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+       I != E; ++I) {
+    MachineInstr *MI = I;
+    if (MI->isDebugValue())
+      continue;
+    DistanceMap.insert(std::make_pair(MI, Dist++));
+
+    runOnInstr(MI, Defs);
+  }
+
+  // Handle any virtual assignments from PHI nodes which might be at the
+  // bottom of this basic block.  We check all of our successor blocks to see
+  // if they have PHI nodes, and if so, we simulate an assignment at the end
+  // of the current block.
+  if (!PHIVarInfo[MBB->getNumber()].empty()) {
+    SmallVectorImpl<unsigned> &VarInfoVec = PHIVarInfo[MBB->getNumber()];
+
+    for (SmallVectorImpl<unsigned>::iterator I = VarInfoVec.begin(),
+           E = VarInfoVec.end(); I != E; ++I)
+      // Mark it alive only in the block we are representing.
+      MarkVirtRegAliveInBlock(getVarInfo(*I),MRI->getVRegDef(*I)->getParent(),
+                              MBB);
+  }
+
+  // MachineCSE may CSE instructions which write to non-allocatable physical
+  // registers across MBBs. Remember if any reserved register is liveout.
+  SmallSet<unsigned, 4> LiveOuts;
+  for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock *SuccMBB = *SI;
+    if (SuccMBB->isLandingPad())
+      continue;
+    for (MachineBasicBlock::livein_iterator LI = SuccMBB->livein_begin(),
+           LE = SuccMBB->livein_end(); LI != LE; ++LI) {
+      unsigned LReg = *LI;
+      if (!TRI->isInAllocatableClass(LReg))
+        // Ignore other live-ins, e.g. those that are live into landing pads.
+        LiveOuts.insert(LReg);
+    }
+  }
+
+  // Loop over PhysRegDef / PhysRegUse, killing any registers that are
+  // available at the end of the basic block.
+  for (unsigned i = 0; i != NumRegs; ++i)
+    if ((PhysRegDef[i] || PhysRegUse[i]) && !LiveOuts.count(i))
+      HandlePhysRegDef(i, nullptr, Defs);
+}
+
 bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   MRI = &mf.getRegInfo();
-  TRI = MF->getTarget().getRegisterInfo();
-
-  unsigned NumRegs = TRI->getNumRegs();
-  PhysRegDef  = new MachineInstr*[NumRegs];
-  PhysRegUse  = new MachineInstr*[NumRegs];
-  PHIVarInfo = new SmallVector<unsigned, 4>[MF->getNumBlockIDs()];
-  std::fill(PhysRegDef,  PhysRegDef  + NumRegs, nullptr);
-  std::fill(PhysRegUse,  PhysRegUse  + NumRegs, nullptr);
+  TRI = MF->getSubtarget().getRegisterInfo();
+
+  const unsigned NumRegs = TRI->getNumRegs();
+  PhysRegDef.assign(NumRegs, nullptr);
+  PhysRegUse.assign(NumRegs, nullptr);
+  PHIVarInfo.resize(MF->getNumBlockIDs());
   PHIJoins.clear();
 
   // FIXME: LiveIntervals will be updated to remove its dependence on
@@ -525,124 +642,11 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   MachineBasicBlock *Entry = MF->begin();
   SmallPtrSet<MachineBasicBlock*,16> Visited;
 
-  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*,16> >
-         DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited);
-       DFI != E; ++DFI) {
-    MachineBasicBlock *MBB = *DFI;
-
-    // Mark live-in registers as live-in.
-    SmallVector<unsigned, 4> Defs;
-    for (MachineBasicBlock::livein_iterator II = MBB->livein_begin(),
-           EE = MBB->livein_end(); II != EE; ++II) {
-      assert(TargetRegisterInfo::isPhysicalRegister(*II) &&
-             "Cannot have a live-in virtual register!");
-      HandlePhysRegDef(*II, nullptr, Defs);
-    }
-
-    // Loop over all of the instructions, processing them.
-    DistanceMap.clear();
-    unsigned Dist = 0;
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-         I != E; ++I) {
-      MachineInstr *MI = I;
-      if (MI->isDebugValue())
-        continue;
-      DistanceMap.insert(std::make_pair(MI, Dist++));
-
-      // Process all of the operands of the instruction...
-      unsigned NumOperandsToProcess = MI->getNumOperands();
-
-      // Unless it is a PHI node.  In this case, ONLY process the DEF, not any
-      // of the uses.  They will be handled in other basic blocks.
-      if (MI->isPHI())
-        NumOperandsToProcess = 1;
-
-      // Clear kill and dead markers. LV will recompute them.
-      SmallVector<unsigned, 4> UseRegs;
-      SmallVector<unsigned, 4> DefRegs;
-      SmallVector<unsigned, 1> RegMasks;
-      for (unsigned i = 0; i != NumOperandsToProcess; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
-        if (MO.isRegMask()) {
-          RegMasks.push_back(i);
-          continue;
-        }
-        if (!MO.isReg() || MO.getReg() == 0)
-          continue;
-        unsigned MOReg = MO.getReg();
-        if (MO.isUse()) {
-          MO.setIsKill(false);
-          if (MO.readsReg())
-            UseRegs.push_back(MOReg);
-        } else /*MO.isDef()*/ {
-          MO.setIsDead(false);
-          DefRegs.push_back(MOReg);
-        }
-      }
-
-      // Process all uses.
-      for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) {
-        unsigned MOReg = UseRegs[i];
-        if (TargetRegisterInfo::isVirtualRegister(MOReg))
-          HandleVirtRegUse(MOReg, MBB, MI);
-        else if (!MRI->isReserved(MOReg))
-          HandlePhysRegUse(MOReg, MI);
-      }
-
-      // Process all masked registers. (Call clobbers).
-      for (unsigned i = 0, e = RegMasks.size(); i != e; ++i)
-        HandleRegMask(MI->getOperand(RegMasks[i]));
-
-      // Process all defs.
-      for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) {
-        unsigned MOReg = DefRegs[i];
-        if (TargetRegisterInfo::isVirtualRegister(MOReg))
-          HandleVirtRegDef(MOReg, MI);
-        else if (!MRI->isReserved(MOReg))
-          HandlePhysRegDef(MOReg, MI, Defs);
-      }
-      UpdatePhysRegDefs(MI, Defs);
-    }
-
-    // Handle any virtual assignments from PHI nodes which might be at the
-    // bottom of this basic block.  We check all of our successor blocks to see
-    // if they have PHI nodes, and if so, we simulate an assignment at the end
-    // of the current block.
-    if (!PHIVarInfo[MBB->getNumber()].empty()) {
-      SmallVectorImpl<unsigned> &VarInfoVec = PHIVarInfo[MBB->getNumber()];
-
-      for (SmallVectorImpl<unsigned>::iterator I = VarInfoVec.begin(),
-             E = VarInfoVec.end(); I != E; ++I)
-        // Mark it alive only in the block we are representing.
-        MarkVirtRegAliveInBlock(getVarInfo(*I),MRI->getVRegDef(*I)->getParent(),
-                                MBB);
-    }
-
-    // MachineCSE may CSE instructions which write to non-allocatable physical
-    // registers across MBBs. Remember if any reserved register is liveout.
-    SmallSet<unsigned, 4> LiveOuts;
-    for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
-           SE = MBB->succ_end(); SI != SE; ++SI) {
-      MachineBasicBlock *SuccMBB = *SI;
-      if (SuccMBB->isLandingPad())
-        continue;
-      for (MachineBasicBlock::livein_iterator LI = SuccMBB->livein_begin(),
-             LE = SuccMBB->livein_end(); LI != LE; ++LI) {
-        unsigned LReg = *LI;
-        if (!TRI->isInAllocatableClass(LReg))
-          // Ignore other live-ins, e.g. those that are live into landing pads.
-          LiveOuts.insert(LReg);
-      }
-    }
-
-    // Loop over PhysRegDef / PhysRegUse, killing any registers that are
-    // available at the end of the basic block.
-    for (unsigned i = 0; i != NumRegs; ++i)
-      if ((PhysRegDef[i] || PhysRegUse[i]) && !LiveOuts.count(i))
-        HandlePhysRegDef(i, nullptr, Defs);
+  for (MachineBasicBlock *MBB : depth_first_ext(Entry, Visited)) {
+    runOnBlock(MBB, NumRegs);
 
-    std::fill(PhysRegDef,  PhysRegDef  + NumRegs, nullptr);
-    std::fill(PhysRegUse,  PhysRegUse  + NumRegs, nullptr);
+    PhysRegDef.assign(NumRegs, nullptr);
+    PhysRegUse.assign(NumRegs, nullptr);
   }
 
   // Convert and transfer the dead / killed information we have gathered into
@@ -664,9 +668,9 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
     assert(Visited.count(&*i) != 0 && "unreachable basic block found");
 #endif
 
-  delete[] PhysRegDef;
-  delete[] PhysRegUse;
-  delete[] PHIVarInfo;
+  PhysRegDef.clear();
+  PhysRegUse.clear();
+  PHIVarInfo.clear();
 
   return false;
 }
diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index 36885e8890d5..e8bf687a626e 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -102,7 +103,7 @@ INITIALIZE_PASS_END(LocalStackSlotPass, "localstackalloc",
 
 bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   unsigned LocalObjectCount = MFI->getObjectIndexEnd();
 
   // If the target doesn't want/need this pass, or if there are no locals
@@ -183,7 +184,7 @@ void LocalStackSlotPass::AssignProtectedObjSet(const StackObjSet &UnassignedObjs
 void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
   // Loop over all of the stack objects, assigning sequential addresses...
   MachineFrameInfo *MFI = Fn.getFrameInfo();
-  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
   bool StackGrowsDown =
     TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
   int64_t Offset = 0;
@@ -272,8 +273,8 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
   bool UsedBaseReg = false;
 
   MachineFrameInfo *MFI = Fn.getFrameInfo();
-  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
-  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
   bool StackGrowsDown =
     TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
 
@@ -290,6 +291,7 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
       // Debug value, stackmap and patchpoint instructions can't be out of
       // range, so they don't need any updates.
       if (MI->isDebugValue() ||
+          MI->getOpcode() == TargetOpcode::STATEPOINT ||
           MI->getOpcode() == TargetOpcode::STACKMAP ||
           MI->getOpcode() == TargetOpcode::PATCHPOINT)
         continue;
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 08fef5ffaf44..3c7390527e5f 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -24,7 +24,6 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/LeakDetector.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/Debug.h"
@@ -32,6 +31,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -44,7 +44,6 @@ MachineBasicBlock::MachineBasicBlock(MachineFunction &mf, const BasicBlock *bb)
 }
 
 MachineBasicBlock::~MachineBasicBlock() {
-  LeakDetector::removeGarbageObject(this);
 }
 
 /// getSymbol - Return the MCSymbol for this basic block.
@@ -53,8 +52,7 @@ MCSymbol *MachineBasicBlock::getSymbol() const {
   if (!CachedMCSymbol) {
     const MachineFunction *MF = getParent();
     MCContext &Ctx = MF->getContext();
-    const TargetMachine &TM = MF->getTarget();
-    const char *Prefix = TM.getDataLayout()->getPrivateGlobalPrefix();
+    const char *Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix();
     CachedMCSymbol = Ctx.GetOrCreateSymbol(Twine(Prefix) + "BB" +
                                            Twine(MF->getFunctionNumber()) +
                                            "_" + Twine(getNumber()));
@@ -85,14 +83,11 @@ void ilist_traits<MachineBasicBlock>::addNodeToList(MachineBasicBlock *N) {
   for (MachineBasicBlock::instr_iterator
          I = N->instr_begin(), E = N->instr_end(); I != E; ++I)
     I->AddRegOperandsToUseLists(RegInfo);
-
-  LeakDetector::removeGarbageObject(N);
 }
 
 void ilist_traits<MachineBasicBlock>::removeNodeFromList(MachineBasicBlock *N) {
   N->getParent()->removeFromMBBNumbering(N->Number);
   N->Number = -1;
-  LeakDetector::addGarbageObject(N);
 }
 
 
@@ -107,8 +102,6 @@ void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) {
   // use/def lists.
   MachineFunction *MF = Parent->getParent();
   N->AddRegOperandsToUseLists(MF->getRegInfo());
-
-  LeakDetector::removeGarbageObject(N);
 }
 
 /// removeNodeFromList (MI) - When we remove an instruction from a basic block
@@ -122,8 +115,6 @@ void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
     N->RemoveRegOperandsFromUseLists(MF->getRegInfo());
 
   N->setParent(nullptr);
-
-  LeakDetector::addGarbageObject(N);
 }
 
 /// transferNodesFromList (MI) - When moving a range of instructions from one
@@ -290,7 +281,7 @@ void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const {
 
   OS << '\n';
 
-  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   if (!livein_empty()) {
     if (Indexes) OS << '\t';
     OS << "    Live Ins:";
@@ -359,7 +350,7 @@ MachineBasicBlock::addLiveIn(unsigned PhysReg, const TargetRegisterClass *RC) {
   bool LiveIn = isLiveIn(PhysReg);
   iterator I = SkipPHIsAndLabels(begin()), E = end();
   MachineRegisterInfo &MRI = getParent()->getRegInfo();
-  const TargetInstrInfo &TII = *getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *getParent()->getSubtarget().getInstrInfo();
 
   // Look for an existing copy.
   if (LiveIn)
@@ -390,7 +381,7 @@ void MachineBasicBlock::moveAfter(MachineBasicBlock *NewBefore) {
 }
 
 void MachineBasicBlock::updateTerminator() {
-  const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
   // A block with no successors has no concerns with fall-through edges.
   if (this->succ_empty()) return;
 
@@ -645,7 +636,7 @@ bool MachineBasicBlock::canFallThrough() {
   // Analyze the branches, if any, at the end of the block.
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
-  const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
   if (TII->AnalyzeBranch(*this, TBB, FBB, Cond)) {
     // If we couldn't analyze the branch, examine the last instruction.
     // If the block doesn't end in a known control barrier, assume fallthrough
@@ -690,7 +681,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
 
   // We may need to update this's terminator, but we can't do that if
   // AnalyzeBranch fails. If this uses a jump table, we won't touch it.
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   if (TII->AnalyzeBranch(*this, TBB, FBB, Cond))
@@ -795,7 +786,8 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   NMBB->addSuccessor(Succ);
   if (!NMBB->isLayoutSuccessor(Succ)) {
     Cond.clear();
-    MF->getTarget().getInstrInfo()->InsertBranch(*NMBB, Succ, nullptr, Cond, dl);
+    MF->getSubtarget().getInstrInfo()->InsertBranch(*NMBB, Succ, nullptr, Cond,
+                                                    dl);
 
     if (Indexes) {
       for (instr_iterator I = NMBB->instr_begin(), E = NMBB->instr_end();
@@ -823,7 +815,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
     NMBB->addLiveIn(*I);
 
   // Update LiveVariables.
-  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   if (LV) {
     // Restore kills of virtual registers that were killed by the terminators.
     while (!KilledRegs.empty()) {
@@ -903,31 +895,8 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   }
 
   if (MachineDominatorTree *MDT =
-      P->getAnalysisIfAvailable<MachineDominatorTree>()) {
-    // Update dominator information.
-    MachineDomTreeNode *SucccDTNode = MDT->getNode(Succ);
-
-    bool IsNewIDom = true;
-    for (const_pred_iterator PI = Succ->pred_begin(), E = Succ->pred_end();
-         PI != E; ++PI) {
-      MachineBasicBlock *PredBB = *PI;
-      if (PredBB == NMBB)
-        continue;
-      if (!MDT->dominates(SucccDTNode, MDT->getNode(PredBB))) {
-        IsNewIDom = false;
-        break;
-      }
-    }
-
-    // We know "this" dominates the newly created basic block.
-    MachineDomTreeNode *NewDTNode = MDT->addNewBlock(NMBB, this);
-
-    // If all the other predecessors of "Succ" are dominated by "Succ" itself
-    // then the new block is the new immediate dominator of "Succ". Otherwise,
-    // the new block doesn't dominate anything.
-    if (IsNewIDom)
-      MDT->changeImmediateDominator(SucccDTNode, NewDTNode);
-  }
+      P->getAnalysisIfAvailable<MachineDominatorTree>())
+    MDT->recordSplitCriticalEdge(this, Succ, NMBB);
 
   if (MachineLoopInfo *MLI = P->getAnalysisIfAvailable<MachineLoopInfo>())
     if (MachineLoop *TIL = MLI->getLoopFor(this)) {
@@ -1086,7 +1055,7 @@ bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
   MachineBasicBlock::succ_iterator SI = succ_begin();
   while (SI != succ_end()) {
     const MachineBasicBlock *MBB = *SI;
-    if (!SeenMBBs.insert(MBB) ||
+    if (!SeenMBBs.insert(MBB).second ||
         (MBB != DestA && MBB != DestB && !MBB->isLandingPad())) {
       // This is a superfluous edge, remove it.
       SI = removeSuccessor(SI);
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 74af1e2d64b0..aaa7d9156976 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -812,7 +813,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
                                    BE = L.block_end();
        BI != BE; ++BI) {
     BlockChain &Chain = *BlockToChain[*BI];
-    if (!UpdatedPreds.insert(&Chain))
+    if (!UpdatedPreds.insert(&Chain).second)
       continue;
 
     assert(Chain.LoopPredecessors == 0);
@@ -913,7 +914,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
   for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
     MachineBasicBlock *BB = &*FI;
     BlockChain &Chain = *BlockToChain[BB];
-    if (!UpdatedPreds.insert(&Chain))
+    if (!UpdatedPreds.insert(&Chain).second)
       continue;
 
     assert(Chain.LoopPredecessors == 0);
@@ -1045,9 +1046,6 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
   if (F.getFunction()->getAttributes().
         hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize))
     return;
-  unsigned Align = TLI->getPrefLoopAlignment();
-  if (!Align)
-    return;  // Don't care about loop alignment.
   if (FunctionChain.begin() == FunctionChain.end())
     return;  // Empty chain.
 
@@ -1065,6 +1063,10 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     if (!L)
       continue;
 
+    unsigned Align = TLI->getPrefLoopAlignment(L);
+    if (!Align)
+      continue;  // Don't care about loop alignment.
+
     // If the block is cold relative to the function entry don't waste space
     // aligning it.
     BlockFrequency Freq = MBFI->getBlockFreq(*BI);
@@ -1111,8 +1113,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   MLI = &getAnalysis<MachineLoopInfo>();
-  TII = F.getTarget().getInstrInfo();
-  TLI = F.getTarget().getTargetLowering();
+  TII = F.getSubtarget().getInstrInfo();
+  TLI = F.getSubtarget().getTargetLowering();
   assert(BlockToChain.empty());
 
   buildCFGChains(F);
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 7da439caded4..29604089ad80 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "machine-cse"
@@ -78,7 +79,8 @@ namespace {
     SmallVector<MachineInstr*, 64> Exps;
     unsigned CurrVN;
 
-    bool PerformTrivialCoalescing(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool PerformTrivialCopyPropagation(MachineInstr *MI,
+                                       MachineBasicBlock *MBB);
     bool isPhysDefTriviallyDead(unsigned Reg,
                                 MachineBasicBlock::const_iterator I,
                                 MachineBasicBlock::const_iterator E) const;
@@ -112,8 +114,12 @@ INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(MachineCSE, "machine-cse",
                 "Machine Common Subexpression Elimination", false, false)
 
-bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI,
-                                          MachineBasicBlock *MBB) {
+/// The source register of a COPY machine instruction can be propagated to all
+/// its users, and this propagation could increase the probability of finding
+/// common subexpressions. If the COPY has only one user, the COPY itself can
+/// be removed.
+bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
+                                               MachineBasicBlock *MBB) {
   bool Changed = false;
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(i);
@@ -122,10 +128,7 @@ bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI,
     unsigned Reg = MO.getReg();
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
-    if (!MRI->hasOneNonDBGUse(Reg))
-      // Only coalesce single use copies. This ensure the copy will be
-      // deleted.
-      continue;
+    bool OnlyOneUse = MRI->hasOneNonDBGUse(Reg);
     MachineInstr *DefMI = MRI->getVRegDef(Reg);
     if (!DefMI->isCopy())
       continue;
@@ -153,10 +156,14 @@ bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI,
       continue;
     DEBUG(dbgs() << "Coalescing: " << *DefMI);
     DEBUG(dbgs() << "***     to: " << *MI);
+    // Propagate SrcReg of copies to MI.
     MO.setReg(SrcReg);
     MRI->clearKillFlags(SrcReg);
-    DefMI->eraseFromParent();
-    ++NumCoalesces;
+    // Coalesce single use copies.
+    if (OnlyOneUse) {
+      DefMI->eraseFromParent();
+      ++NumCoalesces;
+    }
     Changed = true;
   }
 
@@ -380,7 +387,7 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
   // Heuristics #1: Don't CSE "cheap" computation if the def is not local or in
   // an immediate predecessor. We don't want to increase register pressure and
   // end up causing other computation to be spilled.
-  if (MI->isAsCheapAsAMove()) {
+  if (TII->isAsCheapAsAMove(MI)) {
     MachineBasicBlock *CSBB = CSMI->getParent();
     MachineBasicBlock *BB = MI->getParent();
     if (CSBB != BB && !CSBB->isSuccessor(BB))
@@ -444,6 +451,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
 
   SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
   SmallVector<unsigned, 2> ImplicitDefsToUpdate;
+  SmallVector<unsigned, 2> ImplicitDefs;
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
     MachineInstr *MI = &*I;
     ++I;
@@ -453,13 +461,15 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
 
     bool FoundCSE = VNT.count(MI);
     if (!FoundCSE) {
-      // Look for trivial copy coalescing opportunities.
-      if (PerformTrivialCoalescing(MI, MBB)) {
+      // Using trivial copy propagation to find more CSE opportunities.
+      if (PerformTrivialCopyPropagation(MI, MBB)) {
         Changed = true;
 
         // After coalescing MI itself may become a copy.
         if (MI->isCopyLike())
           continue;
+
+        // Try again to see if CSE is possible.
         FoundCSE = VNT.count(MI);
       }
     }
@@ -533,6 +543,12 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
       // we should make sure it is not dead at CSMI.
       if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead())
         ImplicitDefsToUpdate.push_back(i);
+
+      // Keep track of implicit defs of CSMI and MI, to clear possibly
+      // made-redundant kill flags.
+      if (MO.isImplicit() && !MO.isDead() && OldReg == NewReg)
+        ImplicitDefs.push_back(OldReg);
+
       if (OldReg == NewReg) {
         --NumDefs;
         continue;
@@ -573,6 +589,29 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
       for (unsigned i = 0, e = ImplicitDefsToUpdate.size(); i != e; ++i)
         CSMI->getOperand(ImplicitDefsToUpdate[i]).setIsDead(false);
 
+      // Go through implicit defs of CSMI and MI, and clear the kill flags on
+      // their uses in all the instructions between CSMI and MI.
+      // We might have made some of the kill flags redundant, consider:
+      //   subs  ... %NZCV<imp-def>        <- CSMI
+      //   csinc ... %NZCV<imp-use,kill>   <- this kill flag isn't valid anymore
+      //   subs  ... %NZCV<imp-def>        <- MI, to be eliminated
+      //   csinc ... %NZCV<imp-use,kill>
+      // Since we eliminated MI, and reused a register imp-def'd by CSMI
+      // (here %NZCV), that register, if it was killed before MI, should have
+      // that kill flag removed, because it's lifetime was extended.
+      if (CSMI->getParent() == MI->getParent()) {
+        for (MachineBasicBlock::iterator II = CSMI, IE = MI; II != IE; ++II)
+          for (auto ImplicitDef : ImplicitDefs)
+            if (MachineOperand *MO = II->findRegisterUseOperand(
+                    ImplicitDef, /*isKill=*/true, TRI))
+              MO->setIsKill(false);
+      } else {
+        // If the instructions aren't in the same BB, bail out and clear the
+        // kill flag on all uses of the imp-def'd register.
+        for (auto ImplicitDef : ImplicitDefs)
+          MRI->clearKillFlags(ImplicitDef);
+      }
+
       if (CrossMBBPhysDef) {
         // Add physical register defs now coming in from a predecessor to MBB
         // livein list.
@@ -597,6 +636,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
     }
     CSEPairs.clear();
     ImplicitDefsToUpdate.clear();
+    ImplicitDefs.clear();
   }
 
   return Changed;
@@ -663,8 +703,8 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
   if (skipOptnoneFunction(*MF.getFunction()))
     return false;
 
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   AA = &getAnalysis<AliasAnalysis>();
   DT = &getAnalysis<MachineDominatorTree>();
diff --git a/lib/CodeGen/MachineCodeEmitter.cpp b/lib/CodeGen/MachineCodeEmitter.cpp
deleted file mode 100644
index 81b49784c052..000000000000
--- a/lib/CodeGen/MachineCodeEmitter.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//===-- llvm/CodeGen/MachineCodeEmitter.cpp - Code emission -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/MachineCodeEmitter.h"
-
-using namespace llvm;
-
-void MachineCodeEmitter::anchor() { }
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
new file mode 100644
index 000000000000..2931258172bf
--- /dev/null
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -0,0 +1,435 @@
+//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The machine combiner pass uses machine trace metrics to ensure the combined
+// instructions does not lengthen the critical path or the resource depth.
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "machine-combiner"
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+STATISTIC(NumInstCombined, "Number of machineinst combined");
+
+namespace {
+class MachineCombiner : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MCSchedModel SchedModel;
+  MachineRegisterInfo *MRI;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+
+  TargetSchedModel TSchedModel;
+
+  /// OptSize - True if optimizing for code size.
+  bool OptSize;
+
+public:
+  static char ID;
+  MachineCombiner() : MachineFunctionPass(ID) {
+    initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const char *getPassName() const override { return "Machine InstCombiner"; }
+
+private:
+  bool doSubstitute(unsigned NewSize, unsigned OldSize);
+  bool combineInstructions(MachineBasicBlock *);
+  MachineInstr *getOperandDef(const MachineOperand &MO);
+  unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
+                    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                    MachineTraceMetrics::Trace BlockTrace);
+  unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
+                      MachineTraceMetrics::Trace BlockTrace);
+  bool
+  preservesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
+                           MachineTraceMetrics::Trace BlockTrace,
+                           SmallVectorImpl<MachineInstr *> &InsInstrs,
+                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg);
+  bool preservesResourceLen(MachineBasicBlock *MBB,
+                            MachineTraceMetrics::Trace BlockTrace,
+                            SmallVectorImpl<MachineInstr *> &InsInstrs,
+                            SmallVectorImpl<MachineInstr *> &DelInstrs);
+  void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
+                     SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
+};
+}
+
+char MachineCombiner::ID = 0;
+char &llvm::MachineCombinerID = MachineCombiner::ID;
+
+INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
+                      "Machine InstCombiner", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
+                    false, false)
+
+void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
+  MachineInstr *DefInstr = nullptr;
+  // We need a virtual register definition.
+  if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    DefInstr = MRI->getUniqueVRegDef(MO.getReg());
+  // PHI's have no depth etc.
+  if (DefInstr && DefInstr->isPHI())
+    DefInstr = nullptr;
+  return DefInstr;
+}
+
+/// getDepth - Computes depth of instructions in vector \InsInstr.
+///
+/// \param InsInstrs is a vector of machine instructions
+/// \param InstrIdxForVirtReg is a dense map of virtual register to index
+/// of defining machine instruction in \p InsInstrs
+/// \param BlockTrace is a trace of machine instructions
+///
+/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
+unsigned
+MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
+                          DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                          MachineTraceMetrics::Trace BlockTrace) {
+
+  SmallVector<unsigned, 16> InstrDepth;
+  assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
+
+  // Foreach instruction in in the new sequence compute the depth based on the
+  // operands. Use the trace information when possible. For new operands which
+  // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
+  for (auto *InstrPtr : InsInstrs) { // for each Use
+    unsigned IDepth = 0;
+    DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";);
+    for (unsigned i = 0, e = InstrPtr->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = InstrPtr->getOperand(i);
+      // Check for virtual register operand.
+      if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
+        continue;
+      if (!MO.isUse())
+        continue;
+      unsigned DepthOp = 0;
+      unsigned LatencyOp = 0;
+      DenseMap<unsigned, unsigned>::iterator II =
+          InstrIdxForVirtReg.find(MO.getReg());
+      if (II != InstrIdxForVirtReg.end()) {
+        // Operand is new virtual register not in trace
+        assert(II->second < InstrDepth.size() && "Bad Index");
+        MachineInstr *DefInstr = InsInstrs[II->second];
+        assert(DefInstr &&
+               "There must be a definition for a new virtual register");
+        DepthOp = InstrDepth[II->second];
+        LatencyOp = TSchedModel.computeOperandLatency(
+            DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
+            InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
+      } else {
+        MachineInstr *DefInstr = getOperandDef(MO);
+        if (DefInstr) {
+          DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth;
+          LatencyOp = TSchedModel.computeOperandLatency(
+              DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
+              InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
+        }
+      }
+      IDepth = std::max(IDepth, DepthOp + LatencyOp);
+    }
+    InstrDepth.push_back(IDepth);
+  }
+  unsigned NewRootIdx = InsInstrs.size() - 1;
+  return InstrDepth[NewRootIdx];
+}
+
+/// getLatency - Computes instruction latency as max of latency of defined
+/// operands
+///
+/// \param Root is a machine instruction that could be replaced by NewRoot.
+/// It is used to compute a more accurate latency information for NewRoot in
+/// case there is a dependent instruction in the same trace (\p BlockTrace)
+/// \param NewRoot is the instruction for which the latency is computed
+/// \param BlockTrace is a trace of machine instructions
+///
+/// \returns Latency of \p NewRoot
+unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
+                                     MachineTraceMetrics::Trace BlockTrace) {
+
+  assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
+
+  // Check each definition in NewRoot and compute the latency
+  unsigned NewRootLatency = 0;
+
+  for (unsigned i = 0, e = NewRoot->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = NewRoot->getOperand(i);
+    // Check for virtual register operand.
+    if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
+      continue;
+    if (!MO.isDef())
+      continue;
+    // Get the first instruction that uses MO
+    MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
+    RI++;
+    MachineInstr *UseMO = RI->getParent();
+    unsigned LatencyOp = 0;
+    if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) {
+      LatencyOp = TSchedModel.computeOperandLatency(
+          NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
+          UseMO->findRegisterUseOperandIdx(MO.getReg()));
+    } else {
+      LatencyOp = TSchedModel.computeInstrLatency(NewRoot->getOpcode());
+    }
+    NewRootLatency = std::max(NewRootLatency, LatencyOp);
+  }
+  return NewRootLatency;
+}
+
+/// preservesCriticalPathlen - True when the new instruction sequence does not
+/// lengthen the critical path. The DAGCombine code sequence ends in MI
+/// (Machine Instruction) Root. The new code sequence ends in MI NewRoot. A
+/// necessary condition for the new sequence to replace the old sequence is that
+/// is cannot lengthen the critical path. This is decided by the formula
+/// (NewRootDepth + NewRootLatency) <=  (RootDepth + RootLatency + RootSlack)).
+/// The slack is the number of cycles Root can be delayed before the critical
+/// patch becomes longer.
+bool MachineCombiner::preservesCriticalPathLen(
+    MachineBasicBlock *MBB, MachineInstr *Root,
+    MachineTraceMetrics::Trace BlockTrace,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
+
+  assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
+  // NewRoot is the last instruction in the \p InsInstrs vector
+  // Get depth and latency of NewRoot
+  unsigned NewRootIdx = InsInstrs.size() - 1;
+  MachineInstr *NewRoot = InsInstrs[NewRootIdx];
+  unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
+  unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
+
+  // Get depth, latency and slack of Root
+  unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth;
+  unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
+  unsigned RootSlack = BlockTrace.getInstrSlack(Root);
+
+  DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
+        dbgs() << " NewRootDepth: " << NewRootDepth
+               << " NewRootLatency: " << NewRootLatency << "\n";
+        dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency
+               << " RootSlack: " << RootSlack << "\n";
+        dbgs() << " NewRootDepth + NewRootLatency "
+               << NewRootDepth + NewRootLatency << "\n";
+        dbgs() << " RootDepth + RootLatency + RootSlack "
+               << RootDepth + RootLatency + RootSlack << "\n";);
+
+  /// True when the new sequence does not lenghten the critical path.
+  return ((NewRootDepth + NewRootLatency) <=
+          (RootDepth + RootLatency + RootSlack));
+}
+
+/// helper routine to convert instructions into SC
+void MachineCombiner::instr2instrSC(
+    SmallVectorImpl<MachineInstr *> &Instrs,
+    SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
+  for (auto *InstrPtr : Instrs) {
+    unsigned Opc = InstrPtr->getOpcode();
+    unsigned Idx = TII->get(Opc).getSchedClass();
+    const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
+    InstrsSC.push_back(SC);
+  }
+}
+/// preservesResourceLen - True when the new instructions do not increase
+/// resource length
+bool MachineCombiner::preservesResourceLen(
+    MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs) {
+
+  // Compute current resource length
+
+  //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
+  SmallVector <const MachineBasicBlock *, 1> MBBarr;
+  MBBarr.push_back(MBB);
+  unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
+
+  // Deal with SC rather than Instructions.
+  SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
+  SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
+
+  instr2instrSC(InsInstrs, InsInstrsSC);
+  instr2instrSC(DelInstrs, DelInstrsSC);
+
+  ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC);
+  ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC);
+
+  // Compute new resource length
+  unsigned ResLenAfterCombine =
+      BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
+
+  DEBUG(dbgs() << "RESOURCE DATA: \n";
+        dbgs() << " resource len before: " << ResLenBeforeCombine
+               << " after: " << ResLenAfterCombine << "\n";);
+
+  return ResLenAfterCombine <= ResLenBeforeCombine;
+}
+
+/// \returns true when new instruction sequence should be generated
+/// independent if it lenghtens critical path or not
+bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
+  if (OptSize && (NewSize < OldSize))
+    return true;
+  if (!TSchedModel.hasInstrSchedModel())
+    return true;
+  return false;
+}
+
+/// combineInstructions - substitute a slow code sequence with a faster one by
+/// evaluating instruction combining pattern.
+/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
+/// combining based on machine trace metrics. Only combine a sequence of
+/// instructions  when this neither lengthens the critical path nor increases
+/// resource pressure. When optimizing for codesize always combine when the new
+/// sequence is shorter.
+bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
+
+  auto BlockIter = MBB->begin();
+
+  while (BlockIter != MBB->end()) {
+    auto &MI = *BlockIter++;
+
+    DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";);
+    SmallVector<MachineCombinerPattern::MC_PATTERN, 16> Pattern;
+    // The motivating example is:
+    //
+    //     MUL  Other        MUL_op1 MUL_op2  Other
+    //      \    /               \      |    /
+    //      ADD/SUB      =>        MADD/MSUB
+    //      (=Root)                (=NewRoot)
+
+    // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
+    // usually beneficial for code size it unfortunately can hurt performance
+    // when the ADD is on the critical path, but the MUL is not. With the
+    // substitution the MUL becomes part of the critical path (in form of the
+    // MADD) and can lengthen it on architectures where the MADD latency is
+    // longer than the ADD latency.
+    //
+    // For each instruction we check if it can be the root of a combiner
+    // pattern. Then for each pattern the new code sequence in form of MI is
+    // generated and evaluated. When the efficiency criteria (don't lengthen
+    // critical path, don't use more resources) is met the new sequence gets
+    // hooked up into the basic block before the old sequence is removed.
+    //
+    // The algorithm does not try to evaluate all patterns and pick the best.
+    // This is only an artificial restriction though. In practice there is
+    // mostly one pattern and hasPattern() can order patterns based on an
+    // internal cost heuristic.
+
+    if (TII->hasPattern(MI, Pattern)) {
+      for (auto P : Pattern) {
+        SmallVector<MachineInstr *, 16> InsInstrs;
+        SmallVector<MachineInstr *, 16> DelInstrs;
+        DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
+        if (!MinInstr)
+          MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+        MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
+        Traces->verifyAnalysis();
+        TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
+                                        InstrIdxForVirtReg);
+        // Found pattern, but did not generate alternative sequence.
+        // This can happen e.g. when an immediate could not be materialized
+        // in a single instruction.
+        if (!InsInstrs.size())
+          continue;
+        // Substitute when we optimize for codesize and the new sequence has
+        // fewer instructions OR
+        // the new sequence neither lenghten the critical path nor increases
+        // resource pressure.
+        if (doSubstitute(InsInstrs.size(), DelInstrs.size()) ||
+            (preservesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
+                                      InstrIdxForVirtReg) &&
+             preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
+          for (auto *InstrPtr : InsInstrs)
+            MBB->insert((MachineBasicBlock::iterator) & MI,
+                        (MachineInstr *)InstrPtr);
+          for (auto *InstrPtr : DelInstrs)
+            InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
+
+          Changed = true;
+          ++NumInstCombined;
+
+          Traces->invalidate(MBB);
+          Traces->verifyAnalysis();
+          // Eagerly stop after the first pattern fired
+          break;
+        } else {
+          // Cleanup instructions of the alternative code sequence. There is no
+          // use for them.
+          for (auto *InstrPtr : InsInstrs) {
+            MachineFunction *MF = MBB->getParent();
+            MF->DeleteMachineInstr((MachineInstr *)InstrPtr);
+          }
+        }
+        InstrIdxForVirtReg.clear();
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
+  const TargetSubtargetInfo &STI =
+      MF.getTarget().getSubtarget<TargetSubtargetInfo>();
+  TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
+  SchedModel = STI.getSchedModel();
+  TSchedModel.init(SchedModel, &STI, TII);
+  MRI = &MF.getRegInfo();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = 0;
+
+  OptSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+  DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
+  if (!TII->useMachineCombiner()) {
+    DEBUG(dbgs() << "  Skipping pass: Target does not support machine combiner\n");
+    return false;
+  }
+
+  bool Changed = false;
+
+  // Try to combine instructions.
+  for (auto &MBB : MF)
+    Changed |= combineInstructions(&MBB);
+
+  return Changed;
+}
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 3119a35c2e9f..cbd62728ace9 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "codegen-cp"
@@ -335,8 +336,8 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
 
-  TRI = MF.getTarget().getRegisterInfo();
-  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
   MRI = &MF.getRegInfo();
 
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
diff --git a/lib/CodeGen/MachineDominanceFrontier.cpp b/lib/CodeGen/MachineDominanceFrontier.cpp
index 0bee84668f01..acb7c4810b16 100644
--- a/lib/CodeGen/MachineDominanceFrontier.cpp
+++ b/lib/CodeGen/MachineDominanceFrontier.cpp
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineDominanceFrontier.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/Analysis/DominanceFrontierImpl.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/Passes.h"
 
 
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index 04c8ecbf9bdc..df60cf34b610 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -35,6 +35,8 @@ void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  CriticalEdgesToSplit.clear();
+  NewBBs.clear();
   DT->recalculate(F);
 
   return false;
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 7e9b7559517d..6b4cba6c6782 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "codegen"
@@ -52,17 +53,19 @@ void ilist_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
 }
 
 MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
-                                 unsigned FunctionNum, MachineModuleInfo &mmi,
-                                 GCModuleInfo* gmi)
-  : Fn(F), Target(TM), Ctx(mmi.getContext()), MMI(mmi), GMI(gmi) {
-  if (TM.getRegisterInfo())
-    RegInfo = new (Allocator) MachineRegisterInfo(TM);
+                                 unsigned FunctionNum, MachineModuleInfo &mmi)
+    : Fn(F), Target(TM), STI(TM.getSubtargetImpl()), Ctx(mmi.getContext()),
+      MMI(mmi) {
+  if (STI->getRegisterInfo())
+    RegInfo = new (Allocator) MachineRegisterInfo(this);
   else
     RegInfo = nullptr;
 
   MFInfo = nullptr;
-  FrameInfo =
-    new (Allocator) MachineFrameInfo(TM,!F->hasFnAttribute("no-realign-stack"));
+  FrameInfo = new (Allocator)
+      MachineFrameInfo(STI->getFrameLowering()->getStackAlignment(),
+                       STI->getFrameLowering()->isStackRealignable(),
+                       !F->hasFnAttribute("no-realign-stack"));
 
   if (Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                        Attribute::StackAlignment))
@@ -70,13 +73,13 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
                                 getStackAlignment(AttributeSet::FunctionIndex));
 
   ConstantPool = new (Allocator) MachineConstantPool(TM);
-  Alignment = TM.getTargetLowering()->getMinFunctionAlignment();
+  Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
 
   // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
   if (!Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                         Attribute::OptimizeForSize))
     Alignment = std::max(Alignment,
-                         TM.getTargetLowering()->getPrefFunctionAlignment());
+                         STI->getTargetLowering()->getPrefFunctionAlignment());
 
   FunctionNumber = FunctionNum;
   JumpTableInfo = nullptr;
@@ -229,10 +232,10 @@ MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) {
 MachineMemOperand *
 MachineFunction::getMachineMemOperand(MachinePointerInfo PtrInfo, unsigned f,
                                       uint64_t s, unsigned base_alignment,
-                                      const MDNode *TBAAInfo,
+                                      const AAMDNodes &AAInfo,
                                       const MDNode *Ranges) {
   return new (Allocator) MachineMemOperand(PtrInfo, f, s, base_alignment,
-                                           TBAAInfo, Ranges);
+                                           AAInfo, Ranges);
 }
 
 MachineMemOperand *
@@ -243,12 +246,12 @@ MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
                MachineMemOperand(MachinePointerInfo(MMO->getValue(),
                                                     MMO->getOffset()+Offset),
                                  MMO->getFlags(), Size,
-                                 MMO->getBaseAlignment(), nullptr);
+                                 MMO->getBaseAlignment());
   return new (Allocator)
              MachineMemOperand(MachinePointerInfo(MMO->getPseudoValue(),
                                                   MMO->getOffset()+Offset),
                                MMO->getFlags(), Size,
-                               MMO->getBaseAlignment(), nullptr);
+                               MMO->getBaseAlignment());
 }
 
 MachineInstr::mmo_iterator
@@ -279,7 +282,7 @@ MachineFunction::extractLoadMemRefs(MachineInstr::mmo_iterator Begin,
           getMachineMemOperand((*I)->getPointerInfo(),
                                (*I)->getFlags() & ~MachineMemOperand::MOStore,
                                (*I)->getSize(), (*I)->getBaseAlignment(),
-                               (*I)->getTBAAInfo());
+                               (*I)->getAAInfo());
         Result[Index] = JustLoad;
       }
       ++Index;
@@ -311,7 +314,7 @@ MachineFunction::extractStoreMemRefs(MachineInstr::mmo_iterator Begin,
           getMachineMemOperand((*I)->getPointerInfo(),
                                (*I)->getFlags() & ~MachineMemOperand::MOLoad,
                                (*I)->getSize(), (*I)->getBaseAlignment(),
-                               (*I)->getTBAAInfo());
+                               (*I)->getAAInfo());
         Result[Index] = JustStore;
       }
       ++Index;
@@ -350,7 +353,7 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
   // Print Constant Pool
   ConstantPool->print(OS);
 
-  const TargetRegisterInfo *TRI = getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = getSubtarget().getRegisterInfo();
 
   if (RegInfo && !RegInfo->livein_empty()) {
     OS << "Function Live Ins: ";
@@ -459,7 +462,7 @@ unsigned MachineFunction::addLiveIn(unsigned PReg,
 /// normal 'L' label is returned.
 MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
                                         bool isLinkerPrivate) const {
-  const DataLayout *DL = getTarget().getDataLayout();
+  const DataLayout *DL = getSubtarget().getDataLayout();
   assert(JumpTableInfo && "No jump tables");
   assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!");
 
@@ -474,7 +477,7 @@ MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
 /// getPICBaseSymbol - Return a function-local symbol to represent the PIC
 /// base.
 MCSymbol *MachineFunction::getPICBaseSymbol() const {
-  const DataLayout *DL = getTarget().getDataLayout();
+  const DataLayout *DL = getSubtarget().getDataLayout();
   return Ctx.GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+
                                Twine(getFunctionNumber())+"$pb");
 }
@@ -483,15 +486,11 @@ MCSymbol *MachineFunction::getPICBaseSymbol() const {
 //  MachineFrameInfo implementation
 //===----------------------------------------------------------------------===//
 
-const TargetFrameLowering *MachineFrameInfo::getFrameLowering() const {
-  return TM.getFrameLowering();
-}
-
 /// ensureMaxAlignment - Make sure the function is at least Align bytes
 /// aligned.
 void MachineFrameInfo::ensureMaxAlignment(unsigned Align) {
-  if (!getFrameLowering()->isStackRealignable() || !RealignOption)
-    assert(Align <= getFrameLowering()->getStackAlignment() &&
+  if (!StackRealignable || !RealignOption)
+    assert(Align <= StackAlignment &&
            "For targets without stack realignment, Align is out of limit!");
   if (MaxAlignment < Align) MaxAlignment = Align;
 }
@@ -513,11 +512,10 @@ static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align,
 int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
                       bool isSS, const AllocaInst *Alloca) {
   assert(Size != 0 && "Cannot allocate zero size stack objects!");
-  Alignment =
-    clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
-                          !RealignOption,
-                        Alignment, getFrameLowering()->getStackAlignment());
-  Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca));
+  Alignment = clampStackAlignment(!StackRealignable || !RealignOption,
+                                  Alignment, StackAlignment);
+  Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca,
+                                !isSS));
   int Index = (int)Objects.size() - NumFixedObjects - 1;
   assert(Index >= 0 && "Bad frame index!");
   ensureMaxAlignment(Alignment);
@@ -530,9 +528,8 @@ int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
 ///
 int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
                                              unsigned Alignment) {
-  Alignment = clampStackAlignment(
-      !getFrameLowering()->isStackRealignable() || !RealignOption, Alignment,
-      getFrameLowering()->getStackAlignment());
+  Alignment = clampStackAlignment(!StackRealignable || !RealignOption,
+                                  Alignment, StackAlignment);
   CreateStackObject(Size, Alignment, true);
   int Index = (int)Objects.size() - NumFixedObjects - 1;
   ensureMaxAlignment(Alignment);
@@ -547,10 +544,9 @@ int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
 int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment,
                                                 const AllocaInst *Alloca) {
   HasVarSizedObjects = true;
-  Alignment = clampStackAlignment(
-      !getFrameLowering()->isStackRealignable() || !RealignOption, Alignment,
-      getFrameLowering()->getStackAlignment());
-  Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca));
+  Alignment = clampStackAlignment(!StackRealignable || !RealignOption,
+                                  Alignment, StackAlignment);
+  Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca, true));
   ensureMaxAlignment(Alignment);
   return (int)Objects.size()-NumFixedObjects-1;
 }
@@ -561,20 +557,18 @@ int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment,
 /// index with a negative value.
 ///
 int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
-                                        bool Immutable) {
+                                        bool Immutable, bool isAliased) {
   assert(Size != 0 && "Cannot allocate zero size fixed stack objects!");
   // The alignment of the frame index can be determined from its offset from
   // the incoming frame position.  If the frame object is at offset 32 and
   // the stack is guaranteed to be 16-byte aligned, then we know that the
   // object is 16-byte aligned.
-  unsigned StackAlign = getFrameLowering()->getStackAlignment();
-  unsigned Align = MinAlign(SPOffset, StackAlign);
-  Align = clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
-                                  !RealignOption,
-                              Align, getFrameLowering()->getStackAlignment());
+  unsigned Align = MinAlign(SPOffset, StackAlignment);
+  Align = clampStackAlignment(!StackRealignable || !RealignOption, Align,
+                              StackAlignment);
   Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
                                               /*isSS*/   false,
-                                              /*Alloca*/ nullptr));
+                                              /*Alloca*/ nullptr, isAliased));
   return -++NumFixedObjects;
 }
 
@@ -582,25 +576,32 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
 /// on the stack.  Returns an index with a negative value.
 int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
                                                   int64_t SPOffset) {
-  unsigned StackAlign = getFrameLowering()->getStackAlignment();
-  unsigned Align = MinAlign(SPOffset, StackAlign);
-  Align = clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
-                                  !RealignOption,
-                              Align, getFrameLowering()->getStackAlignment());
+  unsigned Align = MinAlign(SPOffset, StackAlignment);
+  Align = clampStackAlignment(!StackRealignable || !RealignOption, Align,
+                              StackAlignment);
   Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset,
                                               /*Immutable*/ true,
                                               /*isSS*/ true,
-                                              /*Alloca*/ nullptr));
+                                              /*Alloca*/ nullptr,
+                                              /*isAliased*/ false));
   return -++NumFixedObjects;
 }
 
+int MachineFrameInfo::CreateFrameAllocation(uint64_t Size) {
+  // Force the use of a frame pointer. The intention is that this intrinsic be
+  // used in conjunction with unwind mechanisms that leak the frame pointer.
+  setFrameAddressIsTaken(true);
+  Size = RoundUpToAlignment(Size, StackAlignment);
+  return CreateStackObject(Size, StackAlignment, false);
+}
+
 BitVector
 MachineFrameInfo::getPristineRegs(const MachineBasicBlock *MBB) const {
   assert(MBB && "MBB must be valid");
   const MachineFunction *MF = MBB->getParent();
   assert(MF && "MBB must be part of a MachineFunction");
   const TargetMachine &TM = MF->getTarget();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
   BitVector BV(TRI->getNumRegs());
 
   // Before CSI is calculated, no registers are considered pristine. They can be
@@ -625,8 +626,8 @@ MachineFrameInfo::getPristineRegs(const MachineBasicBlock *MBB) const {
 }
 
 unsigned MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
   unsigned MaxAlign = getMaxAlignment();
   int Offset = 0;
 
@@ -676,7 +677,7 @@ unsigned MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
 void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
   if (Objects.empty()) return;
 
-  const TargetFrameLowering *FI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering();
   int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0);
 
   OS << "Frame Objects:\n";
@@ -820,7 +821,7 @@ void MachineJumpTableInfo::dump() const { print(dbgs()); }
 void MachineConstantPoolValue::anchor() { }
 
 const DataLayout *MachineConstantPool::getDataLayout() const {
-  return TM.getDataLayout();
+  return TM.getSubtargetImpl()->getDataLayout();
 }
 
 Type *MachineConstantPoolEntry::getType() const {
diff --git a/lib/CodeGen/MachineFunctionAnalysis.cpp b/lib/CodeGen/MachineFunctionAnalysis.cpp
index 46cd60ac613e..f6f34ba9d927 100644
--- a/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -46,8 +46,7 @@ bool MachineFunctionAnalysis::doInitialization(Module &M) {
 bool MachineFunctionAnalysis::runOnFunction(Function &F) {
   assert(!MF && "MachineFunctionAnalysis already initialized!");
   MF = new MachineFunction(&F, TM, NextFnNum++,
-                           getAnalysis<MachineModuleInfo>(),
-                           getAnalysisIfAvailable<GCModuleInfo>());
+                           getAnalysis<MachineModuleInfo>());
   return false;
 }
 
diff --git a/lib/CodeGen/MachineFunctionPrinterPass.cpp b/lib/CodeGen/MachineFunctionPrinterPass.cpp
index dee3977cd080..790f5accdb26 100644
--- a/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -52,7 +52,7 @@ char MachineFunctionPrinterPass::ID = 0;
 }
 
 char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
-INITIALIZE_PASS(MachineFunctionPrinterPass, "print-machineinstrs",
+INITIALIZE_PASS(MachineFunctionPrinterPass, "machineinstr-printer",
                 "Machine Function Printer", false, false)
 
 namespace llvm {
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 5122165e3659..968ec2c68e8c 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -105,23 +106,41 @@ void MachineOperand::setIsDef(bool Val) {
   IsDef = Val;
 }
 
+// If this operand is currently a register operand, and if this is in a
+// function, deregister the operand from the register's use/def list.
+void MachineOperand::removeRegFromUses() {
+  if (!isReg() || !isOnRegUseList())
+    return;
+
+  if (MachineInstr *MI = getParent()) {
+    if (MachineBasicBlock *MBB = MI->getParent()) {
+      if (MachineFunction *MF = MBB->getParent())
+        MF->getRegInfo().removeRegOperandFromUseList(this);
+    }
+  }
+}
+
 /// ChangeToImmediate - Replace this operand with a new immediate operand of
 /// the specified value.  If an operand is known to be an immediate already,
 /// the setImm method should be used.
 void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
   assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
-  // If this operand is currently a register operand, and if this is in a
-  // function, deregister the operand from the register's use/def list.
-  if (isReg() && isOnRegUseList())
-    if (MachineInstr *MI = getParent())
-      if (MachineBasicBlock *MBB = MI->getParent())
-        if (MachineFunction *MF = MBB->getParent())
-          MF->getRegInfo().removeRegOperandFromUseList(this);
+
+  removeRegFromUses();
 
   OpKind = MO_Immediate;
   Contents.ImmVal = ImmVal;
 }
 
+void MachineOperand::ChangeToFPImmediate(const ConstantFP *FPImm) {
+  assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
+
+  removeRegFromUses();
+
+  OpKind = MO_FPImmediate;
+  Contents.CFP = FPImm;
+}
+
 /// ChangeToRegister - Replace this operand with a new register operand of
 /// the specified value.  If an operand is known to be an register already,
 /// the setReg method should be used.
@@ -265,7 +284,8 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
       if (const MachineBasicBlock *MBB = MI->getParent())
         if (const MachineFunction *MF = MBB->getParent())
           TM = &MF->getTarget();
-  const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : nullptr;
+  const TargetRegisterInfo *TRI =
+      TM ? TM->getSubtargetImpl()->getRegisterInfo() : nullptr;
 
   switch (getType()) {
   case MachineOperand::MO_Register:
@@ -377,7 +397,7 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
     break;
   case MachineOperand::MO_Metadata:
     OS << '<';
-    getMetadata()->printAsOperand(OS, /*PrintType=*/false);
+    getMetadata()->printAsOperand(OS);
     OS << '>';
     break;
   case MachineOperand::MO_MCSymbol:
@@ -429,11 +449,11 @@ MachinePointerInfo MachinePointerInfo::getStack(int64_t Offset) {
 
 MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, unsigned f,
                                      uint64_t s, unsigned int a,
-                                     const MDNode *TBAAInfo,
+                                     const AAMDNodes &AAInfo,
                                      const MDNode *Ranges)
   : PtrInfo(ptrinfo), Size(s),
     Flags((f & ((1 << MOMaxBits) - 1)) | ((Log2_32(a) + 1) << MOMaxBits)),
-    TBAAInfo(TBAAInfo), Ranges(Ranges) {
+    AAInfo(AAInfo), Ranges(Ranges) {
   assert((PtrInfo.V.isNull() || PtrInfo.V.is<const PseudoSourceValue*>() ||
           isa<PointerType>(PtrInfo.V.get<const Value*>()->getType())) &&
          "invalid pointer value");
@@ -514,10 +534,38 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) {
     OS << "(align=" << MMO.getAlignment() << ")";
 
   // Print TBAA info.
-  if (const MDNode *TBAAInfo = MMO.getTBAAInfo()) {
+  if (const MDNode *TBAAInfo = MMO.getAAInfo().TBAA) {
     OS << "(tbaa=";
     if (TBAAInfo->getNumOperands() > 0)
-      TBAAInfo->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
+      TBAAInfo->getOperand(0)->printAsOperand(OS);
+    else
+      OS << "<unknown>";
+    OS << ")";
+  }
+
+  // Print AA scope info.
+  if (const MDNode *ScopeInfo = MMO.getAAInfo().Scope) {
+    OS << "(alias.scope=";
+    if (ScopeInfo->getNumOperands() > 0)
+      for (unsigned i = 0, ie = ScopeInfo->getNumOperands(); i != ie; ++i) {
+        ScopeInfo->getOperand(i)->printAsOperand(OS);
+        if (i != ie-1)
+          OS << ",";
+      }
+    else
+      OS << "<unknown>";
+    OS << ")";
+  }
+
+  // Print AA noalias scope info.
+  if (const MDNode *NoAliasInfo = MMO.getAAInfo().NoAlias) {
+    OS << "(noalias=";
+    if (NoAliasInfo->getNumOperands() > 0)
+      for (unsigned i = 0, ie = NoAliasInfo->getNumOperands(); i != ie; ++i) {
+        NoAliasInfo->getOperand(i)->printAsOperand(OS);
+        if (i != ie-1)
+          OS << ",";
+      }
     else
       OS << "<unknown>";
     OS << ")";
@@ -551,6 +599,8 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
   : MCID(&tid), Parent(nullptr), Operands(nullptr), NumOperands(0),
     Flags(0), AsmPrinterFlags(0),
     NumMemRefs(0), MemRefs(nullptr), debugLoc(dl) {
+  assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
+
   // Reserve space for the expected number of operands.
   if (unsigned NumOps = MCID->getNumOperands() +
     MCID->getNumImplicitDefs() + MCID->getNumImplicitUses()) {
@@ -569,6 +619,8 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
     Flags(0), AsmPrinterFlags(0),
     NumMemRefs(MI.NumMemRefs), MemRefs(MI.MemRefs),
     debugLoc(MI.getDebugLoc()) {
+  assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
+
   CapOperands = OperandCapacity::get(MI.getNumOperands());
   Operands = MF.allocateOperandArray(CapOperands);
 
@@ -865,6 +917,27 @@ void MachineInstr::eraseFromParent() {
   getParent()->erase(this);
 }
 
+void MachineInstr::eraseFromParentAndMarkDBGValuesForRemoval() {
+  assert(getParent() && "Not embedded in a basic block!");
+  MachineBasicBlock *MBB = getParent();
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Not embedded in a function!");
+
+  MachineInstr *MI = (MachineInstr *)this;
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    MRI.markUsesInDebugValueAsUndef(Reg);
+  }
+  MI->eraseFromParent();
+}
+
 void MachineInstr::eraseFromBundle() {
   assert(getParent() && "Not embedded in a basic block!");
   getParent()->erase_instr(this);
@@ -1379,7 +1452,7 @@ bool MachineInstr::isInvariantLoad(AliasAnalysis *AA) const {
       // If we have an AliasAnalysis, ask it whether the memory is constant.
       if (AA && AA->pointsToConstantMemory(
                       AliasAnalysis::Location(V, (*I)->getSize(),
-                                              (*I)->getTBAAInfo())))
+                                              (*I)->getAAInfo())))
         continue;
     }
 
@@ -1489,8 +1562,8 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM,
     OS << " = ";
 
   // Print the opcode name.
-  if (TM && TM->getInstrInfo())
-    OS << TM->getInstrInfo()->getName(getOpcode());
+  if (TM && TM->getSubtargetImpl()->getInstrInfo())
+    OS << TM->getSubtargetImpl()->getInstrInfo()->getName(getOpcode());
   else
     OS << "UNKNOWN";
 
@@ -1538,17 +1611,17 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM,
     // call instructions much less noisy on targets where calls clobber lots
     // of registers. Don't rely on MO.isDead() because we may be called before
     // LiveVariables is run, or we may be looking at a non-allocatable reg.
-    if (MF && isCall() &&
+    if (MRI && isCall() &&
         MO.isReg() && MO.isImplicit() && MO.isDef()) {
       unsigned Reg = MO.getReg();
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        const MachineRegisterInfo &MRI = MF->getRegInfo();
-        if (MRI.use_empty(Reg)) {
+        if (MRI->use_empty(Reg)) {
           bool HasAliasLive = false;
-          for (MCRegAliasIterator AI(Reg, TM->getRegisterInfo(), true);
+          for (MCRegAliasIterator AI(
+                   Reg, TM->getSubtargetImpl()->getRegisterInfo(), true);
                AI.isValid(); ++AI) {
             unsigned AliasReg = *AI;
-            if (!MRI.use_empty(AliasReg)) {
+            if (!MRI->use_empty(AliasReg)) {
               HasAliasLive = true;
               break;
             }
@@ -1573,12 +1646,16 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM,
     if (isDebugValue() && MO.isMetadata()) {
       // Pretty print DBG_VALUE instructions.
       const MDNode *MD = MO.getMetadata();
-      if (const MDString *MDS = dyn_cast<MDString>(MD->getOperand(2)))
-        OS << "!\"" << MDS->getString() << '\"';
+      DIDescriptor DI(MD);
+      DIVariable DIV(MD);
+
+      if (DI.isVariable() && !DIV.getName().empty())
+        OS << "!\"" << DIV.getName() << '\"';
       else
         MO.print(OS, TM);
     } else if (TM && (isInsertSubreg() || isRegSequence()) && MO.isImm()) {
-      OS << TM->getRegisterInfo()->getSubRegIndexName(MO.getImm());
+      OS << TM->getSubtargetImpl()->getRegisterInfo()->getSubRegIndexName(
+          MO.getImm());
     } else if (i == AsmDescOp && MO.isImm()) {
       // Pretty print the inline asm operand descriptor.
       OS << '$' << AsmOpCount++;
@@ -1595,9 +1672,12 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM,
 
       unsigned RCID = 0;
       if (InlineAsm::hasRegClassConstraint(Flag, RCID)) {
-        if (TM)
-          OS << ':' << TM->getRegisterInfo()->getRegClass(RCID)->getName();
-        else
+        if (TM) {
+          const TargetRegisterInfo *TRI =
+            TM->getSubtargetImpl()->getRegisterInfo();
+          OS << ':'
+             << TRI->getRegClassName(TRI->getRegClass(RCID));
+        } else
           OS << ":RC" << RCID;
       }
 
@@ -1646,7 +1726,8 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM,
     if (!HaveSemi) OS << ";"; HaveSemi = true;
     for (unsigned i = 0; i != VirtRegs.size(); ++i) {
       const TargetRegisterClass *RC = MRI->getRegClass(VirtRegs[i]);
-      OS << " " << RC->getName() << ':' << PrintReg(VirtRegs[i]);
+      OS << " " << MRI->getTargetRegisterInfo()->getRegClassName(RC)
+         << ':' << PrintReg(VirtRegs[i]);
       for (unsigned j = i+1; j != VirtRegs.size();) {
         if (MRI->getRegClass(VirtRegs[j]) != RC) {
           ++j;
@@ -1672,6 +1753,8 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM,
         OS << " ]";
       }
     }
+    if (isIndirectDebugValue())
+      OS << " indirect";
   } else if (!debugLoc.isUnknown() && MF) {
     if (!HaveSemi) OS << ";";
     OS << " dbg:";
@@ -1881,7 +1964,8 @@ void MachineInstr::emitError(StringRef Msg) const {
     if (getOperand(i-1).isMetadata() &&
         (LocMD = getOperand(i-1).getMetadata()) &&
         LocMD->getNumOperands() != 0) {
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(LocMD->getOperand(0))) {
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
         LocCookie = CI->getZExtValue();
         break;
       }
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index 962169e1b15c..0690f08d495b 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 namespace {
@@ -103,12 +104,12 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   assert(FirstMI != LastMI && "Empty bundle?");
   MIBundleBuilder Bundle(MBB, FirstMI, LastMI);
 
-  const TargetMachine &TM = MBB.getParent()->getTarget();
-  const TargetInstrInfo *TII = TM.getInstrInfo();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
-  MachineInstrBuilder MIB = BuildMI(*MBB.getParent(), FirstMI->getDebugLoc(),
-                                    TII->get(TargetOpcode::BUNDLE));
+  MachineInstrBuilder MIB =
+      BuildMI(MF, FirstMI->getDebugLoc(), TII->get(TargetOpcode::BUNDLE));
   Bundle.prepend(MIB);
 
   SmallVector<unsigned, 32> LocalDefs;
@@ -140,7 +141,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
           // Internal def is now killed.
           KilledDefSet.insert(Reg);
       } else {
-        if (ExternUseSet.insert(Reg)) {
+        if (ExternUseSet.insert(Reg).second) {
           ExternUses.push_back(Reg);
           if (MO.isUndef())
             UndefUseSet.insert(Reg);
@@ -157,7 +158,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       if (!Reg)
         continue;
 
-      if (LocalDefSet.insert(Reg)) {
+      if (LocalDefSet.insert(Reg).second) {
         LocalDefs.push_back(Reg);
         if (MO.isDead()) {
           DeadDefSet.insert(Reg);
@@ -173,7 +174,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       if (!MO.isDead()) {
         for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
           unsigned SubReg = *SubRegs;
-          if (LocalDefSet.insert(SubReg))
+          if (LocalDefSet.insert(SubReg).second)
             LocalDefs.push_back(SubReg);
         }
       }
@@ -185,7 +186,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   SmallSet<unsigned, 32> Added;
   for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
     unsigned Reg = LocalDefs[i];
-    if (Added.insert(Reg)) {
+    if (Added.insert(Reg).second) {
       // If it's not live beyond end of the bundle, mark it dead.
       bool isDead = DeadDefSet.count(Reg) || KilledDefSet.count(Reg);
       MIB.addReg(Reg, getDefRegState(true) | getDeadRegState(isDead) |
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 68d2efdb1d0d..cb14a5cab4b4 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "machine-licm"
@@ -48,6 +49,11 @@ AvoidSpeculation("avoid-speculation",
                  cl::desc("MachineLICM should avoid speculation"),
                  cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+HoistCheapInsts("hoist-cheap-insts",
+                cl::desc("MachineLICM should hoist even cheap instructions"),
+                cl::init(false), cl::Hidden);
+
 STATISTIC(NumHoisted,
           "Number of machine instructions hoisted out of loops");
 STATISTIC(NumLowRP,
@@ -61,7 +67,6 @@ STATISTIC(NumPostRAHoisted,
 
 namespace {
   class MachineLICM : public MachineFunctionPass {
-    const TargetMachine   *TM;
     const TargetInstrInfo *TII;
     const TargetLoweringBase *TLI;
     const TargetRegisterInfo *TRI;
@@ -142,9 +147,6 @@ namespace {
       RegPressure.clear();
       RegLimit.clear();
       BackTrace.clear();
-      for (DenseMap<unsigned,std::vector<const MachineInstr*> >::iterator
-             CI = CSEMap.begin(), CE = CSEMap.end(); CI != CE; ++CI)
-        CI->second.clear();
       CSEMap.clear();
     }
 
@@ -324,13 +326,12 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   Changed = FirstInLoop = false;
-  TM = &MF.getTarget();
-  TII = TM->getInstrInfo();
-  TLI = TM->getTargetLowering();
-  TRI = TM->getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TLI = MF.getSubtarget().getTargetLowering();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MFI = MF.getFrameInfo();
   MRI = &MF.getRegInfo();
-  InstrItins = TM->getInstrItineraryData();
+  InstrItins = MF.getSubtarget().getInstrItineraryData();
 
   PreRegAlloc = MRI->isSSA();
 
@@ -822,7 +823,7 @@ void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
       if (!TargetRegisterInfo::isVirtualRegister(Reg))
         continue;
 
-      bool isNew = RegSeen.insert(Reg);
+      bool isNew = RegSeen.insert(Reg).second;
       unsigned RCId, RCCost;
       getRegisterClassIDAndCost(MI, Reg, i, RCId, RCCost);
       if (MO.isDef())
@@ -854,7 +855,7 @@ void MachineLICM::UpdateRegPressure(const MachineInstr *MI) {
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
 
-    bool isNew = RegSeen.insert(Reg);
+    bool isNew = RegSeen.insert(Reg).second;
     if (MO.isDef())
       Defs.push_back(Reg);
     else if (!isNew && isOperandKill(MO, MRI)) {
@@ -1039,7 +1040,7 @@ bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
 /// IsCheapInstruction - Return true if the instruction is marked "cheap" or
 /// the operand latency between its def and a use is one or less.
 bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
-  if (MI.isAsCheapAsAMove() || MI.isCopyLike())
+  if (TII->isAsCheapAsAMove(&MI) || MI.isCopyLike())
     return true;
   if (!InstrItins || InstrItins->isEmpty())
     return false;
@@ -1079,7 +1080,7 @@ bool MachineLICM::CanCauseHighRegPressure(DenseMap<unsigned, int> &Cost,
 
     // Don't hoist cheap instructions if they would increase register pressure,
     // even if we're under the limit.
-    if (CheapInstr)
+    if (CheapInstr && !HoistCheapInsts)
       return true;
 
     for (unsigned i = BackTrace.size(); i != 0; --i) {
@@ -1299,15 +1300,7 @@ void MachineLICM::InitCSEMap(MachineBasicBlock *BB) {
   for (MachineBasicBlock::iterator I = BB->begin(),E = BB->end(); I != E; ++I) {
     const MachineInstr *MI = &*I;
     unsigned Opcode = MI->getOpcode();
-    DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
-      CI = CSEMap.find(Opcode);
-    if (CI != CSEMap.end())
-      CI->second.push_back(MI);
-    else {
-      std::vector<const MachineInstr*> CSEMIs;
-      CSEMIs.push_back(MI);
-      CSEMap.insert(std::make_pair(Opcode, CSEMIs));
-    }
+    CSEMap[Opcode].push_back(MI);
   }
 }
 
@@ -1447,11 +1440,8 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
     // Add to the CSE map.
     if (CI != CSEMap.end())
       CI->second.push_back(MI);
-    else {
-      std::vector<const MachineInstr*> CSEMIs;
-      CSEMIs.push_back(MI);
-      CSEMap.insert(std::make_pair(Opcode, CSEMIs));
-    }
+    else
+      CSEMap[Opcode].push_back(MI);
   }
 
   ++NumHoisted;
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 4976e35dba82..32d511285eb2 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -270,11 +270,10 @@ MachineModuleInfo::~MachineModuleInfo() {
 bool MachineModuleInfo::doInitialization(Module &M) {
 
   ObjFileMMI = nullptr;
-  CompactUnwindEncoding = 0;
   CurCallSite = 0;
   CallsEHReturn = 0;
   CallsUnwindInit = 0;
-  DbgInfoAvailable = UsesVAFloatArgument = false; 
+  DbgInfoAvailable = UsesVAFloatArgument = UsesMorestackAddr = false;
   // Always emit some info, by default "no personality" info.
   Personalities.push_back(nullptr);
   AddrLabelSymbols = nullptr;
@@ -312,7 +311,6 @@ void MachineModuleInfo::EndFunction() {
   FilterEnds.clear();
   CallsEHReturn = 0;
   CallsUnwindInit = 0;
-  CompactUnwindEncoding = 0;
   VariableDbgInfos.clear();
 }
 
@@ -429,7 +427,7 @@ void MachineModuleInfo::addPersonality(MachineBasicBlock *LandingPad,
 ///
 void MachineModuleInfo::
 addCatchTypeInfo(MachineBasicBlock *LandingPad,
-                 ArrayRef<const GlobalVariable *> TyInfo) {
+                 ArrayRef<const GlobalValue *> TyInfo) {
   LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
   for (unsigned N = TyInfo.size(); N; --N)
     LP.TypeIds.push_back(getTypeIDFor(TyInfo[N - 1]));
@@ -439,7 +437,7 @@ addCatchTypeInfo(MachineBasicBlock *LandingPad,
 ///
 void MachineModuleInfo::
 addFilterTypeInfo(MachineBasicBlock *LandingPad,
-                  ArrayRef<const GlobalVariable *> TyInfo) {
+                  ArrayRef<const GlobalValue *> TyInfo) {
   LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
   std::vector<unsigned> IdsInFilter(TyInfo.size());
   for (unsigned I = 0, E = TyInfo.size(); I != E; ++I)
@@ -454,6 +452,14 @@ void MachineModuleInfo::addCleanup(MachineBasicBlock *LandingPad) {
   LP.TypeIds.push_back(0);
 }
 
+MCSymbol *
+MachineModuleInfo::addClauseForLandingPad(MachineBasicBlock *LandingPad) {
+  MCSymbol *ClauseLabel = Context.CreateTempSymbol();
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.ClauseLabels.push_back(ClauseLabel);
+  return ClauseLabel;
+}
+
 /// TidyLandingPads - Remap landing pad labels and remove any deleted landing
 /// pads.
 void MachineModuleInfo::TidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
@@ -508,7 +514,7 @@ void MachineModuleInfo::setCallSiteLandingPad(MCSymbol *Sym,
 
 /// getTypeIDFor - Return the type id for the specified typeinfo.  This is
 /// function wide.
-unsigned MachineModuleInfo::getTypeIDFor(const GlobalVariable *TI) {
+unsigned MachineModuleInfo::getTypeIDFor(const GlobalValue *TI) {
   for (unsigned i = 0, N = TypeInfos.size(); i != N; ++i)
     if (TypeInfos[i] == TI) return i + 1;
 
diff --git a/lib/CodeGen/MachineRegionInfo.cpp b/lib/CodeGen/MachineRegionInfo.cpp
index c6b6802b0eb1..01d2c2eb56fe 100644
--- a/lib/CodeGen/MachineRegionInfo.cpp
+++ b/lib/CodeGen/MachineRegionInfo.cpp
@@ -1,8 +1,10 @@
 
 #include "llvm/CodeGen/MachineRegionInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/RegionInfoImpl.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+
+#define DEBUG_TYPE "region"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index f560259a98d9..86bb34be3fe8 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -16,28 +16,23 @@
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
 // Pin the vtable to this file.
 void MachineRegisterInfo::Delegate::anchor() {}
 
-MachineRegisterInfo::MachineRegisterInfo(const TargetMachine &TM)
-  : TM(TM), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true) {
+MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF)
+  : MF(MF), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true),
+    TracksSubRegLiveness(false) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
   UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits());
   UsedPhysRegMask.resize(getTargetRegisterInfo()->getNumRegs());
 
   // Create the physreg use/def lists.
-  PhysRegUseDefLists =
-    new MachineOperand*[getTargetRegisterInfo()->getNumRegs()];
-  memset(PhysRegUseDefLists, 0,
-         sizeof(MachineOperand*)*getTargetRegisterInfo()->getNumRegs());
-}
-
-MachineRegisterInfo::~MachineRegisterInfo() {
-  delete [] PhysRegUseDefLists;
+  PhysRegUseDefLists.resize(getTargetRegisterInfo()->getNumRegs(), nullptr);
 }
 
 /// setRegClass - Set the register class of the specified virtual register.
@@ -67,7 +62,7 @@ MachineRegisterInfo::constrainRegClass(unsigned Reg,
 
 bool
 MachineRegisterInfo::recomputeRegClass(unsigned Reg, const TargetMachine &TM) {
-  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo();
   const TargetRegisterClass *OldRC = getRegClass(Reg);
   const TargetRegisterClass *NewRC =
     getTargetRegisterInfo()->getLargestLegalSuperClass(OldRC);
@@ -134,6 +129,7 @@ void MachineRegisterInfo::verifyUseList(unsigned Reg) const {
              << " use list MachineOperand " << MO
              << " has no parent instruction.\n";
       Valid = false;
+      continue;
     }
     MachineOperand *MO0 = &MI->getOperand(0);
     unsigned NumOps = MI->getNumOperands();
@@ -283,18 +279,25 @@ void MachineRegisterInfo::moveOperands(MachineOperand *Dst,
 /// replaceRegWith - Replace all instances of FromReg with ToReg in the
 /// machine function.  This is like llvm-level X->replaceAllUsesWith(Y),
 /// except that it also changes any definitions of the register as well.
+/// If ToReg is a physical register we apply the sub register to obtain the
+/// final/proper physical register.
 void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
   assert(FromReg != ToReg && "Cannot replace a reg with itself");
 
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  
   // TODO: This could be more efficient by bulk changing the operands.
   for (reg_iterator I = reg_begin(FromReg), E = reg_end(); I != E; ) {
     MachineOperand &O = *I;
     ++I;
-    O.setReg(ToReg);
+    if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
+      O.substPhysReg(ToReg, *TRI);
+    } else {
+      O.setReg(ToReg);
+    }
   }
 }
 
-
 /// getVRegDef - Return the machine instr that defines the specified virtual
 /// register or null if none is found.  This assumes that the code is in SSA
 /// form, so there should only be one definition.
@@ -390,6 +393,14 @@ MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
     }
 }
 
+unsigned MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const
+{
+  // Lane masks are only defined for vregs.
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  const TargetRegisterClass &TRC = *getRegClass(Reg);
+  return TRC.getLaneMask();
+}
+
 #ifndef NDEBUG
 void MachineRegisterInfo::dumpUses(unsigned Reg) const {
   for (MachineInstr &I : use_instructions(Reg))
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
index d9173a27dc41..71a6ebaba243 100644
--- a/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -24,8 +24,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
 using namespace llvm;
 
@@ -39,7 +39,7 @@ static AvailableValsTy &getAvailableVals(void *AV) {
 MachineSSAUpdater::MachineSSAUpdater(MachineFunction &MF,
                                      SmallVectorImpl<MachineInstr*> *NewPHI)
   : AV(nullptr), InsertedPHIs(NewPHI) {
-  TII = MF.getTarget().getInstrInfo();
+  TII = MF.getSubtarget().getInstrInfo();
   MRI = &MF.getRegInfo();
 }
 
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 44191f785386..9fe23c5b2272 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -40,6 +40,9 @@ cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
                            cl::desc("Force top-down list scheduling"));
 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
                             cl::desc("Force bottom-up list scheduling"));
+cl::opt<bool>
+DumpCriticalPathLength("misched-dcpl", cl::Hidden,
+                       cl::desc("Print critical path length to stdout"));
 }
 
 #ifndef NDEBUG
@@ -141,12 +144,12 @@ char MachineScheduler::ID = 0;
 
 char &llvm::MachineSchedulerID = MachineScheduler::ID;
 
-INITIALIZE_PASS_BEGIN(MachineScheduler, "misched",
+INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler",
                       "Machine Instruction Scheduler", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(MachineScheduler, "misched",
+INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler",
                     "Machine Instruction Scheduler", false, false)
 
 MachineScheduler::MachineScheduler()
@@ -378,7 +381,7 @@ static bool isSchedBoundary(MachineBasicBlock::iterator MI,
 
 /// Main driver for both MachineScheduler and PostMachineScheduler.
 void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   bool IsPostRA = Scheduler.isPostRA();
 
   // Visit all machine basic blocks.
@@ -427,9 +430,11 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
       // instruction stream until we find the nearest boundary.
       unsigned NumRegionInstrs = 0;
       MachineBasicBlock::iterator I = RegionEnd;
-      for(;I != MBB->begin(); --I, --RemainingInstrs, ++NumRegionInstrs) {
+      for(;I != MBB->begin(); --I, --RemainingInstrs) {
         if (isSchedBoundary(std::prev(I), MBB, MF, TII, IsPostRA))
           break;
+        if (!I->isDebugValue())
+          ++NumRegionInstrs;
       }
       // Notify the scheduler of the region, even if we may skip scheduling
       // it. Perhaps it still needs to be bundled.
@@ -451,6 +456,11 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
             else dbgs() << "End";
             dbgs() << " RegionInstrs: " << NumRegionInstrs
             << " Remaining: " << RemainingInstrs << "\n");
+      if (DumpCriticalPathLength) {
+        errs() << MF->getName();
+        errs() << ":BB# " << MBB->getNumber();
+        errs() << " " << MBB->getName() << " \n";
+      }
 
       // Schedule a region: possibly reorder instructions.
       // This invalidates 'RegionEnd' and 'I'.
@@ -2355,14 +2365,15 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {
   // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
   // are disabled, then these HazardRecs will be disabled.
   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
-  const TargetMachine &TM = DAG->MF.getTarget();
   if (!Top.HazardRec) {
     Top.HazardRec =
-      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
   }
   if (!Bot.HazardRec) {
     Bot.HazardRec =
-      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
   }
 }
 
@@ -2370,8 +2381,8 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {
 void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
                                   MachineBasicBlock::iterator End,
                                   unsigned NumRegionInstrs) {
-  const TargetMachine &TM = Context->MF->getTarget();
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const MachineFunction &MF = *Begin->getParent()->getParent();
+  const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
 
   // Avoid setting up the register pressure tracker for small regions to save
   // compile time. As a rough heuristic, only track pressure when the number of
@@ -2391,8 +2402,8 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
   RegionPolicy.OnlyBottomUp = true;
 
   // Allow the subtarget to override default policy.
-  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
-  ST.overrideSchedPolicy(RegionPolicy, Begin, End, NumRegionInstrs);
+  MF.getSubtarget().overrideSchedPolicy(RegionPolicy, Begin, End,
+                                        NumRegionInstrs);
 
   // After subtarget overrides, apply command line options.
   if (!EnableRegPressure)
@@ -2460,7 +2471,10 @@ void GenericScheduler::registerRoots() {
     if ((*I)->getDepth() > Rem.CriticalPath)
       Rem.CriticalPath = (*I)->getDepth();
   }
-  DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
+  DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n');
+  if (DumpCriticalPathLength) {
+    errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n";
+  }
 
   if (EnableCyclicPath) {
     Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
@@ -2482,8 +2496,8 @@ static bool tryPressure(const PressureChange &TryP,
   }
   // If one candidate decreases and the other increases, go with it.
   // Invalid candidates have UnitInc==0.
-  if (tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
-              Reason)) {
+  if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
+                 Reason)) {
     return true;
   }
   // If the candidates are decreasing pressure, reverse priority.
@@ -2885,10 +2899,10 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
   // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
   // or are disabled, then these HazardRecs will be disabled.
   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
-  const TargetMachine &TM = DAG->MF.getTarget();
   if (!Top.HazardRec) {
     Top.HazardRec =
-      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
   }
 }
 
@@ -2902,7 +2916,10 @@ void PostGenericScheduler::registerRoots() {
     if ((*I)->getDepth() > Rem.CriticalPath)
       Rem.CriticalPath = (*I)->getDepth();
   }
-  DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
+  DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n');
+  if (DumpCriticalPathLength) {
+    errs() << "Critical Path(PGS-RR ): " << Rem.CriticalPath << " \n";
+  }
 }
 
 /// Apply a set of heursitics to a new candidate for PostRA scheduling.
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index f44e4d1eaabd..8337793d960d 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -17,18 +17,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "machine-sink"
@@ -38,6 +41,12 @@ SplitEdges("machine-sink-split",
            cl::desc("Split critical edges during machine sinking"),
            cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+UseBlockFreqInfo("machine-sink-bfi",
+           cl::desc("Use block frequency info to find successors to sink"),
+           cl::init(true), cl::Hidden);
+
+
 STATISTIC(NumSunk,      "Number of machine instructions sunk");
 STATISTIC(NumSplit,     "Number of critical edges split");
 STATISTIC(NumCoalesces, "Number of copies coalesced");
@@ -46,14 +55,20 @@ namespace {
   class MachineSinking : public MachineFunctionPass {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
-    MachineRegisterInfo  *MRI;  // Machine register information
-    MachineDominatorTree *DT;   // Machine dominator tree
+    MachineRegisterInfo  *MRI;     // Machine register information
+    MachineDominatorTree *DT;      // Machine dominator tree
+    MachinePostDominatorTree *PDT; // Machine post dominator tree
     MachineLoopInfo *LI;
+    const MachineBlockFrequencyInfo *MBFI;
     AliasAnalysis *AA;
 
     // Remember which edges have been considered for breaking.
     SmallSet<std::pair<MachineBasicBlock*,MachineBasicBlock*>, 8>
     CEBCandidates;
+    // Remember which edges we are about to split.
+    // This is different from CEBCandidates since those edges
+    // will be split.
+    SetVector<std::pair<MachineBasicBlock*,MachineBasicBlock*> > ToSplit;
 
   public:
     static char ID; // Pass identification
@@ -68,9 +83,13 @@ namespace {
       MachineFunctionPass::getAnalysisUsage(AU);
       AU.addRequired<AliasAnalysis>();
       AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachinePostDominatorTree>();
       AU.addRequired<MachineLoopInfo>();
       AU.addPreserved<MachineDominatorTree>();
+      AU.addPreserved<MachinePostDominatorTree>();
       AU.addPreserved<MachineLoopInfo>();
+      if (UseBlockFreqInfo)
+        AU.addRequired<MachineBlockFrequencyInfo>();
     }
 
     void releaseMemory() override {
@@ -82,10 +101,22 @@ namespace {
     bool isWorthBreakingCriticalEdge(MachineInstr *MI,
                                      MachineBasicBlock *From,
                                      MachineBasicBlock *To);
-    MachineBasicBlock *SplitCriticalEdge(MachineInstr *MI,
-                                         MachineBasicBlock *From,
-                                         MachineBasicBlock *To,
-                                         bool BreakPHIEdge);
+    /// \brief Postpone the splitting of the given critical
+    /// edge (\p From, \p To).
+    ///
+    /// We do not split the edges on the fly. Indeed, this invalidates
+    /// the dominance information and thus triggers a lot of updates
+    /// of that information underneath.
+    /// Instead, we postpone all the splits after each iteration of
+    /// the main loop. That way, the information is at least valid
+    /// for the lifetime of an iteration.
+    ///
+    /// \return True if the edge is marked as toSplit, false otherwise.
+    /// False can be returned if, for instance, this is not profitable.
+    bool PostponeSplitCriticalEdge(MachineInstr *MI,
+                                   MachineBasicBlock *From,
+                                   MachineBasicBlock *To,
+                                   bool BreakPHIEdge);
     bool SinkInstruction(MachineInstr *MI, bool &SawStore);
     bool AllUsesDominatedByBlock(unsigned Reg, MachineBasicBlock *MBB,
                                  MachineBasicBlock *DefMBB,
@@ -135,6 +166,11 @@ bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr *MI,
   DEBUG(dbgs() << "*** to: " << *MI);
   MRI->replaceRegWith(DstReg, SrcReg);
   MI->eraseFromParent();
+
+  // Conservatively, clear any kill flags, since it's possible that they are no
+  // longer correct.
+  MRI->clearKillFlags(SrcReg);
+
   ++NumCoalesces;
   return true;
 }
@@ -213,12 +249,13 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "******** Machine Sinking ********\n");
 
-  const TargetMachine &TM = MF.getTarget();
-  TII = TM.getInstrInfo();
-  TRI = TM.getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   DT = &getAnalysis<MachineDominatorTree>();
+  PDT = &getAnalysis<MachinePostDominatorTree>();
   LI = &getAnalysis<MachineLoopInfo>();
+  MBFI = UseBlockFreqInfo ? &getAnalysis<MachineBlockFrequencyInfo>() : nullptr;
   AA = &getAnalysis<AliasAnalysis>();
 
   bool EverMadeChange = false;
@@ -228,10 +265,24 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
 
     // Process all basic blocks.
     CEBCandidates.clear();
+    ToSplit.clear();
     for (MachineFunction::iterator I = MF.begin(), E = MF.end();
          I != E; ++I)
       MadeChange |= ProcessBlock(*I);
 
+    // If we have anything we marked as toSplit, split it now.
+    for (auto &Pair : ToSplit) {
+      auto NewSucc = Pair.first->SplitCriticalEdge(Pair.second, this);
+      if (NewSucc != nullptr) {
+        DEBUG(dbgs() << " *** Splitting critical edge:"
+              " BB#" << Pair.first->getNumber()
+              << " -- BB#" << NewSucc->getNumber()
+              << " -- BB#" << Pair.second->getNumber() << '\n');
+        MadeChange = true;
+        ++NumSplit;
+      } else
+        DEBUG(dbgs() << " *** Not legal to break critical edge\n");
+    }
     // If this iteration over the code changed anything, keep iterating.
     if (!MadeChange) break;
     EverMadeChange = true;
@@ -289,10 +340,10 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr *MI,
   // If the pass has already considered breaking this edge (during this pass
   // through the function), then let's go ahead and break it. This means
   // sinking multiple "cheap" instructions into the same block.
-  if (!CEBCandidates.insert(std::make_pair(From, To)))
+  if (!CEBCandidates.insert(std::make_pair(From, To)).second)
     return true;
 
-  if (!MI->isCopy() && !MI->isAsCheapAsAMove())
+  if (!MI->isCopy() && !TII->isAsCheapAsAMove(MI))
     return true;
 
   // MI is cheap, we probably don't want to break the critical edge for it.
@@ -328,21 +379,21 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr *MI,
   return false;
 }
 
-MachineBasicBlock *MachineSinking::SplitCriticalEdge(MachineInstr *MI,
-                                                     MachineBasicBlock *FromBB,
-                                                     MachineBasicBlock *ToBB,
-                                                     bool BreakPHIEdge) {
+bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr *MI,
+                                               MachineBasicBlock *FromBB,
+                                               MachineBasicBlock *ToBB,
+                                               bool BreakPHIEdge) {
   if (!isWorthBreakingCriticalEdge(MI, FromBB, ToBB))
-    return nullptr;
+    return false;
 
   // Avoid breaking back edge. From == To means backedge for single BB loop.
   if (!SplitEdges || FromBB == ToBB)
-    return nullptr;
+    return false;
 
   // Check for backedges of more "complex" loops.
   if (LI->getLoopFor(FromBB) == LI->getLoopFor(ToBB) &&
       LI->isLoopHeader(ToBB))
-    return nullptr;
+    return false;
 
   // It's not always legal to break critical edges and sink the computation
   // to the edge.
@@ -389,11 +440,13 @@ MachineBasicBlock *MachineSinking::SplitCriticalEdge(MachineInstr *MI,
       if (*PI == FromBB)
         continue;
       if (!DT->dominates(ToBB, *PI))
-        return nullptr;
+        return false;
     }
   }
 
-  return FromBB->SplitCriticalEdge(ToBB, this);
+  ToSplit.insert(std::make_pair(FromBB, ToBB));
+  
+  return true;
 }
 
 static bool AvoidsSinking(MachineInstr *MI, MachineRegisterInfo *MRI) {
@@ -419,23 +472,6 @@ static void collectDebugValues(MachineInstr *MI,
   }
 }
 
-/// isPostDominatedBy - Return true if A is post dominated by B.
-static bool isPostDominatedBy(MachineBasicBlock *A, MachineBasicBlock *B) {
-
-  // FIXME - Use real post dominator.
-  if (A->succ_size() != 2)
-    return false;
-  MachineBasicBlock::succ_iterator I = A->succ_begin();
-  if (B == *I)
-    ++I;
-  MachineBasicBlock *OtherSuccBlock = *I;
-  if (OtherSuccBlock->succ_size() != 1 ||
-      *(OtherSuccBlock->succ_begin()) != B)
-    return false;
-
-  return true;
-}
-
 /// isProfitableToSinkTo - Return true if it is profitable to sink MI.
 bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr *MI,
                                           MachineBasicBlock *MBB,
@@ -447,8 +483,13 @@ bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr *MI,
     return false;
 
   // It is profitable if SuccToSinkTo does not post dominate current block.
-  if (!isPostDominatedBy(MBB, SuccToSinkTo))
-      return true;
+  if (!PDT->dominates(SuccToSinkTo, MBB))
+    return true;
+
+  // It is profitable to sink an instruction from a deeper loop to a shallower
+  // loop, even if the latter post-dominates the former (PR21115).
+  if (LI->getLoopDepth(MBB) > LI->getLoopDepth(SuccToSinkTo))
+    return true;
 
   // Check if only use in post dominated block is PHI instruction.
   bool NonPHIUse = false;
@@ -463,7 +504,7 @@ bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr *MI,
   // If SuccToSinkTo post dominates then also it may be profitable if MI
   // can further profitably sinked into another block in next round.
   bool BreakPHIEdge = false;
-  // FIXME - If finding successor is compile time expensive then catch results.
+  // FIXME - If finding successor is compile time expensive then cache results.
   if (MachineBasicBlock *MBB2 = FindSuccToSinkTo(MI, SuccToSinkTo, BreakPHIEdge))
     return isProfitableToSinkTo(Reg, MI, SuccToSinkTo, MBB2);
 
@@ -512,19 +553,6 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
       if (!TII->isSafeToMoveRegClassDefs(MRI->getRegClass(Reg)))
         return nullptr;
 
-      // FIXME: This picks a successor to sink into based on having one
-      // successor that dominates all the uses.  However, there are cases where
-      // sinking can happen but where the sink point isn't a successor.  For
-      // example:
-      //
-      //   x = computation
-      //   if () {} else {}
-      //   use x
-      //
-      // the instruction could be sunk over the whole diamond for the
-      // if/then/else (or loop, etc), allowing it to be sunk into other blocks
-      // after that.
-
       // Virtual register defs can only be sunk if all their uses are in blocks
       // dominated by one of the successors.
       if (SuccToSinkTo) {
@@ -539,14 +567,37 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
       }
 
       // Otherwise, we should look at all the successors and decide which one
-      // we should sink to.
-      // We give successors with smaller loop depth higher priority.
-      SmallVector<MachineBasicBlock*, 4> Succs(MBB->succ_begin(), MBB->succ_end());
-      // Sort Successors according to their loop depth.
+      // we should sink to. If we have reliable block frequency information
+      // (frequency != 0) available, give successors with smaller frequencies
+      // higher priority, otherwise prioritize smaller loop depths.
+      SmallVector<MachineBasicBlock*, 4> Succs(MBB->succ_begin(),
+                                               MBB->succ_end());
+
+      // Handle cases where sinking can happen but where the sink point isn't a
+      // successor. For example:
+      //
+      //   x = computation
+      //   if () {} else {}
+      //   use x
+      //
+      const std::vector<MachineDomTreeNode *> &Children =
+        DT->getNode(MBB)->getChildren();
+      for (const auto &DTChild : Children)
+        // DomTree children of MBB that have MBB as immediate dominator are added.
+        if (DTChild->getIDom()->getBlock() == MI->getParent() &&
+            // Skip MBBs already added to the Succs vector above.
+            !MBB->isSuccessor(DTChild->getBlock()))
+          Succs.push_back(DTChild->getBlock());
+
+      // Sort Successors according to their loop depth or block frequency info.
       std::stable_sort(
           Succs.begin(), Succs.end(),
-          [this](const MachineBasicBlock *LHS, const MachineBasicBlock *RHS) {
-            return LI->getLoopDepth(LHS) < LI->getLoopDepth(RHS);
+          [this](const MachineBasicBlock *L, const MachineBasicBlock *R) {
+            uint64_t LHSFreq = MBFI ? MBFI->getBlockFreq(L).getFrequency() : 0;
+            uint64_t RHSFreq = MBFI ? MBFI->getBlockFreq(R).getFrequency() : 0;
+            bool HasBlockFreq = LHSFreq != 0 && RHSFreq != 0;
+            return HasBlockFreq ? LHSFreq < RHSFreq
+                                : LI->getLoopDepth(L) < LI->getLoopDepth(R);
           });
       for (SmallVectorImpl<MachineBasicBlock *>::iterator SI = Succs.begin(),
              E = Succs.end(); SI != E; ++SI) {
@@ -655,21 +706,16 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
     if (!TryBreak)
       DEBUG(dbgs() << "Sinking along critical edge.\n");
     else {
-      MachineBasicBlock *NewSucc =
-        SplitCriticalEdge(MI, ParentBlock, SuccToSinkTo, BreakPHIEdge);
-      if (!NewSucc) {
+      // Mark this edge as to be split.
+      // If the edge can actually be split, the next iteration of the main loop
+      // will sink MI in the newly created block.
+      bool Status =
+        PostponeSplitCriticalEdge(MI, ParentBlock, SuccToSinkTo, BreakPHIEdge);
+      if (!Status)
         DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
-                        "break critical edge\n");
-        return false;
-      } else {
-        DEBUG(dbgs() << " *** Splitting critical edge:"
-              " BB#" << ParentBlock->getNumber()
-              << " -- BB#" << NewSucc->getNumber()
-              << " -- BB#" << SuccToSinkTo->getNumber() << '\n');
-        SuccToSinkTo = NewSucc;
-        ++NumSplit;
-        BreakPHIEdge = false;
-      }
+              "break critical edge\n");
+      // The instruction will not be sunk this time.
+      return false;
     }
   }
 
@@ -677,20 +723,13 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
     // BreakPHIEdge is true if all the uses are in the successor MBB being
     // sunken into and they are all PHI nodes. In this case, machine-sink must
     // break the critical edge first.
-    MachineBasicBlock *NewSucc = SplitCriticalEdge(MI, ParentBlock,
-                                                   SuccToSinkTo, BreakPHIEdge);
-    if (!NewSucc) {
+    bool Status = PostponeSplitCriticalEdge(MI, ParentBlock,
+                                            SuccToSinkTo, BreakPHIEdge);
+    if (!Status)
       DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
             "break critical edge\n");
-      return false;
-    }
-
-    DEBUG(dbgs() << " *** Splitting critical edge:"
-          " BB#" << ParentBlock->getNumber()
-          << " -- BB#" << NewSucc->getNumber()
-          << " -- BB#" << SuccToSinkTo->getNumber() << '\n');
-    SuccToSinkTo = NewSucc;
-    ++NumSplit;
+    // The instruction will not be sunk this time.
+    return false;
   }
 
   // Determine where to insert into. Skip phi nodes.
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index 1bbf0ade6a58..2cf87eb9104f 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -52,13 +52,13 @@ void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
 
 bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
-  TII = MF->getTarget().getInstrInfo();
-  TRI = MF->getTarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
   MRI = &MF->getRegInfo();
   Loops = &getAnalysis<MachineLoopInfo>();
   const TargetSubtargetInfo &ST =
     MF->getTarget().getSubtarget<TargetSubtargetInfo>();
-  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
   BlockInfo.resize(MF->getNumBlockIDs());
   ProcResourceCycles.resize(MF->getNumBlockIDs() *
                             SchedModel.getNumProcResourceKinds());
@@ -135,8 +135,7 @@ MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const {
          "getResources() must be called before getProcResourceCycles()");
   unsigned PRKinds = SchedModel.getNumProcResourceKinds();
   assert((MBBNum+1) * PRKinds <= ProcResourceCycles.size());
-  return ArrayRef<unsigned>(ProcResourceCycles.data() + MBBNum * PRKinds,
-                            PRKinds);
+  return makeArrayRef(ProcResourceCycles.data() + MBBNum * PRKinds, PRKinds);
 }
 
 
@@ -256,8 +255,7 @@ MachineTraceMetrics::Ensemble::
 getProcResourceDepths(unsigned MBBNum) const {
   unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
   assert((MBBNum+1) * PRKinds <= ProcResourceDepths.size());
-  return ArrayRef<unsigned>(ProcResourceDepths.data() + MBBNum * PRKinds,
-                            PRKinds);
+  return makeArrayRef(ProcResourceDepths.data() + MBBNum * PRKinds, PRKinds);
 }
 
 /// Get an array of processor resource heights for MBB. Indexed by processor
@@ -270,8 +268,7 @@ MachineTraceMetrics::Ensemble::
 getProcResourceHeights(unsigned MBBNum) const {
   unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
   assert((MBBNum+1) * PRKinds <= ProcResourceHeights.size());
-  return ArrayRef<unsigned>(ProcResourceHeights.data() + MBBNum * PRKinds,
-                            PRKinds);
+  return makeArrayRef(ProcResourceHeights.data() + MBBNum * PRKinds, PRKinds);
 }
 
 //===----------------------------------------------------------------------===//
@@ -452,7 +449,7 @@ public:
     }
     // To is a new block. Mark the block as visited in case the CFG has cycles
     // that MachineLoopInfo didn't recognize as a natural loop.
-    return LB.Visited.insert(To);
+    return LB.Visited.insert(To).second;
   }
 };
 }
@@ -1169,6 +1166,7 @@ MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr *PHI) const {
   return DepCycle;
 }
 
+/// When bottom is set include instructions in current block in estimate.
 unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
   // Find the limiting processor resource.
   // Numbers have been pre-scaled to be comparable.
@@ -1185,7 +1183,9 @@ unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
   // Convert to cycle count.
   PRMax = TE.MTM.getCycles(PRMax);
 
+  /// All instructions before current block
   unsigned Instrs = TBI.InstrDepth;
+  // plus instructions in current block
   if (Bottom)
     Instrs += TE.MTM.BlockInfo[getBlockNum()].InstrCount;
   if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
@@ -1194,44 +1194,72 @@ unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
   return std::max(Instrs, PRMax);
 }
 
-
-unsigned MachineTraceMetrics::Trace::
-getResourceLength(ArrayRef<const MachineBasicBlock*> Extrablocks,
-                  ArrayRef<const MCSchedClassDesc*> ExtraInstrs) const {
+unsigned MachineTraceMetrics::Trace::getResourceLength(
+    ArrayRef<const MachineBasicBlock *> Extrablocks,
+    ArrayRef<const MCSchedClassDesc *> ExtraInstrs,
+    ArrayRef<const MCSchedClassDesc *> RemoveInstrs) const {
   // Add up resources above and below the center block.
   ArrayRef<unsigned> PRDepths = TE.getProcResourceDepths(getBlockNum());
   ArrayRef<unsigned> PRHeights = TE.getProcResourceHeights(getBlockNum());
   unsigned PRMax = 0;
-  for (unsigned K = 0; K != PRDepths.size(); ++K) {
-    unsigned PRCycles = PRDepths[K] + PRHeights[K];
-    for (unsigned I = 0; I != Extrablocks.size(); ++I)
-      PRCycles += TE.MTM.getProcResourceCycles(Extrablocks[I]->getNumber())[K];
-    for (unsigned I = 0; I != ExtraInstrs.size(); ++I) {
-      const MCSchedClassDesc* SC = ExtraInstrs[I];
+
+  // Capture computing cycles from extra instructions
+  auto extraCycles = [this](ArrayRef<const MCSchedClassDesc *> Instrs,
+                            unsigned ResourceIdx)
+                         ->unsigned {
+    unsigned Cycles = 0;
+    for (unsigned I = 0; I != Instrs.size(); ++I) {
+      const MCSchedClassDesc *SC = Instrs[I];
       if (!SC->isValid())
         continue;
       for (TargetSchedModel::ProcResIter
-             PI = TE.MTM.SchedModel.getWriteProcResBegin(SC),
-             PE = TE.MTM.SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
-        if (PI->ProcResourceIdx != K)
+               PI = TE.MTM.SchedModel.getWriteProcResBegin(SC),
+               PE = TE.MTM.SchedModel.getWriteProcResEnd(SC);
+           PI != PE; ++PI) {
+        if (PI->ProcResourceIdx != ResourceIdx)
           continue;
-        PRCycles += (PI->Cycles * TE.MTM.SchedModel.getResourceFactor(K));
+        Cycles +=
+            (PI->Cycles * TE.MTM.SchedModel.getResourceFactor(ResourceIdx));
       }
     }
+    return Cycles;
+  };
+
+  for (unsigned K = 0; K != PRDepths.size(); ++K) {
+    unsigned PRCycles = PRDepths[K] + PRHeights[K];
+    for (unsigned I = 0; I != Extrablocks.size(); ++I)
+      PRCycles += TE.MTM.getProcResourceCycles(Extrablocks[I]->getNumber())[K];
+    PRCycles += extraCycles(ExtraInstrs, K);
+    PRCycles -= extraCycles(RemoveInstrs, K);
     PRMax = std::max(PRMax, PRCycles);
   }
   // Convert to cycle count.
   PRMax = TE.MTM.getCycles(PRMax);
 
+  // Instrs: #instructions in current trace outside current block.
   unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight;
+  // Add instruction count from the extra blocks.
   for (unsigned i = 0, e = Extrablocks.size(); i != e; ++i)
     Instrs += TE.MTM.getResources(Extrablocks[i])->InstrCount;
+  Instrs += ExtraInstrs.size();
+  Instrs -= RemoveInstrs.size();
   if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
     Instrs /= IW;
   // Assume issue width 1 without a schedule model.
   return std::max(Instrs, PRMax);
 }
 
+bool MachineTraceMetrics::Trace::isDepInTrace(const MachineInstr *DefMI,
+                                              const MachineInstr *UseMI) const {
+  if (DefMI->getParent() == UseMI->getParent())
+    return true;
+
+  const TraceBlockInfo &DepTBI = TE.BlockInfo[DefMI->getParent()->getNumber()];
+  const TraceBlockInfo &TBI = TE.BlockInfo[UseMI->getParent()->getNumber()];
+
+  return DepTBI.isUsefulDominator(TBI);
+}
+
 void MachineTraceMetrics::Ensemble::print(raw_ostream &OS) const {
   OS << getName() << " ensemble:\n";
   for (unsigned i = 0, e = BlockInfo.size(); i != e; ++i) {
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 8515b0f456d8..364e8e2cb3ed 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -42,10 +42,12 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 namespace {
@@ -214,9 +216,9 @@ namespace {
     void report(const char *msg, const MachineBasicBlock *MBB,
                 const LiveInterval &LI);
     void report(const char *msg, const MachineFunction *MF,
-                const LiveRange &LR);
+                const LiveRange &LR, unsigned Reg, unsigned LaneMask);
     void report(const char *msg, const MachineBasicBlock *MBB,
-                const LiveRange &LR);
+                const LiveRange &LR, unsigned Reg, unsigned LaneMask);
 
     void verifyInlineAsm(const MachineInstr *MI);
 
@@ -229,20 +231,22 @@ namespace {
     void verifyLiveVariables();
     void verifyLiveIntervals();
     void verifyLiveInterval(const LiveInterval&);
-    void verifyLiveRangeValue(const LiveRange&, const VNInfo*, unsigned);
+    void verifyLiveRangeValue(const LiveRange&, const VNInfo*, unsigned,
+                              unsigned);
     void verifyLiveRangeSegment(const LiveRange&,
-                                const LiveRange::const_iterator I, unsigned);
-    void verifyLiveRange(const LiveRange&, unsigned);
+                                const LiveRange::const_iterator I, unsigned,
+                                unsigned);
+    void verifyLiveRange(const LiveRange&, unsigned, unsigned LaneMask = 0);
 
     void verifyStackFrame();
   };
 
   struct MachineVerifierPass : public MachineFunctionPass {
     static char ID; // Pass ID, replacement for typeid
-    const char *const Banner;
+    const std::string Banner;
 
-    MachineVerifierPass(const char *b = nullptr)
-      : MachineFunctionPass(ID), Banner(b) {
+    MachineVerifierPass(const std::string &banner = nullptr)
+      : MachineFunctionPass(ID), Banner(banner) {
         initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry());
       }
 
@@ -252,7 +256,7 @@ namespace {
     }
 
     bool runOnMachineFunction(MachineFunction &MF) override {
-      MF.verify(this, Banner);
+      MF.verify(this, Banner.c_str());
       return false;
     }
   };
@@ -263,7 +267,7 @@ char MachineVerifierPass::ID = 0;
 INITIALIZE_PASS(MachineVerifierPass, "machineverifier",
                 "Verify generated machine code", false, false)
 
-FunctionPass *llvm::createMachineVerifierPass(const char *Banner) {
+FunctionPass *llvm::createMachineVerifierPass(const std::string &Banner) {
   return new MachineVerifierPass(Banner);
 }
 
@@ -275,11 +279,12 @@ void MachineFunction::verify(Pass *p, const char *Banner) const {
 bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
   raw_ostream *OutFile = nullptr;
   if (OutFileName) {
-    std::string ErrorInfo;
-    OutFile = new raw_fd_ostream(OutFileName, ErrorInfo,
+    std::error_code EC;
+    OutFile = new raw_fd_ostream(OutFileName, EC,
                                  sys::fs::F_Append | sys::fs::F_Text);
-    if (!ErrorInfo.empty()) {
-      errs() << "Error opening '" << OutFileName << "': " << ErrorInfo << '\n';
+    if (EC) {
+      errs() << "Error opening '" << OutFileName << "': " << EC.message()
+             << '\n';
       exit(1);
     }
 
@@ -292,8 +297,8 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
 
   this->MF = &MF;
   TM = &MF.getTarget();
-  TII = TM->getInstrInfo();
-  TRI = TM->getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
 
   LiveVars = nullptr;
@@ -430,15 +435,23 @@ void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
-                             const LiveRange &LR) {
+                             const LiveRange &LR, unsigned Reg,
+                             unsigned LaneMask) {
   report(msg, MBB);
-  *OS << "- liverange:    " << LR << "\n";
+  *OS << "- liverange:   " << LR << '\n';
+  *OS << "- register:    " << PrintReg(Reg, TRI) << '\n';
+  if (LaneMask != 0)
+    *OS << "- lanemask:    " << format("%04X\n", LaneMask);
 }
 
 void MachineVerifier::report(const char *msg, const MachineFunction *MF,
-                             const LiveRange &LR) {
+                             const LiveRange &LR, unsigned Reg,
+                             unsigned LaneMask) {
   report(msg, MF);
-  *OS << "- liverange:    " << LR << "\n";
+  *OS << "- liverange:   " << LR << '\n';
+  *OS << "- register:    " << PrintReg(Reg, TRI) << '\n';
+  if (LaneMask != 0)
+    *OS << "- lanemask:    " << format("%04X\n", LaneMask);
 }
 
 void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
@@ -588,7 +601,11 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       }
     } else if (TBB && !FBB && Cond.empty()) {
       // Block unconditionally branches somewhere.
-      if (MBB->succ_size() != 1+LandingPadSuccs.size()) {
+      // If the block has exactly one successor, that happens to be a
+      // landingpad, accept it as valid control flow.
+      if (MBB->succ_size() != 1+LandingPadSuccs.size() &&
+          (MBB->succ_size() != 1 || LandingPadSuccs.size() != 1 ||
+           *MBB->succ_begin() != *LandingPadSuccs.begin())) {
         report("MBB exits via unconditional branch but doesn't have "
                "exactly one CFG successor!", MBB);
       } else if (!MBB->isSuccessor(TBB)) {
@@ -905,7 +922,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
           if (!DRC->contains(Reg)) {
             report("Illegal physical register for instruction", MO, MONum);
             *OS << TRI->getName(Reg) << " is not a "
-                << DRC->getName() << " register.\n";
+                << TRI->getRegClassName(DRC) << " register.\n";
           }
         }
       } else {
@@ -916,13 +933,13 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
             TRI->getSubClassWithSubReg(RC, SubIdx);
           if (!SRC) {
             report("Invalid subregister index for virtual register", MO, MONum);
-            *OS << "Register class " << RC->getName()
+            *OS << "Register class " << TRI->getRegClassName(RC)
                 << " does not support subreg index " << SubIdx << "\n";
             return;
           }
           if (RC != SRC) {
             report("Invalid register class for subregister index", MO, MONum);
-            *OS << "Register class " << RC->getName()
+            *OS << "Register class " << TRI->getRegClassName(RC)
                 << " does not fully support subreg index " << SubIdx << "\n";
             return;
           }
@@ -944,8 +961,9 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
           }
           if (!RC->hasSuperClassEq(DRC)) {
             report("Illegal virtual register for instruction", MO, MONum);
-            *OS << "Expected a " << DRC->getName() << " register, but got a "
-                << RC->getName() << " register\n";
+            *OS << "Expected a " << TRI->getRegClassName(DRC)
+                << " register, but got a " << TRI->getRegClassName(RC)
+                << " register\n";
           }
         }
       }
@@ -1048,7 +1066,18 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
     if (!regsLive.count(Reg)) {
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
         // Reserved registers may be used even when 'dead'.
-        if (!isReserved(Reg))
+        bool Bad = !isReserved(Reg);
+        // We are fine if just any subregister has a defined value.
+        if (Bad) {
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid();
+               ++SubRegs) {
+            if (regsLive.count(*SubRegs)) {
+              Bad = false;
+              break;
+            }
+          }
+        }
+        if (Bad)
           report("Using an undefined physical register", MO, MONum);
       } else if (MRI->def_empty(Reg)) {
         report("Reading virtual register without a def", MO, MONum);
@@ -1349,21 +1378,22 @@ void MachineVerifier::verifyLiveIntervals() {
 }
 
 void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
-                                           const VNInfo *VNI,
-                                           unsigned Reg) {
+                                           const VNInfo *VNI, unsigned Reg,
+                                           unsigned LaneMask) {
   if (VNI->isUnused())
     return;
 
   const VNInfo *DefVNI = LR.getVNInfoAt(VNI->def);
 
   if (!DefVNI) {
-    report("Valno not live at def and not marked unused", MF, LR);
+    report("Valno not live at def and not marked unused", MF, LR, Reg,
+           LaneMask);
     *OS << "Valno #" << VNI->id << '\n';
     return;
   }
 
   if (DefVNI != VNI) {
-    report("Live segment at def has different valno", MF, LR);
+    report("Live segment at def has different valno", MF, LR, Reg, LaneMask);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
         << " where valno #" << DefVNI->id << " is live\n";
     return;
@@ -1371,7 +1401,7 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
   if (!MBB) {
-    report("Invalid definition index", MF, LR);
+    report("Invalid definition index", MF, LR, Reg, LaneMask);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
         << " in " << LR << '\n';
     return;
@@ -1379,7 +1409,8 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
 
   if (VNI->isPHIDef()) {
     if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
-      report("PHIDef value is not defined at MBB start", MBB, LR);
+      report("PHIDef value is not defined at MBB start", MBB, LR, Reg,
+             LaneMask);
       *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
           << ", not at the beginning of BB#" << MBB->getNumber() << '\n';
     }
@@ -1389,7 +1420,7 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
   // Non-PHI def.
   const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
   if (!MI) {
-    report("No instruction at def index", MBB, LR);
+    report("No instruction at def index", MBB, LR, Reg, LaneMask);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     return;
   }
@@ -1408,6 +1439,9 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
             !TRI->hasRegUnit(MOI->getReg(), Reg))
           continue;
       }
+      if (LaneMask != 0 &&
+          (TRI->getSubRegIndexLaneMask(MOI->getSubReg()) & LaneMask) == 0)
+        continue;
       hasDef = true;
       if (MOI->isEarlyClobber())
         isEarlyClobber = true;
@@ -1422,12 +1456,13 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
     // DEF slots.
     if (isEarlyClobber) {
       if (!VNI->def.isEarlyClobber()) {
-        report("Early clobber def must be at an early-clobber slot", MBB, LR);
+        report("Early clobber def must be at an early-clobber slot", MBB, LR,
+               Reg, LaneMask);
         *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
       }
     } else if (!VNI->def.isRegister()) {
       report("Non-PHI, non-early clobber def must be at a register slot",
-             MBB, LR);
+             MBB, LR, Reg, LaneMask);
       *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     }
   }
@@ -1435,37 +1470,38 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
 
 void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
                                              const LiveRange::const_iterator I,
-                                             unsigned Reg) {
+                                             unsigned Reg, unsigned LaneMask) {
   const LiveRange::Segment &S = *I;
   const VNInfo *VNI = S.valno;
   assert(VNI && "Live segment has no valno");
 
   if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) {
-    report("Foreign valno in live segment", MF, LR);
+    report("Foreign valno in live segment", MF, LR, Reg, LaneMask);
     *OS << S << " has a bad valno\n";
   }
 
   if (VNI->isUnused()) {
-    report("Live segment valno is marked unused", MF, LR);
+    report("Live segment valno is marked unused", MF, LR, Reg, LaneMask);
     *OS << S << '\n';
   }
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start);
   if (!MBB) {
-    report("Bad start of live segment, no basic block", MF, LR);
+    report("Bad start of live segment, no basic block", MF, LR, Reg, LaneMask);
     *OS << S << '\n';
     return;
   }
   SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
   if (S.start != MBBStartIdx && S.start != VNI->def) {
-    report("Live segment must begin at MBB entry or valno def", MBB, LR);
+    report("Live segment must begin at MBB entry or valno def", MBB, LR, Reg,
+           LaneMask);
     *OS << S << '\n';
   }
 
   const MachineBasicBlock *EndMBB =
     LiveInts->getMBBFromIndex(S.end.getPrevSlot());
   if (!EndMBB) {
-    report("Bad end of live segment, no basic block", MF, LR);
+    report("Bad end of live segment, no basic block", MF, LR, Reg, LaneMask);
     *OS << S << '\n';
     return;
   }
@@ -1483,14 +1519,16 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   const MachineInstr *MI =
     LiveInts->getInstructionFromIndex(S.end.getPrevSlot());
   if (!MI) {
-    report("Live segment doesn't end at a valid instruction", EndMBB, LR);
+    report("Live segment doesn't end at a valid instruction", EndMBB, LR, Reg,
+           LaneMask);
     *OS << S << '\n';
     return;
   }
 
   // The block slot must refer to a basic block boundary.
   if (S.end.isBlock()) {
-    report("Live segment ends at B slot of an instruction", EndMBB, LR);
+    report("Live segment ends at B slot of an instruction", EndMBB, LR, Reg,
+           LaneMask);
     *OS << S << '\n';
   }
 
@@ -1498,7 +1536,8 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
     // Segment ends on the dead slot.
     // That means there must be a dead def.
     if (!SlotIndex::isSameInstr(S.start, S.end)) {
-      report("Live segment ending at dead slot spans instructions", EndMBB, LR);
+      report("Live segment ending at dead slot spans instructions", EndMBB, LR,
+             Reg, LaneMask);
       *OS << S << '\n';
     }
   }
@@ -1508,7 +1547,8 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   if (S.end.isEarlyClobber()) {
     if (I+1 == LR.end() || (I+1)->start != S.end) {
       report("Live segment ending at early clobber slot must be "
-             "redefined by an EC def in the same instruction", EndMBB, LR);
+             "redefined by an EC def in the same instruction", EndMBB, LR, Reg,
+             LaneMask);
       *OS << S << '\n';
     }
   }
@@ -1519,16 +1559,27 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
     // A live segment can end with either a redefinition, a kill flag on a
     // use, or a dead flag on a def.
     bool hasRead = false;
+    bool hasSubRegDef = false;
     for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
       if (!MOI->isReg() || MOI->getReg() != Reg)
         continue;
+      if (LaneMask != 0 &&
+          (LaneMask & TRI->getSubRegIndexLaneMask(MOI->getSubReg())) == 0)
+        continue;
+      if (MOI->isDef() && MOI->getSubReg() != 0)
+        hasSubRegDef = true;
       if (MOI->readsReg())
         hasRead = true;
     }
     if (!S.end.isDead()) {
       if (!hasRead) {
-        report("Instruction ending live segment doesn't read the register", MI);
-        *OS << S << " in " << LR << '\n';
+        // When tracking subregister liveness, the main range must start new
+        // values on partial register writes, even if there is no read.
+        if (!MRI->tracksSubRegLiveness() || LaneMask != 0 || !hasSubRegDef) {
+          report("Instruction ending live segment doesn't read the register",
+                 MI);
+          *OS << S << " in " << LR << '\n';
+        }
       }
     }
   }
@@ -1566,7 +1617,8 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
 
       // All predecessors must have a live-out value.
       if (!PVNI) {
-        report("Register not marked live out of predecessor", *PI, LR);
+        report("Register not marked live out of predecessor", *PI, LR, Reg,
+               LaneMask);
         *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
             << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
             << PEnd << '\n';
@@ -1575,7 +1627,8 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
 
       // Only PHI-defs can take different predecessor values.
       if (!IsPHI && PVNI != VNI) {
-        report("Different value live out of predecessor", *PI, LR);
+        report("Different value live out of predecessor", *PI, LR, Reg,
+               LaneMask);
         *OS << "Valno #" << PVNI->id << " live out of BB#"
             << (*PI)->getNumber() << '@' << PEnd
             << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
@@ -1588,18 +1641,36 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   }
 }
 
-void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg) {
-  for (LiveRange::const_vni_iterator I = LR.vni_begin(), E = LR.vni_end();
-       I != E; ++I)
-    verifyLiveRangeValue(LR, *I, Reg);
+void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg,
+                                      unsigned LaneMask) {
+  for (const VNInfo *VNI : LR.valnos)
+    verifyLiveRangeValue(LR, VNI, Reg, LaneMask);
 
   for (LiveRange::const_iterator I = LR.begin(), E = LR.end(); I != E; ++I)
-    verifyLiveRangeSegment(LR, I, Reg);
+    verifyLiveRangeSegment(LR, I, Reg, LaneMask);
 }
 
 void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
   verifyLiveRange(LI, LI.reg);
 
+  unsigned Reg = LI.reg;
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    unsigned Mask = 0;
+    unsigned MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
+    for (const LiveInterval::SubRange &SR : LI.subranges()) {
+      if ((Mask & SR.LaneMask) != 0)
+        report("Lane masks of sub ranges overlap in live interval", MF, LI);
+      if ((SR.LaneMask & ~MaxMask) != 0)
+        report("Subrange lanemask is invalid", MF, LI);
+      Mask |= SR.LaneMask;
+      verifyLiveRange(SR, LI.reg, SR.LaneMask);
+      if (!LI.covers(SR))
+        report("A Subrange is not covered by the main range", MF, LI);
+    }
+  } else if (LI.hasSubRanges()) {
+    report("subregister liveness only allowed for virtual registers", MF, LI);
+  }
+
   // Check the LI only has one connected component.
   if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
     ConnectedVNInfoEqClasses ConEQ(*LiveInts);
diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index 95a2934afb40..a1042e720c37 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "phi-opt"
@@ -66,7 +67,7 @@ bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) {
     return false;
 
   MRI = &Fn.getRegInfo();
-  TII = Fn.getTarget().getInstrInfo();
+  TII = Fn.getSubtarget().getInstrInfo();
 
   // Find dead PHI cycles and PHI cycles that can be replaced by a single
   // value.  InstCombine does these optimizations, but DAG legalization may
@@ -91,7 +92,7 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
   unsigned DstReg = MI->getOperand(0).getReg();
 
   // See if we already saw this register.
-  if (!PHIsInCycle.insert(MI))
+  if (!PHIsInCycle.insert(MI).second)
     return true;
 
   // Don't scan crazily complex things.
@@ -136,7 +137,7 @@ bool OptimizePHIs::IsDeadPHICycle(MachineInstr *MI, InstrSet &PHIsInCycle) {
          "PHI destination is not a virtual register");
 
   // See if we already saw this register.
-  if (!PHIsInCycle.insert(MI))
+  if (!PHIsInCycle.insert(MI).second)
     return true;
 
   // Don't scan crazily complex things.
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index c8d0819ad13c..def2e3d48ac0 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -30,7 +30,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -150,9 +150,7 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
     Changed |= EliminatePHINodes(MF, *I);
 
   // Remove dead IMPLICIT_DEF instructions.
-  for (SmallPtrSet<MachineInstr*, 4>::iterator I = ImpDefs.begin(),
-         E = ImpDefs.end(); I != E; ++I) {
-    MachineInstr *DefMI = *I;
+  for (MachineInstr *DefMI : ImpDefs) {
     unsigned DefReg = DefMI->getOperand(0).getReg();
     if (MRI->use_nodbg_empty(DefReg)) {
       if (LIS)
@@ -240,7 +238,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
   // Insert a register to register copy at the top of the current block (but
   // after any remaining phi nodes) which copies the new incoming register
   // into the phi node destination.
-  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   if (isSourceDefinedByImplicitDef(MPhi, MRI))
     // If all sources of a PHI node are implicit_def, just emit an
     // implicit_def instead of a copy.
@@ -369,7 +367,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
     // Check to make sure we haven't already emitted the copy for this block.
     // This can happen because PHI nodes may have multiple entries for the same
     // basic block.
-    if (!MBBsInsertedInto.insert(&opBlock))
+    if (!MBBsInsertedInto.insert(&opBlock).second)
       continue;  // If the copy has already been emitted, we're done.
 
     // Find a safe location to insert the copy, this may be the first terminator
diff --git a/lib/CodeGen/PHIEliminationUtils.h b/lib/CodeGen/PHIEliminationUtils.h
index 48234ae0fbdd..b997d7ac5f4f 100644
--- a/lib/CodeGen/PHIEliminationUtils.h
+++ b/lib/CodeGen/PHIEliminationUtils.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_PHIELIMINATIONUTILS_H
-#define LLVM_CODEGEN_PHIELIMINATIONUTILS_H
+#ifndef LLVM_LIB_CODEGEN_PHIELIMINATIONUTILS_H
+#define LLVM_LIB_CODEGEN_PHIELIMINATIONUTILS_H
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
 
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 249b2d0f6bb9..28e9f847a9d7 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/SymbolRewriter.h"
 
 using namespace llvm;
 
@@ -71,6 +72,8 @@ static cl::opt<bool> DisableCGP("disable-cgp", cl::Hidden,
     cl::desc("Disable Codegen Prepare"));
 static cl::opt<bool> DisableCopyProp("disable-copyprop", cl::Hidden,
     cl::desc("Disable Copy Propagation pass"));
+static cl::opt<bool> DisablePartialLibcallInlining("disable-partial-libcall-inlining",
+    cl::Hidden, cl::desc("Disable Partial Libcall Inlining"));
 static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
     cl::desc("Print LLVM IR produced by the loop-reduce pass"));
 static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden,
@@ -97,6 +100,10 @@ static cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
 static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
     cl::desc("Run live interval analysis earlier in the pipeline"));
 
+static cl::opt<bool> UseCFLAA("use-cfl-aa-in-codegen",
+  cl::init(false), cl::Hidden,
+  cl::desc("Enable the new, experimental CFL alias analysis in CodeGen"));
+
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
 /// i.e. -disable-mypass=false has no effect.
@@ -228,8 +235,8 @@ TargetPassConfig::~TargetPassConfig() {
 // registers all common codegen passes.
 TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
   : ImmutablePass(ID), PM(&pm), StartAfter(nullptr), StopAfter(nullptr),
-    Started(true), Stopped(false), TM(tm), Impl(nullptr), Initialized(false),
-    DisableVerify(false),
+    Started(true), Stopped(false), AddingMachinePasses(false), TM(tm),
+    Impl(nullptr), Initialized(false), DisableVerify(false),
     EnableTailMerge(true) {
 
   Impl = new PassConfigImpl();
@@ -297,7 +304,7 @@ IdentifyingPassPtr TargetPassConfig::getPassSubstitution(AnalysisID ID) const {
 /// a later pass or that it should stop after an earlier pass, then do not add
 /// the pass.  Finally, compare the current pass against the StartAfter
 /// and StopAfter options and change the Started/Stopped flags accordingly.
-void TargetPassConfig::addPass(Pass *P) {
+void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
   assert(!Initialized && "PassConfig is immutable");
 
   // Cache the Pass ID here in case the pass manager finds this pass is
@@ -306,10 +313,21 @@ void TargetPassConfig::addPass(Pass *P) {
   // and shouldn't reference it.
   AnalysisID PassID = P->getPassID();
 
-  if (Started && !Stopped)
+  if (Started && !Stopped) {
+    std::string Banner;
+    // Construct banner message before PM->add() as that may delete the pass.
+    if (AddingMachinePasses && (printAfter || verifyAfter))
+      Banner = std::string("After ") + std::string(P->getPassName());
     PM->add(P);
-  else
+    if (AddingMachinePasses) {
+      if (printAfter)
+        addPrintPass(Banner);
+      if (verifyAfter)
+        addVerifyPass(Banner);
+    }
+  } else {
     delete P;
+  }
   if (StopAfter == PassID)
     Stopped = true;
   if (StartAfter == PassID)
@@ -323,7 +341,8 @@ void TargetPassConfig::addPass(Pass *P) {
 ///
 /// addPass cannot return a pointer to the pass instance because is internal the
 /// PassManager and the instance we create here may already be freed.
-AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
+AnalysisID TargetPassConfig::addPass(AnalysisID PassID, bool verifyAfter,
+                                     bool printAfter) {
   IdentifyingPassPtr TargetID = getPassSubstitution(PassID);
   IdentifyingPassPtr FinalPtr = overridePass(PassID, TargetID);
   if (!FinalPtr.isValid())
@@ -338,7 +357,7 @@ AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
       llvm_unreachable("Pass ID not registered");
   }
   AnalysisID FinalID = P->getPassID();
-  addPass(P); // Ends the lifetime of P.
+  addPass(P, verifyAfter, printAfter); // Ends the lifetime of P.
 
   // Add the passes after the pass P if there is any.
   for (SmallVectorImpl<std::pair<AnalysisID, IdentifyingPassPtr> >::iterator
@@ -353,18 +372,25 @@ AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
         NP = Pass::createPass((*I).second.getID());
         assert(NP && "Pass ID not registered");
       }
-      addPass(NP);
+      addPass(NP, false, false);
     }
   }
   return FinalID;
 }
 
-void TargetPassConfig::printAndVerify(const char *Banner) {
+void TargetPassConfig::printAndVerify(const std::string &Banner) {
+  addPrintPass(Banner);
+  addVerifyPass(Banner);
+}
+
+void TargetPassConfig::addPrintPass(const std::string &Banner) {
   if (TM->shouldPrintMachineCode())
-    addPass(createMachineFunctionPrinterPass(dbgs(), Banner));
+    PM->add(createMachineFunctionPrinterPass(dbgs(), Banner));
+}
 
+void TargetPassConfig::addVerifyPass(const std::string &Banner) {
   if (VerifyMachineCode)
-    addPass(createMachineVerifierPass(Banner));
+    PM->add(createMachineVerifierPass(Banner));
 }
 
 /// Add common target configurable passes that perform LLVM IR to IR transforms
@@ -374,7 +400,10 @@ void TargetPassConfig::addIRPasses() {
   // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
   // BasicAliasAnalysis wins if they disagree. This is intended to help
   // support "obvious" type-punning idioms.
+  if (UseCFLAA)
+    addPass(createCFLAliasAnalysisPass());
   addPass(createTypeBasedAliasAnalysisPass());
+  addPass(createScopedNoAliasAAPass());
   addPass(createBasicAliasAnalysisPass());
 
   // Before running any passes, run the verifier to determine if the input
@@ -399,6 +428,9 @@ void TargetPassConfig::addIRPasses() {
   // Prepare expensive constants for SelectionDAG.
   if (getOptLevel() != CodeGenOpt::None && !DisableConstantHoisting)
     addPass(createConstantHoistingPass());
+
+  if (getOptLevel() != CodeGenOpt::None && !DisablePartialLibcallInlining)
+    addPass(createPartiallyInlineLibCallsPass());
 }
 
 /// Turn exception handling constructs into something the code generators can
@@ -416,7 +448,8 @@ void TargetPassConfig::addPassesToHandleExceptions() {
     // FALLTHROUGH
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
-  case ExceptionHandling::WinEH:
+  case ExceptionHandling::ItaniumWinEH:
+  case ExceptionHandling::MSVC: // FIXME: Needs preparation.
     addPass(createDwarfEHPass(TM));
     break;
   case ExceptionHandling::None:
@@ -433,6 +466,7 @@ void TargetPassConfig::addPassesToHandleExceptions() {
 void TargetPassConfig::addCodeGenPrepare() {
   if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
     addPass(createCodeGenPreparePass(TM));
+  addPass(createRewriteSymbolsPass());
 }
 
 /// Add common passes that perform LLVM IR to IR transforms in preparation for
@@ -477,6 +511,8 @@ void TargetPassConfig::addISelPrepare() {
 /// TODO: We could use a single addPre/Post(ID) hook to allow pass injection
 /// before/after any target-independent pass. But it's currently overkill.
 void TargetPassConfig::addMachinePasses() {
+  AddingMachinePasses = true;
+
   // Insert a machine instr printer pass after the specified pass.
   // If -print-machineinstrs specified, print machineinstrs after all passes.
   if (StringRef(PrintMachineInstrs.getValue()).equals(""))
@@ -485,7 +521,7 @@ void TargetPassConfig::addMachinePasses() {
            .equals("option-unspecified")) {
     const PassRegistry *PR = PassRegistry::getPassRegistry();
     const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
-    const PassInfo *IPI = PR->getPassInfo(StringRef("print-machineinstrs"));
+    const PassInfo *IPI = PR->getPassInfo(StringRef("machineinstr-printer"));
     assert (TPI && IPI && "Pass ID not registered!");
     const char *TID = (const char *)(TPI->getTypeInfo());
     const char *IID = (const char *)(IPI->getTypeInfo());
@@ -496,8 +532,7 @@ void TargetPassConfig::addMachinePasses() {
   printAndVerify("After Instruction Selection");
 
   // Expand pseudo-instructions emitted by ISel.
-  if (addPass(&ExpandISelPseudosID))
-    printAndVerify("After ExpandISelPseudos");
+  addPass(&ExpandISelPseudosID);
 
   // Add passes that optimize machine instructions in SSA form.
   if (getOptLevel() != CodeGenOpt::None) {
@@ -505,12 +540,11 @@ void TargetPassConfig::addMachinePasses() {
   } else {
     // If the target requests it, assign local variables to stack slots relative
     // to one another and simplify frame index references where possible.
-    addPass(&LocalStackSlotAllocationID);
+    addPass(&LocalStackSlotAllocationID, false);
   }
 
   // Run pre-ra passes.
-  if (addPreRegAlloc())
-    printAndVerify("After PreRegAlloc passes");
+  addPreRegAlloc();
 
   // Run register allocation and passes that are tightly coupled with it,
   // including phi elimination and scheduling.
@@ -520,12 +554,10 @@ void TargetPassConfig::addMachinePasses() {
     addFastRegAlloc(createRegAllocPass(false));
 
   // Run post-ra passes.
-  if (addPostRegAlloc())
-    printAndVerify("After PostRegAlloc passes");
+  addPostRegAlloc();
 
   // Insert prolog/epilog code.  Eliminate abstract frame index references...
   addPass(&PrologEpilogCodeInserterID);
-  printAndVerify("After PrologEpilogCodeInserter");
 
   /// Add passes that optimize machine instructions after register allocation.
   if (getOptLevel() != CodeGenOpt::None)
@@ -533,11 +565,9 @@ void TargetPassConfig::addMachinePasses() {
 
   // Expand pseudo instructions before second scheduling pass.
   addPass(&ExpandPostRAPseudosID);
-  printAndVerify("After ExpandPostRAPseudos");
 
   // Run pre-sched2 passes.
-  if (addPreSched2())
-    printAndVerify("After PreSched2 passes");
+  addPreSched2();
 
   // Second pass scheduler.
   if (getOptLevel() != CodeGenOpt::None) {
@@ -545,63 +575,61 @@ void TargetPassConfig::addMachinePasses() {
       addPass(&PostMachineSchedulerID);
     else
       addPass(&PostRASchedulerID);
-    printAndVerify("After PostRAScheduler");
   }
 
   // GC
   if (addGCPasses()) {
     if (PrintGCInfo)
-      addPass(createGCInfoPrinter(dbgs()));
+      addPass(createGCInfoPrinter(dbgs()), false, false);
   }
 
   // Basic block placement.
   if (getOptLevel() != CodeGenOpt::None)
     addBlockPlacement();
 
-  if (addPreEmitPass())
-    printAndVerify("After PreEmit passes");
+  addPreEmitPass();
 
-  addPass(&StackMapLivenessID);
+  addPass(&StackMapLivenessID, false);
+
+  AddingMachinePasses = false;
 }
 
 /// Add passes that optimize machine instructions in SSA form.
 void TargetPassConfig::addMachineSSAOptimization() {
   // Pre-ra tail duplication.
-  if (addPass(&EarlyTailDuplicateID))
-    printAndVerify("After Pre-RegAlloc TailDuplicate");
+  addPass(&EarlyTailDuplicateID);
 
   // Optimize PHIs before DCE: removing dead PHI cycles may make more
   // instructions dead.
-  addPass(&OptimizePHIsID);
+  addPass(&OptimizePHIsID, false);
 
   // This pass merges large allocas. StackSlotColoring is a different pass
   // which merges spill slots.
-  addPass(&StackColoringID);
+  addPass(&StackColoringID, false);
 
   // If the target requests it, assign local variables to stack slots relative
   // to one another and simplify frame index references where possible.
-  addPass(&LocalStackSlotAllocationID);
+  addPass(&LocalStackSlotAllocationID, false);
 
   // With optimization, dead code should already be eliminated. However
   // there is one known exception: lowered code for arguments that are only
   // used by tail calls, where the tail calls reuse the incoming stack
   // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
   addPass(&DeadMachineInstructionElimID);
-  printAndVerify("After codegen DCE pass");
 
   // Allow targets to insert passes that improve instruction level parallelism,
   // like if-conversion. Such passes will typically need dominator trees and
   // loop info, just like LICM and CSE below.
-  if (addILPOpts())
-    printAndVerify("After ILP optimizations");
+  addILPOpts();
 
-  addPass(&MachineLICMID);
-  addPass(&MachineCSEID);
+  addPass(&MachineLICMID, false);
+  addPass(&MachineCSEID, false);
   addPass(&MachineSinkingID);
-  printAndVerify("After Machine LICM, CSE and Sinking passes");
 
-  addPass(&PeepholeOptimizerID);
-  printAndVerify("After codegen peephole optimization pass");
+  addPass(&PeepholeOptimizerID, false);
+  // Clean-up the dead code that may have been generated by peephole
+  // rewriting.
+  addPass(&DeadMachineInstructionElimID);
 }
 
 //===---------------------------------------------------------------------===//
@@ -675,21 +703,26 @@ FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
   return createTargetRegisterAllocator(Optimized);
 }
 
+/// Return true if the default global register allocator is in use and
+/// has not be overriden on the command line with '-regalloc=...'
+bool TargetPassConfig::usingDefaultRegAlloc() const {
+  return RegAlloc.getNumOccurrences() == 0;
+}
+
 /// Add the minimum set of target-independent passes that are required for
 /// register allocation. No coalescing or scheduling.
 void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
-  addPass(&PHIEliminationID);
-  addPass(&TwoAddressInstructionPassID);
+  addPass(&PHIEliminationID, false);
+  addPass(&TwoAddressInstructionPassID, false);
 
   addPass(RegAllocPass);
-  printAndVerify("After Register Allocation");
 }
 
 /// Add standard target-independent passes that are tightly coupled with
 /// optimized register allocation, including coalescing, machine instruction
 /// scheduling, and register allocation itself.
 void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
-  addPass(&ProcessImplicitDefsID);
+  addPass(&ProcessImplicitDefsID, false);
 
   // LiveVariables currently requires pure SSA form.
   //
@@ -697,34 +730,30 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   // LiveVariables can be removed completely, and LiveIntervals can be directly
   // computed. (We still either need to regenerate kill flags after regalloc, or
   // preferably fix the scavenger to not depend on them).
-  addPass(&LiveVariablesID);
+  addPass(&LiveVariablesID, false);
 
   // Edge splitting is smarter with machine loop info.
-  addPass(&MachineLoopInfoID);
-  addPass(&PHIEliminationID);
+  addPass(&MachineLoopInfoID, false);
+  addPass(&PHIEliminationID, false);
 
   // Eventually, we want to run LiveIntervals before PHI elimination.
   if (EarlyLiveIntervals)
-    addPass(&LiveIntervalsID);
+    addPass(&LiveIntervalsID, false);
 
-  addPass(&TwoAddressInstructionPassID);
+  addPass(&TwoAddressInstructionPassID, false);
   addPass(&RegisterCoalescerID);
 
   // PreRA instruction scheduling.
-  if (addPass(&MachineSchedulerID))
-    printAndVerify("After Machine Scheduling");
+  addPass(&MachineSchedulerID);
 
   // Add the selected register allocation pass.
   addPass(RegAllocPass);
-  printAndVerify("After Register Allocation, before rewriter");
 
   // Allow targets to change the register assignments before rewriting.
-  if (addPreRewrite())
-    printAndVerify("After pre-rewrite passes");
+  addPreRewrite();
 
   // Finally rewrite virtual registers.
   addPass(&VirtRegRewriterID);
-  printAndVerify("After Virtual Register Rewriter");
 
   // Perform stack slot coloring and post-ra machine LICM.
   //
@@ -736,8 +765,6 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   //
   // FIXME: can this move into MachineLateOptimization?
   addPass(&PostRAMachineLICMID);
-
-  printAndVerify("After StackSlotColoring and postra Machine LICM");
 }
 
 //===---------------------------------------------------------------------===//
@@ -747,34 +774,30 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 /// Add passes that optimize machine instructions after register allocation.
 void TargetPassConfig::addMachineLateOptimization() {
   // Branch folding must be run after regalloc and prolog/epilog insertion.
-  if (addPass(&BranchFolderPassID))
-    printAndVerify("After BranchFolding");
+  addPass(&BranchFolderPassID);
 
   // Tail duplication.
   // Note that duplicating tail just increases code size and degrades
   // performance for targets that require Structured Control Flow.
   // In addition it can also make CFG irreducible. Thus we disable it.
-  if (!TM->requiresStructuredCFG() && addPass(&TailDuplicateID))
-    printAndVerify("After TailDuplicate");
+  if (!TM->requiresStructuredCFG())
+    addPass(&TailDuplicateID);
 
   // Copy propagation.
-  if (addPass(&MachineCopyPropagationID))
-    printAndVerify("After copy propagation pass");
+  addPass(&MachineCopyPropagationID);
 }
 
 /// Add standard GC passes.
 bool TargetPassConfig::addGCPasses() {
-  addPass(&GCMachineCodeAnalysisID);
+  addPass(&GCMachineCodeAnalysisID, false);
   return true;
 }
 
 /// Add standard basic block placement passes.
 void TargetPassConfig::addBlockPlacement() {
-  if (addPass(&MachineBlockPlacementID)) {
+  if (addPass(&MachineBlockPlacementID, false)) {
     // Run a separate pass to collect block placement statistics.
     if (EnableBlockPlacementStats)
       addPass(&MachineBlockPlacementStatsID);
-
-    printAndVerify("After machine block placement.");
   }
 }
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 716cb1f46f15..283d1f26198b 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -46,7 +46,7 @@
 //     if it loads to virtual registers and the virtual register defined has 
 //     a single use.
 //
-// - Optimize Copies and Bitcast:
+// - Optimize Copies and Bitcast (more generally, target specific copies):
 //
 //     Rewrite copies and bitcasts to avoid cross register bank copies
 //     when possible.
@@ -78,6 +78,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "peephole-opt"
@@ -92,7 +94,7 @@ DisablePeephole("disable-peephole", cl::Hidden, cl::init(false),
                 cl::desc("Disable the peephole optimizer"));
 
 static cl::opt<bool>
-DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(true),
+DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false),
                   cl::desc("Disable advanced copy optimization"));
 
 STATISTIC(NumReuse,      "Number of extension results reused");
@@ -100,12 +102,13 @@ STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
 STATISTIC(NumLoadFold,   "Number of loads folded");
 STATISTIC(NumSelects,    "Number of selects optimized");
-STATISTIC(NumCopiesBitcasts, "Number of copies/bitcasts optimized");
+STATISTIC(NumUncoalescableCopies, "Number of uncoalescable copies optimized");
+STATISTIC(NumRewrittenCopies, "Number of copies rewritten");
 
 namespace {
   class PeepholeOptimizer : public MachineFunctionPass {
-    const TargetMachine   *TM;
     const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
     MachineRegisterInfo   *MRI;
     MachineDominatorTree  *DT;  // Machine dominator tree
 
@@ -129,9 +132,15 @@ namespace {
   private:
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
-                          SmallPtrSet<MachineInstr*, 8> &LocalMIs);
-    bool optimizeSelect(MachineInstr *MI);
+                          SmallPtrSetImpl<MachineInstr*> &LocalMIs);
+    bool optimizeSelect(MachineInstr *MI,
+                        SmallPtrSetImpl<MachineInstr *> &LocalMIs);
+    bool optimizeCondBranch(MachineInstr *MI);
     bool optimizeCopyOrBitcast(MachineInstr *MI);
+    bool optimizeCoalescableCopy(MachineInstr *MI);
+    bool optimizeUncoalescableCopy(MachineInstr *MI,
+                                   SmallPtrSetImpl<MachineInstr *> &LocalMIs);
+    bool findNextSource(unsigned &Reg, unsigned &SubReg);
     bool isMoveImmediate(MachineInstr *MI,
                          SmallSet<unsigned, 4> &ImmDefRegs,
                          DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
@@ -140,6 +149,25 @@ namespace {
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
     bool isLoadFoldable(MachineInstr *MI,
                         SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
+
+    /// \brief Check whether \p MI is understood by the register coalescer
+    /// but may require some rewriting.
+    bool isCoalescableCopy(const MachineInstr &MI) {
+      // SubregToRegs are not interesting, because they are already register
+      // coalescer friendly.
+      return MI.isCopy() || (!DisableAdvCopyOpt &&
+                             (MI.isRegSequence() || MI.isInsertSubreg() ||
+                              MI.isExtractSubreg()));
+    }
+
+    /// \brief Check whether \p MI is a copy like instruction that is
+    /// not recognized by the register coalescer.
+    bool isUncoalescableCopy(const MachineInstr &MI) {
+      return MI.isBitcast() ||
+             (!DisableAdvCopyOpt &&
+              (MI.isRegSequenceLike() || MI.isInsertSubregLike() ||
+               MI.isExtractSubregLike()));
+    }
   };
 
   /// \brief Helper class to track the possible sources of a value defined by
@@ -176,63 +204,87 @@ namespace {
     /// the ValueTracker class but that would have complicated the code of
     /// the users of this class.
     bool UseAdvancedTracking;
-    /// Optional MachineRegisterInfo used to perform some complex
+    /// MachineRegisterInfo used to perform tracking.
+    const MachineRegisterInfo &MRI;
+    /// Optional TargetInstrInfo used to perform some complex
     /// tracking.
-    const MachineRegisterInfo *MRI;
+    const TargetInstrInfo *TII;
 
     /// \brief Dispatcher to the right underlying implementation of
     /// getNextSource.
-    bool getNextSourceImpl(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceImpl(unsigned &SrcReg, unsigned &SrcSubReg);
     /// \brief Specialized version of getNextSource for Copy instructions.
-    bool getNextSourceFromCopy(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceFromCopy(unsigned &SrcReg, unsigned &SrcSubReg);
     /// \brief Specialized version of getNextSource for Bitcast instructions.
-    bool getNextSourceFromBitcast(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceFromBitcast(unsigned &SrcReg, unsigned &SrcSubReg);
     /// \brief Specialized version of getNextSource for RegSequence
     /// instructions.
-    bool getNextSourceFromRegSequence(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceFromRegSequence(unsigned &SrcReg, unsigned &SrcSubReg);
     /// \brief Specialized version of getNextSource for InsertSubreg
     /// instructions.
-    bool getNextSourceFromInsertSubreg(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceFromInsertSubreg(unsigned &SrcReg, unsigned &SrcSubReg);
     /// \brief Specialized version of getNextSource for ExtractSubreg
     /// instructions.
-    bool getNextSourceFromExtractSubreg(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceFromExtractSubreg(unsigned &SrcReg, unsigned &SrcSubReg);
     /// \brief Specialized version of getNextSource for SubregToReg
     /// instructions.
-    bool getNextSourceFromSubregToReg(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceFromSubregToReg(unsigned &SrcReg, unsigned &SrcSubReg);
 
   public:
-    /// \brief Create a ValueTracker instance for the value defines by \p MI
-    /// at the operand index \p DefIdx.
+    /// \brief Create a ValueTracker instance for the value defined by \p Reg.
     /// \p DefSubReg represents the sub register index the value tracker will
-    /// track. It does not need to match the sub register index used in \p MI.
+    /// track. It does not need to match the sub register index used in the
+    /// definition of \p Reg.
     /// \p UseAdvancedTracking specifies whether or not the value tracker looks
     /// through complex instructions. By default (false), it handles only copy
     /// and bitcast instructions.
-    /// \p MRI useful to perform some complex checks.
+    /// If \p Reg is a physical register, a value tracker constructed with
+    /// this constructor will not find any alternative source.
+    /// Indeed, when \p Reg is a physical register that constructor does not
+    /// know which definition of \p Reg it should track.
+    /// Use the next constructor to track a physical register.
+    ValueTracker(unsigned Reg, unsigned DefSubReg,
+                 const MachineRegisterInfo &MRI,
+                 bool UseAdvancedTracking = false,
+                 const TargetInstrInfo *TII = nullptr)
+        : Def(nullptr), DefIdx(0), DefSubReg(DefSubReg), Reg(Reg),
+          UseAdvancedTracking(UseAdvancedTracking), MRI(MRI), TII(TII) {
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        Def = MRI.getVRegDef(Reg);
+        DefIdx = MRI.def_begin(Reg).getOperandNo();
+      }
+    }
+
+    /// \brief Create a ValueTracker instance for the value defined by
+    /// the pair \p MI, \p DefIdx.
+    /// Unlike the other constructor, the value tracker produced by this one
+    /// may be able to find a new source when the definition is a physical
+    /// register.
+    /// This could be useful to rewrite target specific instructions into
+    /// generic copy instructions.
     ValueTracker(const MachineInstr &MI, unsigned DefIdx, unsigned DefSubReg,
+                 const MachineRegisterInfo &MRI,
                  bool UseAdvancedTracking = false,
-                 const MachineRegisterInfo *MRI = nullptr)
+                 const TargetInstrInfo *TII = nullptr)
         : Def(&MI), DefIdx(DefIdx), DefSubReg(DefSubReg),
-          UseAdvancedTracking(UseAdvancedTracking), MRI(MRI) {
-      assert(Def->getOperand(DefIdx).isDef() &&
-             Def->getOperand(DefIdx).isReg() &&
-             "Definition does not match machine instruction");
-      // Initially the value is in the defined register.
+          UseAdvancedTracking(UseAdvancedTracking), MRI(MRI), TII(TII) {
+      assert(DefIdx < Def->getDesc().getNumDefs() &&
+             Def->getOperand(DefIdx).isReg() && "Invalid definition");
       Reg = Def->getOperand(DefIdx).getReg();
     }
 
     /// \brief Following the use-def chain, get the next available source
     /// for the tracked value.
-    /// When the returned value is not nullptr, getReg() gives the register
+    /// When the returned value is not nullptr, \p SrcReg gives the register
     /// that contain the tracked value.
     /// \note The sub register index returned in \p SrcSubReg must be used
-    /// on that getReg() to access the actual value.
+    /// on \p SrcReg to access the actual value.
     /// \return Unless the returned value is nullptr (i.e., no source found),
-    /// \p SrcIdx gives the index of the next source in the returned
-    /// instruction and \p SrcSubReg the index to be used on that source to
-    /// get the tracked value. When nullptr is returned, no alternative source
-    /// has been found.
-    const MachineInstr *getNextSource(unsigned &SrcIdx, unsigned &SrcSubReg);
+    /// \p SrcReg gives the register of the next source used in the returned
+    /// instruction and \p SrcSubReg the sub-register index to be used on that
+    /// source to get the tracked value. When nullptr is returned, no
+    /// alternative source has been found.
+    const MachineInstr *getNextSource(unsigned &SrcReg, unsigned &SrcSubReg);
 
     /// \brief Get the last register where the initial value can be found.
     /// Initially this is the register of the definition.
@@ -261,7 +313,7 @@ INITIALIZE_PASS_END(PeepholeOptimizer, "peephole-opts",
 /// debug uses.
 bool PeepholeOptimizer::
 optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
-                 SmallPtrSet<MachineInstr*, 8> &LocalMIs) {
+                 SmallPtrSetImpl<MachineInstr*> &LocalMIs) {
   unsigned SrcReg, DstReg, SubIdx;
   if (!TII->isCoalescableExtInstr(*MI, SrcReg, DstReg, SubIdx))
     return false;
@@ -277,7 +329,7 @@ optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
   // Ensure DstReg can get a register class that actually supports
   // sub-registers. Don't change the class until we commit.
   const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
-  DstRC = TM->getRegisterInfo()->getSubClassWithSubReg(DstRC, SubIdx);
+  DstRC = TRI->getSubClassWithSubReg(DstRC, SubIdx);
   if (!DstRC)
     return false;
 
@@ -286,8 +338,8 @@ optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
   // register.
   // If UseSrcSubIdx is Set, SubIdx also applies to SrcReg, and only uses of
   // SrcReg:SubIdx should be replaced.
-  bool UseSrcSubIdx = TM->getRegisterInfo()->
-    getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != nullptr;
+  bool UseSrcSubIdx =
+      TRI->getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != nullptr;
 
   // The source has other uses. See if we can replace the other uses with use of
   // the result of the extension.
@@ -431,7 +483,8 @@ bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI,
 }
 
 /// Optimize a select instruction.
-bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI) {
+bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI,
+                            SmallPtrSetImpl<MachineInstr *> &LocalMIs) {
   unsigned TrueOp = 0;
   unsigned FalseOp = 0;
   bool Optimizable = false;
@@ -440,13 +493,19 @@ bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI) {
     return false;
   if (!Optimizable)
     return false;
-  if (!TII->optimizeSelect(MI))
+  if (!TII->optimizeSelect(MI, LocalMIs))
     return false;
   MI->eraseFromParent();
   ++NumSelects;
   return true;
 }
 
+/// \brief Check if a simpler conditional branch can be
+// generated
+bool PeepholeOptimizer::optimizeCondBranch(MachineInstr *MI) {
+  return TII->optimizeCondBranch(MI);
+}
+
 /// \brief Check if the registers defined by the pair (RegisterClass, SubReg)
 /// share the same register file.
 static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
@@ -477,85 +536,34 @@ static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
   return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr;
 }
 
-/// \brief Get the index of the definition and source for \p Copy
-/// instruction.
-/// \pre Copy.isCopy() or Copy.isBitcast().
-/// \return True if the Copy instruction has only one register source
-/// and one register definition. Otherwise, \p DefIdx and \p SrcIdx
-/// are invalid.
-static bool getCopyOrBitcastDefUseIdx(const MachineInstr &Copy,
-                                      unsigned &DefIdx, unsigned &SrcIdx) {
-  assert((Copy.isCopy() || Copy.isBitcast()) && "Wrong operation type.");
-  if (Copy.isCopy()) {
-    // Copy instruction are supposed to be: Def = Src.
-     if (Copy.getDesc().getNumOperands() != 2)
-       return false;
-     DefIdx = 0;
-     SrcIdx = 1;
-     assert(Copy.getOperand(DefIdx).isDef() && "Use comes before def!");
-     return true;
-  }
-  // Bitcast case.
-  // Bitcasts with more than one def are not supported.
-  if (Copy.getDesc().getNumDefs() != 1)
-    return false;
-  // Initialize SrcIdx to an undefined operand.
-  SrcIdx = Copy.getDesc().getNumOperands();
-  for (unsigned OpIdx = 0, EndOpIdx = SrcIdx; OpIdx != EndOpIdx; ++OpIdx) {
-    const MachineOperand &MO = Copy.getOperand(OpIdx);
-    if (!MO.isReg() || !MO.getReg())
-      continue;
-    if (MO.isDef())
-      DefIdx = OpIdx;
-    else if (SrcIdx != EndOpIdx)
-      // Multiple sources?
-      return false;
-    SrcIdx = OpIdx;
-  }
-  return true;
-}
-
-/// \brief Optimize a copy or bitcast instruction to avoid cross
-/// register bank copy. The optimization looks through a chain of
-/// copies and try to find a source that has a compatible register
-/// class.
-/// Two register classes are considered to be compatible if they share
-/// the same register bank.
-/// New copies issued by this optimization are register allocator
-/// friendly. This optimization does not remove any copy as it may
-/// overconstraint the register allocator, but replaces some when
-/// possible.
-/// \pre \p MI is a Copy (MI->isCopy() is true)
-/// \return True, when \p MI has been optimized. In that case, \p MI has
-/// been removed from its parent.
-bool PeepholeOptimizer::optimizeCopyOrBitcast(MachineInstr *MI) {
-  unsigned DefIdx, SrcIdx;
-  if (!MI || !getCopyOrBitcastDefUseIdx(*MI, DefIdx, SrcIdx))
+/// \brief Try to find the next source that share the same register file
+/// for the value defined by \p Reg and \p SubReg.
+/// When true is returned, \p Reg and \p SubReg are updated with the
+/// register number and sub-register index of the new source.
+/// \return False if no alternative sources are available. True otherwise.
+bool PeepholeOptimizer::findNextSource(unsigned &Reg, unsigned &SubReg) {
+  // Do not try to find a new source for a physical register.
+  // So far we do not have any motivating example for doing that.
+  // Thus, instead of maintaining untested code, we will revisit that if
+  // that changes at some point.
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
     return false;
 
-  const MachineOperand &MODef = MI->getOperand(DefIdx);
-  assert(MODef.isReg() && "Copies must be between registers.");
-  unsigned Def = MODef.getReg();
-
-  if (TargetRegisterInfo::isPhysicalRegister(Def))
-    return false;
-
-  const TargetRegisterClass *DefRC = MRI->getRegClass(Def);
-  unsigned DefSubReg = MODef.getSubReg();
+  const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
+  unsigned DefSubReg = SubReg;
 
   unsigned Src;
   unsigned SrcSubReg;
   bool ShouldRewrite = false;
-  const TargetRegisterInfo &TRI = *TM->getRegisterInfo();
 
   // Follow the chain of copies until we reach the top of the use-def chain
   // or find a more suitable source.
-  ValueTracker ValTracker(*MI, DefIdx, DefSubReg, !DisableAdvCopyOpt, MRI);
+  ValueTracker ValTracker(Reg, DefSubReg, *MRI, !DisableAdvCopyOpt, TII);
   do {
-    unsigned CopySrcIdx, CopySrcSubReg;
-    if (!ValTracker.getNextSource(CopySrcIdx, CopySrcSubReg))
+    unsigned CopySrcReg, CopySrcSubReg;
+    if (!ValTracker.getNextSource(CopySrcReg, CopySrcSubReg))
       break;
-    Src = ValTracker.getReg();
+    Src = CopySrcReg;
     SrcSubReg = CopySrcSubReg;
 
     // Do not extend the live-ranges of physical registers as they add
@@ -569,29 +577,411 @@ bool PeepholeOptimizer::optimizeCopyOrBitcast(MachineInstr *MI) {
     const TargetRegisterClass *SrcRC = MRI->getRegClass(Src);
 
     // If this source does not incur a cross register bank copy, use it.
-    ShouldRewrite = shareSameRegisterFile(TRI, DefRC, DefSubReg, SrcRC,
+    ShouldRewrite = shareSameRegisterFile(*TRI, DefRC, DefSubReg, SrcRC,
                                           SrcSubReg);
   } while (!ShouldRewrite);
 
   // If we did not find a more suitable source, there is nothing to optimize.
-  if (!ShouldRewrite || Src == MI->getOperand(SrcIdx).getReg())
+  if (!ShouldRewrite || Src == Reg)
+    return false;
+
+  Reg = Src;
+  SubReg = SrcSubReg;
+  return true;
+}
+
+namespace {
+/// \brief Helper class to rewrite the arguments of a copy-like instruction.
+class CopyRewriter {
+protected:
+  /// The copy-like instruction.
+  MachineInstr &CopyLike;
+  /// The index of the source being rewritten.
+  unsigned CurrentSrcIdx;
+
+public:
+  CopyRewriter(MachineInstr &MI) : CopyLike(MI), CurrentSrcIdx(0) {}
+
+  virtual ~CopyRewriter() {}
+
+  /// \brief Get the next rewritable source (SrcReg, SrcSubReg) and
+  /// the related value that it affects (TrackReg, TrackSubReg).
+  /// A source is considered rewritable if its register class and the
+  /// register class of the related TrackReg may not be register
+  /// coalescer friendly. In other words, given a copy-like instruction
+  /// not all the arguments may be returned at rewritable source, since
+  /// some arguments are none to be register coalescer friendly.
+  ///
+  /// Each call of this method moves the current source to the next
+  /// rewritable source.
+  /// For instance, let CopyLike be the instruction to rewrite.
+  /// CopyLike has one definition and one source:
+  /// dst.dstSubIdx = CopyLike src.srcSubIdx.
+  ///
+  /// The first call will give the first rewritable source, i.e.,
+  /// the only source this instruction has:
+  /// (SrcReg, SrcSubReg) = (src, srcSubIdx).
+  /// This source defines the whole definition, i.e.,
+  /// (TrackReg, TrackSubReg) = (dst, dstSubIdx).
+  ///
+  /// The second and subsequent calls will return false, has there is only one
+  /// rewritable source.
+  ///
+  /// \return True if a rewritable source has been found, false otherwise.
+  /// The output arguments are valid if and only if true is returned.
+  virtual bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                                       unsigned &TrackReg,
+                                       unsigned &TrackSubReg) {
+    // If CurrentSrcIdx == 1, this means this function has already been
+    // called once. CopyLike has one defintiion and one argument, thus,
+    // there is nothing else to rewrite.
+    if (!CopyLike.isCopy() || CurrentSrcIdx == 1)
+      return false;
+    // This is the first call to getNextRewritableSource.
+    // Move the CurrentSrcIdx to remember that we made that call.
+    CurrentSrcIdx = 1;
+    // The rewritable source is the argument.
+    const MachineOperand &MOSrc = CopyLike.getOperand(1);
+    SrcReg = MOSrc.getReg();
+    SrcSubReg = MOSrc.getSubReg();
+    // What we track are the alternative sources of the definition.
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+    TrackReg = MODef.getReg();
+    TrackSubReg = MODef.getSubReg();
+    return true;
+  }
+
+  /// \brief Rewrite the current source with \p NewReg and \p NewSubReg
+  /// if possible.
+  /// \return True if the rewritting was possible, false otherwise.
+  virtual bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) {
+    if (!CopyLike.isCopy() || CurrentSrcIdx != 1)
+      return false;
+    MachineOperand &MOSrc = CopyLike.getOperand(CurrentSrcIdx);
+    MOSrc.setReg(NewReg);
+    MOSrc.setSubReg(NewSubReg);
+    return true;
+  }
+};
+
+/// \brief Specialized rewriter for INSERT_SUBREG instruction.
+class InsertSubregRewriter : public CopyRewriter {
+public:
+  InsertSubregRewriter(MachineInstr &MI) : CopyRewriter(MI) {
+    assert(MI.isInsertSubreg() && "Invalid instruction");
+  }
+
+  /// \brief See CopyRewriter::getNextRewritableSource.
+  /// Here CopyLike has the following form:
+  /// dst = INSERT_SUBREG Src1, Src2.src2SubIdx, subIdx.
+  /// Src1 has the same register class has dst, hence, there is
+  /// nothing to rewrite.
+  /// Src2.src2SubIdx, may not be register coalescer friendly.
+  /// Therefore, the first call to this method returns:
+  /// (SrcReg, SrcSubReg) = (Src2, src2SubIdx).
+  /// (TrackReg, TrackSubReg) = (dst, subIdx).
+  ///
+  /// Subsequence calls will return false.
+  bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                               unsigned &TrackReg,
+                               unsigned &TrackSubReg) override {
+    // If we already get the only source we can rewrite, return false.
+    if (CurrentSrcIdx == 2)
+      return false;
+    // We are looking at v2 = INSERT_SUBREG v0, v1, sub0.
+    CurrentSrcIdx = 2;
+    const MachineOperand &MOInsertedReg = CopyLike.getOperand(2);
+    SrcReg = MOInsertedReg.getReg();
+    SrcSubReg = MOInsertedReg.getSubReg();
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+
+    // We want to track something that is compatible with the
+    // partial definition.
+    TrackReg = MODef.getReg();
+    if (MODef.getSubReg())
+      // Bails if we have to compose sub-register indices.
+      return false;
+    TrackSubReg = (unsigned)CopyLike.getOperand(3).getImm();
+    return true;
+  }
+  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+    if (CurrentSrcIdx != 2)
+      return false;
+    // We are rewriting the inserted reg.
+    MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
+    MO.setReg(NewReg);
+    MO.setSubReg(NewSubReg);
+    return true;
+  }
+};
+
+/// \brief Specialized rewriter for EXTRACT_SUBREG instruction.
+class ExtractSubregRewriter : public CopyRewriter {
+  const TargetInstrInfo &TII;
+
+public:
+  ExtractSubregRewriter(MachineInstr &MI, const TargetInstrInfo &TII)
+      : CopyRewriter(MI), TII(TII) {
+    assert(MI.isExtractSubreg() && "Invalid instruction");
+  }
+
+  /// \brief See CopyRewriter::getNextRewritableSource.
+  /// Here CopyLike has the following form:
+  /// dst.dstSubIdx = EXTRACT_SUBREG Src, subIdx.
+  /// There is only one rewritable source: Src.subIdx,
+  /// which defines dst.dstSubIdx.
+  bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                               unsigned &TrackReg,
+                               unsigned &TrackSubReg) override {
+    // If we already get the only source we can rewrite, return false.
+    if (CurrentSrcIdx == 1)
+      return false;
+    // We are looking at v1 = EXTRACT_SUBREG v0, sub0.
+    CurrentSrcIdx = 1;
+    const MachineOperand &MOExtractedReg = CopyLike.getOperand(1);
+    SrcReg = MOExtractedReg.getReg();
+    // If we have to compose sub-register indices, bails out.
+    if (MOExtractedReg.getSubReg())
+      return false;
+
+    SrcSubReg = CopyLike.getOperand(2).getImm();
+
+    // We want to track something that is compatible with the definition.
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+    TrackReg = MODef.getReg();
+    TrackSubReg = MODef.getSubReg();
+    return true;
+  }
+
+  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+    // The only source we can rewrite is the input register.
+    if (CurrentSrcIdx != 1)
+      return false;
+
+    CopyLike.getOperand(CurrentSrcIdx).setReg(NewReg);
+
+    // If we find a source that does not require to extract something,
+    // rewrite the operation with a copy.
+    if (!NewSubReg) {
+      // Move the current index to an invalid position.
+      // We do not want another call to this method to be able
+      // to do any change.
+      CurrentSrcIdx = -1;
+      // Rewrite the operation as a COPY.
+      // Get rid of the sub-register index.
+      CopyLike.RemoveOperand(2);
+      // Morph the operation into a COPY.
+      CopyLike.setDesc(TII.get(TargetOpcode::COPY));
+      return true;
+    }
+    CopyLike.getOperand(CurrentSrcIdx + 1).setImm(NewSubReg);
+    return true;
+  }
+};
+
+/// \brief Specialized rewriter for REG_SEQUENCE instruction.
+class RegSequenceRewriter : public CopyRewriter {
+public:
+  RegSequenceRewriter(MachineInstr &MI) : CopyRewriter(MI) {
+    assert(MI.isRegSequence() && "Invalid instruction");
+  }
+
+  /// \brief See CopyRewriter::getNextRewritableSource.
+  /// Here CopyLike has the following form:
+  /// dst = REG_SEQUENCE Src1.src1SubIdx, subIdx1, Src2.src2SubIdx, subIdx2.
+  /// Each call will return a different source, walking all the available
+  /// source.
+  ///
+  /// The first call returns:
+  /// (SrcReg, SrcSubReg) = (Src1, src1SubIdx).
+  /// (TrackReg, TrackSubReg) = (dst, subIdx1).
+  ///
+  /// The second call returns:
+  /// (SrcReg, SrcSubReg) = (Src2, src2SubIdx).
+  /// (TrackReg, TrackSubReg) = (dst, subIdx2).
+  ///
+  /// And so on, until all the sources have been traversed, then
+  /// it returns false.
+  bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                               unsigned &TrackReg,
+                               unsigned &TrackSubReg) override {
+    // We are looking at v0 = REG_SEQUENCE v1, sub1, v2, sub2, etc.
+
+    // If this is the first call, move to the first argument.
+    if (CurrentSrcIdx == 0) {
+      CurrentSrcIdx = 1;
+    } else {
+      // Otherwise, move to the next argument and check that it is valid.
+      CurrentSrcIdx += 2;
+      if (CurrentSrcIdx >= CopyLike.getNumOperands())
+        return false;
+    }
+    const MachineOperand &MOInsertedReg = CopyLike.getOperand(CurrentSrcIdx);
+    SrcReg = MOInsertedReg.getReg();
+    // If we have to compose sub-register indices, bails out.
+    if ((SrcSubReg = MOInsertedReg.getSubReg()))
+      return false;
+
+    // We want to track something that is compatible with the related
+    // partial definition.
+    TrackSubReg = CopyLike.getOperand(CurrentSrcIdx + 1).getImm();
+
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+    TrackReg = MODef.getReg();
+    // If we have to compose sub-registers, bails.
+    return MODef.getSubReg() == 0;
+  }
+
+  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+    // We cannot rewrite out of bound operands.
+    // Moreover, rewritable sources are at odd positions.
+    if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands())
+      return false;
+
+    MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
+    MO.setReg(NewReg);
+    MO.setSubReg(NewSubReg);
+    return true;
+  }
+};
+} // End namespace.
+
+/// \brief Get the appropriated CopyRewriter for \p MI.
+/// \return A pointer to a dynamically allocated CopyRewriter or nullptr
+/// if no rewriter works for \p MI.
+static CopyRewriter *getCopyRewriter(MachineInstr &MI,
+                                     const TargetInstrInfo &TII) {
+  switch (MI.getOpcode()) {
+  default:
+    return nullptr;
+  case TargetOpcode::COPY:
+    return new CopyRewriter(MI);
+  case TargetOpcode::INSERT_SUBREG:
+    return new InsertSubregRewriter(MI);
+  case TargetOpcode::EXTRACT_SUBREG:
+    return new ExtractSubregRewriter(MI, TII);
+  case TargetOpcode::REG_SEQUENCE:
+    return new RegSequenceRewriter(MI);
+  }
+  llvm_unreachable(nullptr);
+}
+
+/// \brief Optimize generic copy instructions to avoid cross
+/// register bank copy. The optimization looks through a chain of
+/// copies and tries to find a source that has a compatible register
+/// class.
+/// Two register classes are considered to be compatible if they share
+/// the same register bank.
+/// New copies issued by this optimization are register allocator
+/// friendly. This optimization does not remove any copy as it may
+/// overconstraint the register allocator, but replaces some operands
+/// when possible.
+/// \pre isCoalescableCopy(*MI) is true.
+/// \return True, when \p MI has been rewritten. False otherwise.
+bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr *MI) {
+  assert(MI && isCoalescableCopy(*MI) && "Invalid argument");
+  assert(MI->getDesc().getNumDefs() == 1 &&
+         "Coalescer can understand multiple defs?!");
+  const MachineOperand &MODef = MI->getOperand(0);
+  // Do not rewrite physical definitions.
+  if (TargetRegisterInfo::isPhysicalRegister(MODef.getReg()))
     return false;
 
-  // Rewrite the copy to avoid a cross register bank penalty. 
-  unsigned NewVR = TargetRegisterInfo::isPhysicalRegister(Def) ? Def :
-    MRI->createVirtualRegister(DefRC);
-  MachineInstr *NewCopy = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                                  TII->get(TargetOpcode::COPY), NewVR)
-    .addReg(Src, 0, SrcSubReg);
-  NewCopy->getOperand(0).setSubReg(DefSubReg);
-
-  MRI->replaceRegWith(Def, NewVR);
-  MRI->clearKillFlags(NewVR);
-  // We extended the lifetime of Src.
-  // Clear the kill flags to account for that.
-  MRI->clearKillFlags(Src);
+  bool Changed = false;
+  // Get the right rewriter for the current copy.
+  std::unique_ptr<CopyRewriter> CpyRewriter(getCopyRewriter(*MI, *TII));
+  // If none exists, bails out.
+  if (!CpyRewriter)
+    return false;
+  // Rewrite each rewritable source.
+  unsigned SrcReg, SrcSubReg, TrackReg, TrackSubReg;
+  while (CpyRewriter->getNextRewritableSource(SrcReg, SrcSubReg, TrackReg,
+                                              TrackSubReg)) {
+    unsigned NewSrc = TrackReg;
+    unsigned NewSubReg = TrackSubReg;
+    // Try to find a more suitable source.
+    // If we failed to do so, or get the actual source,
+    // move to the next source.
+    if (!findNextSource(NewSrc, NewSubReg) || SrcReg == NewSrc)
+      continue;
+    // Rewrite source.
+    if (CpyRewriter->RewriteCurrentSource(NewSrc, NewSubReg)) {
+      // We may have extended the live-range of NewSrc, account for that.
+      MRI->clearKillFlags(NewSrc);
+      Changed = true;
+    }
+  }
+  // TODO: We could have a clean-up method to tidy the instruction.
+  // E.g., v0 = INSERT_SUBREG v1, v1.sub0, sub0
+  // => v0 = COPY v1
+  // Currently we haven't seen motivating example for that and we
+  // want to avoid untested code.
+  NumRewrittenCopies += Changed == true;
+  return Changed;
+}
+
+/// \brief Optimize copy-like instructions to create
+/// register coalescer friendly instruction.
+/// The optimization tries to kill-off the \p MI by looking
+/// through a chain of copies to find a source that has a compatible
+/// register class.
+/// If such a source is found, it replace \p MI by a generic COPY
+/// operation.
+/// \pre isUncoalescableCopy(*MI) is true.
+/// \return True, when \p MI has been optimized. In that case, \p MI has
+/// been removed from its parent.
+/// All COPY instructions created, are inserted in \p LocalMIs.
+bool PeepholeOptimizer::optimizeUncoalescableCopy(
+    MachineInstr *MI, SmallPtrSetImpl<MachineInstr *> &LocalMIs) {
+  assert(MI && isUncoalescableCopy(*MI) && "Invalid argument");
+
+  // Check if we can rewrite all the values defined by this instruction.
+  SmallVector<
+      std::pair<TargetInstrInfo::RegSubRegPair, TargetInstrInfo::RegSubRegPair>,
+      4> RewritePairs;
+  for (const MachineOperand &MODef : MI->defs()) {
+    if (MODef.isDead())
+      // We can ignore those.
+      continue;
+
+    // If a physical register is here, this is probably for a good reason.
+    // Do not rewrite that.
+    if (TargetRegisterInfo::isPhysicalRegister(MODef.getReg()))
+      return false;
+
+    // If we do not know how to rewrite this definition, there is no point
+    // in trying to kill this instruction.
+    TargetInstrInfo::RegSubRegPair Def(MODef.getReg(), MODef.getSubReg());
+    TargetInstrInfo::RegSubRegPair Src = Def;
+    if (!findNextSource(Src.Reg, Src.SubReg))
+      return false;
+    RewritePairs.push_back(std::make_pair(Def, Src));
+  }
+  // The change is possible for all defs, do it.
+  for (const auto &PairDefSrc : RewritePairs) {
+    const auto &Def = PairDefSrc.first;
+    const auto &Src = PairDefSrc.second;
+    // Rewrite the "copy" in a way the register coalescer understands.
+    assert(!TargetRegisterInfo::isPhysicalRegister(Def.Reg) &&
+           "We do not rewrite physical registers");
+    const TargetRegisterClass *DefRC = MRI->getRegClass(Def.Reg);
+    unsigned NewVR = MRI->createVirtualRegister(DefRC);
+    MachineInstr *NewCopy = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                                    TII->get(TargetOpcode::COPY),
+                                    NewVR).addReg(Src.Reg, 0, Src.SubReg);
+    NewCopy->getOperand(0).setSubReg(Def.SubReg);
+    if (Def.SubReg)
+      NewCopy->getOperand(0).setIsUndef();
+    LocalMIs.insert(NewCopy);
+    MRI->replaceRegWith(Def.Reg, NewVR);
+    MRI->clearKillFlags(NewVR);
+    // We extended the lifetime of Src.
+    // Clear the kill flags to account for that.
+    MRI->clearKillFlags(Src.Reg);
+  }
+  // MI is now dead.
   MI->eraseFromParent();
-  ++NumCopiesBitcasts;
+  ++NumUncoalescableCopies;
   return true;
 }
 
@@ -673,8 +1063,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (DisablePeephole)
     return false;
 
-  TM  = &MF.getTarget();
-  TII = TM->getInstrInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
 
@@ -684,7 +1074,14 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     MachineBasicBlock *MBB = &*I;
 
     bool SeenMoveImm = false;
-    SmallPtrSet<MachineInstr*, 8> LocalMIs;
+
+    // During this forward scan, at some point it needs to answer the question
+    // "given a pointer to an MI in the current BB, is it located before or
+    // after the current instruction".
+    // To perform this, the following set keeps track of the MIs already seen
+    // during the scan, if a MI is not in the set, it is assumed to be located
+    // after. Newly created MIs have to be inserted in the set as well.
+    SmallPtrSet<MachineInstr*, 16> LocalMIs;
     SmallSet<unsigned, 4> ImmDefRegs;
     DenseMap<unsigned, MachineInstr*> ImmDefMIs;
     SmallSet<unsigned, 16> FoldAsLoadDefCandidates;
@@ -711,15 +1108,27 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (MI->mayStore() || MI->isCall())
         FoldAsLoadDefCandidates.clear();
 
-      if (((MI->isBitcast() || MI->isCopy()) && optimizeCopyOrBitcast(MI)) ||
+      if ((isUncoalescableCopy(*MI) &&
+           optimizeUncoalescableCopy(MI, LocalMIs)) ||
           (MI->isCompare() && optimizeCmpInstr(MI, MBB)) ||
-          (MI->isSelect() && optimizeSelect(MI))) {
+          (MI->isSelect() && optimizeSelect(MI, LocalMIs))) {
         // MI is deleted.
         LocalMIs.erase(MI);
         Changed = true;
         continue;
       }
 
+      if (MI->isConditionalBranch() && optimizeCondBranch(MI)) {
+        Changed = true;
+        continue;
+      }
+
+      if (isCoalescableCopy(*MI) && optimizeCoalescableCopy(MI)) {
+        // MI is just rewritten.
+        Changed = true;
+        continue;
+      }
+
       if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) {
         SeenMoveImm = true;
       } else {
@@ -781,24 +1190,25 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
-bool ValueTracker::getNextSourceFromCopy(unsigned &SrcIdx,
+bool ValueTracker::getNextSourceFromCopy(unsigned &SrcReg,
                                          unsigned &SrcSubReg) {
   assert(Def->isCopy() && "Invalid definition");
   // Copy instruction are supposed to be: Def = Src.
   // If someone breaks this assumption, bad things will happen everywhere.
-  assert(Def->getDesc().getNumOperands() == 2 && "Invalid number of operands");
+  assert(Def->getNumOperands() == 2 && "Invalid number of operands");
 
   if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
     // If we look for a different subreg, it means we want a subreg of src.
     // Bails as we do not support composing subreg yet.
     return false;
   // Otherwise, we want the whole source.
-  SrcIdx = 1;
-  SrcSubReg = Def->getOperand(SrcIdx).getSubReg();
+  const MachineOperand &Src = Def->getOperand(1);
+  SrcReg = Src.getReg();
+  SrcSubReg = Src.getSubReg();
   return true;
 }
 
-bool ValueTracker::getNextSourceFromBitcast(unsigned &SrcIdx,
+bool ValueTracker::getNextSourceFromBitcast(unsigned &SrcReg,
                                             unsigned &SrcSubReg) {
   assert(Def->isBitcast() && "Invalid definition");
 
@@ -814,7 +1224,7 @@ bool ValueTracker::getNextSourceFromBitcast(unsigned &SrcIdx,
     // Bails as we do not support composing subreg yet.
     return false;
 
-  SrcIdx = Def->getDesc().getNumOperands();
+  unsigned SrcIdx = Def->getNumOperands();
   for (unsigned OpIdx = DefIdx + 1, EndOpIdx = SrcIdx; OpIdx != EndOpIdx;
        ++OpIdx) {
     const MachineOperand &MO = Def->getOperand(OpIdx);
@@ -826,13 +1236,16 @@ bool ValueTracker::getNextSourceFromBitcast(unsigned &SrcIdx,
       return false;
     SrcIdx = OpIdx;
   }
-  SrcSubReg = Def->getOperand(SrcIdx).getSubReg();
+  const MachineOperand &Src = Def->getOperand(SrcIdx);
+  SrcReg = Src.getReg();
+  SrcSubReg = Src.getSubReg();
   return true;
 }
 
-bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcIdx,
+bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcReg,
                                                 unsigned &SrcSubReg) {
-  assert(Def->isRegSequence() && "Invalid definition");
+  assert((Def->isRegSequence() || Def->isRegSequenceLike()) &&
+         "Invalid definition");
 
   if (Def->getOperand(DefIdx).getSubReg())
     // If we are composing subreg, bails out.
@@ -851,19 +1264,26 @@ bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcIdx,
     // turn that into an assertion.
     return false;
 
+  if (!TII)
+    // We could handle the REG_SEQUENCE here, but we do not want to
+    // duplicate the code from the generic TII.
+    return false;
+
+  SmallVector<TargetInstrInfo::RegSubRegPairAndIdx, 8> RegSeqInputRegs;
+  if (!TII->getRegSequenceInputs(*Def, DefIdx, RegSeqInputRegs))
+    return false;
+
   // We are looking at:
   // Def = REG_SEQUENCE v0, sub0, v1, sub1, ...
   // Check if one of the operand defines the subreg we are interested in.
-  for (unsigned OpIdx = DefIdx + 1, EndOpIdx = Def->getNumOperands();
-       OpIdx != EndOpIdx; OpIdx += 2) {
-    const MachineOperand &MOSubIdx = Def->getOperand(OpIdx + 1);
-    assert(MOSubIdx.isImm() &&
-           "One of the subindex of the reg_sequence is not an immediate");
-    if (MOSubIdx.getImm() == DefSubReg) {
-      assert(Def->getOperand(OpIdx).isReg() &&
-             "One of the source of the reg_sequence is not a register");
-      SrcIdx = OpIdx;
-      SrcSubReg = Def->getOperand(SrcIdx).getSubReg();
+  for (auto &RegSeqInput : RegSeqInputRegs) {
+    if (RegSeqInput.SubIdx == DefSubReg) {
+      if (RegSeqInput.SubReg)
+        // Bails if we have to compose sub registers.
+        return false;
+
+      SrcReg = RegSeqInput.Reg;
+      SrcSubReg = RegSeqInput.SubReg;
       return true;
     }
   }
@@ -874,61 +1294,68 @@ bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcIdx,
   return false;
 }
 
-bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcIdx,
+bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcReg,
                                                  unsigned &SrcSubReg) {
-  assert(Def->isInsertSubreg() && "Invalid definition");
+  assert((Def->isInsertSubreg() || Def->isInsertSubregLike()) &&
+         "Invalid definition");
+
   if (Def->getOperand(DefIdx).getSubReg())
     // If we are composing subreg, bails out.
     // Same remark as getNextSourceFromRegSequence.
     // I.e., this may be turned into an assert.
     return false;
 
+  if (!TII)
+    // We could handle the REG_SEQUENCE here, but we do not want to
+    // duplicate the code from the generic TII.
+    return false;
+
+  TargetInstrInfo::RegSubRegPair BaseReg;
+  TargetInstrInfo::RegSubRegPairAndIdx InsertedReg;
+  if (!TII->getInsertSubregInputs(*Def, DefIdx, BaseReg, InsertedReg))
+    return false;
+
   // We are looking at:
   // Def = INSERT_SUBREG v0, v1, sub1
   // There are two cases:
   // 1. DefSubReg == sub1, get v1.
   // 2. DefSubReg != sub1, the value may be available through v0.
 
-  // #1 Check if the inserted register matches the require sub index.
-  unsigned InsertedSubReg = Def->getOperand(3).getImm();
-  if (InsertedSubReg == DefSubReg) {
-    SrcIdx = 2;
-    SrcSubReg = Def->getOperand(SrcIdx).getSubReg();
+  // #1 Check if the inserted register matches the required sub index.
+  if (InsertedReg.SubIdx == DefSubReg) {
+    SrcReg = InsertedReg.Reg;
+    SrcSubReg = InsertedReg.SubReg;
     return true;
   }
   // #2 Otherwise, if the sub register we are looking for is not partial
   // defined by the inserted element, we can look through the main
   // register (v0).
-  // To check the overlapping we need a MRI and a TRI.
-  if (!MRI)
-    return false;
-
   const MachineOperand &MODef = Def->getOperand(DefIdx);
-  const MachineOperand &MOBase = Def->getOperand(1);
   // If the result register (Def) and the base register (v0) do not
   // have the same register class or if we have to compose
   // subregisters, bails out.
-  if (MRI->getRegClass(MODef.getReg()) != MRI->getRegClass(MOBase.getReg()) ||
-      MOBase.getSubReg())
+  if (MRI.getRegClass(MODef.getReg()) != MRI.getRegClass(BaseReg.Reg) ||
+      BaseReg.SubReg)
     return false;
 
-  // Get the TRI and check if inserted sub register overlaps with the
-  // sub register we are tracking.
-  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  // Get the TRI and check if the inserted sub-register overlaps with the
+  // sub-register we are tracking.
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
   if (!TRI ||
       (TRI->getSubRegIndexLaneMask(DefSubReg) &
-       TRI->getSubRegIndexLaneMask(InsertedSubReg)) != 0)
+       TRI->getSubRegIndexLaneMask(InsertedReg.SubIdx)) != 0)
     return false;
   // At this point, the value is available in v0 via the same subreg
   // we used for Def.
-  SrcIdx = 1;
+  SrcReg = BaseReg.Reg;
   SrcSubReg = DefSubReg;
   return true;
 }
 
-bool ValueTracker::getNextSourceFromExtractSubreg(unsigned &SrcIdx,
+bool ValueTracker::getNextSourceFromExtractSubreg(unsigned &SrcReg,
                                                   unsigned &SrcSubReg) {
-  assert(Def->isExtractSubreg() && "Invalid definition");
+  assert((Def->isExtractSubreg() ||
+          Def->isExtractSubregLike()) && "Invalid definition");
   // We are looking at:
   // Def = EXTRACT_SUBREG v0, sub0
 
@@ -937,17 +1364,26 @@ bool ValueTracker::getNextSourceFromExtractSubreg(unsigned &SrcIdx,
   if (DefSubReg)
     return false;
 
+  if (!TII)
+    // We could handle the EXTRACT_SUBREG here, but we do not want to
+    // duplicate the code from the generic TII.
+    return false;
+
+  TargetInstrInfo::RegSubRegPairAndIdx ExtractSubregInputReg;
+  if (!TII->getExtractSubregInputs(*Def, DefIdx, ExtractSubregInputReg))
+    return false;
+
   // Bails if we have to compose sub registers.
   // Likewise, if v0.subreg != 0, we would have to compose v0.subreg with sub0.
-  if (Def->getOperand(1).getSubReg())
+  if (ExtractSubregInputReg.SubReg)
     return false;
   // Otherwise, the value is available in the v0.sub0.
-  SrcIdx = 1;
-  SrcSubReg = Def->getOperand(2).getImm();
+  SrcReg = ExtractSubregInputReg.Reg;
+  SrcSubReg = ExtractSubregInputReg.SubIdx;
   return true;
 }
 
-bool ValueTracker::getNextSourceFromSubregToReg(unsigned &SrcIdx,
+bool ValueTracker::getNextSourceFromSubregToReg(unsigned &SrcReg,
                                                 unsigned &SrcSubReg) {
   assert(Def->isSubregToReg() && "Invalid definition");
   // We are looking at:
@@ -964,37 +1400,37 @@ bool ValueTracker::getNextSourceFromSubregToReg(unsigned &SrcIdx,
   if (Def->getOperand(2).getSubReg())
     return false;
 
-  SrcIdx = 2;
+  SrcReg = Def->getOperand(2).getReg();
   SrcSubReg = Def->getOperand(3).getImm();
   return true;
 }
 
-bool ValueTracker::getNextSourceImpl(unsigned &SrcIdx, unsigned &SrcSubReg) {
+bool ValueTracker::getNextSourceImpl(unsigned &SrcReg, unsigned &SrcSubReg) {
   assert(Def && "This method needs a valid definition");
 
   assert(
       (DefIdx < Def->getDesc().getNumDefs() || Def->getDesc().isVariadic()) &&
       Def->getOperand(DefIdx).isDef() && "Invalid DefIdx");
   if (Def->isCopy())
-    return getNextSourceFromCopy(SrcIdx, SrcSubReg);
+    return getNextSourceFromCopy(SrcReg, SrcSubReg);
   if (Def->isBitcast())
-    return getNextSourceFromBitcast(SrcIdx, SrcSubReg);
+    return getNextSourceFromBitcast(SrcReg, SrcSubReg);
   // All the remaining cases involve "complex" instructions.
   // Bails if we did not ask for the advanced tracking.
   if (!UseAdvancedTracking)
     return false;
-  if (Def->isRegSequence())
-    return getNextSourceFromRegSequence(SrcIdx, SrcSubReg);
-  if (Def->isInsertSubreg())
-    return getNextSourceFromInsertSubreg(SrcIdx, SrcSubReg);
-  if (Def->isExtractSubreg())
-    return getNextSourceFromExtractSubreg(SrcIdx, SrcSubReg);
+  if (Def->isRegSequence() || Def->isRegSequenceLike())
+    return getNextSourceFromRegSequence(SrcReg, SrcSubReg);
+  if (Def->isInsertSubreg() || Def->isInsertSubregLike())
+    return getNextSourceFromInsertSubreg(SrcReg, SrcSubReg);
+  if (Def->isExtractSubreg() || Def->isExtractSubregLike())
+    return getNextSourceFromExtractSubreg(SrcReg, SrcSubReg);
   if (Def->isSubregToReg())
-    return getNextSourceFromSubregToReg(SrcIdx, SrcSubReg);
+    return getNextSourceFromSubregToReg(SrcReg, SrcSubReg);
   return false;
 }
 
-const MachineInstr *ValueTracker::getNextSource(unsigned &SrcIdx,
+const MachineInstr *ValueTracker::getNextSource(unsigned &SrcReg,
                                                 unsigned &SrcSubReg) {
   // If we reach a point where we cannot move up in the use-def chain,
   // there is nothing we can get.
@@ -1003,20 +1439,18 @@ const MachineInstr *ValueTracker::getNextSource(unsigned &SrcIdx,
 
   const MachineInstr *PrevDef = nullptr;
   // Try to find the next source.
-  if (getNextSourceImpl(SrcIdx, SrcSubReg)) {
+  if (getNextSourceImpl(SrcReg, SrcSubReg)) {
     // Update definition, definition index, and subregister for the
     // next call of getNextSource.
-    const MachineOperand &MO = Def->getOperand(SrcIdx);
-    assert(MO.isReg() && !MO.isDef() && "Source is invalid");
     // Update the current register.
-    Reg = MO.getReg();
+    Reg = SrcReg;
     // Update the return value before moving up in the use-def chain.
     PrevDef = Def;
     // If we can still move up in the use-def chain, move to the next
     // defintion.
     if (!TargetRegisterInfo::isPhysicalRegister(Reg)) {
-      Def = MRI->getVRegDef(Reg);
-      DefIdx = MRI->def_begin(Reg).getOperandNo();
+      Def = MRI.getVRegDef(Reg);
+      DefIdx = MRI.def_begin(Reg).getOperandNo();
       DefSubReg = SrcSubReg;
       return PrevDef;
     }
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index a1ab3445fa4f..89e1d113ff70 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -41,7 +41,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
@@ -98,7 +97,7 @@ namespace {
     }
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
-    
+
     bool enablePostRAScheduler(
         const TargetSubtargetInfo &ST, CodeGenOpt::Level OptLevel,
         TargetSubtargetInfo::AntiDepBreakMode &Mode,
@@ -137,10 +136,10 @@ namespace {
 
   public:
     SchedulePostRATDList(
-      MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
-      AliasAnalysis *AA, const RegisterClassInfo&,
-      TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
-      SmallVectorImpl<const TargetRegisterClass*> &CriticalPathRCs);
+        MachineFunction &MF, MachineLoopInfo &MLI, AliasAnalysis *AA,
+        const RegisterClassInfo &,
+        TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
+        SmallVectorImpl<const TargetRegisterClass *> &CriticalPathRCs);
 
     ~SchedulePostRATDList();
 
@@ -193,16 +192,17 @@ INITIALIZE_PASS(PostRAScheduler, "post-RA-sched",
                 "Post RA top-down list latency scheduler", false, false)
 
 SchedulePostRATDList::SchedulePostRATDList(
-  MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
-  AliasAnalysis *AA, const RegisterClassInfo &RCI,
-  TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
-  SmallVectorImpl<const TargetRegisterClass*> &CriticalPathRCs)
-  : ScheduleDAGInstrs(MF, MLI, MDT, /*IsPostRA=*/true), AA(AA), EndIndex(0) {
-
-  const TargetMachine &TM = MF.getTarget();
-  const InstrItineraryData *InstrItins = TM.getInstrItineraryData();
+    MachineFunction &MF, MachineLoopInfo &MLI, AliasAnalysis *AA,
+    const RegisterClassInfo &RCI,
+    TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
+    SmallVectorImpl<const TargetRegisterClass *> &CriticalPathRCs)
+    : ScheduleDAGInstrs(MF, &MLI, /*IsPostRA=*/true), AA(AA), EndIndex(0) {
+
+  const InstrItineraryData *InstrItins =
+      MF.getSubtarget().getInstrItineraryData();
   HazardRec =
-    TM.getInstrInfo()->CreateTargetPostRAHazardRecognizer(InstrItins, this);
+      MF.getSubtarget().getInstrInfo()->CreateTargetPostRAHazardRecognizer(
+          InstrItins, this);
 
   assert((AntiDepMode == TargetSubtargetInfo::ANTIDEP_NONE ||
           MRI.tracksLiveness()) &&
@@ -265,9 +265,8 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   if (skipOptnoneFunction(*Fn.getFunction()))
     return false;
 
-  TII = Fn.getTarget().getInstrInfo();
+  TII = Fn.getSubtarget().getInstrInfo();
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
-  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
   AliasAnalysis *AA = &getAnalysis<AliasAnalysis>();
   TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
 
@@ -301,7 +300,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
 
   DEBUG(dbgs() << "PostRAScheduler\n");
 
-  SchedulePostRATDList Scheduler(Fn, MLI, MDT, AA, RegClassInfo, AntiDepMode,
+  SchedulePostRATDList Scheduler(Fn, MLI, AA, RegClassInfo, AntiDepMode,
                                  CriticalPathRCs);
 
   // Loop over all of the basic blocks
@@ -560,10 +559,10 @@ void SchedulePostRATDList::ListScheduleTopDown() {
       if (HT == ScheduleHazardRecognizer::NoHazard) {
         if (HazardRec->ShouldPreferAnother(CurSUnit)) {
           if (!NotPreferredSUnit) {
-	    // If this is the first non-preferred node for this cycle, then
-	    // record it and continue searching for a preferred node. If this
-	    // is not the first non-preferred node, then treat it as though
-	    // there had been a hazard.
+            // If this is the first non-preferred node for this cycle, then
+            // record it and continue searching for a preferred node. If this
+            // is not the first non-preferred node, then treat it as though
+            // there had been a hazard.
             NotPreferredSUnit = CurSUnit;
             continue;
           }
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index 31299272a7a2..b1538006e724 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -138,8 +139,8 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
 
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   assert(MRI->isSSA() && "ProcessImplicitDefs only works on SSA form.");
   assert(WorkList.empty() && "Inconsistent worklist state");
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index b98d210e9d57..385e5a35afba 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <climits>
 
 using namespace llvm;
@@ -110,8 +111,8 @@ typedef SmallSetVector<int, 8> StackObjSet;
 ///
 bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   const Function* F = Fn.getFunction();
-  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
-  const TargetFrameLowering *TFI = Fn.getTarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
 
   assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs");
 
@@ -185,8 +186,8 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
 /// variables for the function's frame information and eliminate call frame
 /// pseudo instructions.
 void PEI::calculateCallsInformation(MachineFunction &Fn) {
-  const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
-  const TargetFrameLowering *TFI = Fn.getTarget().getFrameLowering();
+  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
   MachineFrameInfo *MFI = Fn.getFrameInfo();
 
   unsigned MaxCallFrameSize = 0;
@@ -239,8 +240,8 @@ void PEI::calculateCallsInformation(MachineFunction &Fn) {
 /// calculateCalleeSavedRegisters - Scan the function for modified callee saved
 /// registers.
 void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
-  const TargetRegisterInfo *RegInfo = F.getTarget().getRegisterInfo();
-  const TargetFrameLowering *TFI = F.getTarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering();
   MachineFrameInfo *MFI = F.getFrameInfo();
 
   // Get the callee saved register list...
@@ -337,9 +338,9 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
   if (CSI.empty())
     return;
 
-  const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
-  const TargetFrameLowering *TFI = Fn.getTarget().getFrameLowering();
-  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
   MachineBasicBlock::iterator I;
 
   // Spill using target interface.
@@ -445,7 +446,7 @@ AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
 /// abstract stack objects.
 ///
 void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
   StackProtector *SP = &getAnalysis<StackProtector>();
 
   bool StackGrowsDown =
@@ -515,7 +516,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
   // Make sure the special register scavenging spill slot is closest to the
   // incoming stack pointer if a frame pointer is required and is closer
   // to the incoming rather than the final stack pointer.
-  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
   bool EarlyScavengingSlots = (TFI.hasFP(Fn) &&
                                TFI.isFPCloseToIncomingSP() &&
                                RegInfo->useFPForScavengingIndex(Fn) &&
@@ -670,7 +671,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
 /// prolog and epilog code to the function.
 ///
 void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
 
   // Add prologue to the function...
   TFI.emitPrologue(Fn);
@@ -710,8 +711,7 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
   SmallPtrSet<MachineBasicBlock*, 8> Reachable;
 
   // Iterate over the reachable blocks in DFS order.
-  for (df_ext_iterator<MachineFunction*, SmallPtrSet<MachineBasicBlock*, 8> >
-       DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable);
+  for (auto DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable);
        DFI != DFE; ++DFI) {
     int SPAdj = 0;
     // Check the exit state of the DFS stack predecessor.
@@ -738,31 +738,24 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
 
 void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
                               int &SPAdj) {
-  const TargetMachine &TM = Fn.getTarget();
-  assert(TM.getRegisterInfo() && "TM::getRegisterInfo() must be implemented!");
-  const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
-  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
-  const TargetFrameLowering *TFI = TM.getFrameLowering();
-  bool StackGrowsDown =
-    TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+  assert(Fn.getSubtarget().getRegisterInfo() &&
+         "getRegisterInfo() must be implemented!");
+  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
   int FrameSetupOpcode   = TII.getCallFrameSetupOpcode();
   int FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
 
   if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
 
+  bool InsideCallSequence = false;
+
   for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
 
     if (I->getOpcode() == FrameSetupOpcode ||
         I->getOpcode() == FrameDestroyOpcode) {
-      // Remember how much SP has been adjusted to create the call
-      // frame.
-      int Size = I->getOperand(0).getImm();
-
-      if ((!StackGrowsDown && I->getOpcode() == FrameSetupOpcode) ||
-          (StackGrowsDown && I->getOpcode() == FrameDestroyOpcode))
-        Size = -Size;
-
-      SPAdj += Size;
+      InsideCallSequence = (I->getOpcode() == FrameSetupOpcode);
+      SPAdj += TII.getSPAdjust(I);
 
       MachineBasicBlock::iterator PrevI = BB->end();
       if (I != BB->begin()) PrevI = std::prev(I);
@@ -776,6 +769,13 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
       continue;
     }
 
+    // If we are looking at a call sequence, we need to keep track of
+    // the SP adjustment made by each instruction in the sequence.
+    // This includes both the frame setup/destroy pseudos (handled above),
+    // as well as other instructions that have side effects w.r.t the SP.
+    if (InsideCallSequence)
+      SPAdj += TII.getSPAdjust(I);
+
     MachineInstr *MI = I;
     bool DoIncr = true;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -797,6 +797,37 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
         continue;
       }
 
+      // TODO: This code should be commoned with the code for
+      // PATCHPOINT. There's no good reason for the difference in
+      // implementation other than historical accident.  The only
+      // remaining difference is the unconditional use of the stack
+      // pointer as the base register.
+      if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
+        assert((!MI->isDebugValue() || i == 0) &&
+               "Frame indicies can only appear as the first operand of a "
+               "DBG_VALUE machine instruction");
+        unsigned Reg;
+        MachineOperand &Offset = MI->getOperand(i + 1);
+        const unsigned refOffset =
+          TFI->getFrameIndexReferenceFromSP(Fn, MI->getOperand(i).getIndex(),
+                                            Reg);
+
+        Offset.setImm(Offset.getImm() + refOffset);
+        MI->getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
+        continue;
+      }
+
+      // Frame allocations are target independent. Simply swap the index with
+      // the offset.
+      if (MI->getOpcode() == TargetOpcode::FRAME_ALLOC) {
+        assert(TFI->hasFP(Fn) && "frame alloc requires FP");
+        MachineOperand &FI = MI->getOperand(i);
+        unsigned Reg;
+        int FrameOffset = TFI->getFrameIndexReference(Fn, FI.getIndex(), Reg);
+        FI.ChangeToImmediate(FrameOffset);
+        continue;
+      }
+
       // Some instructions (e.g. inline asm instructions) can have
       // multiple frame indices and/or cause eliminateFrameIndex
       // to insert more than one instruction. We need the register
@@ -837,7 +868,8 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
 /// FIXME: Iterating over the instruction stream is unnecessary. We can simply
 /// iterate over the vreg use list, which at this point only contains machine
 /// operands for which eliminateFrameIndex need a new scratch reg.
-void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
+void
+PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
   // Run through the instructions and find any virtual registers.
   for (MachineFunction::iterator BB = Fn.begin(),
        E = Fn.end(); BB != E; ++BB) {
@@ -888,12 +920,16 @@ void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
           // Replace this reference to the virtual register with the
           // scratch register.
           assert (ScratchReg && "Missing scratch register!");
+          MachineRegisterInfo &MRI = Fn.getRegInfo();
           Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
+          
+          // Make sure MRI now accounts this register as used.
+          MRI.setPhysRegUsed(ScratchReg);
 
           // Because this instruction was processed by the RS before this
           // register was allocated, make sure that the RS now records the
           // register as being used.
-          RS->setUsed(ScratchReg);
+          RS->setRegUsed(ScratchReg);
         }
       }
 
diff --git a/lib/CodeGen/PrologEpilogInserter.h b/lib/CodeGen/PrologEpilogInserter.h
index 5a6d39a04a22..f88b8ef2b660 100644
--- a/lib/CodeGen/PrologEpilogInserter.h
+++ b/lib/CodeGen/PrologEpilogInserter.h
@@ -16,8 +16,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_PEI_H
-#define LLVM_CODEGEN_PEI_H
+#ifndef LLVM_LIB_CODEGEN_PROLOGEPILOGINSERTER_H
+#define LLVM_LIB_CODEGEN_PROLOGEPILOGINSERTER_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SparseBitVector.h"
diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp
index 12b2c902f8d3..b1c341d3a681 100644
--- a/lib/CodeGen/PseudoSourceValue.cpp
+++ b/lib/CodeGen/PseudoSourceValue.cpp
@@ -107,13 +107,9 @@ bool FixedStackPseudoSourceValue::isConstant(const MachineFrameInfo *MFI) const{
 }
 
 bool FixedStackPseudoSourceValue::isAliased(const MachineFrameInfo *MFI) const {
-  // Negative frame indices are used for special things that don't
-  // appear in LLVM IR. Non-negative indices may be used for things
-  // like static allocas.
   if (!MFI)
-    return FI >= 0;
-  // Spill slots should not alias others.
-  return !MFI->isFixedObjectIndex(FI) && !MFI->isSpillSlotObjectIndex(FI);
+    return true;
+  return MFI->isAliasedObjectIndex(FI);
 }
 
 bool FixedStackPseudoSourceValue::mayAlias(const MachineFrameInfo *MFI) const {
diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index 894aee7a9917..6b346f4536cf 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@@ -21,7 +21,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #ifndef NDEBUG
 #include "llvm/ADT/SparseBitVector.h"
@@ -91,6 +90,7 @@ void RegAllocBase::allocatePhysRegs() {
     // Unused registers can appear when the spiller coalesces snippets.
     if (MRI->reg_nodbg_empty(VirtReg->reg)) {
       DEBUG(dbgs() << "Dropping unused " << *VirtReg << '\n');
+      aboutToRemoveInterval(*VirtReg);
       LIS->removeInterval(VirtReg->reg);
       continue;
     }
@@ -102,7 +102,7 @@ void RegAllocBase::allocatePhysRegs() {
     // register if possible and populate a list of new live intervals that
     // result from splitting.
     DEBUG(dbgs() << "\nselectOrSplit "
-          << MRI->getRegClass(VirtReg->reg)->getName()
+          << TRI->getRegClassName(MRI->getRegClass(VirtReg->reg))
           << ':' << *VirtReg << " w=" << VirtReg->weight << '\n');
     typedef SmallVector<unsigned, 4> VirtRegVec;
     VirtRegVec SplitVRegs;
@@ -140,6 +140,7 @@ void RegAllocBase::allocatePhysRegs() {
       assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned");
       if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) {
         DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
+        aboutToRemoveInterval(*SplitVirtReg);
         LIS->removeInterval(SplitVirtReg->reg);
         continue;
       }
diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index b333c36bff92..659b8f505a22 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@@ -34,8 +34,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_REGALLOCBASE
-#define LLVM_CODEGEN_REGALLOCBASE
+#ifndef LLVM_LIB_CODEGEN_REGALLOCBASE_H
+#define LLVM_LIB_CODEGEN_REGALLOCBASE_H
 
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
@@ -96,6 +96,9 @@ protected:
   // Use this group name for NamedRegionTimer.
   static const char TimerGroupName[];
 
+  /// Method called when the allocator is about to remove a LiveInterval.
+  virtual void aboutToRemoveInterval(LiveInterval &LI) {}
+
 public:
   /// VerifyEnabled - True when -verify-regalloc is given.
   static bool VerifyEnabled;
@@ -106,4 +109,4 @@ private:
 
 } // end namespace llvm
 
-#endif // !defined(LLVM_CODEGEN_REGALLOCBASE)
+#endif
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index 6bc678e85211..0090332a8123 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -33,7 +33,6 @@
 #include "llvm/PassAnalysisSupport.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <cstdlib>
 #include <queue>
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 97b9f7650892..c62141400267 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -33,7 +33,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -53,7 +53,6 @@ namespace {
     RAFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1),
                isBulkSpilling(false) {}
   private:
-    const TargetMachine *TM;
     MachineFunction *MF;
     MachineRegisterInfo *MRI;
     const TargetRegisterInfo *TRI;
@@ -298,7 +297,8 @@ void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
       LiveDbgValueMap[LRI->VirtReg];
     for (unsigned li = 0, le = LRIDbgValues.size(); li != le; ++li) {
       MachineInstr *DBG = LRIDbgValues[li];
-      const MDNode *MDPtr = DBG->getOperand(2).getMetadata();
+      const MDNode *Var = DBG->getDebugVariable();
+      const MDNode *Expr = DBG->getDebugExpression();
       bool IsIndirect = DBG->isIndirectDebugValue();
       uint64_t Offset = IsIndirect ? DBG->getOperand(1).getImm() : 0;
       DebugLoc DL;
@@ -308,10 +308,13 @@ void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
         DL = (--EI)->getDebugLoc();
       } else
         DL = MI->getDebugLoc();
-      MachineBasicBlock *MBB = DBG->getParent();
       MachineInstr *NewDV =
           BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::DBG_VALUE))
-              .addFrameIndex(FI).addImm(Offset).addMetadata(MDPtr);
+              .addFrameIndex(FI)
+              .addImm(Offset)
+              .addMetadata(Var)
+              .addMetadata(Expr);
+      assert(NewDV->getParent() == MBB && "dangling parent pointer");
       (void)NewDV;
       DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV);
     }
@@ -369,15 +372,23 @@ void RAFast::usePhysReg(MachineOperand &MO) {
     case regDisabled:
       break;
     case regReserved:
-      assert(TRI->isSuperRegister(PhysReg, Alias) &&
+      // Either PhysReg is a subregister of Alias and we mark the
+      // whole register as free, or PhysReg is the superregister of
+      // Alias and we mark all the aliases as disabled before freeing
+      // PhysReg.
+      // In the latter case, since PhysReg was disabled, this means that
+      // its value is defined only by physical sub-registers. This check
+      // is performed by the assert of the default case in this loop.
+      // Note: The value of the superregister may only be partial
+      // defined, that is why regDisabled is a valid state for aliases.
+      assert((TRI->isSuperRegister(PhysReg, Alias) ||
+              TRI->isSuperRegister(Alias, PhysReg)) &&
              "Instruction is not using a subregister of a reserved register");
-      // Leave the superregister in the working set.
-      PhysRegState[Alias] = regFree;
-      MO.getParent()->addRegisterKilled(Alias, TRI, true);
-      return;
+      // Fall through.
     case regFree:
       if (TRI->isSuperRegister(PhysReg, Alias)) {
         // Leave the superregister in the working set.
+        PhysRegState[Alias] = regFree;
         MO.getParent()->addRegisterKilled(Alias, TRI, true);
         return;
       }
@@ -545,7 +556,7 @@ RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr *MI,
   }
 
   DEBUG(dbgs() << "Allocating " << PrintReg(VirtReg) << " from "
-               << RC->getName() << "\n");
+               << TRI->getRegClassName(RC) << "\n");
 
   unsigned BestReg = 0, BestCost = spillImpossible;
   for (ArrayRef<MCPhysReg>::iterator I = AO.begin(), E = AO.end(); I != E; ++I){
@@ -705,7 +716,7 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
       continue;
     if (MO.isEarlyClobber() || MI->isRegTiedToDefOperand(i) ||
         (MO.getSubReg() && MI->readsVirtualRegister(Reg))) {
-      if (ThroughRegs.insert(Reg))
+      if (ThroughRegs.insert(Reg).second)
         DEBUG(dbgs() << ' ' << PrintReg(Reg));
     }
   }
@@ -862,13 +873,16 @@ void RAFast::AllocateBasicBlock() {
               // Modify DBG_VALUE now that the value is in a spill slot.
               bool IsIndirect = MI->isIndirectDebugValue();
               uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
-              const MDNode *MDPtr =
-                MI->getOperand(MI->getNumOperands()-1).getMetadata();
+              const MDNode *Var = MI->getDebugVariable();
+              const MDNode *Expr = MI->getDebugExpression();
               DebugLoc DL = MI->getDebugLoc();
               MachineBasicBlock *MBB = MI->getParent();
               MachineInstr *NewDV = BuildMI(*MBB, MBB->erase(MI), DL,
                                             TII->get(TargetOpcode::DBG_VALUE))
-                  .addFrameIndex(SS).addImm(Offset).addMetadata(MDPtr);
+                                        .addFrameIndex(SS)
+                                        .addImm(Offset)
+                                        .addMetadata(Var)
+                                        .addMetadata(Expr);
               DEBUG(dbgs() << "Modifying debug info due to spill:"
                            << "\t" << *NewDV);
               // Scan NewDV operands from the beginning.
@@ -1017,8 +1031,7 @@ void RAFast::AllocateBasicBlock() {
 
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
         if (!MRI->isAllocatable(Reg)) continue;
-        definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
-                               regFree : regReserved);
+        definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
         continue;
       }
       LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, CopySrc);
@@ -1070,9 +1083,8 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
                << "********** Function: " << Fn.getName() << '\n');
   MF = &Fn;
   MRI = &MF->getRegInfo();
-  TM = &Fn.getTarget();
-  TRI = TM->getRegisterInfo();
-  TII = TM->getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
   MRI->freezeReservedRegs(Fn);
   RegClassInfo.runOnMachineFunction(Fn);
   UsedInInstr.clear();
@@ -1093,9 +1105,8 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
   }
 
   // Add the clobber lists for all the instructions we skipped earlier.
-  for (SmallPtrSet<const MCInstrDesc*, 4>::const_iterator
-       I = SkippedInstrs.begin(), E = SkippedInstrs.end(); I != E; ++I)
-    if (const uint16_t *Defs = (*I)->getImplicitDefs())
+  for (const MCInstrDesc *Desc : SkippedInstrs)
+    if (const uint16_t *Defs = Desc->getImplicitDefs())
       while (*Defs)
         MRI->setPhysRegUsed(*Defs++);
 
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index dee990c2eb5f..edc329499c42 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -296,6 +296,9 @@ class RAGreedy : public MachineFunctionPass,
   /// obtained from the TargetSubtargetInfo.
   bool EnableLocalReassign;
 
+  /// Set of broken hints that may be reconciled later because of eviction.
+  SmallSetVector<LiveInterval *, 8> SetOfBrokenHints;
+
 public:
   RAGreedy();
 
@@ -311,6 +314,7 @@ public:
   void enqueue(LiveInterval *LI) override;
   LiveInterval *dequeue() override;
   unsigned selectOrSplit(LiveInterval&, SmallVectorImpl<unsigned>&) override;
+  void aboutToRemoveInterval(LiveInterval &) override;
 
   /// Perform register allocation.
   bool runOnMachineFunction(MachineFunction &mf) override;
@@ -378,6 +382,24 @@ private:
                                    SmallVirtRegSet &, unsigned);
   bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<unsigned> &,
                                SmallVirtRegSet &, unsigned);
+  void tryHintRecoloring(LiveInterval &);
+  void tryHintsRecoloring();
+
+  /// Model the information carried by one end of a copy.
+  struct HintInfo {
+    /// The frequency of the copy.
+    BlockFrequency Freq;
+    /// The virtual register or physical register.
+    unsigned Reg;
+    /// Its currently assigned register.
+    /// In case of a physical register Reg == PhysReg.
+    unsigned PhysReg;
+    HintInfo(BlockFrequency Freq, unsigned Reg, unsigned PhysReg)
+        : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {}
+  };
+  typedef SmallVector<HintInfo, 4> HintsInfo;
+  BlockFrequency getBrokenHintFreq(const HintsInfo &, unsigned);
+  void collectHintInfo(unsigned, HintsInfo &);
 };
 } // end anonymous namespace
 
@@ -453,7 +475,9 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
 
 bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
   if (VRM->hasPhys(VirtReg)) {
-    Matrix->unassign(LIS->getInterval(VirtReg));
+    LiveInterval &LI = LIS->getInterval(VirtReg);
+    Matrix->unassign(LI);
+    aboutToRemoveInterval(LI);
     return true;
   }
   // Unassigned virtreg is probably in the priority queue.
@@ -514,7 +538,7 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
     // Giant live ranges fall back to the global assignment heuristic, which
     // prevents excessive spilling in pathological cases.
     bool ReverseLocal = TRI->reverseLocalAssignment();
-    bool ForceGlobal = !ReverseLocal && TRI->mayOverrideLocalAssignment() &&
+    bool ForceGlobal = !ReverseLocal &&
       (Size / SlotIndex::InstrDist) > (2 * MRI->getRegClass(Reg)->getNumRegs());
 
     if (ExtraRegInfo[Reg].Stage == RS_Assign && !ForceGlobal && !LI->empty() &&
@@ -817,7 +841,7 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
     const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg);
     unsigned MinCost = RegClassInfo.getMinCost(RC);
     if (MinCost >= CostPerUseLimit) {
-      DEBUG(dbgs() << RC->getName() << " minimum cost = " << MinCost
+      DEBUG(dbgs() << TRI->getRegClassName(RC) << " minimum cost = " << MinCost
                    << ", no cheaper registers to be found.\n");
       return 0;
     }
@@ -967,14 +991,12 @@ void RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
       BCS[B].Exit = SpillPlacement::PrefSpill;
 
     if (++B == GroupSize) {
-      ArrayRef<SpillPlacement::BlockConstraint> Array(BCS, B);
-      SpillPlacer->addConstraints(Array);
+      SpillPlacer->addConstraints(makeArrayRef(BCS, B));
       B = 0;
     }
   }
 
-  ArrayRef<SpillPlacement::BlockConstraint> Array(BCS, B);
-  SpillPlacer->addConstraints(Array);
+  SpillPlacer->addConstraints(makeArrayRef(BCS, B));
   SpillPlacer->addLinks(makeArrayRef(TBS, T));
 }
 
@@ -1013,7 +1035,7 @@ void RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
 
     // Compute through constraints from the interference, or assume that all
     // through blocks prefer spilling when forming compact regions.
-    ArrayRef<unsigned> NewBlocks = makeArrayRef(ActiveBlocks).slice(AddedTo);
+    auto NewBlocks = makeArrayRef(ActiveBlocks).slice(AddedTo);
     if (Cand.PhysReg)
       addThroughConstraints(Cand.Intf, NewBlocks);
     else
@@ -1791,9 +1813,11 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
         // instructions.
         //
         // Try to guess the size of the new interval.
-        const float EstWeight = normalizeSpillWeight(blockFreq * (NewGaps + 1),
-                                 Uses[SplitBefore].distance(Uses[SplitAfter]) +
-                                 (LiveBefore + LiveAfter)*SlotIndex::InstrDist);
+        const float EstWeight = normalizeSpillWeight(
+            blockFreq * (NewGaps + 1),
+            Uses[SplitBefore].distance(Uses[SplitAfter]) +
+                (LiveBefore + LiveAfter) * SlotIndex::InstrDist,
+            1);
         // Would this split be possible to allocate?
         // Never allocate all gaps, we wouldn't be making progress.
         DEBUG(dbgs() << " w=" << EstWeight);
@@ -2213,6 +2237,11 @@ unsigned RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg,
   return PhysReg;
 }
 
+void RAGreedy::aboutToRemoveInterval(LiveInterval &LI) {
+  // Do not keep invalid information around.
+  SetOfBrokenHints.remove(&LI);
+}
+
 void RAGreedy::initializeCSRCost() {
   // We use the larger one out of the command-line option and the value report
   // by TRI.
@@ -2238,6 +2267,170 @@ void RAGreedy::initializeCSRCost() {
     CSRCost = CSRCost.getFrequency() * (ActualEntry / FixedEntry);
 }
 
+/// \brief Collect the hint info for \p Reg.
+/// The results are stored into \p Out.
+/// \p Out is not cleared before being populated.
+void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
+  for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
+    if (!Instr.isFullCopy())
+      continue;
+    // Look for the other end of the copy.
+    unsigned OtherReg = Instr.getOperand(0).getReg();
+    if (OtherReg == Reg) {
+      OtherReg = Instr.getOperand(1).getReg();
+      if (OtherReg == Reg)
+        continue;
+    }
+    // Get the current assignment.
+    unsigned OtherPhysReg = TargetRegisterInfo::isPhysicalRegister(OtherReg)
+                                ? OtherReg
+                                : VRM->getPhys(OtherReg);
+    // Push the collected information.
+    Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
+                           OtherPhysReg));
+  }
+}
+
+/// \brief Using the given \p List, compute the cost of the broken hints if
+/// \p PhysReg was used.
+/// \return The cost of \p List for \p PhysReg.
+BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List,
+                                           unsigned PhysReg) {
+  BlockFrequency Cost = 0;
+  for (const HintInfo &Info : List) {
+    if (Info.PhysReg != PhysReg)
+      Cost += Info.Freq;
+  }
+  return Cost;
+}
+
+/// \brief Using the register assigned to \p VirtReg, try to recolor
+/// all the live ranges that are copy-related with \p VirtReg.
+/// The recoloring is then propagated to all the live-ranges that have
+/// been recolored and so on, until no more copies can be coalesced or
+/// it is not profitable.
+/// For a given live range, profitability is determined by the sum of the
+/// frequencies of the non-identity copies it would introduce with the old
+/// and new register.
+void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
+  // We have a broken hint, check if it is possible to fix it by
+  // reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
+  // some register and PhysReg may be available for the other live-ranges.
+  SmallSet<unsigned, 4> Visited;
+  SmallVector<unsigned, 2> RecoloringCandidates;
+  HintsInfo Info;
+  unsigned Reg = VirtReg.reg;
+  unsigned PhysReg = VRM->getPhys(Reg);
+  // Start the recoloring algorithm from the input live-interval, then
+  // it will propagate to the ones that are copy-related with it.
+  Visited.insert(Reg);
+  RecoloringCandidates.push_back(Reg);
+
+  DEBUG(dbgs() << "Trying to reconcile hints for: " << PrintReg(Reg, TRI) << '('
+               << PrintReg(PhysReg, TRI) << ")\n");
+
+  do {
+    Reg = RecoloringCandidates.pop_back_val();
+
+    // We cannot recolor physcal register.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+
+    assert(VRM->hasPhys(Reg) && "We have unallocated variable!!");
+
+    // Get the live interval mapped with this virtual register to be able
+    // to check for the interference with the new color.
+    LiveInterval &LI = LIS->getInterval(Reg);
+    unsigned CurrPhys = VRM->getPhys(Reg);
+    // Check that the new color matches the register class constraints and
+    // that it is free for this live range.
+    if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||
+                                Matrix->checkInterference(LI, PhysReg)))
+      continue;
+
+    DEBUG(dbgs() << PrintReg(Reg, TRI) << '(' << PrintReg(CurrPhys, TRI)
+                 << ") is recolorable.\n");
+
+    // Gather the hint info.
+    Info.clear();
+    collectHintInfo(Reg, Info);
+    // Check if recoloring the live-range will increase the cost of the
+    // non-identity copies.
+    if (CurrPhys != PhysReg) {
+      DEBUG(dbgs() << "Checking profitability:\n");
+      BlockFrequency OldCopiesCost = getBrokenHintFreq(Info, CurrPhys);
+      BlockFrequency NewCopiesCost = getBrokenHintFreq(Info, PhysReg);
+      DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency()
+                   << "\nNew Cost: " << NewCopiesCost.getFrequency() << '\n');
+      if (OldCopiesCost < NewCopiesCost) {
+        DEBUG(dbgs() << "=> Not profitable.\n");
+        continue;
+      }
+      // At this point, the cost is either cheaper or equal. If it is
+      // equal, we consider this is profitable because it may expose
+      // more recoloring opportunities.
+      DEBUG(dbgs() << "=> Profitable.\n");
+      // Recolor the live-range.
+      Matrix->unassign(LI);
+      Matrix->assign(LI, PhysReg);
+    }
+    // Push all copy-related live-ranges to keep reconciling the broken
+    // hints.
+    for (const HintInfo &HI : Info) {
+      if (Visited.insert(HI.Reg).second)
+        RecoloringCandidates.push_back(HI.Reg);
+    }
+  } while (!RecoloringCandidates.empty());
+}
+
+/// \brief Try to recolor broken hints.
+/// Broken hints may be repaired by recoloring when an evicted variable
+/// freed up a register for a larger live-range.
+/// Consider the following example:
+/// BB1:
+///   a =
+///   b =
+/// BB2:
+///   ...
+///   = b
+///   = a
+/// Let us assume b gets split:
+/// BB1:
+///   a =
+///   b =
+/// BB2:
+///   c = b
+///   ...
+///   d = c
+///   = d
+///   = a
+/// Because of how the allocation work, b, c, and d may be assigned different
+/// colors. Now, if a gets evicted later:
+/// BB1:
+///   a =
+///   st a, SpillSlot
+///   b =
+/// BB2:
+///   c = b
+///   ...
+///   d = c
+///   = d
+///   e = ld SpillSlot
+///   = e
+/// This is likely that we can assign the same register for b, c, and d,
+/// getting rid of 2 copies.
+void RAGreedy::tryHintsRecoloring() {
+  for (LiveInterval *LI : SetOfBrokenHints) {
+    assert(TargetRegisterInfo::isVirtualRegister(LI->reg) &&
+           "Recoloring is possible only for virtual registers");
+    // Some dead defs may be around (e.g., because of debug uses).
+    // Ignore those.
+    if (!VRM->hasPhys(LI->reg))
+      continue;
+    tryHintRecoloring(*LI);
+  }
+}
+
 unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
                                      SmallVectorImpl<unsigned> &NewVRegs,
                                      SmallVirtRegSet &FixedRegisters,
@@ -2274,8 +2467,18 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   // queue. The RS_Split ranges already failed to do this, and they should not
   // get a second chance until they have been split.
   if (Stage != RS_Split)
-    if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit))
+    if (unsigned PhysReg =
+            tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit)) {
+      unsigned Hint = MRI->getSimpleHint(VirtReg.reg);
+      // If VirtReg has a hint and that hint is broken record this
+      // virtual register as a recoloring candidate for broken hint.
+      // Indeed, since we evicted a variable in its neighborhood it is
+      // likely we can at least partially recolor some of the
+      // copy-related live-ranges.
+      if (Hint && Hint != PhysReg)
+        SetOfBrokenHints.insert(&VirtReg);
       return PhysReg;
+    }
 
   assert(NewVRegs.empty() && "Cannot append to existing NewVRegs");
 
@@ -2319,13 +2522,13 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
                << "********** Function: " << mf.getName() << '\n');
 
   MF = &mf;
-  const TargetMachine &TM = MF->getTarget();
-  TRI = TM.getRegisterInfo();
-  TII = TM.getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
   RCI.runOnMachineFunction(mf);
 
   EnableLocalReassign = EnableLocalReassignment ||
-    TM.getSubtargetImpl()->enableRALocalReassignment(TM.getOptLevel());
+                        MF->getSubtarget().enableRALocalReassignment(
+                            MF->getTarget().getOptLevel());
 
   if (VerifyEnabled)
     MF->verify(this, "Before greedy register allocator");
@@ -2355,8 +2558,10 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   NextCascade = 1;
   IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
   GlobalCand.resize(32);  // This will grow as needed.
+  SetOfBrokenHints.clear();
 
   allocatePhysRegs();
+  tryHintsRecoloring();
   releaseMemory();
   return true;
 }
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 8a3b53fd08e5..eb7e563352ac 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -49,9 +49,10 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <limits>
 #include <memory>
+#include <queue>
 #include <set>
 #include <sstream>
 #include <vector>
@@ -61,17 +62,17 @@ using namespace llvm;
 #define DEBUG_TYPE "regalloc"
 
 static RegisterRegAlloc
-registerPBQPRepAlloc("pbqp", "PBQP register allocator",
+RegisterPBQPRepAlloc("pbqp", "PBQP register allocator",
                        createDefaultPBQPRegisterAllocator);
 
 static cl::opt<bool>
-pbqpCoalescing("pbqp-coalescing",
+PBQPCoalescing("pbqp-coalescing",
                 cl::desc("Attempt coalescing during PBQP register allocation."),
                 cl::init(false), cl::Hidden);
 
 #ifndef NDEBUG
 static cl::opt<bool>
-pbqpDumpGraphs("pbqp-dump-graphs",
+PBQPDumpGraphs("pbqp-dump-graphs",
                cl::desc("Dump graphs for each function/round in the compilation unit."),
                cl::init(false), cl::Hidden);
 #endif
@@ -88,8 +89,8 @@ public:
   static char ID;
 
   /// Construct a PBQP register allocator.
-  RegAllocPBQP(std::unique_ptr<PBQPBuilder> b, char *cPassID = nullptr)
-      : MachineFunctionPass(ID), builder(std::move(b)), customPassID(cPassID) {
+  RegAllocPBQP(char *cPassID = nullptr)
+      : MachineFunctionPass(ID), customPassID(cPassID) {
     initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
     initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
     initializeLiveStacksPass(*PassRegistry::getPassRegistry());
@@ -117,301 +118,320 @@ private:
   typedef std::map<RegPair, PBQP::PBQPNum> CoalesceMap;
   typedef std::set<unsigned> RegSet;
 
-  std::unique_ptr<PBQPBuilder> builder;
-
   char *customPassID;
 
-  MachineFunction *mf;
-  const TargetMachine *tm;
-  const TargetRegisterInfo *tri;
-  const TargetInstrInfo *tii;
-  MachineRegisterInfo *mri;
-  const MachineBlockFrequencyInfo *mbfi;
-
-  std::unique_ptr<Spiller> spiller;
-  LiveIntervals *lis;
-  LiveStacks *lss;
-  VirtRegMap *vrm;
-
-  RegSet vregsToAlloc, emptyIntervalVRegs;
+  RegSet VRegsToAlloc, EmptyIntervalVRegs;
 
   /// \brief Finds the initial set of vreg intervals to allocate.
-  void findVRegIntervalsToAlloc();
+  void findVRegIntervalsToAlloc(const MachineFunction &MF, LiveIntervals &LIS);
+
+  /// \brief Constructs an initial graph.
+  void initializeGraph(PBQPRAGraph &G);
 
   /// \brief Given a solved PBQP problem maps this solution back to a register
   /// assignment.
-  bool mapPBQPToRegAlloc(const PBQPRAProblem &problem,
-                         const PBQP::Solution &solution);
+  bool mapPBQPToRegAlloc(const PBQPRAGraph &G,
+                         const PBQP::Solution &Solution,
+                         VirtRegMap &VRM,
+                         Spiller &VRegSpiller);
 
   /// \brief Postprocessing before final spilling. Sets basic block "live in"
   /// variables.
-  void finalizeAlloc() const;
+  void finalizeAlloc(MachineFunction &MF, LiveIntervals &LIS,
+                     VirtRegMap &VRM) const;
 
 };
 
 char RegAllocPBQP::ID = 0;
 
-} // End anonymous namespace.
-
-unsigned PBQPRAProblem::getVRegForNode(PBQPRAGraph::NodeId node) const {
-  Node2VReg::const_iterator vregItr = node2VReg.find(node);
-  assert(vregItr != node2VReg.end() && "No vreg for node.");
-  return vregItr->second;
-}
-
-PBQPRAGraph::NodeId PBQPRAProblem::getNodeForVReg(unsigned vreg) const {
-  VReg2Node::const_iterator nodeItr = vreg2Node.find(vreg);
-  assert(nodeItr != vreg2Node.end() && "No node for vreg.");
-  return nodeItr->second;
-
-}
+/// @brief Set spill costs for each node in the PBQP reg-alloc graph.
+class SpillCosts : public PBQPRAConstraint {
+public:
+  void apply(PBQPRAGraph &G) override {
+    LiveIntervals &LIS = G.getMetadata().LIS;
+
+    // A minimum spill costs, so that register constraints can can be set
+    // without normalization in the [0.0:MinSpillCost( interval.
+    const PBQP::PBQPNum MinSpillCost = 10.0;
+
+    for (auto NId : G.nodeIds()) {
+      PBQP::PBQPNum SpillCost =
+        LIS.getInterval(G.getNodeMetadata(NId).getVReg()).weight;
+      if (SpillCost == 0.0)
+        SpillCost = std::numeric_limits<PBQP::PBQPNum>::min();
+      else
+        SpillCost += MinSpillCost;
+      PBQPRAGraph::RawVector NodeCosts(G.getNodeCosts(NId));
+      NodeCosts[PBQP::RegAlloc::getSpillOptionIdx()] = SpillCost;
+      G.setNodeCosts(NId, std::move(NodeCosts));
+    }
+  }
+};
 
-const PBQPRAProblem::AllowedSet&
-  PBQPRAProblem::getAllowedSet(unsigned vreg) const {
-  AllowedSetMap::const_iterator allowedSetItr = allowedSets.find(vreg);
-  assert(allowedSetItr != allowedSets.end() && "No pregs for vreg.");
-  const AllowedSet &allowedSet = allowedSetItr->second;
-  return allowedSet;
-}
+/// @brief Add interference edges between overlapping vregs.
+class Interference : public PBQPRAConstraint {
+private:
 
-unsigned PBQPRAProblem::getPRegForOption(unsigned vreg, unsigned option) const {
-  assert(isPRegOption(vreg, option) && "Not a preg option.");
+private:
 
-  const AllowedSet& allowedSet = getAllowedSet(vreg);
-  assert(option <= allowedSet.size() && "Option outside allowed set.");
-  return allowedSet[option - 1];
-}
+  typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr;
+  typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IMatrixKey;
+  typedef DenseMap<IMatrixKey, PBQPRAGraph::MatrixPtr> IMatrixCache;
 
-PBQPRAProblem *PBQPBuilder::build(MachineFunction *mf, const LiveIntervals *lis,
-                                  const MachineBlockFrequencyInfo *mbfi,
-                                  const RegSet &vregs) {
+  // Holds (Interval, CurrentSegmentID, and NodeId). The first two are required
+  // for the fast interference graph construction algorithm. The last is there
+  // to save us from looking up node ids via the VRegToNode map in the graph
+  // metadata.
+  typedef std::tuple<LiveInterval*, size_t, PBQP::GraphBase::NodeId>
+    IntervalInfo;
 
-  LiveIntervals *LIS = const_cast<LiveIntervals*>(lis);
-  MachineRegisterInfo *mri = &mf->getRegInfo();
-  const TargetRegisterInfo *tri = mf->getTarget().getRegisterInfo();
+  static SlotIndex getStartPoint(const IntervalInfo &I) {
+    return std::get<0>(I)->segments[std::get<1>(I)].start;
+  }
 
-  std::unique_ptr<PBQPRAProblem> p(new PBQPRAProblem());
-  PBQPRAGraph &g = p->getGraph();
-  RegSet pregs;
+  static SlotIndex getEndPoint(const IntervalInfo &I) {
+    return std::get<0>(I)->segments[std::get<1>(I)].end;
+  }
 
-  // Collect the set of preg intervals, record that they're used in the MF.
-  for (unsigned Reg = 1, e = tri->getNumRegs(); Reg != e; ++Reg) {
-    if (mri->def_empty(Reg))
-      continue;
-    pregs.insert(Reg);
-    mri->setPhysRegUsed(Reg);
+  static PBQP::GraphBase::NodeId getNodeId(const IntervalInfo &I) {
+    return std::get<2>(I);
   }
 
-  // Iterate over vregs.
-  for (RegSet::const_iterator vregItr = vregs.begin(), vregEnd = vregs.end();
-       vregItr != vregEnd; ++vregItr) {
-    unsigned vreg = *vregItr;
-    const TargetRegisterClass *trc = mri->getRegClass(vreg);
-    LiveInterval *vregLI = &LIS->getInterval(vreg);
+  static bool lowestStartPoint(const IntervalInfo &I1,
+                               const IntervalInfo &I2) {
+    // Condition reversed because priority queue has the *highest* element at
+    // the front, rather than the lowest.
+    return getStartPoint(I1) > getStartPoint(I2);
+  }
 
-    // Record any overlaps with regmask operands.
-    BitVector regMaskOverlaps;
-    LIS->checkRegMaskInterference(*vregLI, regMaskOverlaps);
+  static bool lowestEndPoint(const IntervalInfo &I1,
+                             const IntervalInfo &I2) {
+    SlotIndex E1 = getEndPoint(I1);
+    SlotIndex E2 = getEndPoint(I2);
 
-    // Compute an initial allowed set for the current vreg.
-    typedef std::vector<unsigned> VRAllowed;
-    VRAllowed vrAllowed;
-    ArrayRef<MCPhysReg> rawOrder = trc->getRawAllocationOrder(*mf);
-    for (unsigned i = 0; i != rawOrder.size(); ++i) {
-      unsigned preg = rawOrder[i];
-      if (mri->isReserved(preg))
-        continue;
+    if (E1 < E2)
+      return true;
 
-      // vregLI crosses a regmask operand that clobbers preg.
-      if (!regMaskOverlaps.empty() && !regMaskOverlaps.test(preg))
-        continue;
+    if (E1 > E2)
+      return false;
 
-      // vregLI overlaps fixed regunit interference.
-      bool Interference = false;
-      for (MCRegUnitIterator Units(preg, tri); Units.isValid(); ++Units) {
-        if (vregLI->overlaps(LIS->getRegUnit(*Units))) {
-          Interference = true;
-          break;
-        }
-      }
-      if (Interference)
-        continue;
-
-      // preg is usable for this virtual register.
-      vrAllowed.push_back(preg);
-    }
-
-    PBQP::Vector nodeCosts(vrAllowed.size() + 1, 0);
+    // If two intervals end at the same point, we need a way to break the tie or
+    // the set will assume they're actually equal and refuse to insert a
+    // "duplicate". Just compare the vregs - fast and guaranteed unique.
+    return std::get<0>(I1)->reg < std::get<0>(I2)->reg;
+  }
 
-    PBQP::PBQPNum spillCost = (vregLI->weight != 0.0) ?
-        vregLI->weight : std::numeric_limits<PBQP::PBQPNum>::min();
+  static bool isAtLastSegment(const IntervalInfo &I) {
+    return std::get<1>(I) == std::get<0>(I)->size() - 1;
+  }
 
-    addSpillCosts(nodeCosts, spillCost);
+  static IntervalInfo nextSegment(const IntervalInfo &I) {
+    return std::make_tuple(std::get<0>(I), std::get<1>(I) + 1, std::get<2>(I));
+  }
 
-    // Construct the node.
-    PBQPRAGraph::NodeId nId = g.addNode(std::move(nodeCosts));
+public:
 
-    // Record the mapping and allowed set in the problem.
-    p->recordVReg(vreg, nId, vrAllowed.begin(), vrAllowed.end());
+  void apply(PBQPRAGraph &G) override {
+    // The following is loosely based on the linear scan algorithm introduced in
+    // "Linear Scan Register Allocation" by Poletto and Sarkar. This version
+    // isn't linear, because the size of the active set isn't bound by the
+    // number of registers, but rather the size of the largest clique in the
+    // graph. Still, we expect this to be better than N^2.
+    LiveIntervals &LIS = G.getMetadata().LIS;
+
+    // Interferenc matrices are incredibly regular - they're only a function of
+    // the allowed sets, so we cache them to avoid the overhead of constructing
+    // and uniquing them.
+    IMatrixCache C;
+
+    typedef std::set<IntervalInfo, decltype(&lowestEndPoint)> IntervalSet;
+    typedef std::priority_queue<IntervalInfo, std::vector<IntervalInfo>,
+                                decltype(&lowestStartPoint)> IntervalQueue;
+    IntervalSet Active(lowestEndPoint);
+    IntervalQueue Inactive(lowestStartPoint);
+
+    // Start by building the inactive set.
+    for (auto NId : G.nodeIds()) {
+      unsigned VReg = G.getNodeMetadata(NId).getVReg();
+      LiveInterval &LI = LIS.getInterval(VReg);
+      assert(!LI.empty() && "PBQP graph contains node for empty interval");
+      Inactive.push(std::make_tuple(&LI, 0, NId));
+    }
 
-  }
+    while (!Inactive.empty()) {
+      // Tentatively grab the "next" interval - this choice may be overriden
+      // below.
+      IntervalInfo Cur = Inactive.top();
+
+      // Retire any active intervals that end before Cur starts.
+      IntervalSet::iterator RetireItr = Active.begin();
+      while (RetireItr != Active.end() &&
+             (getEndPoint(*RetireItr) <= getStartPoint(Cur))) {
+        // If this interval has subsequent segments, add the next one to the
+        // inactive list.
+        if (!isAtLastSegment(*RetireItr))
+          Inactive.push(nextSegment(*RetireItr));
+
+        ++RetireItr;
+      }
+      Active.erase(Active.begin(), RetireItr);
+
+      // One of the newly retired segments may actually start before the
+      // Cur segment, so re-grab the front of the inactive list.
+      Cur = Inactive.top();
+      Inactive.pop();
+
+      // At this point we know that Cur overlaps all active intervals. Add the
+      // interference edges.
+      PBQP::GraphBase::NodeId NId = getNodeId(Cur);
+      for (const auto &A : Active) {
+        PBQP::GraphBase::NodeId MId = getNodeId(A);
+
+        // Check that we haven't already added this edge
+        // FIXME: findEdge is expensive in the worst case (O(max_clique(G))).
+        //        It might be better to replace this with a local bit-matrix.
+        if (G.findEdge(NId, MId) != PBQPRAGraph::invalidEdgeId())
+          continue;
 
-  for (RegSet::const_iterator vr1Itr = vregs.begin(), vrEnd = vregs.end();
-         vr1Itr != vrEnd; ++vr1Itr) {
-    unsigned vr1 = *vr1Itr;
-    const LiveInterval &l1 = lis->getInterval(vr1);
-    const PBQPRAProblem::AllowedSet &vr1Allowed = p->getAllowedSet(vr1);
-
-    for (RegSet::const_iterator vr2Itr = std::next(vr1Itr); vr2Itr != vrEnd;
-         ++vr2Itr) {
-      unsigned vr2 = *vr2Itr;
-      const LiveInterval &l2 = lis->getInterval(vr2);
-      const PBQPRAProblem::AllowedSet &vr2Allowed = p->getAllowedSet(vr2);
-
-      assert(!l2.empty() && "Empty interval in vreg set?");
-      if (l1.overlaps(l2)) {
-        PBQP::Matrix edgeCosts(vr1Allowed.size()+1, vr2Allowed.size()+1, 0);
-        addInterferenceCosts(edgeCosts, vr1Allowed, vr2Allowed, tri);
-
-        g.addEdge(p->getNodeForVReg(vr1), p->getNodeForVReg(vr2),
-                  std::move(edgeCosts));
+        // This is a new edge - add it to the graph.
+        createInterferenceEdge(G, NId, MId, C);
       }
+
+      // Finally, add Cur to the Active set.
+      Active.insert(Cur);
     }
   }
 
-  return p.release();
-}
+private:
 
-void PBQPBuilder::addSpillCosts(PBQP::Vector &costVec,
-                                PBQP::PBQPNum spillCost) {
-  costVec[0] = spillCost;
-}
+  void createInterferenceEdge(PBQPRAGraph &G, PBQPRAGraph::NodeId NId,
+                              PBQPRAGraph::NodeId MId, IMatrixCache &C) {
 
-void PBQPBuilder::addInterferenceCosts(
-                                    PBQP::Matrix &costMat,
-                                    const PBQPRAProblem::AllowedSet &vr1Allowed,
-                                    const PBQPRAProblem::AllowedSet &vr2Allowed,
-                                    const TargetRegisterInfo *tri) {
-  assert(costMat.getRows() == vr1Allowed.size() + 1 && "Matrix height mismatch.");
-  assert(costMat.getCols() == vr2Allowed.size() + 1 && "Matrix width mismatch.");
+    const TargetRegisterInfo &TRI =
+      *G.getMetadata().MF.getTarget().getSubtargetImpl()->getRegisterInfo();
 
-  for (unsigned i = 0; i != vr1Allowed.size(); ++i) {
-    unsigned preg1 = vr1Allowed[i];
+    const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs();
+    const auto &MRegs = G.getNodeMetadata(MId).getAllowedRegs();
 
-    for (unsigned j = 0; j != vr2Allowed.size(); ++j) {
-      unsigned preg2 = vr2Allowed[j];
+    // Try looking the edge costs up in the IMatrixCache first.
+    IMatrixKey K(&NRegs, &MRegs);
+    IMatrixCache::iterator I = C.find(K);
+    if (I != C.end()) {
+      G.addEdgeBypassingCostAllocator(NId, MId, I->second);
+      return;
+    }
 
-      if (tri->regsOverlap(preg1, preg2)) {
-        costMat[i + 1][j + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
+    PBQPRAGraph::RawMatrix M(NRegs.size() + 1, MRegs.size() + 1, 0);
+    for (unsigned I = 0; I != NRegs.size(); ++I) {
+      unsigned PRegN = NRegs[I];
+      for (unsigned J = 0; J != MRegs.size(); ++J) {
+        unsigned PRegM = MRegs[J];
+        if (TRI.regsOverlap(PRegN, PRegM))
+          M[I + 1][J + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
       }
     }
+
+    PBQPRAGraph::EdgeId EId = G.addEdge(NId, MId, std::move(M));
+    C[K] = G.getEdgeCostsPtr(EId);
   }
-}
+};
 
-PBQPRAProblem *PBQPBuilderWithCoalescing::build(MachineFunction *mf,
-                                                const LiveIntervals *lis,
-                                                const MachineBlockFrequencyInfo *mbfi,
-                                                const RegSet &vregs) {
 
-  std::unique_ptr<PBQPRAProblem> p(PBQPBuilder::build(mf, lis, mbfi, vregs));
-  PBQPRAGraph &g = p->getGraph();
+class Coalescing : public PBQPRAConstraint {
+public:
+  void apply(PBQPRAGraph &G) override {
+    MachineFunction &MF = G.getMetadata().MF;
+    MachineBlockFrequencyInfo &MBFI = G.getMetadata().MBFI;
+    CoalescerPair CP(*MF.getTarget().getSubtargetImpl()->getRegisterInfo());
+
+    // Scan the machine function and add a coalescing cost whenever CoalescerPair
+    // gives the Ok.
+    for (const auto &MBB : MF) {
+      for (const auto &MI : MBB) {
+
+        // Skip not-coalescable or already coalesced copies.
+        if (!CP.setRegisters(&MI) || CP.getSrcReg() == CP.getDstReg())
+          continue;
 
-  const TargetMachine &tm = mf->getTarget();
-  CoalescerPair cp(*tm.getRegisterInfo());
+        unsigned DstReg = CP.getDstReg();
+        unsigned SrcReg = CP.getSrcReg();
 
-  // Scan the machine function and add a coalescing cost whenever CoalescerPair
-  // gives the Ok.
-  for (const auto &mbb : *mf) {
-    for (const auto &mi : mbb) {
-      if (!cp.setRegisters(&mi)) {
-        continue; // Not coalescable.
-      }
+        const float Scale = 1.0f / MBFI.getEntryFreq();
+        PBQP::PBQPNum CBenefit = MBFI.getBlockFreq(&MBB).getFrequency() * Scale;
 
-      if (cp.getSrcReg() == cp.getDstReg()) {
-        continue; // Already coalesced.
-      }
+        if (CP.isPhys()) {
+          if (!MF.getRegInfo().isAllocatable(DstReg))
+            continue;
 
-      unsigned dst = cp.getDstReg(),
-               src = cp.getSrcReg();
+          PBQPRAGraph::NodeId NId = G.getMetadata().getNodeIdForVReg(SrcReg);
 
-      const float copyFactor = 0.5; // Cost of copy relative to load. Current
-      // value plucked randomly out of the air.
+          const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed =
+            G.getNodeMetadata(NId).getAllowedRegs();
 
-      PBQP::PBQPNum cBenefit =
-        copyFactor * LiveIntervals::getSpillWeight(false, true, mbfi, &mi);
+          unsigned PRegOpt = 0;
+          while (PRegOpt < Allowed.size() && Allowed[PRegOpt] != DstReg)
+            ++PRegOpt;
 
-      if (cp.isPhys()) {
-        if (!mf->getRegInfo().isAllocatable(dst)) {
-          continue;
-        }
-
-        const PBQPRAProblem::AllowedSet &allowed = p->getAllowedSet(src);
-        unsigned pregOpt = 0;
-        while (pregOpt < allowed.size() && allowed[pregOpt] != dst) {
-          ++pregOpt;
-        }
-        if (pregOpt < allowed.size()) {
-          ++pregOpt; // +1 to account for spill option.
-          PBQPRAGraph::NodeId node = p->getNodeForVReg(src);
-          llvm::dbgs() << "Reading node costs for node " << node << "\n";
-          llvm::dbgs() << "Source node: " << &g.getNodeCosts(node) << "\n";
-          PBQP::Vector newCosts(g.getNodeCosts(node));
-          addPhysRegCoalesce(newCosts, pregOpt, cBenefit);
-          g.setNodeCosts(node, newCosts);
-        }
-      } else {
-        const PBQPRAProblem::AllowedSet *allowed1 = &p->getAllowedSet(dst);
-        const PBQPRAProblem::AllowedSet *allowed2 = &p->getAllowedSet(src);
-        PBQPRAGraph::NodeId node1 = p->getNodeForVReg(dst);
-        PBQPRAGraph::NodeId node2 = p->getNodeForVReg(src);
-        PBQPRAGraph::EdgeId edge = g.findEdge(node1, node2);
-        if (edge == g.invalidEdgeId()) {
-          PBQP::Matrix costs(allowed1->size() + 1, allowed2->size() + 1, 0);
-          addVirtRegCoalesce(costs, *allowed1, *allowed2, cBenefit);
-          g.addEdge(node1, node2, costs);
+          if (PRegOpt < Allowed.size()) {
+            PBQPRAGraph::RawVector NewCosts(G.getNodeCosts(NId));
+            NewCosts[PRegOpt + 1] -= CBenefit;
+            G.setNodeCosts(NId, std::move(NewCosts));
+          }
         } else {
-          if (g.getEdgeNode1Id(edge) == node2) {
-            std::swap(node1, node2);
-            std::swap(allowed1, allowed2);
+          PBQPRAGraph::NodeId N1Id = G.getMetadata().getNodeIdForVReg(DstReg);
+          PBQPRAGraph::NodeId N2Id = G.getMetadata().getNodeIdForVReg(SrcReg);
+          const PBQPRAGraph::NodeMetadata::AllowedRegVector *Allowed1 =
+            &G.getNodeMetadata(N1Id).getAllowedRegs();
+          const PBQPRAGraph::NodeMetadata::AllowedRegVector *Allowed2 =
+            &G.getNodeMetadata(N2Id).getAllowedRegs();
+
+          PBQPRAGraph::EdgeId EId = G.findEdge(N1Id, N2Id);
+          if (EId == G.invalidEdgeId()) {
+            PBQPRAGraph::RawMatrix Costs(Allowed1->size() + 1,
+                                         Allowed2->size() + 1, 0);
+            addVirtRegCoalesce(Costs, *Allowed1, *Allowed2, CBenefit);
+            G.addEdge(N1Id, N2Id, std::move(Costs));
+          } else {
+            if (G.getEdgeNode1Id(EId) == N2Id) {
+              std::swap(N1Id, N2Id);
+              std::swap(Allowed1, Allowed2);
+            }
+            PBQPRAGraph::RawMatrix Costs(G.getEdgeCosts(EId));
+            addVirtRegCoalesce(Costs, *Allowed1, *Allowed2, CBenefit);
+            G.setEdgeCosts(EId, std::move(Costs));
           }
-          PBQP::Matrix costs(g.getEdgeCosts(edge));
-          addVirtRegCoalesce(costs, *allowed1, *allowed2, cBenefit);
-          g.setEdgeCosts(edge, costs);
         }
       }
     }
   }
 
-  return p.release();
-}
-
-void PBQPBuilderWithCoalescing::addPhysRegCoalesce(PBQP::Vector &costVec,
-                                                   unsigned pregOption,
-                                                   PBQP::PBQPNum benefit) {
-  costVec[pregOption] += -benefit;
-}
-
-void PBQPBuilderWithCoalescing::addVirtRegCoalesce(
-                                    PBQP::Matrix &costMat,
-                                    const PBQPRAProblem::AllowedSet &vr1Allowed,
-                                    const PBQPRAProblem::AllowedSet &vr2Allowed,
-                                    PBQP::PBQPNum benefit) {
-
-  assert(costMat.getRows() == vr1Allowed.size() + 1 && "Size mismatch.");
-  assert(costMat.getCols() == vr2Allowed.size() + 1 && "Size mismatch.");
-
-  for (unsigned i = 0; i != vr1Allowed.size(); ++i) {
-    unsigned preg1 = vr1Allowed[i];
-    for (unsigned j = 0; j != vr2Allowed.size(); ++j) {
-      unsigned preg2 = vr2Allowed[j];
+private:
 
-      if (preg1 == preg2) {
-        costMat[i + 1][j + 1] += -benefit;
+  void addVirtRegCoalesce(
+                    PBQPRAGraph::RawMatrix &CostMat,
+                    const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed1,
+                    const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed2,
+                    PBQP::PBQPNum Benefit) {
+    assert(CostMat.getRows() == Allowed1.size() + 1 && "Size mismatch.");
+    assert(CostMat.getCols() == Allowed2.size() + 1 && "Size mismatch.");
+    for (unsigned I = 0; I != Allowed1.size(); ++I) {
+      unsigned PReg1 = Allowed1[I];
+      for (unsigned J = 0; J != Allowed2.size(); ++J) {
+        unsigned PReg2 = Allowed2[J];
+        if (PReg1 == PReg2)
+          CostMat[I + 1][J + 1] -= Benefit;
       }
     }
   }
-}
 
+};
+
+} // End anonymous namespace.
+
+// Out-of-line destructor/anchor for PBQPRAConstraint.
+PBQPRAConstraint::~PBQPRAConstraint() {}
+void PBQPRAConstraint::anchor() {}
+void PBQPRAConstraintList::anchor() {}
 
 void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   au.setPreservesCFG();
@@ -437,118 +457,197 @@ void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   MachineFunctionPass::getAnalysisUsage(au);
 }
 
-void RegAllocPBQP::findVRegIntervalsToAlloc() {
+void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF,
+                                            LiveIntervals &LIS) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Iterate over all live ranges.
-  for (unsigned i = 0, e = mri->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
-    if (mri->reg_nodbg_empty(Reg))
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
       continue;
-    LiveInterval *li = &lis->getInterval(Reg);
+    LiveInterval &LI = LIS.getInterval(Reg);
 
     // If this live interval is non-empty we will use pbqp to allocate it.
     // Empty intervals we allocate in a simple post-processing stage in
     // finalizeAlloc.
-    if (!li->empty()) {
-      vregsToAlloc.insert(li->reg);
+    if (!LI.empty()) {
+      VRegsToAlloc.insert(LI.reg);
     } else {
-      emptyIntervalVRegs.insert(li->reg);
+      EmptyIntervalVRegs.insert(LI.reg);
+    }
+  }
+}
+
+static bool isACalleeSavedRegister(unsigned reg, const TargetRegisterInfo &TRI,
+                                   const MachineFunction &MF) {
+  const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF);
+  for (unsigned i = 0; CSR[i] != 0; ++i)
+    if (TRI.regsOverlap(reg, CSR[i]))
+      return true;
+  return false;
+}
+
+void RegAllocPBQP::initializeGraph(PBQPRAGraph &G) {
+  MachineFunction &MF = G.getMetadata().MF;
+
+  LiveIntervals &LIS = G.getMetadata().LIS;
+  const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
+  const TargetRegisterInfo &TRI =
+    *G.getMetadata().MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+
+  for (auto VReg : VRegsToAlloc) {
+    const TargetRegisterClass *TRC = MRI.getRegClass(VReg);
+    LiveInterval &VRegLI = LIS.getInterval(VReg);
+
+    // Record any overlaps with regmask operands.
+    BitVector RegMaskOverlaps;
+    LIS.checkRegMaskInterference(VRegLI, RegMaskOverlaps);
+
+    // Compute an initial allowed set for the current vreg.
+    std::vector<unsigned> VRegAllowed;
+    ArrayRef<MCPhysReg> RawPRegOrder = TRC->getRawAllocationOrder(MF);
+    for (unsigned I = 0; I != RawPRegOrder.size(); ++I) {
+      unsigned PReg = RawPRegOrder[I];
+      if (MRI.isReserved(PReg))
+        continue;
+
+      // vregLI crosses a regmask operand that clobbers preg.
+      if (!RegMaskOverlaps.empty() && !RegMaskOverlaps.test(PReg))
+        continue;
+
+      // vregLI overlaps fixed regunit interference.
+      bool Interference = false;
+      for (MCRegUnitIterator Units(PReg, &TRI); Units.isValid(); ++Units) {
+        if (VRegLI.overlaps(LIS.getRegUnit(*Units))) {
+          Interference = true;
+          break;
+        }
+      }
+      if (Interference)
+        continue;
+
+      // preg is usable for this virtual register.
+      VRegAllowed.push_back(PReg);
     }
+
+    PBQPRAGraph::RawVector NodeCosts(VRegAllowed.size() + 1, 0);
+
+    // Tweak cost of callee saved registers, as using then force spilling and
+    // restoring them. This would only happen in the prologue / epilogue though.
+    for (unsigned i = 0; i != VRegAllowed.size(); ++i)
+      if (isACalleeSavedRegister(VRegAllowed[i], TRI, MF))
+        NodeCosts[1 + i] += 1.0;
+
+    PBQPRAGraph::NodeId NId = G.addNode(std::move(NodeCosts));
+    G.getNodeMetadata(NId).setVReg(VReg);
+    G.getNodeMetadata(NId).setAllowedRegs(
+      G.getMetadata().getAllowedRegs(std::move(VRegAllowed)));
+    G.getMetadata().setNodeIdForVReg(VReg, NId);
   }
 }
 
-bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
-                                     const PBQP::Solution &solution) {
+bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAGraph &G,
+                                     const PBQP::Solution &Solution,
+                                     VirtRegMap &VRM,
+                                     Spiller &VRegSpiller) {
+  MachineFunction &MF = G.getMetadata().MF;
+  LiveIntervals &LIS = G.getMetadata().LIS;
+  const TargetRegisterInfo &TRI =
+    *MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+  (void)TRI;
+
   // Set to true if we have any spills
-  bool anotherRoundNeeded = false;
+  bool AnotherRoundNeeded = false;
 
   // Clear the existing allocation.
-  vrm->clearAllVirt();
+  VRM.clearAllVirt();
 
-  const PBQPRAGraph &g = problem.getGraph();
   // Iterate over the nodes mapping the PBQP solution to a register
   // assignment.
-  for (auto NId : g.nodeIds()) {
-    unsigned vreg = problem.getVRegForNode(NId);
-    unsigned alloc = solution.getSelection(NId);
-
-    if (problem.isPRegOption(vreg, alloc)) {
-      unsigned preg = problem.getPRegForOption(vreg, alloc);
-      DEBUG(dbgs() << "VREG " << PrintReg(vreg, tri) << " -> "
-            << tri->getName(preg) << "\n");
-      assert(preg != 0 && "Invalid preg selected.");
-      vrm->assignVirt2Phys(vreg, preg);
-    } else if (problem.isSpillOption(vreg, alloc)) {
-      vregsToAlloc.erase(vreg);
-      SmallVector<unsigned, 8> newSpills;
-      LiveRangeEdit LRE(&lis->getInterval(vreg), newSpills, *mf, *lis, vrm);
-      spiller->spill(LRE);
-
-      DEBUG(dbgs() << "VREG " << PrintReg(vreg, tri) << " -> SPILLED (Cost: "
+  for (auto NId : G.nodeIds()) {
+    unsigned VReg = G.getNodeMetadata(NId).getVReg();
+    unsigned AllocOption = Solution.getSelection(NId);
+
+    if (AllocOption != PBQP::RegAlloc::getSpillOptionIdx()) {
+      unsigned PReg = G.getNodeMetadata(NId).getAllowedRegs()[AllocOption - 1];
+      DEBUG(dbgs() << "VREG " << PrintReg(VReg, &TRI) << " -> "
+            << TRI.getName(PReg) << "\n");
+      assert(PReg != 0 && "Invalid preg selected.");
+      VRM.assignVirt2Phys(VReg, PReg);
+    } else {
+      VRegsToAlloc.erase(VReg);
+      SmallVector<unsigned, 8> NewSpills;
+      LiveRangeEdit LRE(&LIS.getInterval(VReg), NewSpills, MF, LIS, &VRM);
+      VRegSpiller.spill(LRE);
+
+      DEBUG(dbgs() << "VREG " << PrintReg(VReg, &TRI) << " -> SPILLED (Cost: "
                    << LRE.getParent().weight << ", New vregs: ");
 
       // Copy any newly inserted live intervals into the list of regs to
       // allocate.
-      for (LiveRangeEdit::iterator itr = LRE.begin(), end = LRE.end();
-           itr != end; ++itr) {
-        LiveInterval &li = lis->getInterval(*itr);
-        assert(!li.empty() && "Empty spill range.");
-        DEBUG(dbgs() << PrintReg(li.reg, tri) << " ");
-        vregsToAlloc.insert(li.reg);
+      for (LiveRangeEdit::iterator I = LRE.begin(), E = LRE.end();
+           I != E; ++I) {
+        LiveInterval &LI = LIS.getInterval(*I);
+        assert(!LI.empty() && "Empty spill range.");
+        DEBUG(dbgs() << PrintReg(LI.reg, &TRI) << " ");
+        VRegsToAlloc.insert(LI.reg);
       }
 
       DEBUG(dbgs() << ")\n");
 
       // We need another round if spill intervals were added.
-      anotherRoundNeeded |= !LRE.empty();
-    } else {
-      llvm_unreachable("Unknown allocation option.");
+      AnotherRoundNeeded |= !LRE.empty();
     }
   }
 
-  return !anotherRoundNeeded;
+  return !AnotherRoundNeeded;
 }
 
+void RegAllocPBQP::finalizeAlloc(MachineFunction &MF,
+                                 LiveIntervals &LIS,
+                                 VirtRegMap &VRM) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
 
-void RegAllocPBQP::finalizeAlloc() const {
   // First allocate registers for the empty intervals.
   for (RegSet::const_iterator
-         itr = emptyIntervalVRegs.begin(), end = emptyIntervalVRegs.end();
-         itr != end; ++itr) {
-    LiveInterval *li = &lis->getInterval(*itr);
+         I = EmptyIntervalVRegs.begin(), E = EmptyIntervalVRegs.end();
+         I != E; ++I) {
+    LiveInterval &LI = LIS.getInterval(*I);
 
-    unsigned physReg = mri->getSimpleHint(li->reg);
+    unsigned PReg = MRI.getSimpleHint(LI.reg);
 
-    if (physReg == 0) {
-      const TargetRegisterClass *liRC = mri->getRegClass(li->reg);
-      physReg = liRC->getRawAllocationOrder(*mf).front();
+    if (PReg == 0) {
+      const TargetRegisterClass &RC = *MRI.getRegClass(LI.reg);
+      PReg = RC.getRawAllocationOrder(MF).front();
     }
 
-    vrm->assignVirt2Phys(li->reg, physReg);
+    VRM.assignVirt2Phys(LI.reg, PReg);
   }
 }
 
-bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
+static inline float normalizePBQPSpillWeight(float UseDefFreq, unsigned Size,
+                                         unsigned NumInstr) {
+  // All intervals have a spill weight that is mostly proportional to the number
+  // of uses, with uses in loops having a bigger weight.
+  return NumInstr * normalizeSpillWeight(UseDefFreq, Size, 1);
+}
 
-  mf = &MF;
-  tm = &mf->getTarget();
-  tri = tm->getRegisterInfo();
-  tii = tm->getInstrInfo();
-  mri = &mf->getRegInfo();
+bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+  MachineBlockFrequencyInfo &MBFI =
+    getAnalysis<MachineBlockFrequencyInfo>();
 
-  lis = &getAnalysis<LiveIntervals>();
-  lss = &getAnalysis<LiveStacks>();
-  mbfi = &getAnalysis<MachineBlockFrequencyInfo>();
+  calculateSpillWeightsAndHints(LIS, MF, getAnalysis<MachineLoopInfo>(), MBFI,
+                                normalizePBQPSpillWeight);
 
-  calculateSpillWeightsAndHints(*lis, MF, getAnalysis<MachineLoopInfo>(),
-                                *mbfi);
+  VirtRegMap &VRM = getAnalysis<VirtRegMap>();
 
-  vrm = &getAnalysis<VirtRegMap>();
-  spiller.reset(createInlineSpiller(*this, MF, *vrm));
+  std::unique_ptr<Spiller> VRegSpiller(createInlineSpiller(*this, MF, VRM));
 
-  mri->freezeReservedRegs(MF);
+  MF.getRegInfo().freezeReservedRegs(MF);
 
-  DEBUG(dbgs() << "PBQP Register Allocating for " << mf->getName() << "\n");
+  DEBUG(dbgs() << "PBQP Register Allocating for " << MF.getName() << "\n");
 
   // Allocator main loop:
   //
@@ -560,72 +659,72 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   // This process is continued till no more spills are generated.
 
   // Find the vreg intervals in need of allocation.
-  findVRegIntervalsToAlloc();
+  findVRegIntervalsToAlloc(MF, LIS);
 
 #ifndef NDEBUG
-  const Function* func = mf->getFunction();
-  std::string fqn =
-    func->getParent()->getModuleIdentifier() + "." +
-    func->getName().str();
+  const Function &F = *MF.getFunction();
+  std::string FullyQualifiedName =
+    F.getParent()->getModuleIdentifier() + "." + F.getName().str();
 #endif
 
   // If there are non-empty intervals allocate them using pbqp.
-  if (!vregsToAlloc.empty()) {
+  if (!VRegsToAlloc.empty()) {
 
-    bool pbqpAllocComplete = false;
-    unsigned round = 0;
+    const TargetSubtargetInfo &Subtarget = *MF.getTarget().getSubtargetImpl();
+    std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot =
+      llvm::make_unique<PBQPRAConstraintList>();
+    ConstraintsRoot->addConstraint(llvm::make_unique<SpillCosts>());
+    ConstraintsRoot->addConstraint(llvm::make_unique<Interference>());
+    if (PBQPCoalescing)
+      ConstraintsRoot->addConstraint(llvm::make_unique<Coalescing>());
+    ConstraintsRoot->addConstraint(Subtarget.getCustomPBQPConstraints());
 
-    while (!pbqpAllocComplete) {
-      DEBUG(dbgs() << "  PBQP Regalloc round " << round << ":\n");
+    bool PBQPAllocComplete = false;
+    unsigned Round = 0;
 
-      std::unique_ptr<PBQPRAProblem> problem(
-          builder->build(mf, lis, mbfi, vregsToAlloc));
+    while (!PBQPAllocComplete) {
+      DEBUG(dbgs() << "  PBQP Regalloc round " << Round << ":\n");
+
+      PBQPRAGraph G(PBQPRAGraph::GraphMetadata(MF, LIS, MBFI));
+      initializeGraph(G);
+      ConstraintsRoot->apply(G);
 
 #ifndef NDEBUG
-      if (pbqpDumpGraphs) {
-        std::ostringstream rs;
-        rs << round;
-        std::string graphFileName(fqn + "." + rs.str() + ".pbqpgraph");
-        std::string tmp;
-        raw_fd_ostream os(graphFileName.c_str(), tmp, sys::fs::F_Text);
-        DEBUG(dbgs() << "Dumping graph for round " << round << " to \""
-              << graphFileName << "\"\n");
-        problem->getGraph().dump(os);
+      if (PBQPDumpGraphs) {
+        std::ostringstream RS;
+        RS << Round;
+        std::string GraphFileName = FullyQualifiedName + "." + RS.str() +
+                                    ".pbqpgraph";
+        std::error_code EC;
+        raw_fd_ostream OS(GraphFileName, EC, sys::fs::F_Text);
+        DEBUG(dbgs() << "Dumping graph for round " << Round << " to \""
+              << GraphFileName << "\"\n");
+        G.dumpToStream(OS);
       }
 #endif
 
-      PBQP::Solution solution =
-        PBQP::RegAlloc::solve(problem->getGraph());
-
-      pbqpAllocComplete = mapPBQPToRegAlloc(*problem, solution);
-
-      ++round;
+      PBQP::Solution Solution = PBQP::RegAlloc::solve(G);
+      PBQPAllocComplete = mapPBQPToRegAlloc(G, Solution, VRM, *VRegSpiller);
+      ++Round;
     }
   }
 
   // Finalise allocation, allocate empty ranges.
-  finalizeAlloc();
-  vregsToAlloc.clear();
-  emptyIntervalVRegs.clear();
+  finalizeAlloc(MF, LIS, VRM);
+  VRegsToAlloc.clear();
+  EmptyIntervalVRegs.clear();
 
-  DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *vrm << "\n");
+  DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << VRM << "\n");
 
   return true;
 }
 
-FunctionPass *
-llvm::createPBQPRegisterAllocator(std::unique_ptr<PBQPBuilder> builder,
-                                  char *customPassID) {
-  return new RegAllocPBQP(std::move(builder), customPassID);
+FunctionPass *llvm::createPBQPRegisterAllocator(char *customPassID) {
+  return new RegAllocPBQP(customPassID);
 }
 
 FunctionPass* llvm::createDefaultPBQPRegisterAllocator() {
-  std::unique_ptr<PBQPBuilder> Builder;
-  if (pbqpCoalescing)
-    Builder = llvm::make_unique<PBQPBuilderWithCoalescing>();
-  else
-    Builder = llvm::make_unique<PBQPBuilder>();
-  return createPBQPRegisterAllocator(std::move(Builder));
+  return createPBQPRegisterAllocator();
 }
 
 #undef DEBUG_TYPE
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index 8b5445c31624..ab33672d41f3 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -20,7 +20,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -38,8 +37,8 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   MF = &mf;
 
   // Allocate new array the first time we see a new target.
-  if (MF->getTarget().getRegisterInfo() != TRI) {
-    TRI = MF->getTarget().getRegisterInfo();
+  if (MF->getSubtarget().getRegisterInfo() != TRI) {
+    TRI = MF->getSubtarget().getRegisterInfo();
     RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
     unsigned NumPSets = TRI->getNumRegPressureSets();
     PSetLimits.reset(new unsigned[NumPSets]);
@@ -48,6 +47,7 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   }
 
   // Does this MF have different CSRs?
+  assert(TRI && "no register info set");
   const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
   if (Update || CSR != CalleeSaved) {
     // Build a CSRNum map. Every CSR alias gets an entry pointing to the last
@@ -77,6 +77,7 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
 /// registers filtered out. Volatile registers come first followed by CSR
 /// aliases ordered according to the CSR order specified by the target.
 void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
+  assert(RC && "no register class given");
   RCInfo &RCI = RegClass[RC->getID()];
 
   // Raw register count, including all reserved regs.
@@ -138,7 +139,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
   RCI.LastCostChange = LastCostChange;
 
   DEBUG({
-    dbgs() << "AllocationOrder(" << RC->getName() << ") = [";
+    dbgs() << "AllocationOrder(" << TRI->getRegClassName(RC) << ") = [";
     for (unsigned I = 0; I != RCI.NumRegs; ++I)
       dbgs() << ' ' << PrintReg(RCI.Order[I], TRI);
     dbgs() << (RCI.ProperSubClass ? " ] (sub-class)\n" : " ]\n");
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index e04a3cf077ff..b8cae4a59ed5 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -86,6 +87,14 @@ namespace {
     AliasAnalysis *AA;
     RegisterClassInfo RegClassInfo;
 
+    /// A LaneMask to remember on which subregister live ranges we need to call
+    /// shrinkToUses() later.
+    unsigned ShrinkMask;
+
+    /// True if the main range of the currently coalesced intervals should be
+    /// checked for smaller live intervals.
+    bool ShrinkMainRange;
+
     /// \brief True if the coalescer should aggressively coalesce global copies
     /// in favor of keeping local copies.
     bool JoinGlobalCopies;
@@ -94,11 +103,11 @@ namespace {
     /// blocks exclusively containing copies.
     bool JoinSplitEdges;
 
-    /// WorkList - Copy instructions yet to be coalesced.
+    /// Copy instructions yet to be coalesced.
     SmallVector<MachineInstr*, 8> WorkList;
     SmallVector<MachineInstr*, 8> LocalWorkList;
 
-    /// ErasedInstrs - Set of instruction pointers that have been erased, and
+    /// Set of instruction pointers that have been erased, and
     /// that may be present in WorkList.
     SmallPtrSet<MachineInstr*, 8> ErasedInstrs;
 
@@ -114,21 +123,21 @@ namespace {
     /// LiveRangeEdit callback.
     void LRE_WillEraseInstruction(MachineInstr *MI) override;
 
-    /// coalesceLocals - coalesce the LocalWorkList.
+    /// Coalesce the LocalWorkList.
     void coalesceLocals();
 
-    /// joinAllIntervals - join compatible live intervals
+    /// Join compatible live intervals
     void joinAllIntervals();
 
-    /// copyCoalesceInMBB - Coalesce copies in the specified MBB, putting
+    /// Coalesce copies in the specified MBB, putting
     /// copies that cannot yet be coalesced into WorkList.
     void copyCoalesceInMBB(MachineBasicBlock *MBB);
 
-    /// copyCoalesceWorkList - Try to coalesce all copies in CurrList. Return
+    /// Try to coalesce all copies in CurrList. Return
     /// true if any progress was made.
     bool copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList);
 
-    /// joinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+    /// Attempt to join intervals corresponding to SrcReg/DstReg,
     /// which are the src/dst of the copy instruction CopyMI.  This returns
     /// true if the copy was successfully coalesced away. If it is not
     /// currently possible to coalesce this interval, but it may be possible if
@@ -136,7 +145,7 @@ namespace {
     /// 'Again'.
     bool joinCopy(MachineInstr *TheCopy, bool &Again);
 
-    /// joinIntervals - Attempt to join these two intervals.  On failure, this
+    /// Attempt to join these two intervals.  On failure, this
     /// returns false.  The output "SrcInt" will not have been modified, so we
     /// can use this information below to update aliases.
     bool joinIntervals(CoalescerPair &CP);
@@ -147,40 +156,53 @@ namespace {
     /// Attempt joining with a reserved physreg.
     bool joinReservedPhysReg(CoalescerPair &CP);
 
-    /// adjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
+    /// Add the LiveRange @p ToMerge as a subregister liverange of @p LI.
+    /// Subranges in @p LI which only partially interfere with the desired
+    /// LaneMask are split as necessary. @p LaneMask are the lanes that
+    /// @p ToMerge will occupy in the coalescer register. @p LI has its subrange
+    /// lanemasks already adjusted to the coalesced register.
+    void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge,
+                           unsigned LaneMask, CoalescerPair &CP);
+
+    /// Join the liveranges of two subregisters. Joins @p RRange into
+    /// @p LRange, @p RRange may be invalid afterwards.
+    void joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                          unsigned LaneMask, const CoalescerPair &CP);
+
+    /// We found a non-trivially-coalescable copy. If
     /// the source value number is defined by a copy from the destination reg
     /// see if we can merge these two destination reg valno# into a single
     /// value number, eliminating a copy.
     bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
 
-    /// hasOtherReachingDefs - Return true if there are definitions of IntB
+    /// Return true if there are definitions of IntB
     /// other than BValNo val# that can reach uses of AValno val# of IntA.
     bool hasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
                               VNInfo *AValNo, VNInfo *BValNo);
 
-    /// removeCopyByCommutingDef - We found a non-trivially-coalescable copy.
+    /// We found a non-trivially-coalescable copy.
     /// If the source value number is defined by a commutable instruction and
     /// its other operand is coalesced to the copy dest register, see if we
     /// can transform the copy into a noop by commuting the definition.
     bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
 
-    /// reMaterializeTrivialDef - If the source of a copy is defined by a
+    /// If the source of a copy is defined by a
     /// trivial computation, replace the copy by rematerialize the definition.
     bool reMaterializeTrivialDef(CoalescerPair &CP, MachineInstr *CopyMI,
                                  bool &IsDefCopy);
 
-    /// canJoinPhys - Return true if a physreg copy should be joined.
+    /// Return true if a physreg copy should be joined.
     bool canJoinPhys(const CoalescerPair &CP);
 
-    /// updateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+    /// Replace all defs and uses of SrcReg to DstReg and
     /// update the subregister number if it is not zero. If DstReg is a
     /// physical register and the existing subregister number of the def / use
     /// being updated is not zero, make sure to set it to the correct physical
     /// subregister.
     void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
 
-    /// eliminateUndefCopy - Handle copies of undef values.
-    bool eliminateUndefCopy(MachineInstr *CopyMI, const CoalescerPair &CP);
+    /// Handle copies of undef values.
+    bool eliminateUndefCopy(MachineInstr *CopyMI);
 
   public:
     static char ID; // Class identification, replacement for typeinfo
@@ -192,10 +214,10 @@ namespace {
 
     void releaseMemory() override;
 
-    /// runOnMachineFunction - pass entry point
+    /// This is the pass entry point.
     bool runOnMachineFunction(MachineFunction&) override;
 
-    /// print - Implement the dump method.
+    /// Implement the dump method.
     void print(raw_ostream &O, const Module* = nullptr) const override;
   };
 } /// end anonymous namespace
@@ -407,7 +429,7 @@ void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
   ErasedInstrs.insert(MI);
 }
 
-/// adjustCopiesBackFrom - We found a non-trivially-coalescable copy with IntA
+/// We found a non-trivially-coalescable copy with IntA
 /// being the source and IntB being the dest, thus this defines a value number
 /// in IntB.  If the source value number (in IntA) is defined by a copy from B,
 /// see if we can merge these two pieces of B into a single value number,
@@ -492,6 +514,16 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   // Okay, merge "B1" into the same value number as "B0".
   if (BValNo != ValS->valno)
     IntB.MergeValueNumberInto(BValNo, ValS->valno);
+
+  // Do the same for the subregister segments.
+  for (LiveInterval::SubRange &S : IntB.subranges()) {
+    VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
+    S.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, SubBValNo));
+    VNInfo *SubValSNo = S.getVNInfoAt(AValNo->def.getPrevSlot());
+    if (SubBValNo != SubValSNo)
+      S.MergeValueNumberInto(SubBValNo, SubValSNo);
+  }
+
   DEBUG(dbgs() << "   result = " << IntB << '\n');
 
   // If the source instruction was killing the source register before the
@@ -512,7 +544,7 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   return true;
 }
 
-/// hasOtherReachingDefs - Return true if there are definitions of IntB
+/// Return true if there are definitions of IntB
 /// other than BValNo val# that can reach uses of AValno val# of IntA.
 bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
                                              LiveInterval &IntB,
@@ -523,26 +555,37 @@ bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
   if (LIS->hasPHIKill(IntA, AValNo))
     return true;
 
-  for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
-       AI != AE; ++AI) {
-    if (AI->valno != AValNo) continue;
+  for (LiveRange::Segment &ASeg : IntA.segments) {
+    if (ASeg.valno != AValNo) continue;
     LiveInterval::iterator BI =
-      std::upper_bound(IntB.begin(), IntB.end(), AI->start);
+      std::upper_bound(IntB.begin(), IntB.end(), ASeg.start);
     if (BI != IntB.begin())
       --BI;
-    for (; BI != IntB.end() && AI->end >= BI->start; ++BI) {
+    for (; BI != IntB.end() && ASeg.end >= BI->start; ++BI) {
       if (BI->valno == BValNo)
         continue;
-      if (BI->start <= AI->start && BI->end > AI->start)
+      if (BI->start <= ASeg.start && BI->end > ASeg.start)
         return true;
-      if (BI->start > AI->start && BI->start < AI->end)
+      if (BI->start > ASeg.start && BI->start < ASeg.end)
         return true;
     }
   }
   return false;
 }
 
-/// removeCopyByCommutingDef - We found a non-trivially-coalescable copy with
+/// Copy segements with value number @p SrcValNo from liverange @p Src to live
+/// range @Dst and use value number @p DstValNo there.
+static void addSegmentsWithValNo(LiveRange &Dst, VNInfo *DstValNo,
+                                 const LiveRange &Src, const VNInfo *SrcValNo)
+{
+  for (const LiveRange::Segment &S : Src.segments) {
+    if (S.valno != SrcValNo)
+      continue;
+    Dst.addSegment(LiveRange::Segment(S.start, S.end, DstValNo));
+  }
+}
+
+/// We found a non-trivially-coalescable copy with
 /// IntA being the source and IntB being the dest, thus this defines a value
 /// number in IntB.  If the source value number (in IntA) is defined by a
 /// commutable instruction and its other operand is coalesced to the copy dest
@@ -559,7 +602,7 @@ bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
 ///
 ///  B2 = op B0 A2<kill>
 ///    ...
-///  B1 = B2      <- now an identify copy
+///  B1 = B2      <- now an identity copy
 ///    ...
 ///     = op B2   <- more uses
 ///
@@ -567,25 +610,23 @@ bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
 ///
 bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
                                                  MachineInstr *CopyMI) {
-  assert (!CP.isPhys());
-
-  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
+  assert(!CP.isPhys());
 
   LiveInterval &IntA =
-    LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+      LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
   LiveInterval &IntB =
-    LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+      LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
 
   // BValNo is a value number in B that is defined by a copy from A. 'B1' in
   // the example above.
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
   VNInfo *BValNo = IntB.getVNInfoAt(CopyIdx);
-  if (!BValNo || BValNo->def != CopyIdx)
-    return false;
+  assert(BValNo != nullptr && BValNo->def == CopyIdx);
 
   // AValNo is the value number in A that defines the copy, A3 in the example.
   VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
-  assert(AValNo && "COPY source not live");
-  if (AValNo->isPHIDef() || AValNo->isUnused())
+  assert(AValNo && !AValNo->isUnused() && "COPY source not live");
+  if (AValNo->isPHIDef())
     return false;
   MachineInstr *DefMI = LIS->getInstructionFromIndex(AValNo->def);
   if (!DefMI)
@@ -652,8 +693,6 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     MBB->insert(Pos, NewMI);
     MBB->erase(DefMI);
   }
-  unsigned OpIdx = NewMI->findRegisterUseOperandIdx(IntA.reg, false);
-  NewMI->getOperand(OpIdx).setIsKill();
 
   // If ALR and BLR overlaps and end of BLR extends beyond end of ALR, e.g.
   // A = or A, B
@@ -666,10 +705,13 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   // Update uses of IntA of the specific Val# with IntB.
   for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg),
-         UE = MRI->use_end(); UI != UE;) {
+                                         UE = MRI->use_end();
+       UI != UE; /* ++UI is below because of possible MI removal */) {
     MachineOperand &UseMO = *UI;
-    MachineInstr *UseMI = UseMO.getParent();
     ++UI;
+    if (UseMO.isUndef())
+      continue;
+    MachineInstr *UseMI = UseMO.getParent();
     if (UseMI->isDebugValue()) {
       // FIXME These don't have an instruction index.  Not clear we have enough
       // info to decide whether to do this replacement or not.  For now do it.
@@ -678,7 +720,8 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     }
     SlotIndex UseIdx = LIS->getInstructionIndex(UseMI).getRegSlot(true);
     LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
-    if (US == IntA.end() || US->valno != AValNo)
+    assert(US != IntA.end() && "Use must be live");
+    if (US->valno != AValNo)
       continue;
     // Kill flags are no longer accurate. They are recomputed after RA.
     UseMO.setIsKill(false);
@@ -703,6 +746,16 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI);
     assert(DVNI->def == DefIdx);
     BValNo = IntB.MergeValueNumberInto(BValNo, DVNI);
+    for (LiveInterval::SubRange &S : IntB.subranges()) {
+      VNInfo *SubDVNI = S.getVNInfoAt(DefIdx);
+      if (!SubDVNI)
+        continue;
+      VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
+      assert(SubBValNo->def == CopyIdx);
+      VNInfo *Merged = S.MergeValueNumberInto(SubBValNo, SubDVNI);
+      Merged->def = CopyIdx;
+    }
+
     ErasedInstrs.insert(UseMI);
     LIS->RemoveMachineInstrFromMaps(UseMI);
     UseMI->eraseFromParent();
@@ -710,22 +763,77 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   // Extend BValNo by merging in IntA live segments of AValNo. Val# definition
   // is updated.
-  VNInfo *ValNo = BValNo;
-  ValNo->def = AValNo->def;
-  for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
-       AI != AE; ++AI) {
-    if (AI->valno != AValNo) continue;
-    IntB.addSegment(LiveInterval::Segment(AI->start, AI->end, ValNo));
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  if (IntB.hasSubRanges()) {
+    if (!IntA.hasSubRanges()) {
+      unsigned Mask = MRI->getMaxLaneMaskForVReg(IntA.reg);
+      IntA.createSubRangeFrom(Allocator, Mask, IntA);
+    }
+    SlotIndex AIdx = CopyIdx.getRegSlot(true);
+    for (LiveInterval::SubRange &SA : IntA.subranges()) {
+      VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);
+      assert(ASubValNo != nullptr);
+
+      unsigned AMask = SA.LaneMask;
+      for (LiveInterval::SubRange &SB : IntB.subranges()) {
+        unsigned BMask = SB.LaneMask;
+        unsigned Common = BMask & AMask;
+        if (Common == 0)
+          continue;
+
+        DEBUG(
+            dbgs() << format("\t\tCopy+Merge %04X into %04X\n", BMask, Common));
+        unsigned BRest = BMask & ~AMask;
+        LiveInterval::SubRange *CommonRange;
+        if (BRest != 0) {
+          SB.LaneMask = BRest;
+          DEBUG(dbgs() << format("\t\tReduce Lane to %04X\n", BRest));
+          // Duplicate SubRange for newly merged common stuff.
+          CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB);
+        } else {
+          // We van reuse the L SubRange.
+          SB.LaneMask = Common;
+          CommonRange = &SB;
+        }
+        LiveRange RangeCopy(SB, Allocator);
+
+        VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx);
+        assert(BSubValNo->def == CopyIdx);
+        BSubValNo->def = ASubValNo->def;
+        addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo);
+        AMask &= ~BMask;
+      }
+      if (AMask != 0) {
+        DEBUG(dbgs() << format("\t\tNew Lane %04X\n", AMask));
+        LiveRange *NewRange = IntB.createSubRange(Allocator, AMask);
+        VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator);
+        addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo);
+      }
+      SA.removeValNo(ASubValNo);
+    }
   }
+
+  BValNo->def = AValNo->def;
+  addSegmentsWithValNo(IntB, BValNo, IntA, AValNo);
   DEBUG(dbgs() << "\t\textended: " << IntB << '\n');
 
   IntA.removeValNo(AValNo);
+  // Remove valuenos in subranges (the A+B have subranges case has already been
+  // handled above)
+  if (!IntB.hasSubRanges()) {
+    SlotIndex AIdx = CopyIdx.getRegSlot(true);
+    for (LiveInterval::SubRange &SA : IntA.subranges()) {
+      VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);
+      assert(ASubValNo != nullptr);
+      SA.removeValNo(ASubValNo);
+    }
+  }
   DEBUG(dbgs() << "\t\ttrimmed:  " << IntA << '\n');
   ++numCommutes;
   return true;
 }
 
-/// reMaterializeTrivialDef - If the source of a copy is defined by a trivial
+/// If the source of a copy is defined by a trivial
 /// computation, replace the copy by rematerialize the definition.
 bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
                                                 MachineInstr *CopyMI,
@@ -751,7 +859,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
     IsDefCopy = true;
     return false;
   }
-  if (!DefMI->isAsCheapAsAMove())
+  if (!TII->isAsCheapAsAMove(DefMI))
     return false;
   if (!TII->isTriviallyReMaterializable(DefMI, AA))
     return false;
@@ -898,57 +1006,110 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
 
   // The source interval can become smaller because we removed a use.
   LIS->shrinkToUses(&SrcInt, &DeadDefs);
-  if (!DeadDefs.empty())
+  if (!DeadDefs.empty()) {
+    // If the virtual SrcReg is completely eliminated, update all DBG_VALUEs
+    // to describe DstReg instead.
+    for (MachineOperand &UseMO : MRI->use_operands(SrcReg)) {
+      MachineInstr *UseMI = UseMO.getParent();
+      if (UseMI->isDebugValue()) {
+        UseMO.setReg(DstReg);
+        DEBUG(dbgs() << "\t\tupdated: " << *UseMI);
+      }
+    }
     eliminateDeadDefs();
+  }
 
   return true;
 }
 
-/// eliminateUndefCopy - ProcessImpicitDefs may leave some copies of <undef>
+static void removeUndefValue(LiveRange &LR, SlotIndex At)
+{
+  VNInfo *VNInfo = LR.getVNInfoAt(At);
+  assert(VNInfo != nullptr && SlotIndex::isSameInstr(VNInfo->def, At));
+  LR.removeValNo(VNInfo);
+}
+
+/// ProcessImpicitDefs may leave some copies of <undef>
 /// values, it only removes local variables. When we have a copy like:
 ///
 ///   %vreg1 = COPY %vreg2<undef>
 ///
 /// We delete the copy and remove the corresponding value number from %vreg1.
 /// Any uses of that value number are marked as <undef>.
-bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI,
-                                           const CoalescerPair &CP) {
+bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
+  // Note that we do not query CoalescerPair here but redo isMoveInstr as the
+  // CoalescerPair may have a new register class with adjusted subreg indices
+  // at this point.
+  unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+  isMoveInstr(*TRI, CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
+
   SlotIndex Idx = LIS->getInstructionIndex(CopyMI);
-  LiveInterval *SrcInt = &LIS->getInterval(CP.getSrcReg());
-  if (SrcInt->liveAt(Idx))
-    return false;
-  LiveInterval *DstInt = &LIS->getInterval(CP.getDstReg());
-  if (DstInt->liveAt(Idx))
+  const LiveInterval &SrcLI = LIS->getInterval(SrcReg);
+  // CopyMI is undef iff SrcReg is not live before the instruction.
+  if (SrcSubIdx != 0 && SrcLI.hasSubRanges()) {
+    unsigned SrcMask = TRI->getSubRegIndexLaneMask(SrcSubIdx);
+    for (const LiveInterval::SubRange &SR : SrcLI.subranges()) {
+      if ((SR.LaneMask & SrcMask) == 0)
+        continue;
+      if (SR.liveAt(Idx))
+        return false;
+    }
+  } else if (SrcLI.liveAt(Idx))
     return false;
 
-  // No intervals are live-in to CopyMI - it is undef.
-  if (CP.isFlipped())
-    DstInt = SrcInt;
-  SrcInt = nullptr;
+  DEBUG(dbgs() << "\tEliminating copy of <undef> value\n");
 
-  VNInfo *DeadVNI = DstInt->getVNInfoAt(Idx.getRegSlot());
-  assert(DeadVNI && "No value defined in DstInt");
-  DstInt->removeValNo(DeadVNI);
+  // Remove any DstReg segments starting at the instruction.
+  LiveInterval &DstLI = LIS->getInterval(DstReg);
+  unsigned DstMask = TRI->getSubRegIndexLaneMask(DstSubIdx);
+  SlotIndex RegIndex = Idx.getRegSlot();
+  for (LiveInterval::SubRange &SR : DstLI.subranges()) {
+    if ((SR.LaneMask & DstMask) == 0)
+      continue;
+    removeUndefValue(SR, RegIndex);
+
+    DstLI.removeEmptySubRanges();
+  }
+  // Remove value or merge with previous one in case of a subregister def.
+  if (VNInfo *PrevVNI = DstLI.getVNInfoAt(Idx)) {
+    VNInfo *VNInfo = DstLI.getVNInfoAt(RegIndex);
+    DstLI.MergeValueNumberInto(VNInfo, PrevVNI);
+  } else {
+    removeUndefValue(DstLI, RegIndex);
+  }
 
-  // Find new undef uses.
-  for (MachineOperand &MO : MRI->reg_nodbg_operands(DstInt->reg)) {
-    if (MO.isDef() || MO.isUndef())
+  // Mark uses as undef.
+  for (MachineOperand &MO : MRI->reg_nodbg_operands(DstReg)) {
+    if (MO.isDef() /*|| MO.isUndef()*/)
       continue;
-    MachineInstr *MI = MO.getParent();
-    SlotIndex Idx = LIS->getInstructionIndex(MI);
-    if (DstInt->liveAt(Idx))
+    const MachineInstr &MI = *MO.getParent();
+    SlotIndex UseIdx = LIS->getInstructionIndex(&MI);
+    unsigned UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+    bool isLive;
+    if (UseMask != ~0u && DstLI.hasSubRanges()) {
+      isLive = false;
+      for (const LiveInterval::SubRange &SR : DstLI.subranges()) {
+        if ((SR.LaneMask & UseMask) == 0)
+          continue;
+        if (SR.liveAt(UseIdx)) {
+          isLive = true;
+          break;
+        }
+      }
+    } else
+      isLive = DstLI.liveAt(UseIdx);
+    if (isLive)
       continue;
     MO.setIsUndef(true);
-    DEBUG(dbgs() << "\tnew undef: " << Idx << '\t' << *MI);
+    DEBUG(dbgs() << "\tnew undef: " << UseIdx << '\t' << MI);
   }
   return true;
 }
 
-/// updateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
-/// update the subregister number if it is not zero. If DstReg is a
-/// physical register and the existing subregister number of the def / use
-/// being updated is not zero, make sure to set it to the correct physical
-/// subregister.
+/// Replace all defs and uses of SrcReg to DstReg and update the subregister
+/// number if it is not zero. If DstReg is a physical register and the existing
+/// subregister number of the def / use being updated is not zero, make sure to
+/// set it to the correct physical subregister.
 void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
                                           unsigned DstReg,
                                           unsigned SubIdx) {
@@ -966,7 +1127,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
     // the UseMI operands removes them from the SrcReg use-def chain, but when
     // SrcReg is DstReg we could encounter UseMI twice if it has multiple
     // operands mentioning the virtual register.
-    if (SrcReg == DstReg && !Visited.insert(UseMI))
+    if (SrcReg == DstReg && !Visited.insert(UseMI).second)
       continue;
 
     SmallVector<unsigned,8> Ops;
@@ -988,6 +1149,40 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
       if (SubIdx && MO.isDef())
         MO.setIsUndef(!Reads);
 
+      // A subreg use of a partially undef (super) register may be a complete
+      // undef use now and then has to be marked that way.
+      if (SubIdx != 0 && MO.isUse() && MRI->tracksSubRegLiveness()) {
+        if (!DstInt->hasSubRanges()) {
+          BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+          unsigned Mask = MRI->getMaxLaneMaskForVReg(DstInt->reg);
+          DstInt->createSubRangeFrom(Allocator, Mask, *DstInt);
+        }
+        unsigned Mask = TRI->getSubRegIndexLaneMask(SubIdx);
+        bool IsUndef = true;
+        SlotIndex MIIdx = UseMI->isDebugValue()
+          ? LIS->getSlotIndexes()->getIndexBefore(UseMI)
+          : LIS->getInstructionIndex(UseMI);
+        SlotIndex UseIdx = MIIdx.getRegSlot(true);
+        for (LiveInterval::SubRange &S : DstInt->subranges()) {
+          if ((S.LaneMask & Mask) == 0)
+            continue;
+          if (S.liveAt(UseIdx)) {
+            IsUndef = false;
+            break;
+          }
+        }
+        if (IsUndef) {
+          MO.setIsUndef(true);
+          // We found out some subregister use is actually reading an undefined
+          // value. In some cases the whole vreg has become undefined at this
+          // point so we have to potentially shrink the main range if the
+          // use was ending a live segment there.
+          LiveQueryResult Q = DstInt->Query(MIIdx);
+          if (Q.valueOut() == nullptr)
+            ShrinkMainRange = true;
+        }
+      }
+
       if (DstIsPhys)
         MO.substPhysReg(DstReg, *TRI);
       else
@@ -1003,7 +1198,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
   }
 }
 
-/// canJoinPhys - Return true if a copy involving a physreg should be joined.
+/// Return true if a copy involving a physreg should be joined.
 bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
   /// Always join simple intervals that are defined by a single copy from a
   /// reserved register. This doesn't increase register pressure, so it is
@@ -1021,7 +1216,7 @@ bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
   return false;
 }
 
-/// joinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+/// Attempt to join intervals corresponding to SrcReg/DstReg,
 /// which are the src/dst of the copy instruction CopyMI.  This returns true
 /// if the copy was successfully coalesced away. If it is not currently
 /// possible to coalesce this interval, but it may be possible if other
@@ -1064,8 +1259,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   }
 
   // Eliminate undefs.
-  if (!CP.isPhys() && eliminateUndefCopy(CopyMI, CP)) {
-    DEBUG(dbgs() << "\tEliminated copy of <undef> value.\n");
+  if (!CP.isPhys() && eliminateUndefCopy(CopyMI)) {
     LIS->RemoveMachineInstrFromMaps(CopyMI);
     CopyMI->eraseFromParent();
     return false;  // Not coalescable.
@@ -1077,12 +1271,22 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   if (CP.getSrcReg() == CP.getDstReg()) {
     LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
     DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
-    LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(CopyMI));
+    const SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI);
+    LiveQueryResult LRQ = LI.Query(CopyIdx);
     if (VNInfo *DefVNI = LRQ.valueDefined()) {
       VNInfo *ReadVNI = LRQ.valueIn();
       assert(ReadVNI && "No value before copy and no <undef> flag.");
       assert(ReadVNI != DefVNI && "Cannot read and define the same value.");
       LI.MergeValueNumberInto(DefVNI, ReadVNI);
+
+      // Process subregister liveranges.
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        LiveQueryResult SLRQ = S.Query(CopyIdx);
+        if (VNInfo *SDefVNI = SLRQ.valueDefined()) {
+          VNInfo *SReadVNI = SLRQ.valueIn();
+          S.MergeValueNumberInto(SDefVNI, SReadVNI);
+        }
+      }
       DEBUG(dbgs() << "\tMerged values:          " << LI << '\n');
     }
     LIS->RemoveMachineInstrFromMaps(CopyMI);
@@ -1106,9 +1310,14 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
       return false;
     }
   } else {
+    // When possible, let DstReg be the larger interval.
+    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).size() >
+                           LIS->getInterval(CP.getDstReg()).size())
+      CP.flip();
+
     DEBUG({
-      dbgs() << "\tConsidering merging to " << CP.getNewRC()->getName()
-             << " with ";
+      dbgs() << "\tConsidering merging to "
+             << TRI->getRegClassName(CP.getNewRC()) << " with ";
       if (CP.getDstIdx() && CP.getSrcIdx())
         dbgs() << PrintReg(CP.getDstReg()) << " in "
                << TRI->getSubRegIndexName(CP.getDstIdx()) << " and "
@@ -1118,13 +1327,11 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
         dbgs() << PrintReg(CP.getSrcReg(), TRI) << " in "
                << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n';
     });
-
-    // When possible, let DstReg be the larger interval.
-    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).size() >
-                           LIS->getInterval(CP.getDstReg()).size())
-      CP.flip();
   }
 
+  ShrinkMask = 0;
+  ShrinkMainRange = false;
+
   // Okay, attempt to join these two intervals.  On failure, this returns false.
   // Otherwise, if one of the intervals being joined is a physreg, this method
   // always canonicalizes DstInt to be it.  The output "SrcInt" will not have
@@ -1179,6 +1386,22 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
   updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
 
+  // Shrink subregister ranges if necessary.
+  if (ShrinkMask != 0) {
+    LiveInterval &LI = LIS->getInterval(CP.getDstReg());
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      if ((S.LaneMask & ShrinkMask) == 0)
+        continue;
+      DEBUG(dbgs() << "Shrink LaneUses (Lane "
+                   << format("%04X", S.LaneMask) << ")\n");
+      LIS->shrinkToUses(S, LI.reg);
+    }
+  }
+  if (ShrinkMainRange) {
+    LiveInterval &LI = LIS->getInterval(CP.getDstReg());
+    LIS->shrinkToUses(&LI);
+  }
+
   // SrcReg is guaranteed to be the register whose live interval that is
   // being merged.
   LIS->removeInterval(CP.getSrcReg());
@@ -1187,7 +1410,9 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   TRI->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
 
   DEBUG({
-    dbgs() << "\tJoined. Result = ";
+    dbgs() << "\tSuccess: " << PrintReg(CP.getSrcReg(), TRI, CP.getSrcIdx())
+           << " -> " << PrintReg(CP.getDstReg(), TRI, CP.getDstIdx()) << '\n';
+    dbgs() << "\tResult = ";
     if (CP.isPhys())
       dbgs() << PrintReg(CP.getDstReg(), TRI);
     else
@@ -1305,13 +1530,26 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
 namespace {
 /// Track information about values in a single virtual register about to be
 /// joined. Objects of this class are always created in pairs - one for each
-/// side of the CoalescerPair.
+/// side of the CoalescerPair (or one for each lane of a side of the coalescer
+/// pair)
 class JoinVals {
-  LiveInterval &LI;
-
-  // Location of this register in the final joined register.
-  // Either CP.DstIdx or CP.SrcIdx.
-  unsigned SubIdx;
+  /// Live range we work on.
+  LiveRange &LR;
+  /// (Main) register we work on.
+  const unsigned Reg;
+
+  // Reg (and therefore the values in this liverange) will end up as subregister
+  // SubIdx in the coalesced register. Either CP.DstIdx or CP.SrcIdx.
+  const unsigned SubIdx;
+  // The LaneMask that this liverange will occupy the coalesced register. May be
+  // smaller than the lanemask produced by SubIdx when merging subranges.
+  const unsigned LaneMask;
+
+  /// This is true when joining sub register ranges, false when joining main
+  /// ranges.
+  const bool SubRangeJoin;
+  /// Whether the current LiveInterval tracks subregister liveness.
+  const bool TrackSubRegLiveness;
 
   // Values that will be present in the final live range.
   SmallVectorImpl<VNInfo*> &NewVNInfo;
@@ -1402,27 +1640,28 @@ class JoinVals {
   // One entry per value number in LI.
   SmallVector<Val, 8> Vals;
 
-  unsigned computeWriteLanes(const MachineInstr *DefMI, bool &Redef);
-  VNInfo *stripCopies(VNInfo *VNI);
+  unsigned computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const;
+  std::pair<const VNInfo*,unsigned> followCopyChain(const VNInfo *VNI) const;
+  bool valuesIdentical(VNInfo *Val0, VNInfo *Val1, const JoinVals &Other) const;
   ConflictResolution analyzeValue(unsigned ValNo, JoinVals &Other);
   void computeAssignment(unsigned ValNo, JoinVals &Other);
   bool taintExtent(unsigned, unsigned, JoinVals&,
                    SmallVectorImpl<std::pair<SlotIndex, unsigned> >&);
-  bool usesLanes(MachineInstr *MI, unsigned, unsigned, unsigned);
+  bool usesLanes(const MachineInstr *MI, unsigned, unsigned, unsigned) const;
   bool isPrunedValue(unsigned ValNo, JoinVals &Other);
 
 public:
-  JoinVals(LiveInterval &li, unsigned subIdx,
-           SmallVectorImpl<VNInfo*> &newVNInfo,
-           const CoalescerPair &cp,
-           LiveIntervals *lis,
-           const TargetRegisterInfo *tri)
-    : LI(li), SubIdx(subIdx), NewVNInfo(newVNInfo), CP(cp), LIS(lis),
-      Indexes(LIS->getSlotIndexes()), TRI(tri),
-      Assignments(LI.getNumValNums(), -1), Vals(LI.getNumValNums())
+  JoinVals(LiveRange &LR, unsigned Reg, unsigned SubIdx, unsigned LaneMask,
+           SmallVectorImpl<VNInfo*> &newVNInfo, const CoalescerPair &cp,
+           LiveIntervals *lis, const TargetRegisterInfo *TRI, bool SubRangeJoin,
+           bool TrackSubRegLiveness)
+    : LR(LR), Reg(Reg), SubIdx(SubIdx), LaneMask(LaneMask),
+      SubRangeJoin(SubRangeJoin), TrackSubRegLiveness(TrackSubRegLiveness),
+      NewVNInfo(newVNInfo), CP(cp), LIS(lis), Indexes(LIS->getSlotIndexes()),
+      TRI(TRI), Assignments(LR.getNumValNums(), -1), Vals(LR.getNumValNums())
   {}
 
-  /// Analyze defs in LI and compute a value mapping in NewVNInfo.
+  /// Analyze defs in LR and compute a value mapping in NewVNInfo.
   /// Returns false if any conflicts were impossible to resolve.
   bool mapValues(JoinVals &Other);
 
@@ -1430,16 +1669,22 @@ public:
   /// Returns false if any conflicts were impossible to resolve.
   bool resolveConflicts(JoinVals &Other);
 
-  /// Prune the live range of values in Other.LI where they would conflict with
-  /// CR_Replace values in LI. Collect end points for restoring the live range
+  /// Prune the live range of values in Other.LR where they would conflict with
+  /// CR_Replace values in LR. Collect end points for restoring the live range
   /// after joining.
-  void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints);
+  void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints,
+                   bool changeInstrs);
+
+  // Removes subranges starting at copies that get removed. This sometimes
+  // happens when undefined subranges are copied around. These ranges contain
+  // no usefull information and can be removed.
+  void pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask);
 
   /// Erase any machine instructions that have been coalesced away.
   /// Add erased instructions to ErasedInstrs.
   /// Add foreign virtual registers to ShrinkRegs if their live range ended at
   /// the erased instrs.
-  void eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs,
+  void eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
                    SmallVectorImpl<unsigned> &ShrinkRegs);
 
   /// Get the value assignments suitable for passing to LiveInterval::join.
@@ -1450,10 +1695,11 @@ public:
 /// Compute the bitmask of lanes actually written by DefMI.
 /// Set Redef if there are any partial register definitions that depend on the
 /// previous value of the register.
-unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef) {
+unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef)
+  const {
   unsigned L = 0;
   for (ConstMIOperands MO(DefMI); MO.isValid(); ++MO) {
-    if (!MO->isReg() || MO->getReg() != LI.reg || !MO->isDef())
+    if (!MO->isReg() || MO->getReg() != Reg || !MO->isDef())
       continue;
     L |= TRI->getSubRegIndexLaneMask(
            TRI->composeSubRegIndices(SubIdx, MO->getSubReg()));
@@ -1464,21 +1710,64 @@ unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef) {
 }
 
 /// Find the ultimate value that VNI was copied from.
-VNInfo *JoinVals::stripCopies(VNInfo *VNI) {
+std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
+    const VNInfo *VNI) const {
+  unsigned Reg = this->Reg;
+
   while (!VNI->isPHIDef()) {
-    MachineInstr *MI = Indexes->getInstructionFromIndex(VNI->def);
+    SlotIndex Def = VNI->def;
+    MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
     assert(MI && "No defining instruction");
     if (!MI->isFullCopy())
+      return std::make_pair(VNI, Reg);
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+      return std::make_pair(VNI, Reg);
+
+    const LiveInterval &LI = LIS->getInterval(SrcReg);
+    const VNInfo *ValueIn;
+    // No subrange involved.
+    if (!SubRangeJoin || !LI.hasSubRanges()) {
+      LiveQueryResult LRQ = LI.Query(Def);
+      ValueIn = LRQ.valueIn();
+    } else {
+      // Query subranges. Pick the first matching one.
+      ValueIn = nullptr;
+      for (const LiveInterval::SubRange &S : LI.subranges()) {
+        // Transform lanemask to a mask in the joined live interval.
+        unsigned SMask = TRI->composeSubRegIndexLaneMask(SubIdx, S.LaneMask);
+        if ((SMask & LaneMask) == 0)
+          continue;
+        LiveQueryResult LRQ = S.Query(Def);
+        ValueIn = LRQ.valueIn();
+        break;
+      }
+    }
+    if (ValueIn == nullptr)
       break;
-    unsigned Reg = MI->getOperand(1).getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
-      break;
-    LiveQueryResult LRQ = LIS->getInterval(Reg).Query(VNI->def);
-    if (!LRQ.valueIn())
-      break;
-    VNI = LRQ.valueIn();
+    VNI = ValueIn;
+    Reg = SrcReg;
   }
-  return VNI;
+  return std::make_pair(VNI, Reg);
+}
+
+bool JoinVals::valuesIdentical(VNInfo *Value0, VNInfo *Value1,
+                               const JoinVals &Other) const {
+  const VNInfo *Orig0;
+  unsigned Reg0;
+  std::tie(Orig0, Reg0) = followCopyChain(Value0);
+  if (Orig0 == Value1)
+    return true;
+
+  const VNInfo *Orig1;
+  unsigned Reg1;
+  std::tie(Orig1, Reg1) = Other.followCopyChain(Value1);
+
+  // The values are equal if they are defined at the same place and use the
+  // same register. Note that we cannot compare VNInfos directly as some of
+  // them might be from a copy created in mergeSubRangeInto()  while the other
+  // is from the original LiveInterval.
+  return Orig0->def == Orig1->def && Reg0 == Reg1;
 }
 
 /// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
@@ -1492,7 +1781,7 @@ JoinVals::ConflictResolution
 JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   Val &V = Vals[ValNo];
   assert(!V.isAnalyzed() && "Value has already been analyzed!");
-  VNInfo *VNI = LI.getValNumInfo(ValNo);
+  VNInfo *VNI = LR.getValNumInfo(ValNo);
   if (VNI->isUnused()) {
     V.WriteLanes = ~0u;
     return CR_Keep;
@@ -1502,46 +1791,56 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   const MachineInstr *DefMI = nullptr;
   if (VNI->isPHIDef()) {
     // Conservatively assume that all lanes in a PHI are valid.
-    V.ValidLanes = V.WriteLanes = TRI->getSubRegIndexLaneMask(SubIdx);
+    unsigned Lanes = SubRangeJoin ? 1 : TRI->getSubRegIndexLaneMask(SubIdx);
+    V.ValidLanes = V.WriteLanes = Lanes;
   } else {
     DefMI = Indexes->getInstructionFromIndex(VNI->def);
-    bool Redef = false;
-    V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);
-
-    // If this is a read-modify-write instruction, there may be more valid
-    // lanes than the ones written by this instruction.
-    // This only covers partial redef operands. DefMI may have normal use
-    // operands reading the register. They don't contribute valid lanes.
-    //
-    // This adds ssub1 to the set of valid lanes in %src:
-    //
-    //   %src:ssub1<def> = FOO
-    //
-    // This leaves only ssub1 valid, making any other lanes undef:
-    //
-    //   %src:ssub1<def,read-undef> = FOO %src:ssub2
-    //
-    // The <read-undef> flag on the def operand means that old lane values are
-    // not important.
-    if (Redef) {
-      V.RedefVNI = LI.Query(VNI->def).valueIn();
-      assert(V.RedefVNI && "Instruction is reading nonexistent value");
-      computeAssignment(V.RedefVNI->id, Other);
-      V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
-    }
+    assert(DefMI != nullptr);
+    if (SubRangeJoin) {
+      // We don't care about the lanes when joining subregister ranges.
+      V.ValidLanes = V.WriteLanes = 1;
+    } else {
+      bool Redef = false;
+      V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);
+
+      // If this is a read-modify-write instruction, there may be more valid
+      // lanes than the ones written by this instruction.
+      // This only covers partial redef operands. DefMI may have normal use
+      // operands reading the register. They don't contribute valid lanes.
+      //
+      // This adds ssub1 to the set of valid lanes in %src:
+      //
+      //   %src:ssub1<def> = FOO
+      //
+      // This leaves only ssub1 valid, making any other lanes undef:
+      //
+      //   %src:ssub1<def,read-undef> = FOO %src:ssub2
+      //
+      // The <read-undef> flag on the def operand means that old lane values are
+      // not important.
+      if (Redef) {
+        V.RedefVNI = LR.Query(VNI->def).valueIn();
+        assert((TrackSubRegLiveness || V.RedefVNI) &&
+               "Instruction is reading nonexistent value");
+        if (V.RedefVNI != nullptr) {
+          computeAssignment(V.RedefVNI->id, Other);
+          V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
+        }
+      }
 
-    // An IMPLICIT_DEF writes undef values.
-    if (DefMI->isImplicitDef()) {
-      // We normally expect IMPLICIT_DEF values to be live only until the end
-      // of their block. If the value is really live longer and gets pruned in
-      // another block, this flag is cleared again.
-      V.ErasableImplicitDef = true;
-      V.ValidLanes &= ~V.WriteLanes;
+      // An IMPLICIT_DEF writes undef values.
+      if (DefMI->isImplicitDef()) {
+        // We normally expect IMPLICIT_DEF values to be live only until the end
+        // of their block. If the value is really live longer and gets pruned in
+        // another block, this flag is cleared again.
+        V.ErasableImplicitDef = true;
+        V.ValidLanes &= ~V.WriteLanes;
+      }
     }
   }
 
   // Find the value in Other that overlaps VNI->def, if any.
-  LiveQueryResult OtherLRQ = Other.LI.Query(VNI->def);
+  LiveQueryResult OtherLRQ = Other.LR.Query(VNI->def);
 
   // It is possible that both values are defined by the same instruction, or
   // the values are PHIs defined in the same block. When that happens, the two
@@ -1611,8 +1910,14 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
     return CR_Replace;
 
   // Check for simple erasable conflicts.
-  if (DefMI->isImplicitDef())
+  if (DefMI->isImplicitDef()) {
+    // We need the def for the subregister if there is nothing else live at the
+    // subrange at this point.
+    if (TrackSubRegLiveness
+        && (V.WriteLanes & (OtherV.ValidLanes | OtherV.WriteLanes)) == 0)
+      return CR_Replace;
     return CR_Erase;
+  }
 
   // Include the non-conflict where DefMI is a coalescable copy that kills
   // OtherVNI. We still want the copy erased and value numbers merged.
@@ -1633,8 +1938,8 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   //   %other = COPY %ext
   //   %this  = COPY %ext <-- Erase this copy
   //
-  if (DefMI->isFullCopy() && !CP.isPartial() &&
-      stripCopies(VNI) == stripCopies(V.OtherVNI))
+  if (DefMI->isFullCopy() && !CP.isPartial()
+      && valuesIdentical(VNI, V.OtherVNI, Other))
     return CR_Erase;
 
   // If the lanes written by this instruction were all undef in OtherVNI, it is
@@ -1669,7 +1974,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   // VNI is clobbering live lanes in OtherVNI, but there is still the
   // possibility that no instructions actually read the clobbered lanes.
   // If we're clobbering all the lanes in OtherVNI, at least one must be read.
-  // Otherwise Other.LI wouldn't be live here.
+  // Otherwise Other.RI wouldn't be live here.
   if ((TRI->getSubRegIndexLaneMask(Other.SubIdx) & ~V.WriteLanes) == 0)
     return CR_Impossible;
 
@@ -1690,7 +1995,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   return CR_Unresolved;
 }
 
-/// Compute the value assignment for ValNo in LI.
+/// Compute the value assignment for ValNo in RI.
 /// This may be called recursively by analyzeValue(), but never for a ValNo on
 /// the stack.
 void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
@@ -1708,42 +2013,48 @@ void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
     assert(V.OtherVNI && "OtherVNI not assigned, can't merge.");
     assert(Other.Vals[V.OtherVNI->id].isAnalyzed() && "Missing recursion");
     Assignments[ValNo] = Other.Assignments[V.OtherVNI->id];
-    DEBUG(dbgs() << "\t\tmerge " << PrintReg(LI.reg) << ':' << ValNo << '@'
-                 << LI.getValNumInfo(ValNo)->def << " into "
-                 << PrintReg(Other.LI.reg) << ':' << V.OtherVNI->id << '@'
+    DEBUG(dbgs() << "\t\tmerge " << PrintReg(Reg) << ':' << ValNo << '@'
+                 << LR.getValNumInfo(ValNo)->def << " into "
+                 << PrintReg(Other.Reg) << ':' << V.OtherVNI->id << '@'
                  << V.OtherVNI->def << " --> @"
                  << NewVNInfo[Assignments[ValNo]]->def << '\n');
     break;
   case CR_Replace:
-  case CR_Unresolved:
+  case CR_Unresolved: {
     // The other value is going to be pruned if this join is successful.
     assert(V.OtherVNI && "OtherVNI not assigned, can't prune");
-    Other.Vals[V.OtherVNI->id].Pruned = true;
+    Val &OtherV = Other.Vals[V.OtherVNI->id];
+    // We cannot erase an IMPLICIT_DEF if we don't have valid values for all
+    // its lanes.
+    if ((OtherV.WriteLanes & ~V.ValidLanes) != 0 && TrackSubRegLiveness)
+      OtherV.ErasableImplicitDef = false;
+    OtherV.Pruned = true;
+  }
     // Fall through.
   default:
     // This value number needs to go in the final joined live range.
     Assignments[ValNo] = NewVNInfo.size();
-    NewVNInfo.push_back(LI.getValNumInfo(ValNo));
+    NewVNInfo.push_back(LR.getValNumInfo(ValNo));
     break;
   }
 }
 
 bool JoinVals::mapValues(JoinVals &Other) {
-  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
     computeAssignment(i, Other);
     if (Vals[i].Resolution == CR_Impossible) {
-      DEBUG(dbgs() << "\t\tinterference at " << PrintReg(LI.reg) << ':' << i
-                   << '@' << LI.getValNumInfo(i)->def << '\n');
+      DEBUG(dbgs() << "\t\tinterference at " << PrintReg(Reg) << ':' << i
+                   << '@' << LR.getValNumInfo(i)->def << '\n');
       return false;
     }
   }
   return true;
 }
 
-/// Assuming ValNo is going to clobber some valid lanes in Other.LI, compute
+/// Assuming ValNo is going to clobber some valid lanes in Other.LR, compute
 /// the extent of the tainted lanes in the block.
 ///
-/// Multiple values in Other.LI can be affected since partial redefinitions can
+/// Multiple values in Other.LR can be affected since partial redefinitions can
 /// preserve previously tainted lanes.
 ///
 ///   1 %dst = VLOAD           <-- Define all lanes in %dst
@@ -1758,23 +2069,23 @@ bool JoinVals::mapValues(JoinVals &Other) {
 bool JoinVals::
 taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other,
             SmallVectorImpl<std::pair<SlotIndex, unsigned> > &TaintExtent) {
-  VNInfo *VNI = LI.getValNumInfo(ValNo);
+  VNInfo *VNI = LR.getValNumInfo(ValNo);
   MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
   SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB);
 
-  // Scan Other.LI from VNI.def to MBBEnd.
-  LiveInterval::iterator OtherI = Other.LI.find(VNI->def);
-  assert(OtherI != Other.LI.end() && "No conflict?");
+  // Scan Other.LR from VNI.def to MBBEnd.
+  LiveInterval::iterator OtherI = Other.LR.find(VNI->def);
+  assert(OtherI != Other.LR.end() && "No conflict?");
   do {
     // OtherI is pointing to a tainted value. Abort the join if the tainted
     // lanes escape the block.
     SlotIndex End = OtherI->end;
     if (End >= MBBEnd) {
-      DEBUG(dbgs() << "\t\ttaints global " << PrintReg(Other.LI.reg) << ':'
+      DEBUG(dbgs() << "\t\ttaints global " << PrintReg(Other.Reg) << ':'
                    << OtherI->valno->id << '@' << OtherI->start << '\n');
       return false;
     }
-    DEBUG(dbgs() << "\t\ttaints local " << PrintReg(Other.LI.reg) << ':'
+    DEBUG(dbgs() << "\t\ttaints local " << PrintReg(Other.Reg) << ':'
                  << OtherI->valno->id << '@' << OtherI->start
                  << " to " << End << '\n');
     // A dead def is not a problem.
@@ -1783,7 +2094,7 @@ taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other,
     TaintExtent.push_back(std::make_pair(End, TaintedLanes));
 
     // Check for another def in the MBB.
-    if (++OtherI == Other.LI.end() || OtherI->start >= MBBEnd)
+    if (++OtherI == Other.LR.end() || OtherI->start >= MBBEnd)
       break;
 
     // Lanes written by the new def are no longer tainted.
@@ -1797,8 +2108,8 @@ taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other,
 
 /// Return true if MI uses any of the given Lanes from Reg.
 /// This does not include partial redefinitions of Reg.
-bool JoinVals::usesLanes(MachineInstr *MI, unsigned Reg, unsigned SubIdx,
-                         unsigned Lanes) {
+bool JoinVals::usesLanes(const MachineInstr *MI, unsigned Reg, unsigned SubIdx,
+                         unsigned Lanes) const {
   if (MI->isDebugValue())
     return false;
   for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
@@ -1814,16 +2125,19 @@ bool JoinVals::usesLanes(MachineInstr *MI, unsigned Reg, unsigned SubIdx,
 }
 
 bool JoinVals::resolveConflicts(JoinVals &Other) {
-  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
     Val &V = Vals[i];
     assert (V.Resolution != CR_Impossible && "Unresolvable conflict");
     if (V.Resolution != CR_Unresolved)
       continue;
-    DEBUG(dbgs() << "\t\tconflict at " << PrintReg(LI.reg) << ':' << i
-                 << '@' << LI.getValNumInfo(i)->def << '\n');
+    DEBUG(dbgs() << "\t\tconflict at " << PrintReg(Reg) << ':' << i
+                 << '@' << LR.getValNumInfo(i)->def << '\n');
+    if (SubRangeJoin)
+      return false;
+
     ++NumLaneConflicts;
     assert(V.OtherVNI && "Inconsistent conflict resolution.");
-    VNInfo *VNI = LI.getValNumInfo(i);
+    VNInfo *VNI = LR.getValNumInfo(i);
     const Val &OtherV = Other.Vals[V.OtherVNI->id];
 
     // VNI is known to clobber some lanes in OtherVNI. If we go ahead with the
@@ -1853,7 +2167,7 @@ bool JoinVals::resolveConflicts(JoinVals &Other) {
     unsigned TaintNum = 0;
     for(;;) {
       assert(MI != MBB->end() && "Bad LastMI");
-      if (usesLanes(MI, Other.LI.reg, Other.SubIdx, TaintedLanes)) {
+      if (usesLanes(MI, Other.Reg, Other.SubIdx, TaintedLanes)) {
         DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
         return false;
       }
@@ -1875,7 +2189,7 @@ bool JoinVals::resolveConflicts(JoinVals &Other) {
   return true;
 }
 
-// Determine if ValNo is a copy of a value number in LI or Other.LI that will
+// Determine if ValNo is a copy of a value number in LR or Other.LR that will
 // be pruned:
 //
 //   %dst = COPY %src
@@ -1898,15 +2212,16 @@ bool JoinVals::isPrunedValue(unsigned ValNo, JoinVals &Other) {
 }
 
 void JoinVals::pruneValues(JoinVals &Other,
-                           SmallVectorImpl<SlotIndex> &EndPoints) {
-  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
-    SlotIndex Def = LI.getValNumInfo(i)->def;
+                           SmallVectorImpl<SlotIndex> &EndPoints,
+                           bool changeInstrs) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    SlotIndex Def = LR.getValNumInfo(i)->def;
     switch (Vals[i].Resolution) {
     case CR_Keep:
       break;
     case CR_Replace: {
-      // This value takes precedence over the value in Other.LI.
-      LIS->pruneValue(&Other.LI, Def, &EndPoints);
+      // This value takes precedence over the value in Other.LR.
+      LIS->pruneValue(Other.LR, Def, &EndPoints);
       // Check if we're replacing an IMPLICIT_DEF value. The IMPLICIT_DEF
       // instructions are only inserted to provide a live-out value for PHI
       // predecessors, so the instruction should simply go away once its value
@@ -1915,34 +2230,37 @@ void JoinVals::pruneValues(JoinVals &Other,
       bool EraseImpDef = OtherV.ErasableImplicitDef &&
                          OtherV.Resolution == CR_Keep;
       if (!Def.isBlock()) {
-        // Remove <def,read-undef> flags. This def is now a partial redef.
-        // Also remove <def,dead> flags since the joined live range will
-        // continue past this instruction.
-        for (MIOperands MO(Indexes->getInstructionFromIndex(Def));
-             MO.isValid(); ++MO)
-          if (MO->isReg() && MO->isDef() && MO->getReg() == LI.reg) {
-            MO->setIsUndef(EraseImpDef);
-            MO->setIsDead(false);
+        if (changeInstrs) {
+          // Remove <def,read-undef> flags. This def is now a partial redef.
+          // Also remove <def,dead> flags since the joined live range will
+          // continue past this instruction.
+          for (MIOperands MO(Indexes->getInstructionFromIndex(Def));
+               MO.isValid(); ++MO) {
+            if (MO->isReg() && MO->isDef() && MO->getReg() == Reg) {
+              MO->setIsUndef(EraseImpDef);
+              MO->setIsDead(false);
+            }
           }
+        }
         // This value will reach instructions below, but we need to make sure
         // the live range also reaches the instruction at Def.
         if (!EraseImpDef)
           EndPoints.push_back(Def);
       }
-      DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.LI.reg) << " at " << Def
-                   << ": " << Other.LI << '\n');
+      DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.Reg) << " at " << Def
+                   << ": " << Other.LR << '\n');
       break;
     }
     case CR_Erase:
     case CR_Merge:
       if (isPrunedValue(i, Other)) {
-        // This value is ultimately a copy of a pruned value in LI or Other.LI.
+        // This value is ultimately a copy of a pruned value in LR or Other.LR.
         // We can no longer trust the value mapping computed by
         // computeAssignment(), the value that was originally copied could have
         // been replaced.
-        LIS->pruneValue(&LI, Def, &EndPoints);
-        DEBUG(dbgs() << "\t\tpruned all of " << PrintReg(LI.reg) << " at "
-                     << Def << ": " << LI << '\n');
+        LIS->pruneValue(LR, Def, &EndPoints);
+        DEBUG(dbgs() << "\t\tpruned all of " << PrintReg(Reg) << " at "
+                     << Def << ": " << LR << '\n');
       }
       break;
     case CR_Unresolved:
@@ -1952,11 +2270,49 @@ void JoinVals::pruneValues(JoinVals &Other,
   }
 }
 
-void JoinVals::eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs,
+void JoinVals::pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask)
+{
+  // Look for values being erased.
+  bool DidPrune = false;
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    if (Vals[i].Resolution != CR_Erase)
+      continue;
+
+    // Check subranges at the point where the copy will be removed.
+    SlotIndex Def = LR.getValNumInfo(i)->def;
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      LiveQueryResult Q = S.Query(Def);
+
+      // If a subrange starts at the copy then an undefined value has been
+      // copied and we must remove that subrange value as well.
+      VNInfo *ValueOut = Q.valueOutOrDead();
+      if (ValueOut != nullptr && Q.valueIn() == nullptr) {
+        DEBUG(dbgs() << "\t\tPrune sublane " << format("%04X", S.LaneMask)
+                     << " at " << Def << "\n");
+        LIS->pruneValue(S, Def, nullptr);
+        DidPrune = true;
+        // Mark value number as unused.
+        ValueOut->markUnused();
+        continue;
+      }
+      // If a subrange ends at the copy, then a value was copied but only
+      // partially used later. Shrink the subregister range apropriately.
+      if (Q.valueIn() != nullptr && Q.valueOut() == nullptr) {
+        DEBUG(dbgs() << "\t\tDead uses at sublane "
+                     << format("%04X", S.LaneMask) << " at " << Def << "\n");
+        ShrinkMask |= S.LaneMask;
+      }
+    }
+  }
+  if (DidPrune)
+    LI.removeEmptySubRanges();
+}
+
+void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
                            SmallVectorImpl<unsigned> &ShrinkRegs) {
-  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
     // Get the def location before markUnused() below invalidates it.
-    SlotIndex Def = LI.getValNumInfo(i)->def;
+    SlotIndex Def = LR.getValNumInfo(i)->def;
     switch (Vals[i].Resolution) {
     case CR_Keep:
       // If an IMPLICIT_DEF value is pruned, it doesn't serve a purpose any
@@ -1964,12 +2320,12 @@ void JoinVals::eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs,
       // PHIElimination to guarantee that all PHI predecessors have a value.
       if (!Vals[i].ErasableImplicitDef || !Vals[i].Pruned)
         break;
-      // Remove value number i from LI. Note that this VNInfo is still present
+      // Remove value number i from LR. Note that this VNInfo is still present
       // in NewVNInfo, so it will appear as an unused value number in the final
       // joined interval.
-      LI.getValNumInfo(i)->markUnused();
-      LI.removeValNo(LI.getValNumInfo(i));
-      DEBUG(dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LI << '\n');
+      LR.getValNumInfo(i)->markUnused();
+      LR.removeValNo(LR.getValNumInfo(i));
+      DEBUG(dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LR << '\n');
       // FALL THROUGH.
 
     case CR_Erase: {
@@ -1993,12 +2349,96 @@ void JoinVals::eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs,
   }
 }
 
+void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                                         unsigned LaneMask,
+                                         const CoalescerPair &CP) {
+  SmallVector<VNInfo*, 16> NewVNInfo;
+  JoinVals RHSVals(RRange, CP.getSrcReg(), CP.getSrcIdx(), LaneMask,
+                   NewVNInfo, CP, LIS, TRI, true, true);
+  JoinVals LHSVals(LRange, CP.getDstReg(), CP.getDstIdx(), LaneMask,
+                   NewVNInfo, CP, LIS, TRI, true, true);
+
+  /// Compute NewVNInfo and resolve conflicts (see also joinVirtRegs())
+  /// Conflicts should already be resolved so the mapping/resolution should
+  /// always succeed.
+  if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals))
+    llvm_unreachable("Can't join subrange although main ranges are compatible");
+  if (!LHSVals.resolveConflicts(RHSVals) || !RHSVals.resolveConflicts(LHSVals))
+    llvm_unreachable("Can't join subrange although main ranges are compatible");
+
+  // The merging algorithm in LiveInterval::join() can't handle conflicting
+  // value mappings, so we need to remove any live ranges that overlap a
+  // CR_Replace resolution. Collect a set of end points that can be used to
+  // restore the live range after joining.
+  SmallVector<SlotIndex, 8> EndPoints;
+  LHSVals.pruneValues(RHSVals, EndPoints, false);
+  RHSVals.pruneValues(LHSVals, EndPoints, false);
+
+  LRange.verify();
+  RRange.verify();
+
+  // Join RRange into LHS.
+  LRange.join(RRange, LHSVals.getAssignments(), RHSVals.getAssignments(),
+              NewVNInfo);
+
+  DEBUG(dbgs() << "\t\tjoined lanes: " << LRange << "\n");
+  if (EndPoints.empty())
+    return;
+
+  // Recompute the parts of the live range we had to remove because of
+  // CR_Replace conflicts.
+  DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
+               << " points: " << LRange << '\n');
+  LIS->extendToIndices(LRange, EndPoints);
+}
+
+void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
+                                          const LiveRange &ToMerge,
+                                          unsigned LaneMask, CoalescerPair &CP) {
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  for (LiveInterval::SubRange &R : LI.subranges()) {
+    unsigned RMask = R.LaneMask;
+    // LaneMask of subregisters common to subrange R and ToMerge.
+    unsigned Common = RMask & LaneMask;
+    // There is nothing to do without common subregs.
+    if (Common == 0)
+      continue;
+
+    DEBUG(dbgs() << format("\t\tCopy+Merge %04X into %04X\n", RMask, Common));
+    // LaneMask of subregisters contained in the R range but not in ToMerge,
+    // they have to split into their own subrange.
+    unsigned LRest = RMask & ~LaneMask;
+    LiveInterval::SubRange *CommonRange;
+    if (LRest != 0) {
+      R.LaneMask = LRest;
+      DEBUG(dbgs() << format("\t\tReduce Lane to %04X\n", LRest));
+      // Duplicate SubRange for newly merged common stuff.
+      CommonRange = LI.createSubRangeFrom(Allocator, Common, R);
+    } else {
+      // Reuse the existing range.
+      R.LaneMask = Common;
+      CommonRange = &R;
+    }
+    LiveRange RangeCopy(ToMerge, Allocator);
+    joinSubRegRanges(*CommonRange, RangeCopy, Common, CP);
+    LaneMask &= ~RMask;
+  }
+
+  if (LaneMask != 0) {
+    DEBUG(dbgs() << format("\t\tNew Lane %04X\n", LaneMask));
+    LI.createSubRangeFrom(Allocator, LaneMask, ToMerge);
+  }
+}
+
 bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   SmallVector<VNInfo*, 16> NewVNInfo;
   LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
   LiveInterval &LHS = LIS->getInterval(CP.getDstReg());
-  JoinVals RHSVals(RHS, CP.getSrcIdx(), NewVNInfo, CP, LIS, TRI);
-  JoinVals LHSVals(LHS, CP.getDstIdx(), NewVNInfo, CP, LIS, TRI);
+  bool TrackSubRegLiveness = MRI->tracksSubRegLiveness();
+  JoinVals RHSVals(RHS, CP.getSrcReg(), CP.getSrcIdx(), 0, NewVNInfo, CP, LIS,
+                   TRI, false, TrackSubRegLiveness);
+  JoinVals LHSVals(LHS, CP.getDstReg(), CP.getDstIdx(), 0, NewVNInfo, CP, LIS,
+                   TRI, false, TrackSubRegLiveness);
 
   DEBUG(dbgs() << "\t\tRHS = " << RHS
                << "\n\t\tLHS = " << LHS
@@ -2014,14 +2454,55 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
     return false;
 
   // All clear, the live ranges can be merged.
+  if (RHS.hasSubRanges() || LHS.hasSubRanges()) {
+    BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+
+    // Transform lanemasks from the LHS to masks in the coalesced register and
+    // create initial subranges if necessary.
+    unsigned DstIdx = CP.getDstIdx();
+    if (!LHS.hasSubRanges()) {
+      unsigned Mask = DstIdx == 0 ? CP.getNewRC()->getLaneMask()
+                                  : TRI->getSubRegIndexLaneMask(DstIdx);
+      // LHS must support subregs or we wouldn't be in this codepath.
+      assert(Mask != 0);
+      LHS.createSubRangeFrom(Allocator, Mask, LHS);
+    } else if (DstIdx != 0) {
+      // Transform LHS lanemasks to new register class if necessary.
+      for (LiveInterval::SubRange &R : LHS.subranges()) {
+        unsigned Mask = TRI->composeSubRegIndexLaneMask(DstIdx, R.LaneMask);
+        R.LaneMask = Mask;
+      }
+    }
+    DEBUG(dbgs() << "\t\tLHST = " << PrintReg(CP.getDstReg())
+                 << ' ' << LHS << '\n');
+
+    // Determine lanemasks of RHS in the coalesced register and merge subranges.
+    unsigned SrcIdx = CP.getSrcIdx();
+    if (!RHS.hasSubRanges()) {
+      unsigned Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask()
+                                  : TRI->getSubRegIndexLaneMask(SrcIdx);
+      mergeSubRangeInto(LHS, RHS, Mask, CP);
+    } else {
+      // Pair up subranges and merge.
+      for (LiveInterval::SubRange &R : RHS.subranges()) {
+        unsigned Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask);
+        mergeSubRangeInto(LHS, R, Mask, CP);
+      }
+    }
+
+    DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n");
+
+    LHSVals.pruneSubRegValues(LHS, ShrinkMask);
+    RHSVals.pruneSubRegValues(LHS, ShrinkMask);
+  }
 
   // The merging algorithm in LiveInterval::join() can't handle conflicting
   // value mappings, so we need to remove any live ranges that overlap a
   // CR_Replace resolution. Collect a set of end points that can be used to
   // restore the live range after joining.
   SmallVector<SlotIndex, 8> EndPoints;
-  LHSVals.pruneValues(RHSVals, EndPoints);
-  RHSVals.pruneValues(LHSVals, EndPoints);
+  LHSVals.pruneValues(RHSVals, EndPoints, true);
+  RHSVals.pruneValues(LHSVals, EndPoints, true);
 
   // Erase COPY and IMPLICIT_DEF instructions. This may cause some external
   // registers to require trimming.
@@ -2040,19 +2521,18 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   MRI->clearKillFlags(LHS.reg);
   MRI->clearKillFlags(RHS.reg);
 
-  if (EndPoints.empty())
-    return true;
+  if (!EndPoints.empty()) {
+    // Recompute the parts of the live range we had to remove because of
+    // CR_Replace conflicts.
+    DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
+                 << " points: " << LHS << '\n');
+    LIS->extendToIndices((LiveRange&)LHS, EndPoints);
+  }
 
-  // Recompute the parts of the live range we had to remove because of
-  // CR_Replace conflicts.
-  DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
-               << " points: " << LHS << '\n');
-  LIS->extendToIndices(LHS, EndPoints);
   return true;
 }
 
-/// joinIntervals - Attempt to join these two intervals.  On failure, this
-/// returns false.
+/// Attempt to join these two intervals.  On failure, this returns false.
 bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
   return CP.isPhys() ? joinReservedPhysReg(CP) : joinVirtRegs(CP);
 }
@@ -2224,8 +2704,8 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   MF = &fn;
   MRI = &fn.getRegInfo();
   TM = &fn.getTarget();
-  TRI = TM->getRegisterInfo();
-  TII = TM->getInstrInfo();
+  TRI = TM->getSubtargetImpl()->getRegisterInfo();
+  TII = TM->getSubtargetImpl()->getInstrInfo();
   LIS = &getAnalysis<LiveIntervals>();
   AA = &getAnalysis<AliasAnalysis>();
   Loops = &getAnalysis<MachineLoopInfo>();
@@ -2266,7 +2746,22 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
       continue;
     if (MRI->recomputeRegClass(Reg, *TM)) {
       DEBUG(dbgs() << PrintReg(Reg) << " inflated to "
-                   << MRI->getRegClass(Reg)->getName() << '\n');
+                   << TRI->getRegClassName(MRI->getRegClass(Reg)) << '\n');
+      LiveInterval &LI = LIS->getInterval(Reg);
+      unsigned MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
+      if (MaxMask == 0) {
+        // If the inflated register class does not support subregisters anymore
+        // remove the subranges.
+        LI.clearSubRanges();
+      } else {
+        // If subranges are still supported, then the same subregs should still
+        // be supported.
+#ifndef NDEBUG
+        for (LiveInterval::SubRange &S : LI.subranges()) {
+          assert ((S.LaneMask & ~MaxMask) == 0);
+        }
+#endif
+      }
       ++NumInflated;
     }
   }
@@ -2277,7 +2772,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   return true;
 }
 
-/// print - Implement the dump method.
+/// Implement the dump method.
 void RegisterCoalescer::print(raw_ostream &O, const Module* m) const {
    LIS->print(O, m);
 }
diff --git a/lib/CodeGen/RegisterCoalescer.h b/lib/CodeGen/RegisterCoalescer.h
index e57ceab37d0d..04067a1427af 100644
--- a/lib/CodeGen/RegisterCoalescer.h
+++ b/lib/CodeGen/RegisterCoalescer.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_REGISTER_COALESCER_H
-#define LLVM_CODEGEN_REGISTER_COALESCER_H
+#ifndef LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
+#define LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
 
 namespace llvm {
 
@@ -22,38 +22,36 @@ namespace llvm {
   class TargetRegisterClass;
   class TargetInstrInfo;
 
-  /// CoalescerPair - A helper class for register coalescers. When deciding if
+  /// A helper class for register coalescers. When deciding if
   /// two registers can be coalesced, CoalescerPair can determine if a copy
   /// instruction would become an identity copy after coalescing.
   class CoalescerPair {
     const TargetRegisterInfo &TRI;
 
-    /// DstReg - The register that will be left after coalescing. It can be a
+    /// The register that will be left after coalescing. It can be a
     /// virtual or physical register.
     unsigned DstReg;
 
-    /// SrcReg - the virtual register that will be coalesced into dstReg.
+    /// The virtual register that will be coalesced into dstReg.
     unsigned SrcReg;
 
-    /// DstIdx - The sub-register index of the old DstReg in the new coalesced
-    /// register.
+    /// The sub-register index of the old DstReg in the new coalesced register.
     unsigned DstIdx;
 
-    /// SrcIdx - The sub-register index of the old SrcReg in the new coalesced
-    /// register.
+    /// The sub-register index of the old SrcReg in the new coalesced register.
     unsigned SrcIdx;
 
-    /// Partial - True when the original copy was a partial subregister copy.
+    /// True when the original copy was a partial subregister copy.
     bool Partial;
 
-    /// CrossClass - True when both regs are virtual, and newRC is constrained.
+    /// True when both regs are virtual and newRC is constrained.
     bool CrossClass;
 
-    /// Flipped - True when DstReg and SrcReg are reversed from the original
+    /// True when DstReg and SrcReg are reversed from the original
     /// copy instruction.
     bool Flipped;
 
-    /// NewRC - The register class of the coalesced register, or NULL if DstReg
+    /// The register class of the coalesced register, or NULL if DstReg
     /// is a physreg. This register class may be a super-register of both
     /// SrcReg and DstReg.
     const TargetRegisterClass *NewRC;
@@ -70,49 +68,47 @@ namespace llvm {
       : TRI(tri), DstReg(PhysReg), SrcReg(VirtReg), DstIdx(0), SrcIdx(0),
         Partial(false), CrossClass(false), Flipped(false), NewRC(nullptr) {}
 
-    /// setRegisters - set registers to match the copy instruction MI. Return
+    /// Set registers to match the copy instruction MI. Return
     /// false if MI is not a coalescable copy instruction.
     bool setRegisters(const MachineInstr*);
 
-    /// flip - Swap SrcReg and DstReg. Return false if swapping is impossible
+    /// Swap SrcReg and DstReg. Return false if swapping is impossible
     /// because DstReg is a physical register, or SubIdx is set.
     bool flip();
 
-    /// isCoalescable - Return true if MI is a copy instruction that will become
+    /// Return true if MI is a copy instruction that will become
     /// an identity copy after coalescing.
     bool isCoalescable(const MachineInstr*) const;
 
-    /// isPhys - Return true if DstReg is a physical register.
+    /// Return true if DstReg is a physical register.
     bool isPhys() const { return !NewRC; }
 
-    /// isPartial - Return true if the original copy instruction did not copy
+    /// Return true if the original copy instruction did not copy
     /// the full register, but was a subreg operation.
     bool isPartial() const { return Partial; }
 
-    /// isCrossClass - Return true if DstReg is virtual and NewRC is a smaller
+    /// Return true if DstReg is virtual and NewRC is a smaller
     /// register class than DstReg's.
     bool isCrossClass() const { return CrossClass; }
 
-    /// isFlipped - Return true when getSrcReg is the register being defined by
+    /// Return true when getSrcReg is the register being defined by
     /// the original copy instruction.
     bool isFlipped() const { return Flipped; }
 
-    /// getDstReg - Return the register (virtual or physical) that will remain
+    /// Return the register (virtual or physical) that will remain
     /// after coalescing.
     unsigned getDstReg() const { return DstReg; }
 
-    /// getSrcReg - Return the virtual register that will be coalesced away.
+    /// Return the virtual register that will be coalesced away.
     unsigned getSrcReg() const { return SrcReg; }
 
-    /// getDstIdx - Return the subregister index that DstReg will be coalesced
-    /// into, or 0.
+    /// Return the subregister index that DstReg will be coalesced into, or 0.
     unsigned getDstIdx() const { return DstIdx; }
 
-    /// getSrcIdx - Return the subregister index that SrcReg will be coalesced
-    /// into, or 0.
+    /// Return the subregister index that SrcReg will be coalesced into, or 0.
     unsigned getSrcIdx() const { return SrcIdx; }
 
-    /// getNewRC - Return the register class of the coalesced register.
+    /// Return the register class of the coalesced register.
     const TargetRegisterClass *getNewRC() const { return NewRC; }
   };
 } // End llvm namespace
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 617e45989c66..9925efbff24b 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -185,7 +184,7 @@ void RegPressureTracker::init(const MachineFunction *mf,
   reset();
 
   MF = mf;
-  TRI = MF->getTarget().getRegisterInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
   RCI = rci;
   MRI = &MF->getRegInfo();
   MBB = mbb;
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 72b628589e47..7626dd29c00a 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -24,24 +24,16 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "reg-scavenging"
 
-/// setUsed - Set the register and its sub-registers as being used.
-void RegScavenger::setUsed(unsigned Reg) {
-  for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-       SubRegs.isValid(); ++SubRegs)
-    RegsAvailable.reset(*SubRegs);
-}
-
-bool RegScavenger::isAliasUsed(unsigned Reg) const {
-  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-    if (isUsed(*AI, *AI == Reg))
-      return true;
-  return false;
+/// setUsed - Set the register units of this register as used.
+void RegScavenger::setRegUsed(unsigned Reg) {
+  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+    RegUnitsAvailable.reset(*RUI);
 }
 
 void RegScavenger::initRegState() {
@@ -51,8 +43,8 @@ void RegScavenger::initRegState() {
     I->Restore = nullptr;
   }
 
-  // All registers started out unused.
-  RegsAvailable.set();
+  // All register units start out unused.
+  RegUnitsAvailable.set();
 
   if (!MBB)
     return;
@@ -60,22 +52,21 @@ void RegScavenger::initRegState() {
   // Live-in registers are in use.
   for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
          E = MBB->livein_end(); I != E; ++I)
-    setUsed(*I);
+    setRegUsed(*I);
 
   // Pristine CSRs are also unavailable.
   BitVector PR = MBB->getParent()->getFrameInfo()->getPristineRegs(MBB);
   for (int I = PR.find_first(); I>0; I = PR.find_next(I))
-    setUsed(I);
+    setRegUsed(I);
 }
 
 void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
   MachineFunction &MF = *mbb->getParent();
-  const TargetMachine &TM = MF.getTarget();
-  TII = TM.getInstrInfo();
-  TRI = TM.getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
 
-  assert((NumPhysRegs == 0 || NumPhysRegs == TRI->getNumRegs()) &&
+  assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) &&
          "Target changed?");
 
   // It is not possible to use the register scavenger after late optimization
@@ -85,17 +76,11 @@ void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
 
   // Self-initialize.
   if (!MBB) {
-    NumPhysRegs = TRI->getNumRegs();
-    RegsAvailable.resize(NumPhysRegs);
-    KillRegs.resize(NumPhysRegs);
-    DefRegs.resize(NumPhysRegs);
-
-    // Create callee-saved registers bitvector.
-    CalleeSavedRegs.resize(NumPhysRegs);
-    const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
-    if (CSRegs != nullptr)
-      for (unsigned i = 0; CSRegs[i]; ++i)
-        CalleeSavedRegs.set(CSRegs[i]);
+    NumRegUnits = TRI->getNumRegUnits();
+    RegUnitsAvailable.resize(NumRegUnits);
+    KillRegUnits.resize(NumRegUnits);
+    DefRegUnits.resize(NumRegUnits);
+    TmpRegUnits.resize(NumRegUnits);
   }
 
   MBB = mbb;
@@ -104,10 +89,9 @@ void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
   Tracking = false;
 }
 
-void RegScavenger::addRegWithSubRegs(BitVector &BV, unsigned Reg) {
-  for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-       SubRegs.isValid(); ++SubRegs)
-    BV.set(*SubRegs);
+void RegScavenger::addRegUnits(BitVector &BV, unsigned Reg) {
+  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+    BV.set(*RUI);
 }
 
 void RegScavenger::determineKillsAndDefs() {
@@ -122,12 +106,25 @@ void RegScavenger::determineKillsAndDefs() {
   // predicated, conservatively assume "kill" markers do not actually kill the
   // register. Similarly ignores "dead" markers.
   bool isPred = TII->isPredicated(MI);
-  KillRegs.reset();
-  DefRegs.reset();
+  KillRegUnits.reset();
+  DefRegUnits.reset();
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (MO.isRegMask())
-      (isPred ? DefRegs : KillRegs).setBitsNotInMask(MO.getRegMask());
+    if (MO.isRegMask()) {
+      
+      TmpRegUnits.clear();
+      for (unsigned RU = 0, RUEnd = TRI->getNumRegUnits(); RU != RUEnd; ++RU) {
+        for (MCRegUnitRootIterator RURI(RU, TRI); RURI.isValid(); ++RURI) {
+          if (MO.clobbersPhysReg(*RURI)) {
+            TmpRegUnits.set(RU);
+            break;
+          }
+        }
+      }
+      
+      // Apply the mask.
+      (isPred ? DefRegUnits : KillRegUnits) |= TmpRegUnits;
+    }
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
@@ -139,13 +136,13 @@ void RegScavenger::determineKillsAndDefs() {
       if (MO.isUndef())
         continue;
       if (!isPred && MO.isKill())
-        addRegWithSubRegs(KillRegs, Reg);
+        addRegUnits(KillRegUnits, Reg);
     } else {
       assert(MO.isDef());
       if (!isPred && MO.isDead())
-        addRegWithSubRegs(KillRegs, Reg);
+        addRegUnits(KillRegUnits, Reg);
       else
-        addRegWithSubRegs(DefRegs, Reg);
+        addRegUnits(DefRegUnits, Reg);
     }
   }
 }
@@ -158,8 +155,8 @@ void RegScavenger::unprocess() {
     determineKillsAndDefs();
 
     // Commit the changes.
-    setUsed(KillRegs);
-    setUnused(DefRegs);
+    setUsed(KillRegUnits);
+    setUnused(DefRegUnits);
   }
 
   if (MBBI == MBB->begin()) {
@@ -208,7 +205,7 @@ void RegScavenger::forward() {
     if (MO.isUse()) {
       if (MO.isUndef())
         continue;
-      if (!isUsed(Reg)) {
+      if (!isRegUsed(Reg)) {
         // Check if it's partial live: e.g.
         // D0 = insert_subreg D0<undef>, S0
         // ... D0
@@ -219,15 +216,23 @@ void RegScavenger::forward() {
         // insert_subreg around causes both correctness and performance issues.
         bool SubUsed = false;
         for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
-          if (isUsed(*SubRegs)) {
+          if (isRegUsed(*SubRegs)) {
             SubUsed = true;
             break;
           }
-        if (!SubUsed) {
+        bool SuperUsed = false;
+        for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) {
+          if (isRegUsed(*SR)) {
+            SuperUsed = true;
+            break;
+          }
+        }
+        if (!SubUsed && !SuperUsed) {
           MBB->getParent()->verify(nullptr, "In Register Scavenger");
           llvm_unreachable("Using an undefined register!");
         }
         (void)SubUsed;
+        (void)SuperUsed;
       }
     } else {
       assert(MO.isDef());
@@ -243,23 +248,23 @@ void RegScavenger::forward() {
 #endif // NDEBUG
 
   // Commit the changes.
-  setUnused(KillRegs);
-  setUsed(DefRegs);
+  setUnused(KillRegUnits);
+  setUsed(DefRegUnits);
 }
 
-void RegScavenger::getRegsUsed(BitVector &used, bool includeReserved) {
-  used = RegsAvailable;
-  used.flip();
-  if (includeReserved)
-    used |= MRI->getReservedRegs();
-  else
-    used.reset(MRI->getReservedRegs());
+bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const {
+  if (includeReserved && isReserved(Reg))
+    return true;
+  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+    if (!RegUnitsAvailable.test(*RUI))
+      return true;
+  return false;
 }
 
 unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
   for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
        I != E; ++I)
-    if (!isAliasUsed(*I)) {
+    if (!isRegUsed(*I)) {
       DEBUG(dbgs() << "Scavenger found unused reg: " << TRI->getName(*I) <<
             "\n");
       return *I;
@@ -273,13 +278,13 @@ BitVector RegScavenger::getRegsAvailable(const TargetRegisterClass *RC) {
   BitVector Mask(TRI->getNumRegs());
   for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
        I != E; ++I)
-    if (!isAliasUsed(*I))
+    if (!isRegUsed(*I))
       Mask.set(*I);
   return Mask;
 }
 
 /// findSurvivorReg - Return the candidate register that is unused for the
-/// longest after StargMII. UseMI is set to the instruction where the search
+/// longest after StartMII. UseMI is set to the instruction where the search
 /// stopped.
 ///
 /// No more than InstrLimit instructions are inspected.
@@ -375,9 +380,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   }
 
   // Try to find a register that's unused if there is one, as then we won't
-  // have to spill. Search explicitly rather than masking out based on
-  // RegsAvailable, as RegsAvailable does not take aliases into account.
-  // That's what getRegsAvailable() is for.
+  // have to spill.
   BitVector Available = getRegsAvailable(RC);
   Available &= Candidates;
   if (Available.any())
@@ -388,7 +391,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
 
   // If we found an unused register there is no reason to spill it.
-  if (!isAliasUsed(SReg)) {
+  if (!isRegUsed(SReg)) {
     DEBUG(dbgs() << "Scavenged register: " << TRI->getName(SReg) << "\n");
     return SReg;
   }
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 6a2a08096461..6f8b337c6512 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <climits>
 using namespace llvm;
 
@@ -35,11 +36,9 @@ static cl::opt<bool> StressSchedOpt(
 void SchedulingPriorityQueue::anchor() { }
 
 ScheduleDAG::ScheduleDAG(MachineFunction &mf)
-  : TM(mf.getTarget()),
-    TII(TM.getInstrInfo()),
-    TRI(TM.getRegisterInfo()),
-    MF(mf), MRI(mf.getRegInfo()),
-    EntrySU(), ExitSU() {
+    : TM(mf.getTarget()), TII(TM.getSubtargetImpl()->getInstrInfo()),
+      TRI(TM.getSubtargetImpl()->getRegisterInfo()), MF(mf),
+      MRI(mf.getRegInfo()), EntrySU(), ExitSU() {
 #ifndef NDEBUG
   StressSched = StressSchedOpt;
 #endif
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 0f8b21c1c1f3..409e7041907f 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -44,18 +44,17 @@ using namespace llvm;
 
 static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
     cl::ZeroOrMore, cl::init(false),
-    cl::desc("Enable use of AA during MI GAD construction"));
+    cl::desc("Enable use of AA during MI DAG construction"));
 
 static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
-    cl::init(true), cl::desc("Enable use of TBAA during MI GAD construction"));
+    cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction"));
 
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
-                                     const MachineLoopInfo &mli,
-                                     const MachineDominatorTree &mdt,
+                                     const MachineLoopInfo *mli,
                                      bool IsPostRAFlag,
                                      bool RemoveKillFlags,
                                      LiveIntervals *lis)
-  : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()), LIS(lis),
+  : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(lis),
     IsPostRA(IsPostRAFlag), RemoveKillFlags(RemoveKillFlags),
     CanHandleTerminators(false), FirstDbgValue(nullptr) {
   assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals");
@@ -64,7 +63,7 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
          "Virtual registers must be removed prior to PostRA scheduling");
 
   const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
-  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
 }
 
 /// getUnderlyingObjectFromInt - This is the function that does the work of
@@ -110,7 +109,7 @@ static void getUnderlyingObjects(const Value *V,
     for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), IE = Objs.end();
          I != IE; ++I) {
       V = *I;
-      if (!Visited.insert(V))
+      if (!Visited.insert(V).second)
         continue;
       if (Operator::getOpcode(V) == Instruction::IntToPtr) {
         const Value *O =
@@ -512,9 +511,18 @@ static inline bool isUnsafeMemoryObject(MachineInstr *MI,
 static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
                              MachineInstr *MIa,
                              MachineInstr *MIb) {
+  const MachineFunction *MF = MIa->getParent()->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
   // Cover a trivial case - no edge is need to itself.
   if (MIa == MIb)
     return false;
+ 
+  // Let the target decide if memory accesses cannot possibly overlap.
+  if ((MIa->mayLoad() || MIa->mayStore()) &&
+      (MIb->mayLoad() || MIb->mayStore()))
+    if (TII->areMemAccessesTriviallyDisjoint(MIa, MIb, AA))
+      return false;
 
   // FIXME: Need to handle multiple memory operands to support all targets.
   if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand())
@@ -563,9 +571,9 @@ static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
 
   AliasAnalysis::AliasResult AAResult = AA->alias(
       AliasAnalysis::Location(MMOa->getValue(), Overlapa,
-                              UseTBAA ? MMOa->getTBAAInfo() : nullptr),
+                              UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
       AliasAnalysis::Location(MMOb->getValue(), Overlapb,
-                              UseTBAA ? MMOb->getTBAAInfo() : nullptr));
+                              UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
 
   return (AAResult != AliasAnalysis::NoAlias);
 }
@@ -575,12 +583,12 @@ static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
 static unsigned
 iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI,
                  SUnit *SUa, SUnit *SUb, SUnit *ExitSU, unsigned *Depth,
-                 SmallPtrSet<const SUnit*, 16> &Visited) {
+                 SmallPtrSetImpl<const SUnit*> &Visited) {
   if (!SUa || !SUb || SUb == ExitSU)
     return *Depth;
 
   // Remember visited nodes.
-  if (!Visited.insert(SUb))
+  if (!Visited.insert(SUb).second)
       return *Depth;
   // If there is _some_ dependency already in place, do not
   // descend any further.
@@ -606,10 +614,10 @@ iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI,
   }
   // Track current depth.
   (*Depth)++;
-  // Iterate over chain dependencies only.
+  // Iterate over memory dependencies only.
   for (SUnit::const_succ_iterator I = SUb->Succs.begin(), E = SUb->Succs.end();
        I != E; ++I)
-    if (I->isCtrl())
+    if (I->isNormalMemoryOrBarrier())
       iterateChainSucc (AA, MFI, SUa, I->getSUnit(), ExitSU, Depth, Visited);
   return *Depth;
 }
@@ -636,11 +644,12 @@ static void adjustChainDeps(AliasAnalysis *AA, const MachineFrameInfo *MFI,
       Dep.setLatency(((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0);
       (*I)->addPred(Dep);
     }
-    // Now go through all the chain successors and iterate from them.
-    // Keep track of visited nodes.
+
+    // Iterate recursively over all previously added memory chain
+    // successors. Keep track of visited nodes.
     for (SUnit::const_succ_iterator J = (*I)->Succs.begin(),
          JE = (*I)->Succs.end(); J != JE; ++J)
-      if (J->isCtrl())
+      if (J->isNormalMemoryOrBarrier())
         iterateChainSucc (AA, MFI, SU, J->getSUnit(),
                           ExitSU, &Depth, Visited);
   }
@@ -656,7 +665,7 @@ void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
                          bool isNormalMemory = false) {
   // If this is a false dependency,
   // do not add the edge, but rememeber the rejected node.
-  if (!AA || MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
+  if (MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
     SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier);
     Dep.setLatency(TrueMemOrderLatency);
     SUb->addPred(Dep);
@@ -883,7 +892,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
 
       // fall-through
     new_alias_chain:
-      // Chain all possibly aliasing memory references though SU.
+      // Chain all possibly aliasing memory references through SU.
       if (AliasChain) {
         unsigned ChainLatency = 0;
         if (AliasChain->getInstr()->mayLoad())
@@ -912,6 +921,13 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       AliasMemDefs.clear();
       AliasMemUses.clear();
     } else if (MI->mayStore()) {
+      // Add dependence on barrier chain, if needed.
+      // There is no point to check aliasing on barrier event. Even if
+      // SU and barrier _could_ be reordered, they should not. In addition,
+      // we have lost all RejectMemNodes below barrier.
+      if (BarrierChain)
+        BarrierChain->addPred(SDep(SU, SDep::Barrier));
+
       UnderlyingObjectsVector Objs;
       getUnderlyingObjectsForInstr(MI, MFI, Objs);
 
@@ -981,12 +997,6 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
                         TrueMemOrderLatency);
       }
-      // Add dependence on barrier chain, if needed.
-      // There is no point to check aliasing on barrier event. Even if
-      // SU and barrier _could_ be reordered, they should not. In addition,
-      // we have lost all RejectMemNodes below barrier.
-      if (BarrierChain)
-        BarrierChain->addPred(SDep(SU, SDep::Barrier));
     } else if (MI->mayLoad()) {
       bool MayAlias = true;
       if (MI->isInvariantLoad(AA)) {
diff --git a/lib/CodeGen/ScheduleDAGPrinter.cpp b/lib/CodeGen/ScheduleDAGPrinter.cpp
index f59c6cfd8284..b2e4617720ff 100644
--- a/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <fstream>
 using namespace llvm;
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 004c685bf39a..38833a4165a3 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -78,7 +78,7 @@ ScoreboardHazardRecognizer(const InstrItineraryData *II,
     DEBUG(dbgs() << "Disabled scoreboard hazard recognizer\n");
   else {
     // A nonempty itinerary must have a SchedModel.
-    IssueWidth = ItinData->SchedModel->IssueWidth;
+    IssueWidth = ItinData->SchedModel.IssueWidth;
     DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = "
           << ScoreboardDepth << '\n');
   }
diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt
index 75e816720f57..fbedf2c1d17a 100644
--- a/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -19,6 +19,7 @@ add_llvm_library(LLVMSelectionDAG
   SelectionDAGDumper.cpp
   SelectionDAGISel.cpp
   SelectionDAGPrinter.cpp
+  StatepointLowering.cpp
   ScheduleDAGVLIW.cpp
   TargetLowering.cpp
   TargetSelectionDAGInfo.cpp
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2abcdd524512..5145731f6231 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17,6 +17,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -32,7 +34,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -76,6 +77,10 @@ namespace {
                              "slicing"),
                     cl::init(false));
 
+  static cl::opt<bool>
+    MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
+                      cl::desc("DAG combiner may split indexing from loads"));
+
 //------------------------------ DAGCombiner ---------------------------------//
 
   class DAGCombiner {
@@ -87,61 +92,70 @@ namespace {
     bool LegalTypes;
     bool ForCodeSize;
 
-    // Worklist of all of the nodes that need to be simplified.
-    //
-    // This has the semantics that when adding to the worklist,
-    // the item added must be next to be processed. It should
-    // also only appear once. The naive approach to this takes
-    // linear time.
-    //
-    // To reduce the insert/remove time to logarithmic, we use
-    // a set and a vector to maintain our worklist.
-    //
-    // The set contains the items on the worklist, but does not
-    // maintain the order they should be visited.
-    //
-    // The vector maintains the order nodes should be visited, but may
-    // contain duplicate or removed nodes. When choosing a node to
-    // visit, we pop off the order stack until we find an item that is
-    // also in the contents set. All operations are O(log N).
-    SmallPtrSet<SDNode*, 64> WorklistContents;
-    SmallVector<SDNode*, 64> WorklistOrder;
+    /// \brief Worklist of all of the nodes that need to be simplified.
+    ///
+    /// This must behave as a stack -- new nodes to process are pushed onto the
+    /// back and when processing we pop off of the back.
+    ///
+    /// The worklist will not contain duplicates but may contain null entries
+    /// due to nodes being deleted from the underlying DAG.
+    SmallVector<SDNode *, 64> Worklist;
+
+    /// \brief Mapping from an SDNode to its position on the worklist.
+    ///
+    /// This is used to find and remove nodes from the worklist (by nulling
+    /// them) when they are deleted from the underlying DAG. It relies on
+    /// stable indices of nodes within the worklist.
+    DenseMap<SDNode *, unsigned> WorklistMap;
+
+    /// \brief Set of nodes which have been combined (at least once).
+    ///
+    /// This is used to allow us to reliably add any operands of a DAG node
+    /// which have not yet been combined to the worklist.
+    SmallPtrSet<SDNode *, 64> CombinedNodes;
 
     // AA - Used for DAG load/store alias analysis.
     AliasAnalysis &AA;
 
-    /// AddUsersToWorklist - When an instruction is simplified, add all users of
-    /// the instruction to the work lists because they might get more simplified
-    /// now.
-    ///
+    /// When an instruction is simplified, add all users of the instruction to
+    /// the work lists because they might get more simplified now.
     void AddUsersToWorklist(SDNode *N) {
       for (SDNode *Node : N->uses())
         AddToWorklist(Node);
     }
 
-    /// visit - call the node-specific routine that knows how to fold each
-    /// particular type of node.
+    /// Call the node-specific routine that folds each particular type of node.
     SDValue visit(SDNode *N);
 
   public:
-    /// AddToWorklist - Add to the work list making sure its instance is at the
-    /// back (next to be processed.)
+    /// Add to the worklist making sure its instance is at the back (next to be
+    /// processed.)
     void AddToWorklist(SDNode *N) {
       // Skip handle nodes as they can't usefully be combined and confuse the
       // zero-use deletion strategy.
       if (N->getOpcode() == ISD::HANDLENODE)
         return;
 
-      WorklistContents.insert(N);
-      WorklistOrder.push_back(N);
+      if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
+        Worklist.push_back(N);
     }
 
-    /// removeFromWorklist - remove all instances of N from the worklist.
-    ///
+    /// Remove all instances of N from the worklist.
     void removeFromWorklist(SDNode *N) {
-      WorklistContents.erase(N);
+      CombinedNodes.erase(N);
+
+      auto It = WorklistMap.find(N);
+      if (It == WorklistMap.end())
+        return; // Not in the worklist.
+
+      // Null out the entry rather than erasing it to avoid a linear operation.
+      Worklist[It->second] = nullptr;
+      WorklistMap.erase(It);
     }
 
+    void deleteAndRecombine(SDNode *N);
+    bool recursivelyDeleteUnusedNodes(SDNode *N);
+
     SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
                       bool AddTo = true);
 
@@ -159,9 +173,9 @@ namespace {
 
   private:
 
-    /// SimplifyDemandedBits - Check the specified integer node value to see if
-    /// it can be simplified or if things it uses can be simplified by bit
-    /// propagation.  If so, return true.
+    /// Check the specified integer node value to see if it can be simplified or
+    /// if things it uses can be simplified by bit propagation.
+    /// If so, return true.
     bool SimplifyDemandedBits(SDValue Op) {
       unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
       APInt Demanded = APInt::getAllOnesValue(BitWidth);
@@ -172,8 +186,19 @@ namespace {
 
     bool CombineToPreIndexedLoadStore(SDNode *N);
     bool CombineToPostIndexedLoadStore(SDNode *N);
+    SDValue SplitIndexingFromLoad(LoadSDNode *LD);
     bool SliceUpLoad(SDNode *N);
 
+    /// \brief Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
+    ///   load.
+    ///
+    /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
+    /// \param InVecVT type of the input vector to EVE with bitcasts resolved.
+    /// \param EltNo index of the vector element to load.
+    /// \param OriginalLoad load that EVE came from to be replaced.
+    /// \returns EVE on success SDValue() on failure.
+    SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
+        SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad);
     void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
     SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
     SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
@@ -187,7 +212,7 @@ namespace {
                          SDValue Trunc, SDValue ExtLoad, SDLoc DL,
                          ISD::NodeType ExtType);
 
-    /// combine - call the node-specific routine that knows how to fold each
+    /// Call the node-specific routine that knows how to fold each
     /// particular type of node. If that doesn't do anything, try the
     /// target-specific DAG combines.
     SDValue combine(SDNode *N);
@@ -251,6 +276,7 @@ namespace {
     SDValue visitFMA(SDNode *N);
     SDValue visitFDIV(SDNode *N);
     SDValue visitFREM(SDNode *N);
+    SDValue visitFSQRT(SDNode *N);
     SDValue visitFCOPYSIGN(SDNode *N);
     SDValue visitSINT_TO_FP(SDNode *N);
     SDValue visitUINT_TO_FP(SDNode *N);
@@ -264,6 +290,8 @@ namespace {
     SDValue visitFCEIL(SDNode *N);
     SDValue visitFTRUNC(SDNode *N);
     SDValue visitFFLOOR(SDNode *N);
+    SDValue visitFMINNUM(SDNode *N);
+    SDValue visitFMAXNUM(SDNode *N);
     SDValue visitBRCOND(SDNode *N);
     SDValue visitBR_CC(SDNode *N);
     SDValue visitLOAD(SDNode *N);
@@ -275,6 +303,8 @@ namespace {
     SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
     SDValue visitVECTOR_SHUFFLE(SDNode *N);
     SDValue visitINSERT_SUBVECTOR(SDNode *N);
+    SDValue visitMLOAD(SDNode *N);
+    SDValue visitMSTORE(SDNode *N);
 
     SDValue XformToShuffleWithZero(SDNode *N);
     SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS);
@@ -299,7 +329,12 @@ namespace {
     SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
     SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
     SDValue BuildSDIV(SDNode *N);
+    SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
+    SDValue BuildReciprocalEstimate(SDValue Op);
+    SDValue BuildRsqrtEstimate(SDValue Op);
+    SDValue BuildRsqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations);
+    SDValue BuildRsqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations);
     SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
@@ -316,17 +351,16 @@ namespace {
 
     SDValue GetDemandedBits(SDValue V, const APInt &Mask);
 
-    /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
+    /// Walk up chain skipping non-aliasing memory nodes,
     /// looking for aliasing nodes and adding them to the Aliases vector.
     void GatherAllAliases(SDNode *N, SDValue OriginalChain,
                           SmallVectorImpl<SDValue> &Aliases);
 
-    /// isAlias - Return true if there is any possibility that the two addresses
-    /// overlap.
+    /// Return true if there is any possibility that the two addresses overlap.
     bool isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const;
 
-    /// FindBetterChain - Walk up chain skipping non-aliasing memory nodes,
-    /// looking for a better chain (aliasing node.)
+    /// Walk up chain skipping non-aliasing memory nodes, looking for a better
+    /// chain (aliasing node.)
     SDValue FindBetterChain(SDNode *N, SDValue Chain);
 
     /// Merge consecutive store operations into a wide store.
@@ -354,13 +388,13 @@ namespace {
           FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
     }
 
-    /// Run - runs the dag combiner on all nodes in the work list
+    /// Runs the dag combiner on all nodes in the work list
     void Run(CombineLevel AtLevel);
 
     SelectionDAG &getDAG() const { return DAG; }
 
-    /// getShiftAmountTy - Returns a type large enough to hold any valid
-    /// shift amount - before type legalization these can be huge.
+    /// Returns a type large enough to hold any valid shift amount - before type
+    /// legalization these can be huge.
     EVT getShiftAmountTy(EVT LHSTy) {
       assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
       if (LHSTy.isVector())
@@ -369,15 +403,14 @@ namespace {
                         : TLI.getPointerTy();
     }
 
-    /// isTypeLegal - This method returns true if we are running before type
-    /// legalization or if the specified VT is legal.
+    /// This method returns true if we are running before type legalization or
+    /// if the specified VT is legal.
     bool isTypeLegal(const EVT &VT) {
       if (!LegalTypes) return true;
       return TLI.isTypeLegal(VT);
     }
 
-    /// getSetCCResultType - Convenience wrapper around
-    /// TargetLowering::getSetCCResultType
+    /// Convenience wrapper around TargetLowering::getSetCCResultType
     EVT getSetCCResultType(EVT VT) const {
       return TLI.getSetCCResultType(*DAG.getContext(), VT);
     }
@@ -386,7 +419,7 @@ namespace {
 
 
 namespace {
-/// WorklistRemover - This class is a DAGUpdateListener that removes any deleted
+/// This class is a DAGUpdateListener that removes any deleted
 /// nodes from the worklist.
 class WorklistRemover : public SelectionDAG::DAGUpdateListener {
   DAGCombiner &DC;
@@ -437,9 +470,24 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
-/// isNegatibleForFree - Return 1 if we can compute the negated form of the
-/// specified expression for the same cost as the expression itself, or 2 if we
-/// can compute the negated form more cheaply than the expression itself.
+void DAGCombiner::deleteAndRecombine(SDNode *N) {
+  removeFromWorklist(N);
+
+  // If the operands of this node are only used by the node, they will now be
+  // dead. Make sure to re-visit them and recursively delete dead nodes.
+  for (const SDValue &Op : N->ops())
+    // For an operand generating multiple values, one of the values may
+    // become dead allowing further simplification (e.g. split index
+    // arithmetic from an indexed load).
+    if (Op->hasOneUse() || Op->getNumValues() > 1)
+      AddToWorklist(Op.getNode());
+
+  DAG.DeleteNode(N);
+}
+
+/// Return 1 if we can compute the negated form of the specified expression for
+/// the same cost as the expression itself, or 2 if we can compute the negated
+/// form more cheaply than the expression itself.
 static char isNegatibleForFree(SDValue Op, bool LegalOperations,
                                const TargetLowering &TLI,
                                const TargetOptions *Options,
@@ -502,10 +550,10 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
   }
 }
 
-/// GetNegatedExpression - If isNegatibleForFree returns true, this function
-/// returns the newly negated expression.
+/// If isNegatibleForFree returns true, return the newly negated expression.
 static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
                                     bool LegalOperations, unsigned Depth = 0) {
+  const TargetOptions &Options = DAG.getTarget().Options;
   // fneg is removable even if it has multiple uses.
   if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0);
 
@@ -522,12 +570,11 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
   }
   case ISD::FADD:
     // FIXME: determine better conditions for this xform.
-    assert(DAG.getTarget().Options.UnsafeFPMath);
+    assert(Options.UnsafeFPMath);
 
     // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
     if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
-                           DAG.getTargetLoweringInfo(),
-                           &DAG.getTarget().Options, Depth+1))
+                           DAG.getTargetLoweringInfo(), &Options, Depth+1))
       return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
                                               LegalOperations, Depth+1),
@@ -539,7 +586,7 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
                        Op.getOperand(0));
   case ISD::FSUB:
     // We can't turn -(A-B) into B-A when we honor signed zeros.
-    assert(DAG.getTarget().Options.UnsafeFPMath);
+    assert(Options.UnsafeFPMath);
 
     // fold (fneg (fsub 0, B)) -> B
     if (ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Op.getOperand(0)))
@@ -552,12 +599,11 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
   case ISD::FMUL:
   case ISD::FDIV:
-    assert(!DAG.getTarget().Options.HonorSignDependentRoundingFPMath());
+    assert(!Options.HonorSignDependentRoundingFPMath());
 
     // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
     if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
-                           DAG.getTargetLoweringInfo(),
-                           &DAG.getTarget().Options, Depth+1))
+                           DAG.getTargetLoweringInfo(), &Options, Depth+1))
       return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
                                               LegalOperations, Depth+1),
@@ -582,7 +628,7 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
   }
 }
 
-// isSetCCEquivalent - Return true if this node is a setcc, or is a select_cc
+// Return true if this node is a setcc, or is a select_cc
 // that selects between the target values used for true and false, making it
 // equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to
 // the appropriate nodes based on the type of node we are checking. This
@@ -601,15 +647,19 @@ bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
       !TLI.isConstFalseVal(N.getOperand(3).getNode()))
     return false;
 
+  if (TLI.getBooleanContents(N.getValueType()) ==
+      TargetLowering::UndefinedBooleanContent)
+    return false;
+
   LHS = N.getOperand(0);
   RHS = N.getOperand(1);
   CC  = N.getOperand(4);
   return true;
 }
 
-// isOneUseSetCC - Return true if this is a SetCC-equivalent operation with only
-// one use.  If this is true, it allows the users to invert the operation for
-// free when it is profitable to do so.
+/// Return true if this is a SetCC-equivalent operation with only one use.
+/// If this is true, it allows the users to invert the operation for free when
+/// it is profitable to do so.
 bool DAGCombiner::isOneUseSetCC(SDValue N) const {
   SDValue N0, N1, N2;
   if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
@@ -617,7 +667,7 @@ bool DAGCombiner::isOneUseSetCC(SDValue N) const {
   return false;
 }
 
-/// isConstantSplatVector - Returns true if N is a BUILD_VECTOR node whose
+/// Returns true if N is a BUILD_VECTOR node whose
 /// elements are all the same constant or undefined.
 static bool isConstantSplatVector(SDNode *N, APInt& SplatValue) {
   BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(N);
@@ -638,7 +688,7 @@ static SDNode *isConstantBuildVectorOrConstantInt(SDValue N) {
   if (isa<ConstantSDNode>(N))
     return N.getNode();
   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N);
-  if(BV && BV->isConstant())
+  if (BV && BV->isConstant())
     return BV;
   return nullptr;
 }
@@ -664,6 +714,23 @@ static ConstantSDNode *isConstOrConstSplat(SDValue N) {
   return nullptr;
 }
 
+// \brief Returns the SDNode if it is a constant splat BuildVector or constant
+// float.
+static ConstantFPSDNode *isConstOrConstSplatFP(SDValue N) {
+  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+    return CN;
+
+  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
+    BitVector UndefElements;
+    ConstantFPSDNode *CN = BV->getConstantFPSplatNode(&UndefElements);
+
+    if (CN && UndefElements.none())
+      return CN;
+  }
+
+  return nullptr;
+}
+
 SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL,
                                     SDValue N0, SDValue N1) {
   EVT VT = N0.getValueType();
@@ -671,10 +738,9 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL,
     if (SDNode *L = isConstantBuildVectorOrConstantInt(N0.getOperand(1))) {
       if (SDNode *R = isConstantBuildVectorOrConstantInt(N1)) {
         // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2))
-        SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, L, R);
-        if (!OpNode.getNode())
-          return SDValue();
-        return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
+        if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, L, R))
+          return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
+        return SDValue();
       }
       if (N0.hasOneUse()) {
         // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one
@@ -692,10 +758,9 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL,
     if (SDNode *R = isConstantBuildVectorOrConstantInt(N1.getOperand(1))) {
       if (SDNode *L = isConstantBuildVectorOrConstantInt(N0)) {
         // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2))
-        SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, R, L);
-        if (!OpNode.getNode())
-          return SDValue();
-        return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
+        if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, R, L))
+          return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
+        return SDValue();
       }
       if (N1.hasOneUse()) {
         // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one
@@ -720,11 +785,12 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
         N->dump(&DAG);
         dbgs() << "\nWith: ";
         To[0].getNode()->dump(&DAG);
-        dbgs() << " and " << NumTo-1 << " other values\n";
-        for (unsigned i = 0, e = NumTo; i != e; ++i)
-          assert((!To[i].getNode() ||
-                  N->getValueType(i) == To[i].getValueType()) &&
-                 "Cannot combine value to value of different type!"));
+        dbgs() << " and " << NumTo-1 << " other values\n");
+  for (unsigned i = 0, e = NumTo; i != e; ++i)
+    assert((!To[i].getNode() ||
+            N->getValueType(i) == To[i].getValueType()) &&
+           "Cannot combine value to value of different type!");
+
   WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesWith(N, To);
   if (AddTo) {
@@ -740,14 +806,8 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
   // Finally, if the node is now dead, remove it from the graph.  The node
   // may not be dead if the replacement process recursively simplified to
   // something else needing this node.
-  if (N->use_empty()) {
-    // Nodes can be reintroduced into the worklist.  Make sure we do not
-    // process a node that has been replaced.
-    removeFromWorklist(N);
-
-    // Finally, since the node is now dead, remove it from the graph.
-    DAG.DeleteNode(N);
-  }
+  if (N->use_empty())
+    deleteAndRecombine(N);
   return SDValue(N, 0);
 }
 
@@ -765,22 +825,12 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
   // Finally, if the node is now dead, remove it from the graph.  The node
   // may not be dead if the replacement process recursively simplified to
   // something else needing this node.
-  if (TLO.Old.getNode()->use_empty()) {
-    removeFromWorklist(TLO.Old.getNode());
-
-    // If the operands of this node are only used by the node, they will now
-    // be dead.  Make sure to visit them first to delete dead nodes early.
-    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands(); i != e; ++i)
-      if (TLO.Old.getNode()->getOperand(i).getNode()->hasOneUse())
-        AddToWorklist(TLO.Old.getNode()->getOperand(i).getNode());
-
-    DAG.DeleteNode(TLO.Old.getNode());
-  }
+  if (TLO.Old.getNode()->use_empty())
+    deleteAndRecombine(TLO.Old.getNode());
 }
 
-/// SimplifyDemandedBits - Check the specified integer node value to see if
-/// it can be simplified or if things it uses can be simplified by bit
-/// propagation.  If so, return true.
+/// Check the specified integer node value to see if it can be simplified or if
+/// things it uses can be simplified by bit propagation. If so, return true.
 bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
   TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
   APInt KnownZero, KnownOne;
@@ -815,8 +865,7 @@ void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
   WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
-  removeFromWorklist(Load);
-  DAG.DeleteNode(Load);
+  deleteAndRecombine(Load);
   AddToWorklist(Trunc.getNode());
 }
 
@@ -826,8 +875,8 @@ SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
     EVT MemVT = LD->getMemoryVT();
     ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
-      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD
-                                                  : ISD::EXTLOAD)
+      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
+                                                       : ISD::EXTLOAD)
       : LD->getExtensionType();
     Replace = true;
     return DAG.getExtLoad(ExtType, dl, PVT,
@@ -889,9 +938,9 @@ SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
   return DAG.getZeroExtendInReg(NewOp, dl, OldVT);
 }
 
-/// PromoteIntBinOp - Promote the specified integer binary operation if the
-/// target indicates it is beneficial. e.g. On x86, it's usually better to
-/// promote i16 operations to i32 since i16 instructions are longer.
+/// Promote the specified integer binary operation if the target indicates it is
+/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
+/// i32 since i16 instructions are longer.
 SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
   if (!LegalOperations)
     return SDValue();
@@ -947,9 +996,9 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
   return SDValue();
 }
 
-/// PromoteIntShiftOp - Promote the specified integer shift operation if the
-/// target indicates it is beneficial. e.g. On x86, it's usually better to
-/// promote i16 operations to i32 since i16 instructions are longer.
+/// Promote the specified integer shift operation if the target indicates it is
+/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
+/// i32 since i16 instructions are longer.
 SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
   if (!LegalOperations)
     return SDValue();
@@ -1048,8 +1097,8 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
     LoadSDNode *LD = cast<LoadSDNode>(N);
     EVT MemVT = LD->getMemoryVT();
     ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
-      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD
-                                                  : ISD::EXTLOAD)
+      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
+                                                       : ISD::EXTLOAD)
       : LD->getExtensionType();
     SDValue NewLD = DAG.getExtLoad(ExtType, dl, PVT,
                                    LD->getChain(), LD->getBasePtr(),
@@ -1064,14 +1113,42 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
     WorklistRemover DeadNodes(*this);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
-    removeFromWorklist(N);
-    DAG.DeleteNode(N);
+    deleteAndRecombine(N);
     AddToWorklist(Result.getNode());
     return true;
   }
   return false;
 }
 
+/// \brief Recursively delete a node which has no uses and any operands for
+/// which it is the only use.
+///
+/// Note that this both deletes the nodes and removes them from the worklist.
+/// It also adds any nodes who have had a user deleted to the worklist as they
+/// may now have only one use and subject to other combines.
+bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
+  if (!N->use_empty())
+    return false;
+
+  SmallSetVector<SDNode *, 16> Nodes;
+  Nodes.insert(N);
+  do {
+    N = Nodes.pop_back_val();
+    if (!N)
+      continue;
+
+    if (N->use_empty()) {
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+        Nodes.insert(N->getOperand(i).getNode());
+
+      removeFromWorklist(N);
+      DAG.DeleteNode(N);
+    } else {
+      AddToWorklist(N);
+    }
+  } while (!Nodes.empty());
+  return true;
+}
 
 //===----------------------------------------------------------------------===//
 //  Main DAG Combiner implementation
@@ -1083,6 +1160,13 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
   LegalOperations = Level >= AfterLegalizeVectorOps;
   LegalTypes = Level >= AfterLegalizeTypes;
 
+  // Early exit if this basic block is in an optnone function.
+  AttributeSet FnAttrs =
+    DAG.getMachineFunction().getFunction()->getAttributes();
+  if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                           Attribute::OptimizeNone))
+    return;
+
   // Add all the dag nodes to the worklist.
   for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
        E = DAG.allnodes_end(); I != E; ++I)
@@ -1093,34 +1177,52 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
   // changes of the root.
   HandleSDNode Dummy(DAG.getRoot());
 
-  // The root of the dag may dangle to deleted nodes until the dag combiner is
-  // done.  Set it to null to avoid confusion.
-  DAG.setRoot(SDValue());
-
   // while the worklist isn't empty, find a node and
   // try and combine it.
-  while (!WorklistContents.empty()) {
+  while (!WorklistMap.empty()) {
     SDNode *N;
-    // The WorklistOrder holds the SDNodes in order, but it may contain
-    // duplicates.
-    // In order to avoid a linear scan, we use a set (O(log N)) to hold what the
-    // worklist *should* contain, and check the node we want to visit is should
-    // actually be visited.
+    // The Worklist holds the SDNodes in order, but it may contain null entries.
     do {
-      N = WorklistOrder.pop_back_val();
-    } while (!WorklistContents.erase(N));
+      N = Worklist.pop_back_val();
+    } while (!N);
+
+    bool GoodWorklistEntry = WorklistMap.erase(N);
+    (void)GoodWorklistEntry;
+    assert(GoodWorklistEntry &&
+           "Found a worklist entry without a corresponding map entry!");
 
     // If N has no uses, it is dead.  Make sure to revisit all N's operands once
     // N is deleted from the DAG, since they too may now be dead or may have a
     // reduced number of uses, allowing other xforms.
-    if (N->use_empty()) {
-      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-        AddToWorklist(N->getOperand(i).getNode());
-
-      DAG.DeleteNode(N);
+    if (recursivelyDeleteUnusedNodes(N))
       continue;
+
+    WorklistRemover DeadNodes(*this);
+
+    // If this combine is running after legalizing the DAG, re-legalize any
+    // nodes pulled off the worklist.
+    if (Level == AfterLegalizeDAG) {
+      SmallSetVector<SDNode *, 16> UpdatedNodes;
+      bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
+
+      for (SDNode *LN : UpdatedNodes) {
+        AddToWorklist(LN);
+        AddUsersToWorklist(LN);
+      }
+      if (!NIsValid)
+        continue;
     }
 
+    DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));
+
+    // Add any operands of the new node which have not yet been combined to the
+    // worklist as well. Because the worklist uniques things already, this
+    // won't repeatedly process the same operand.
+    CombinedNodes.insert(N);
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      if (!CombinedNodes.count(N->getOperand(i).getNode()))
+        AddToWorklist(N->getOperand(i).getNode());
+
     SDValue RV = combine(N);
 
     if (!RV.getNode())
@@ -1139,15 +1241,11 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
            RV.getNode()->getOpcode() != ISD::DELETED_NODE &&
            "Node was deleted but visit returned new node!");
 
-    DEBUG(dbgs() << "\nReplacing.3 ";
-          N->dump(&DAG);
-          dbgs() << "\nWith: ";
-          RV.getNode()->dump(&DAG);
-          dbgs() << '\n');
+    DEBUG(dbgs() << " ... into: ";
+          RV.getNode()->dump(&DAG));
 
     // Transfer debug value.
     DAG.TransferDbgValues(SDValue(N, 0), RV);
-    WorklistRemover DeadNodes(*this);
     if (N->getNumValues() == RV.getNode()->getNumValues())
       DAG.ReplaceAllUsesWith(N, RV.getNode());
     else {
@@ -1161,23 +1259,11 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
     AddToWorklist(RV.getNode());
     AddUsersToWorklist(RV.getNode());
 
-    // Add any uses of the old node to the worklist in case this node is the
-    // last one that uses them.  They may become dead after this node is
-    // deleted.
-    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-      AddToWorklist(N->getOperand(i).getNode());
-
     // Finally, if the node is now dead, remove it from the graph.  The node
     // may not be dead if the replacement process recursively simplified to
-    // something else needing this node.
-    if (N->use_empty()) {
-      // Nodes can be reintroduced into the worklist.  Make sure we do not
-      // process a node that has been replaced.
-      removeFromWorklist(N);
-
-      // Finally, since the node is now dead, remove it from the graph.
-      DAG.DeleteNode(N);
-    }
+    // something else needing this node. This will also take care of adding any
+    // operands which have lost a user to the worklist.
+    recursivelyDeleteUnusedNodes(N);
   }
 
   // If the root changed (e.g. it was a dead load, update the root).
@@ -1239,6 +1325,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FMA:                return visitFMA(N);
   case ISD::FDIV:               return visitFDIV(N);
   case ISD::FREM:               return visitFREM(N);
+  case ISD::FSQRT:              return visitFSQRT(N);
   case ISD::FCOPYSIGN:          return visitFCOPYSIGN(N);
   case ISD::SINT_TO_FP:         return visitSINT_TO_FP(N);
   case ISD::UINT_TO_FP:         return visitUINT_TO_FP(N);
@@ -1250,6 +1337,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FNEG:               return visitFNEG(N);
   case ISD::FABS:               return visitFABS(N);
   case ISD::FFLOOR:             return visitFFLOOR(N);
+  case ISD::FMINNUM:            return visitFMINNUM(N);
+  case ISD::FMAXNUM:            return visitFMAXNUM(N);
   case ISD::FCEIL:              return visitFCEIL(N);
   case ISD::FTRUNC:             return visitFTRUNC(N);
   case ISD::BRCOND:             return visitBRCOND(N);
@@ -1263,6 +1352,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::EXTRACT_SUBVECTOR:  return visitEXTRACT_SUBVECTOR(N);
   case ISD::VECTOR_SHUFFLE:     return visitVECTOR_SHUFFLE(N);
   case ISD::INSERT_SUBVECTOR:   return visitINSERT_SUBVECTOR(N);
+  case ISD::MLOAD:              return visitMLOAD(N);
+  case ISD::MSTORE:             return visitMSTORE(N);
   }
   return SDValue();
 }
@@ -1342,8 +1433,8 @@ SDValue DAGCombiner::combine(SDNode *N) {
   return RV;
 }
 
-/// getInputChainForNode - Given a node, return its input chain if it has one,
-/// otherwise return a null sd operand.
+/// Given a node, return its input chain if it has one, otherwise return a null
+/// sd operand.
 static SDValue getInputChainForNode(SDNode *N) {
   if (unsigned NumOps = N->getNumOperands()) {
     if (N->getOperand(0).getValueType() == MVT::Other)
@@ -1405,7 +1496,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
 
       default:
         // Only add if it isn't already in the list.
-        if (SeenOps.insert(Op.getNode()))
+        if (SeenOps.insert(Op.getNode()).second)
           Ops.push_back(Op);
         else
           Changed = true;
@@ -1446,33 +1537,10 @@ SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
     for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
       DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i));
   } while (!N->use_empty());
-  removeFromWorklist(N);
-  DAG.DeleteNode(N);
+  deleteAndRecombine(N);
   return SDValue(N, 0);   // Return N so it doesn't get rechecked!
 }
 
-static
-SDValue combineShlAddConstant(SDLoc DL, SDValue N0, SDValue N1,
-                              SelectionDAG &DAG) {
-  EVT VT = N0.getValueType();
-  SDValue N00 = N0.getOperand(0);
-  SDValue N01 = N0.getOperand(1);
-  ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N01);
-
-  if (N01C && N00.getOpcode() == ISD::ADD && N00.getNode()->hasOneUse() &&
-      isa<ConstantSDNode>(N00.getOperand(1))) {
-    // fold (add (shl (add x, c1), c2), ) -> (add (add (shl x, c2), c1<<c2), )
-    N0 = DAG.getNode(ISD::ADD, SDLoc(N0), VT,
-                     DAG.getNode(ISD::SHL, SDLoc(N00), VT,
-                                 N00.getOperand(0), N01),
-                     DAG.getNode(ISD::SHL, SDLoc(N01), VT,
-                                 N00.getOperand(1), N01));
-    return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
-  }
-
-  return SDValue();
-}
-
 SDValue DAGCombiner::visitADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -1589,16 +1657,6 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     }
   }
 
-  // fold (add (shl (add x, c1), c2), ) -> (add (add (shl x, c2), c1<<c2), )
-  if (N0.getOpcode() == ISD::SHL && N0.getNode()->hasOneUse()) {
-    SDValue Result = combineShlAddConstant(SDLoc(N), N0, N1, DAG);
-    if (Result.getNode()) return Result;
-  }
-  if (N1.getOpcode() == ISD::SHL && N1.getNode()->hasOneUse()) {
-    SDValue Result = combineShlAddConstant(SDLoc(N), N1, N0, DAG);
-    if (Result.getNode()) return Result;
-  }
-
   // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
   if (N1.getOpcode() == ISD::SHL &&
       N1.getOperand(0).getOpcode() == ISD::SUB)
@@ -1642,6 +1700,17 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
   }
 
+  // add X, (sextinreg Y i1) -> sub X, (and Y 1)
+  if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
+    if (TN->getVT() == MVT::i1) {
+      SDLoc DL(N);
+      SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
+                                 DAG.getConstant(1, VT));
+      return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt);
+    }
+  }
+
   return SDValue();
 }
 
@@ -1807,6 +1876,17 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
                                  VT);
     }
 
+  // sub X, (sextinreg Y i1) -> add X, (and Y 1)
+  if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
+    if (TN->getVT() == MVT::i1) {
+      SDLoc DL(N);
+      SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
+                                 DAG.getConstant(1, VT));
+      return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt);
+    }
+  }
+
   return SDValue();
 }
 
@@ -2011,9 +2091,14 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
                                      (-N1C->getAPIntValue()).isPowerOf2())) {
     // If dividing by powers of two is cheap, then don't perform the following
     // fold.
-    if (TLI.isPow2DivCheap())
+    if (TLI.isPow2SDivCheap())
       return SDValue();
 
+    // Target-specific implementation of sdiv x, pow2.
+    SDValue Res = BuildSDIVPow2(N);
+    if (Res.getNode())
+      return Res;
+
     unsigned lg2 = N1C->getAPIntValue().countTrailingZeros();
 
     // Splat the sign bit into the register
@@ -2281,10 +2366,9 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
   return SDValue();
 }
 
-/// SimplifyNodeWithTwoResults - Perform optimizations common to nodes that
-/// compute two values. LoOp and HiOp give the opcodes for the two computations
-/// that are being performed. Return true if a simplification was made.
-///
+/// Perform optimizations common to nodes that compute two values. LoOp and HiOp
+/// give the opcodes for the two computations that are being performed. Return
+/// true if a simplification was made.
 SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
                                                 unsigned HiOp) {
   // If the high half is not needed, just compute the low half.
@@ -2292,8 +2376,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
   if (!HiExists &&
       (!LegalOperations ||
        TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
-    SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0),
-                              ArrayRef<SDUse>(N->op_begin(), N->op_end()));
+    SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
     return CombineTo(N, Res, Res);
   }
 
@@ -2302,8 +2385,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
   if (!LoExists &&
       (!LegalOperations ||
        TLI.isOperationLegal(HiOp, N->getValueType(1)))) {
-    SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1),
-                              ArrayRef<SDUse>(N->op_begin(), N->op_end()));
+    SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
     return CombineTo(N, Res, Res);
   }
 
@@ -2313,8 +2395,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
 
   // If the two computed results can be simplified separately, separate them.
   if (LoExists) {
-    SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0),
-                             ArrayRef<SDUse>(N->op_begin(), N->op_end()));
+    SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
     AddToWorklist(Lo.getNode());
     SDValue LoOpt = combine(Lo.getNode());
     if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
@@ -2324,8 +2405,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
   }
 
   if (HiExists) {
-    SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1),
-                             ArrayRef<SDUse>(N->op_begin(), N->op_end()));
+    SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
     AddToWorklist(Hi.getNode());
     SDValue HiOpt = combine(Hi.getNode());
     if (HiOpt.getNode() && HiOpt != Hi &&
@@ -2431,8 +2511,8 @@ SDValue DAGCombiner::visitUDIVREM(SDNode *N) {
   return SDValue();
 }
 
-/// SimplifyBinOpWithSameOpcodeHands - If this is a binary operator with
-/// two operands of the same opcode, try to simplify it.
+/// If this is a binary operator with two operands of the same opcode, try to
+/// simplify it.
 SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
@@ -2445,6 +2525,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
   // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
   // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
+  // fold (OP (bswap x), (bswap y)) -> (bswap (OP x, y))
   // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
   //
   // do not sink logical op inside of a vector extend, since it may combine
@@ -2452,6 +2533,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   EVT Op0VT = N0.getOperand(0).getValueType();
   if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
        N0.getOpcode() == ISD::SIGN_EXTEND ||
+       N0.getOpcode() == ISD::BSWAP ||
        // Avoid infinite looping with PromoteIntBinOp.
        (N0.getOpcode() == ISD::ANY_EXTEND &&
         (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) ||
@@ -2598,9 +2680,17 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
     // fold (and x, 0) -> 0, vector edition
     if (ISD::isBuildVectorAllZeros(N0.getNode()))
-      return N0;
+      // do not return N0, because undef node may exist in N0
+      return DAG.getConstant(
+          APInt::getNullValue(
+              N0.getValueType().getScalarType().getSizeInBits()),
+          N0.getValueType());
     if (ISD::isBuildVectorAllZeros(N1.getNode()))
-      return N1;
+      // do not return N1, because undef node may exist in N1
+      return DAG.getConstant(
+          APInt::getNullValue(
+              N1.getValueType().getScalarType().getSizeInBits()),
+          N1.getValueType());
 
     // fold (and x, -1) -> x, vector edition
     if (ISD::isBuildVectorAllOnes(N0.getNode()))
@@ -2708,6 +2798,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     // actually legal and isn't going to get expanded, else this is a false
     // optimisation.
     bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
+                                                    Load->getValueType(0),
                                                     Load->getMemoryVT());
 
     // Resize the constant to the same size as the original memory access before
@@ -2834,7 +2925,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
                            BitWidth - MemVT.getScalarType().getSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
-         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
@@ -2854,7 +2945,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
                            BitWidth - MemVT.getScalarType().getSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
-         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
@@ -2880,10 +2971,11 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue())){
         EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
         EVT LoadedVT = LN0->getMemoryVT();
+        EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
 
         if (ExtVT == LoadedVT &&
-            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
-          EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
+            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy,
+                                                    ExtVT))) {
 
           SDValue NewLoad =
             DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
@@ -2898,7 +2990,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         // Do not generate loads of non-round integer types since these can
         // be expensive (and would be wrong if the type is not byte sized).
         if (!LN0->isVolatile() && LoadedVT.bitsGT(ExtVT) && ExtVT.isRound() &&
-            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
+            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy,
+                                                    ExtVT))) {
           EVT PtrType = LN0->getOperand(1).getValueType();
 
           unsigned Alignment = LN0->getAlignment();
@@ -2918,13 +3011,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
           AddToWorklist(NewPtr.getNode());
 
-          EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
           SDValue Load =
             DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
                            LN0->getChain(), NewPtr,
                            LN0->getPointerInfo(),
                            ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                           Alignment, LN0->getTBAAInfo());
+                           LN0->isInvariant(), Alignment, LN0->getAAInfo());
           AddToWorklist(N);
           CombineTo(LN0, Load, Load.getValue(1));
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2971,8 +3063,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   return SDValue();
 }
 
-/// MatchBSwapHWord - Match (a >> 8) | (a << 8) as (bswap a) >> 16
-///
+/// Match (a >> 8) | (a << 8) as (bswap a) >> 16.
 SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                         bool DemandHighBits) {
   if (!LegalOperations)
@@ -3077,10 +3168,13 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   return Res;
 }
 
-/// isBSwapHWordElement - Return true if the specified node is an element
-/// that makes up a 32-bit packed halfword byteswap. i.e.
-/// ((x&0xff)<<8)|((x&0xff00)>>8)|((x&0x00ff0000)<<8)|((x&0xff000000)>>8)
-static bool isBSwapHWordElement(SDValue N, SmallVectorImpl<SDNode *> &Parts) {
+/// Return true if the specified node is an element that makes up a 32-bit
+/// packed halfword byteswap.
+/// ((x & 0x000000ff) << 8) |
+/// ((x & 0x0000ff00) >> 8) |
+/// ((x & 0x00ff0000) << 8) |
+/// ((x & 0xff000000) >> 8)
+static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
   if (!N.getNode()->hasOneUse())
     return false;
 
@@ -3147,8 +3241,11 @@ static bool isBSwapHWordElement(SDValue N, SmallVectorImpl<SDNode *> &Parts) {
   return true;
 }
 
-/// MatchBSwapHWord - Match a 32-bit packed halfword bswap. That is
-/// ((x&0xff)<<8)|((x&0xff00)>>8)|((x&0x00ff0000)<<8)|((x&0xff000000)>>8)
+/// Match a 32-bit packed halfword bswap. That is
+/// ((x & 0x000000ff) << 8) |
+/// ((x & 0x0000ff00) >> 8) |
+/// ((x & 0x00ff0000) << 8) |
+/// ((x & 0xff000000) >> 8)
 /// => (rotl (bswap x), 16)
 SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   if (!LegalOperations)
@@ -3160,7 +3257,6 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   if (!TLI.isOperationLegal(ISD::BSWAP, VT))
     return SDValue();
 
-  SmallVector<SDNode*,4> Parts(4, (SDNode*)nullptr);
   // Look for either
   // (or (or (and), (and)), (or (and), (and)))
   // (or (or (or (and), (and)), (and)), (and))
@@ -3168,6 +3264,7 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
     return SDValue();
   SDValue N00 = N0.getOperand(0);
   SDValue N01 = N0.getOperand(1);
+  SDNode *Parts[4] = {};
 
   if (N1.getOpcode() == ISD::OR &&
       N00.getNumOperands() == 2 && N01.getNumOperands() == 2) {
@@ -3241,9 +3338,17 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
 
     // fold (or x, -1) -> -1, vector edition
     if (ISD::isBuildVectorAllOnes(N0.getNode()))
-      return N0;
+      // do not return N0, because undef node may exist in N0
+      return DAG.getConstant(
+          APInt::getAllOnesValue(
+              N0.getValueType().getScalarType().getSizeInBits()),
+          N0.getValueType());
     if (ISD::isBuildVectorAllOnes(N1.getNode()))
-      return N1;
+      // do not return N1, because undef node may exist in N1
+      return DAG.getConstant(
+          APInt::getAllOnesValue(
+              N1.getValueType().getScalarType().getSizeInBits()),
+          N1.getValueType());
 
     // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask1)
     // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf B, A, Mask2)
@@ -3342,12 +3447,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
              isa<ConstantSDNode>(N0.getOperand(1))) {
     ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1));
     if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0) {
-      SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1);
-      if (!COR.getNode())
-        return SDValue();
-      return DAG.getNode(ISD::AND, SDLoc(N), VT,
-                         DAG.getNode(ISD::OR, SDLoc(N0), VT,
-                                     N0.getOperand(0), N1), COR);
+      if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1))
+        return DAG.getNode(
+            ISD::AND, SDLoc(N), VT,
+            DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR);
+      return SDValue();
     }
   }
   // fold (or (setcc x), (setcc y)) -> (setcc (or x, y))
@@ -3435,7 +3539,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   return SDValue();
 }
 
-/// MatchRotateHalf - Match "(X shl/srl V1) & V2" where V2 may not be present.
+/// Match "(X shl/srl V1) & V2" where V2 may not be present.
 static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
   if (Op.getOpcode() == ISD::AND) {
     if (isa<ConstantSDNode>(Op.getOperand(1))) {
@@ -3732,7 +3836,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     return RXOR;
 
   // fold !(x cc y) -> (x !cc y)
-  if (N1C && N1C->getAPIntValue() == 1 && isSetCCEquivalent(N0, LHS, RHS, CC)) {
+  if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) {
     bool isInt = LHS.getValueType().isInteger();
     ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
                                                isInt);
@@ -3825,8 +3929,8 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   return SDValue();
 }
 
-/// visitShiftByConstant - Handle transforms common to the three shifts, when
-/// the shift amount is a constant.
+/// Handle transforms common to the three shifts, when the shift amount is a
+/// constant.
 SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) {
   // We can't and shouldn't fold opaque constants.
   if (Amt->isOpaque())
@@ -3963,8 +4067,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
         if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
             TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
                 TargetLowering::ZeroOrNegativeOneBooleanContent) {
-          SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, VT, N01CV, N1CV);
-          if (C.getNode())
+          if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, VT, N01CV, N1CV))
             return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
         }
       } else {
@@ -4098,6 +4201,18 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
                        HiBitsMask);
   }
 
+  // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
+  // Variant of version done on multiply, except mul by a power of 2 is turned
+  // into a shift.
+  APInt Val;
+  if (N1C && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
+      (isa<ConstantSDNode>(N0.getOperand(1)) ||
+       isConstantSplatVector(N0.getOperand(1).getNode(), Val))) {
+    SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
+    SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
+    return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1);
+  }
+
   if (N1C) {
     SDValue NewSHL = visitShiftByConstant(N, N1C);
     if (NewSHL.getNode())
@@ -4498,6 +4613,43 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) {
   return SDValue();
 }
 
+
+/// \brief Generate Min/Max node
+static SDValue combineMinNumMaxNum(SDLoc DL, EVT VT, SDValue LHS, SDValue RHS,
+                                   SDValue True, SDValue False,
+                                   ISD::CondCode CC, const TargetLowering &TLI,
+                                   SelectionDAG &DAG) {
+  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+    return SDValue();
+
+  switch (CC) {
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETULT:
+  case ISD::SETULE: {
+    unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
+    if (TLI.isOperationLegal(Opcode, VT))
+      return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+    return SDValue();
+  }
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: {
+    unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
+    if (TLI.isOperationLegal(Opcode, VT))
+      return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+    return SDValue();
+  }
+  default:
+    return SDValue();
+  }
+}
+
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4577,9 +4729,31 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
 
   // fold selects based on a setcc into other things, such as min/max/abs
   if (N0.getOpcode() == ISD::SETCC) {
+    // select x, y (fcmp lt x, y) -> fminnum x, y
+    // select x, y (fcmp gt x, y) -> fmaxnum x, y
+    //
+    // This is OK if we don't care about what happens if either operand is a
+    // NaN.
+    //
+
+    // FIXME: Instead of testing for UnsafeFPMath, this should be checking for
+    // no signed zeros as well as no nans.
+    const TargetOptions &Options = DAG.getTarget().Options;
+    if (Options.UnsafeFPMath &&
+        VT.isFloatingPoint() && N0.hasOneUse() &&
+        DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+
+      SDValue FMinMax =
+          combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1),
+                              N1, N2, CC, TLI, DAG);
+      if (FMinMax)
+        return FMinMax;
+    }
+
     if ((!LegalOperations &&
          TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
-	TLI.isOperationLegal(ISD::SELECT_CC, VT))
+        TLI.isOperationLegal(ISD::SELECT_CC, VT))
       return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT,
                          N0.getOperand(0), N0.getOperand(1),
                          N1, N2, N0.getOperand(2));
@@ -4613,12 +4787,17 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
   SDValue Cond = N->getOperand(0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
-  MVT VT = N->getSimpleValueType(0);
+  EVT VT = N->getValueType(0);
   int NumElems = VT.getVectorNumElements();
   assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
          RHS.getOpcode() == ISD::CONCAT_VECTORS &&
          Cond.getOpcode() == ISD::BUILD_VECTOR);
 
+  // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about
+  // binary ones here.
+  if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2)
+    return SDValue();
+
   // We're sure we have an even number of elements due to the
   // concat_vectors we have as arguments to vselect.
   // Skip BV elements until we find one that's not an UNDEF
@@ -4656,6 +4835,162 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
       TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
 }
 
+SDValue DAGCombiner::visitMSTORE(SDNode *N) {
+
+  if (Level >= AfterLegalizeTypes)
+    return SDValue();
+
+  MaskedStoreSDNode *MST = dyn_cast<MaskedStoreSDNode>(N);
+  SDValue Mask = MST->getMask();
+  SDValue Data  = MST->getData();
+  SDLoc DL(N);
+
+  // If the MSTORE data type requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+  if (Mask.getOpcode() == ISD::SETCC) {
+
+    // Check if any splitting is required.
+    if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
+        TargetLowering::TypeSplitVector)
+      return SDValue();
+
+    SDValue MaskLo, MaskHi, Lo, Hi;
+    std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
+
+    EVT LoVT, HiVT;
+    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MST->getValueType(0));
+
+    SDValue Chain = MST->getChain();
+    SDValue Ptr   = MST->getBasePtr();
+
+    EVT MemoryVT = MST->getMemoryVT();
+    unsigned Alignment = MST->getOriginalAlignment();
+
+    // if Alignment is equal to the vector size,
+    // take the half of it for the second part
+    unsigned SecondHalfAlignment =
+      (Alignment == Data->getValueType(0).getSizeInBits()/8) ?
+         Alignment/2 : Alignment;
+
+    EVT LoMemVT, HiMemVT;
+    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+    SDValue DataLo, DataHi;
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+
+    MachineMemOperand *MMO = DAG.getMachineFunction().
+      getMachineMemOperand(MST->getPointerInfo(), 
+                           MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
+                           Alignment, MST->getAAInfo(), MST->getRanges());
+
+    Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, MMO);
+
+    unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
+
+    MMO = DAG.getMachineFunction().
+      getMachineMemOperand(MST->getPointerInfo(), 
+                           MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
+                           SecondHalfAlignment, MST->getAAInfo(),
+                           MST->getRanges());
+
+    Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, MMO);
+
+    AddToWorklist(Lo.getNode());
+    AddToWorklist(Hi.getNode());
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMLOAD(SDNode *N) {
+
+  if (Level >= AfterLegalizeTypes)
+    return SDValue();
+
+  MaskedLoadSDNode *MLD = dyn_cast<MaskedLoadSDNode>(N);
+  SDValue Mask = MLD->getMask();
+  SDLoc DL(N);
+
+  // If the MLOAD result requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+
+  if (Mask.getOpcode() == ISD::SETCC) {
+    EVT VT = N->getValueType(0);
+
+    // Check if any splitting is required.
+    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
+        TargetLowering::TypeSplitVector)
+      return SDValue();
+
+    SDValue MaskLo, MaskHi, Lo, Hi;
+    std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
+
+    SDValue Src0 = MLD->getSrc0();
+    SDValue Src0Lo, Src0Hi;
+    std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);
+
+    EVT LoVT, HiVT;
+    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
+
+    SDValue Chain = MLD->getChain();
+    SDValue Ptr   = MLD->getBasePtr();
+    EVT MemoryVT = MLD->getMemoryVT();
+    unsigned Alignment = MLD->getOriginalAlignment();
+
+    // if Alignment is equal to the vector size,
+    // take the half of it for the second part
+    unsigned SecondHalfAlignment =
+      (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
+         Alignment/2 : Alignment;
+
+    EVT LoMemVT, HiMemVT;
+    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+    MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(), 
+                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
+                         Alignment, MLD->getAAInfo(), MLD->getRanges());
+
+    Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, MMO);
+
+    unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
+
+    MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(), 
+                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
+                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
+
+    Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, MMO);
+
+    AddToWorklist(Lo.getNode());
+    AddToWorklist(Hi.getNode());
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
+                        Hi.getValue(1));
+
+    // Legalized the chain result - switch anything that used the old chain to
+    // use the new one.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);
+
+    SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+
+    SDValue RetOps[] = { LoadRes, Chain };
+    return DAG.getMergeValues(RetOps, DL);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4765,13 +5100,16 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
         return N2;    // cond always true -> true val
       else
         return N3;    // cond always false -> false val
-    }
-
-    // Fold to a simpler select_cc
-    if (SCC.getOpcode() == ISD::SETCC)
+    } else if (SCC->getOpcode() == ISD::UNDEF) {
+      // When the condition is UNDEF, just return the first operand. This is
+      // coherent the DAG creation, no setcc node is created in this case
+      return N2;
+    } else if (SCC.getOpcode() == ISD::SETCC) {
+      // Fold to a simpler select_cc
       return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(),
                          SCC.getOperand(0), SCC.getOperand(1), N2, N3,
                          SCC.getOperand(2));
+    }
   }
 
   // If we can fold this based on the true/false value, do so.
@@ -5004,7 +5342,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()))) {
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) {
     bool DoXform = true;
     SmallVector<SDNode*, 4> SetCCs;
     if (!N0.hasOneUse())
@@ -5032,7 +5370,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
-        TLI.isLoadExtLegal(ISD::SEXTLOAD, MemVT)) {
+        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
                                        LN0->getBasePtr(), MemVT,
@@ -5052,7 +5390,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
        N0.getOpcode() == ISD::XOR) &&
       isa<LoadSDNode>(N0.getOperand(0)) &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
-      TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()) &&
+      TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()) &&
       (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
     if (LN0->getExtensionType() != ISD::ZEXTLOAD && LN0->isUnindexed()) {
@@ -5131,14 +5469,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       if (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, SetCCVT)) {
         SDLoc DL(N);
         ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
-        SDValue SetCC = DAG.getSetCC(DL,
-                                     SetCCVT,
+        SDValue SetCC = DAG.getSetCC(DL, SetCCVT,
                                      N0.getOperand(0), N0.getOperand(1), CC);
-        EVT SelectVT = getSetCCResultType(VT);
-        return DAG.getSelect(DL, VT,
-                             DAG.getSExtOrTrunc(SetCC, DL, SelectVT),
+        return DAG.getSelect(DL, VT, SetCC,
                              NegOne, DAG.getConstant(0, VT));
-
       }
     }
   }
@@ -5298,7 +5632,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()))) {
+       TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) {
     bool DoXform = true;
     SmallVector<SDNode*, 4> SetCCs;
     if (!N0.hasOneUse())
@@ -5326,7 +5660,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
        N0.getOpcode() == ISD::XOR) &&
       isa<LoadSDNode>(N0.getOperand(0)) &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
-      TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()) &&
+      TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()) &&
       (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
     if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) {
@@ -5363,7 +5697,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
-        TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT)) {
+        TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
                                        LN0->getBasePtr(), MemVT,
@@ -5525,7 +5859,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   // scalars.
   if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType())) {
+      TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
     bool DoXform = true;
     SmallVector<SDNode*, 4> SetCCs;
     if (!N0.hasOneUse())
@@ -5555,7 +5889,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     ISD::LoadExtType ExtType = LN0->getExtensionType();
     EVT MemVT = LN0->getMemoryVT();
-    if (!LegalOperations || TLI.isLoadExtLegal(ExtType, MemVT)) {
+    if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
                                        VT, LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
@@ -5610,9 +5944,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   return SDValue();
 }
 
-/// GetDemandedBits - See if the specified operand can be simplified with the
-/// knowledge that only the bits specified by Mask are used.  If so, return the
-/// simpler operand, otherwise return a null SDValue.
+/// See if the specified operand can be simplified with the knowledge that only
+/// the bits specified by Mask are used.  If so, return the simpler operand,
+/// otherwise return a null SDValue.
 SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
   switch (V.getOpcode()) {
   default: break;
@@ -5653,11 +5987,11 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
   return SDValue();
 }
 
-/// ReduceLoadWidth - If the result of a wider load is shifted to right of N
-/// bits and then truncated to a narrower type and where N is a multiple
-/// of number of bits of the narrower type, transform it to a narrower load
-/// from address + N / num of bits of new type. If the result is to be
-/// extended, also fold the extension to form a extending load.
+/// If the result of a wider load is shifted to right of N  bits and then
+/// truncated to a narrower type and where N is a multiple of number of bits of
+/// the narrower type, transform it to a narrower load from address + N / num of
+/// bits of new type. If the result is to be extended, also fold the extension
+/// to form a extending load.
 SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
   unsigned Opc = N->getOpcode();
 
@@ -5684,7 +6018,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     ExtVT = EVT::getIntegerVT(*DAG.getContext(),
                               VT.getSizeInBits() - N01->getZExtValue());
   }
-  if (LegalOperations && !TLI.isLoadExtLegal(ExtType, ExtVT))
+  if (LegalOperations && !TLI.isLoadExtLegal(ExtType, VT, ExtVT))
     return SDValue();
 
   unsigned EVTBits = ExtVT.getSizeInBits();
@@ -5763,6 +6097,9 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
       LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt)
     return SDValue();
 
+  if (!TLI.shouldReduceLoadWidth(LN0, ExtType, ExtVT))
+    return SDValue();
+
   EVT PtrType = N0.getOperand(1).getValueType();
 
   if (PtrType == MVT::Untyped || PtrType.isExtended())
@@ -5789,12 +6126,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     Load =  DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
                         LN0->getPointerInfo().getWithOffset(PtrOff),
                         LN0->isVolatile(), LN0->isNonTemporal(),
-                        LN0->isInvariant(), NewAlign, LN0->getTBAAInfo());
+                        LN0->isInvariant(), NewAlign, LN0->getAAInfo());
   else
     Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(),NewPtr,
                           LN0->getPointerInfo().getWithOffset(PtrOff),
                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                          NewAlign, LN0->getTBAAInfo());
+                          LN0->isInvariant(), NewAlign, LN0->getAAInfo());
 
   // Replace the old load's chain with the new load's chain.
   WorklistRemover DeadNodes(*this);
@@ -5888,7 +6225,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
@@ -5904,7 +6241,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       N0.hasOneUse() &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
@@ -6152,7 +6489,7 @@ static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
   return Elt.getOperand(Elt.getResNo()).getNode();
 }
 
-/// CombineConsecutiveLoads - build_pair (load, load) -> load
+/// build_pair (load, load) -> load
 /// if load locations are consecutive.
 SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
   assert(N->getOpcode() == ISD::BUILD_PAIR);
@@ -6218,7 +6555,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       // Ideally this won't happen very often, because instcombine
       // and the earlier dagcombine runs (where illegal nodes are
       // permitted) should have folded most of them already.
-      DAG.DeleteNode(Res.getNode());
+      deleteAndRecombine(Res.getNode());
     }
   }
 
@@ -6247,12 +6584,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
                                  LN0->getBasePtr(), LN0->getPointerInfo(),
                                  LN0->isVolatile(), LN0->isNonTemporal(),
                                  LN0->isInvariant(), OrigAlign,
-                                 LN0->getTBAAInfo());
-      AddToWorklist(N);
-      CombineTo(N0.getNode(),
-                DAG.getNode(ISD::BITCAST, SDLoc(N0),
-                            N0.getValueType(), Load),
-                Load.getValue(1));
+                                 LN0->getAAInfo());
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
       return Load;
     }
   }
@@ -6337,9 +6670,8 @@ SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
   return CombineConsecutiveLoads(N, VT);
 }
 
-/// ConstantFoldBITCASTofBUILD_VECTOR - We know that BV is a build_vector
-/// node with Constant, ConstantFP or Undef operands.  DstEltVT indicates the
-/// destination element value type.
+/// We know that BV is a build_vector node with Constant, ConstantFP or Undef
+/// operands. DstEltVT indicates the destination element value type.
 SDValue DAGCombiner::
 ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   EVT SrcEltVT = BV->getValueType(0).getVectorElementType();
@@ -6383,7 +6715,6 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   if (SrcEltVT.isFloatingPoint()) {
     // Convert the input float vector to a int vector where the elements are the
     // same sizes.
-    assert((SrcEltVT == MVT::f32 || SrcEltVT == MVT::f64) && "Unknown FP VT!");
     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
     BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
     SrcEltVT = IntVT;
@@ -6392,7 +6723,6 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   // Now we know the input is an integer vector.  If the output is a FP type,
   // convert to integer first, then to FP of the right size.
   if (DstEltVT.isFloatingPoint()) {
-    assert((DstEltVT == MVT::f32 || DstEltVT == MVT::f64) && "Unknown FP VT!");
     EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
     SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();
 
@@ -6475,6 +6805,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
+  const TargetOptions &Options = DAG.getTarget().Options;
 
   // fold vector ops
   if (VT.isVector()) {
@@ -6485,195 +6816,197 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   // fold (fadd c1, c2) -> c1 + c2
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N1);
+
   // canonicalize constant to RHS
   if (N0CFP && !N1CFP)
     return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N0);
-  // fold (fadd A, 0) -> A
-  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
-      N1CFP->getValueAPF().isZero())
-    return N0;
+
   // fold (fadd A, (fneg B)) -> (fsub A, B)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-    isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
+      isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2)
     return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations));
+
   // fold (fadd (fneg A), B) -> (fsub B, A)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-    isNegatibleForFree(N0, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
+      isNegatibleForFree(N0, LegalOperations, TLI, &Options) == 2)
     return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N1,
                        GetNegatedExpression(N0, DAG, LegalOperations));
 
-  // If allowed, fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
-  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
-      N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() &&
-      isa<ConstantFPSDNode>(N0.getOperand(1)))
-    return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0.getOperand(0),
-                       DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                   N0.getOperand(1), N1));
-
-  // No FP constant should be created after legalization as Instruction
-  // Selection pass has hard time in dealing with FP constant.
-  //
-  // We don't need test this condition for transformation like following, as
-  // the DAG being transformed implies it is legal to take FP constant as
-  // operand.
-  //
-  //  (fadd (fmul c, x), x) -> (fmul c+1, x)
-  //
-  bool AllowNewFpConst = (Level < AfterLegalizeDAG);
-
-  // If allow, fold (fadd (fneg x), x) -> 0.0
-  if (AllowNewFpConst && DAG.getTarget().Options.UnsafeFPMath &&
-      N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
-    return DAG.getConstantFP(0.0, VT);
+  // If 'unsafe math' is enabled, fold lots of things.
+  if (Options.UnsafeFPMath) {
+    // No FP constant should be created after legalization as Instruction
+    // Selection pass has a hard time dealing with FP constants.
+    bool AllowNewConst = (Level < AfterLegalizeDAG);
 
-    // If allow, fold (fadd x, (fneg x)) -> 0.0
-  if (AllowNewFpConst && DAG.getTarget().Options.UnsafeFPMath &&
-      N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
-    return DAG.getConstantFP(0.0, VT);
-
-  // In unsafe math mode, we can fold chains of FADD's of the same value
-  // into multiplications.  This transform is not safe in general because
-  // we are reducing the number of rounding steps.
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      TLI.isOperationLegalOrCustom(ISD::FMUL, VT) &&
-      !N0CFP && !N1CFP) {
-    if (N0.getOpcode() == ISD::FMUL) {
-      ConstantFPSDNode *CFP00 = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
-      ConstantFPSDNode *CFP01 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
-
-      // (fadd (fmul c, x), x) -> (fmul x, c+1)
-      if (CFP00 && !CFP01 && N0.getOperand(1) == N1) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP00, 0),
-                                     DAG.getConstantFP(1.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N1, NewCFP);
-      }
+    // fold (fadd A, 0) -> A
+    if (N1CFP && N1CFP->getValueAPF().isZero())
+      return N0;
 
-      // (fadd (fmul x, c), x) -> (fmul x, c+1)
-      if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP01, 0),
-                                     DAG.getConstantFP(1.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N1, NewCFP);
-      }
+    // fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
+    if (N1CFP && N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() &&
+        isa<ConstantFPSDNode>(N0.getOperand(1)))
+      return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0.getOperand(0),
+                         DAG.getNode(ISD::FADD, SDLoc(N), VT,
+                                     N0.getOperand(1), N1));
+
+    // If allowed, fold (fadd (fneg x), x) -> 0.0
+    if (AllowNewConst && N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
+      return DAG.getConstantFP(0.0, VT);
+
+    // If allowed, fold (fadd x, (fneg x)) -> 0.0
+    if (AllowNewConst && N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
+      return DAG.getConstantFP(0.0, VT);
+
+    // We can fold chains of FADD's of the same value into multiplications.
+    // This transform is not safe in general because we are reducing the number
+    // of rounding steps.
+    if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
+      if (N0.getOpcode() == ISD::FMUL) {
+        ConstantFPSDNode *CFP00 = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
+        ConstantFPSDNode *CFP01 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+
+        // (fadd (fmul x, c), x) -> (fmul x, c+1)
+        if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
+                                       SDValue(CFP01, 0),
+                                       DAG.getConstantFP(1.0, VT));
+          return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N1, NewCFP);
+        }
 
-      // (fadd (fmul c, x), (fadd x, x)) -> (fmul x, c+2)
-      if (CFP00 && !CFP01 && N1.getOpcode() == ISD::FADD &&
-          N1.getOperand(0) == N1.getOperand(1) &&
-          N0.getOperand(1) == N1.getOperand(0)) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP00, 0),
-                                     DAG.getConstantFP(2.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N0.getOperand(1), NewCFP);
+        // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
+        if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
+            N1.getOperand(0) == N1.getOperand(1) &&
+            N0.getOperand(0) == N1.getOperand(0)) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
+                                       SDValue(CFP01, 0),
+                                       DAG.getConstantFP(2.0, VT));
+          return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
+                             N0.getOperand(0), NewCFP);
+        }
       }
 
-      // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
-      if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
-          N1.getOperand(0) == N1.getOperand(1) &&
-          N0.getOperand(0) == N1.getOperand(0)) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP01, 0),
-                                     DAG.getConstantFP(2.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N0.getOperand(0), NewCFP);
-      }
-    }
+      if (N1.getOpcode() == ISD::FMUL) {
+        ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
+        ConstantFPSDNode *CFP11 = dyn_cast<ConstantFPSDNode>(N1.getOperand(1));
 
-    if (N1.getOpcode() == ISD::FMUL) {
-      ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
-      ConstantFPSDNode *CFP11 = dyn_cast<ConstantFPSDNode>(N1.getOperand(1));
+        // (fadd x, (fmul x, c)) -> (fmul x, c+1)
+        if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
+                                       SDValue(CFP11, 0),
+                                       DAG.getConstantFP(1.0, VT));
+          return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0, NewCFP);
+        }
 
-      // (fadd x, (fmul c, x)) -> (fmul x, c+1)
-      if (CFP10 && !CFP11 && N1.getOperand(1) == N0) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP10, 0),
-                                     DAG.getConstantFP(1.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N0, NewCFP);
+        // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
+        if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
+            N0.getOperand(0) == N0.getOperand(1) &&
+            N1.getOperand(0) == N0.getOperand(0)) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
+                                       SDValue(CFP11, 0),
+                                       DAG.getConstantFP(2.0, VT));
+          return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N1.getOperand(0), NewCFP);
+        }
       }
 
-      // (fadd x, (fmul x, c)) -> (fmul x, c+1)
-      if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP11, 0),
-                                     DAG.getConstantFP(1.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N0, NewCFP);
+      if (N0.getOpcode() == ISD::FADD && AllowNewConst) {
+        ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
+        // (fadd (fadd x, x), x) -> (fmul x, 3.0)
+        if (!CFP && N0.getOperand(0) == N0.getOperand(1) &&
+            (N0.getOperand(0) == N1))
+          return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
+                             N1, DAG.getConstantFP(3.0, VT));
       }
 
-
-      // (fadd (fadd x, x), (fmul c, x)) -> (fmul x, c+2)
-      if (CFP10 && !CFP11 && N0.getOpcode() == ISD::FADD &&
-          N0.getOperand(0) == N0.getOperand(1) &&
-          N1.getOperand(1) == N0.getOperand(0)) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP10, 0),
-                                     DAG.getConstantFP(2.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N1.getOperand(1), NewCFP);
+      if (N1.getOpcode() == ISD::FADD && AllowNewConst) {
+        ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
+        // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
+        if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
+            N1.getOperand(0) == N0)
+          return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
+                             N0, DAG.getConstantFP(3.0, VT));
       }
 
-      // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
-      if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
+      // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
+      if (AllowNewConst &&
+          N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
           N0.getOperand(0) == N0.getOperand(1) &&
-          N1.getOperand(0) == N0.getOperand(0)) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP11, 0),
-                                     DAG.getConstantFP(2.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N1.getOperand(0), NewCFP);
-      }
-    }
-
-    if (N0.getOpcode() == ISD::FADD && AllowNewFpConst) {
-      ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
-      // (fadd (fadd x, x), x) -> (fmul x, 3.0)
-      if (!CFP && N0.getOperand(0) == N0.getOperand(1) &&
-          (N0.getOperand(0) == N1))
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N1, DAG.getConstantFP(3.0, VT));
-    }
-
-    if (N1.getOpcode() == ISD::FADD && AllowNewFpConst) {
-      ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
-      // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
-      if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
-          N1.getOperand(0) == N0)
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(0) == N1.getOperand(0))
         return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N0, DAG.getConstantFP(3.0, VT));
+                           N0.getOperand(0), DAG.getConstantFP(4.0, VT));
     }
-
-    // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
-    if (AllowNewFpConst &&
-        N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
-        N0.getOperand(0) == N0.getOperand(1) &&
-        N1.getOperand(0) == N1.getOperand(1) &&
-        N0.getOperand(0) == N1.getOperand(0))
-      return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                         N0.getOperand(0),
-                         DAG.getConstantFP(4.0, VT));
-  }
+  } // enable-unsafe-fp-math
 
   // FADD -> FMA combines:
-  if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
-       DAG.getTarget().Options.UnsafeFPMath) &&
-      DAG.getTarget().getTargetLowering()->isFMAFasterThanFMulAndFAdd(VT) &&
+  if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
+      TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
 
     // fold (fadd (fmul x, y), z) -> (fma x, y, z)
-    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse())
+    if (N0.getOpcode() == ISD::FMUL &&
+        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
                          N0.getOperand(0), N0.getOperand(1), N1);
 
     // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
     // Note: Commutes FADD operands.
-    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse())
+    if (N1.getOpcode() == ISD::FMUL &&
+        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
                          N1.getOperand(0), N1.getOperand(1), N0);
+
+    // When FP_EXTEND nodes are free on the target, and there is an opportunity
+    // to combine into FMA, arrange such nodes accordingly.
+    if (TLI.isFPExtFree(VT)) {
+
+      // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
+      if (N0.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::FMUL)
+          return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N00.getOperand(0)),
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N00.getOperand(1)), N1);
+      }
+
+      // fold (fadd x, (fpext (fmul y, z)), z) -> (fma (fpext y), (fpext z), x)
+      // Note: Commutes FADD operands.
+      if (N1.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N10 = N1.getOperand(0);
+        if (N10.getOpcode() == ISD::FMUL)
+          return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N10.getOperand(0)),
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N10.getOperand(1)), N0);
+      }
+    }
+
+    // More folding opportunities when target permits.
+    if (TLI.enableAggressiveFMAFusion(VT)) {
+
+      // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
+      if (N0.getOpcode() == ISD::FMA &&
+          N0.getOperand(2).getOpcode() == ISD::FMUL)
+        return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                           N0.getOperand(0), N0.getOperand(1),
+                           DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                                       N0.getOperand(2).getOperand(0),
+                                       N0.getOperand(2).getOperand(1),
+                                       N1));
+
+      // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))
+      if (N1->getOpcode() == ISD::FMA &&
+          N1.getOperand(2).getOpcode() == ISD::FMUL)
+        return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                           N1.getOperand(0), N1.getOperand(1),
+                           DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                                       N1.getOperand(2).getOperand(0),
+                                       N1.getOperand(2).getOperand(1),
+                                       N0));
+    }
   }
 
   return SDValue();
@@ -6682,10 +7015,11 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
 SDValue DAGCombiner::visitFSUB(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
+  ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
 
   // fold vector ops
   if (VT.isVector()) {
@@ -6696,60 +7030,60 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   // fold (fsub c1, c2) -> c1-c2
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0, N1);
-  // fold (fsub A, 0) -> A
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      N1CFP && N1CFP->getValueAPF().isZero())
-    return N0;
-  // fold (fsub 0, B) -> -B
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      N0CFP && N0CFP->getValueAPF().isZero()) {
-    if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
-      return GetNegatedExpression(N1, DAG, LegalOperations);
-    if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
-      return DAG.getNode(ISD::FNEG, dl, VT, N1);
-  }
+
   // fold (fsub A, (fneg B)) -> (fadd A, B)
-  if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
+  if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
     return DAG.getNode(ISD::FADD, dl, VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations));
 
-  // If 'unsafe math' is enabled, fold
-  //    (fsub x, x) -> 0.0 &
-  //    (fsub x, (fadd x, y)) -> (fneg y) &
-  //    (fsub x, (fadd y, x)) -> (fneg y)
-  if (DAG.getTarget().Options.UnsafeFPMath) {
+  // If 'unsafe math' is enabled, fold lots of things.
+  if (Options.UnsafeFPMath) {
+    // (fsub A, 0) -> A
+    if (N1CFP && N1CFP->getValueAPF().isZero())
+      return N0;
+
+    // (fsub 0, B) -> -B
+    if (N0CFP && N0CFP->getValueAPF().isZero()) {
+      if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
+        return GetNegatedExpression(N1, DAG, LegalOperations);
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+        return DAG.getNode(ISD::FNEG, dl, VT, N1);
+    }
+
+    // (fsub x, x) -> 0.0
     if (N0 == N1)
       return DAG.getConstantFP(0.0f, VT);
 
+    // (fsub x, (fadd x, y)) -> (fneg y)
+    // (fsub x, (fadd y, x)) -> (fneg y)
     if (N1.getOpcode() == ISD::FADD) {
       SDValue N10 = N1->getOperand(0);
       SDValue N11 = N1->getOperand(1);
 
-      if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI,
-                                          &DAG.getTarget().Options))
+      if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI, &Options))
         return GetNegatedExpression(N11, DAG, LegalOperations);
 
-      if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI,
-                                          &DAG.getTarget().Options))
+      if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI, &Options))
         return GetNegatedExpression(N10, DAG, LegalOperations);
     }
   }
 
   // FSUB -> FMA combines:
-  if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
-       DAG.getTarget().Options.UnsafeFPMath) &&
-      DAG.getTarget().getTargetLowering()->isFMAFasterThanFMulAndFAdd(VT) &&
+  if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
+      TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
 
     // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse())
+    if (N0.getOpcode() == ISD::FMUL &&
+        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
       return DAG.getNode(ISD::FMA, dl, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(ISD::FNEG, dl, VT, N1));
 
     // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
     // Note: Commutes FSUB operands.
-    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse())
+    if (N1.getOpcode() == ISD::FMUL &&
+        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
       return DAG.getNode(ISD::FMA, dl, VT,
                          DAG.getNode(ISD::FNEG, dl, VT,
                          N1.getOperand(0)),
@@ -6758,13 +7092,115 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
     // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
     if (N0.getOpcode() == ISD::FNEG &&
         N0.getOperand(0).getOpcode() == ISD::FMUL &&
-        N0->hasOneUse() && N0.getOperand(0).hasOneUse()) {
+        ((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) ||
+            TLI.enableAggressiveFMAFusion(VT))) {
       SDValue N00 = N0.getOperand(0).getOperand(0);
       SDValue N01 = N0.getOperand(0).getOperand(1);
       return DAG.getNode(ISD::FMA, dl, VT,
                          DAG.getNode(ISD::FNEG, dl, VT, N00), N01,
                          DAG.getNode(ISD::FNEG, dl, VT, N1));
     }
+
+    // When FP_EXTEND nodes are free on the target, and there is an opportunity
+    // to combine into FMA, arrange such nodes accordingly.
+    if (TLI.isFPExtFree(VT)) {
+
+      // fold (fsub (fpext (fmul x, y)), z)
+      //   -> (fma (fpext x), (fpext y), (fneg z))
+      if (N0.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::FMUL)
+          return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N00.getOperand(0)),
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N00.getOperand(1)),
+                             DAG.getNode(ISD::FNEG, SDLoc(N), VT, N1));
+      }
+
+      // fold (fsub x, (fpext (fmul y, z)))
+      //   -> (fma (fneg (fpext y)), (fpext z), x)
+      // Note: Commutes FSUB operands.
+      if (N1.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N10 = N1.getOperand(0);
+        if (N10.getOpcode() == ISD::FMUL)
+          return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                             DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                                         DAG.getNode(ISD::FP_EXTEND, SDLoc(N),
+                                                     VT, N10.getOperand(0))),
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N10.getOperand(1)),
+                             N0);
+      }
+
+      // fold (fsub (fpext (fneg (fmul, x, y))), z)
+      //   -> (fma (fneg (fpext x)), (fpext y), (fneg z))
+      if (N0.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::FNEG) {
+          SDValue N000 = N00.getOperand(0);
+          if (N000.getOpcode() == ISD::FMUL) {
+            return DAG.getNode(ISD::FMA, dl, VT,
+                               DAG.getNode(ISD::FNEG, dl, VT,
+                                           DAG.getNode(ISD::FP_EXTEND, SDLoc(N),
+                                                       VT, N000.getOperand(0))),
+                               DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                           N000.getOperand(1)),
+                               DAG.getNode(ISD::FNEG, dl, VT, N1));
+          }
+        }
+      }
+
+      // fold (fsub (fneg (fpext (fmul, x, y))), z)
+      //   -> (fma (fneg (fpext x)), (fpext y), (fneg z))
+      if (N0.getOpcode() == ISD::FNEG) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::FP_EXTEND) {
+          SDValue N000 = N00.getOperand(0);
+          if (N000.getOpcode() == ISD::FMUL) {
+            return DAG.getNode(ISD::FMA, dl, VT,
+                               DAG.getNode(ISD::FNEG, dl, VT,
+                                           DAG.getNode(ISD::FP_EXTEND, SDLoc(N),
+                                           VT, N000.getOperand(0))),
+                               DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                           N000.getOperand(1)),
+                               DAG.getNode(ISD::FNEG, dl, VT, N1));
+          }
+        }
+      }
+    }
+
+    // More folding opportunities when target permits.
+    if (TLI.enableAggressiveFMAFusion(VT)) {
+
+      // fold (fsub (fma x, y, (fmul u, v)), z)
+      //   -> (fma x, y (fma u, v, (fneg z)))
+      if (N0.getOpcode() == ISD::FMA &&
+          N0.getOperand(2).getOpcode() == ISD::FMUL)
+        return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                           N0.getOperand(0), N0.getOperand(1),
+                           DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                                       N0.getOperand(2).getOperand(0),
+                                       N0.getOperand(2).getOperand(1),
+                                       DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                                                   N1)));
+
+      // fold (fsub x, (fma y, z, (fmul u, v)))
+      //   -> (fma (fneg y), z, (fma (fneg u), v, x))
+      if (N1.getOpcode() == ISD::FMA &&
+          N1.getOperand(2).getOpcode() == ISD::FMUL) {
+        SDValue N20 = N1.getOperand(2).getOperand(0);
+        SDValue N21 = N1.getOperand(2).getOperand(1);
+        return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                           DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                                       N1.getOperand(0)),
+                           N1.getOperand(1),
+                           DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                                       DAG.getNode(ISD::FNEG, SDLoc(N),  VT,
+                                                   N20),
+                                       N21, N0));
+      }
+    }
   }
 
   return SDValue();
@@ -6773,47 +7209,82 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 SDValue DAGCombiner::visitFMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
+  ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
   EVT VT = N->getValueType(0);
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const TargetOptions &Options = DAG.getTarget().Options;
 
   // fold vector ops
   if (VT.isVector()) {
+    // This just handles C1 * C2 for vectors. Other vector folds are below.
     SDValue FoldedVOp = SimplifyVBinOp(N);
-    if (FoldedVOp.getNode()) return FoldedVOp;
+    if (FoldedVOp.getNode())
+      return FoldedVOp;
+    // Canonicalize vector constant to RHS.
+    if (N0.getOpcode() == ISD::BUILD_VECTOR &&
+        N1.getOpcode() != ISD::BUILD_VECTOR)
+      if (auto *BV0 = dyn_cast<BuildVectorSDNode>(N0))
+        if (BV0->isConstant())
+          return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
   }
 
   // fold (fmul c1, c2) -> c1*c2
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0, N1);
+
   // canonicalize constant to RHS
   if (N0CFP && !N1CFP)
     return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N1, N0);
-  // fold (fmul A, 0) -> 0
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      N1CFP && N1CFP->getValueAPF().isZero())
-    return N1;
-  // fold (fmul A, 0) -> 0, vector edition.
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      ISD::isBuildVectorAllZeros(N1.getNode()))
-    return N1;
+
   // fold (fmul A, 1.0) -> A
   if (N1CFP && N1CFP->isExactlyValue(1.0))
     return N0;
+
+  if (Options.UnsafeFPMath) {
+    // fold (fmul A, 0) -> 0
+    if (N1CFP && N1CFP->getValueAPF().isZero())
+      return N1;
+
+    // fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
+    if (N0.getOpcode() == ISD::FMUL) {
+      // Fold scalars or any vector constants (not just splats).
+      // This fold is done in general by InstCombine, but extra fmul insts
+      // may have been generated during lowering.
+      SDValue N01 = N0.getOperand(1);
+      auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
+      auto *BV01 = dyn_cast<BuildVectorSDNode>(N01);
+      if ((N1CFP && isConstOrConstSplatFP(N01)) ||
+          (BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) {
+        SDLoc SL(N);
+        SDValue MulConsts = DAG.getNode(ISD::FMUL, SL, VT, N01, N1);
+        return DAG.getNode(ISD::FMUL, SL, VT, N0.getOperand(0), MulConsts);
+      }
+    }
+
+    // fold (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c))
+    // Undo the fmul 2.0, x -> fadd x, x transformation, since if it occurs
+    // during an early run of DAGCombiner can prevent folding with fmuls
+    // inserted during lowering.
+    if (N0.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1)) {
+      SDLoc SL(N);
+      const SDValue Two = DAG.getConstantFP(2.0, VT);
+      SDValue MulConsts = DAG.getNode(ISD::FMUL, SL, VT, Two, N1);
+      return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), MulConsts);
+    }
+  }
+
   // fold (fmul X, 2.0) -> (fadd X, X)
   if (N1CFP && N1CFP->isExactlyValue(+2.0))
     return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N0);
+
   // fold (fmul X, -1.0) -> (fneg X)
   if (N1CFP && N1CFP->isExactlyValue(-1.0))
     if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
       return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
 
   // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
-  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI,
-                                       &DAG.getTarget().Options)) {
-    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI,
-                                         &DAG.getTarget().Options)) {
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) {
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) {
       // Both can be negated for free, check to see if at least one is cheaper
       // negated.
       if (LHSNeg == 2 || RHSNeg == 2)
@@ -6823,14 +7294,6 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
     }
   }
 
-  // If allowed, fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      N1CFP && N0.getOpcode() == ISD::FMUL &&
-      N0.getNode()->hasOneUse() && isa<ConstantFPSDNode>(N0.getOperand(1)))
-    return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
-                       DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                                   N0.getOperand(1), N1));
-
   return SDValue();
 }
 
@@ -6842,8 +7305,16 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
 
-  if (DAG.getTarget().Options.UnsafeFPMath) {
+  // Constant fold FMA.
+  if (isa<ConstantFPSDNode>(N0) &&
+      isa<ConstantFPSDNode>(N1) &&
+      isa<ConstantFPSDNode>(N2)) {
+    return DAG.getNode(ISD::FMA, dl, VT, N0, N1, N2);
+  }
+
+  if (Options.UnsafeFPMath) {
     if (N0CFP && N0CFP->isZero())
       return N2;
     if (N1CFP && N1CFP->isZero())
@@ -6859,7 +7330,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
     return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);
 
   // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
-  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
+  if (Options.UnsafeFPMath && N1CFP &&
       N2.getOpcode() == ISD::FMUL &&
       N0 == N2.getOperand(0) &&
       N2.getOperand(1).getOpcode() == ISD::ConstantFP) {
@@ -6869,7 +7340,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
 
 
   // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
-  if (DAG.getTarget().Options.UnsafeFPMath &&
+  if (Options.UnsafeFPMath &&
       N0.getOpcode() == ISD::FMUL && N1CFP &&
       N0.getOperand(1).getOpcode() == ISD::ConstantFP) {
     return DAG.getNode(ISD::FMA, dl, VT,
@@ -6893,13 +7364,13 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
   }
 
   // (fma x, c, x) -> (fmul x, (c+1))
-  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP && N0 == N2)
+  if (Options.UnsafeFPMath && N1CFP && N0 == N2)
     return DAG.getNode(ISD::FMUL, dl, VT, N0,
                        DAG.getNode(ISD::FADD, dl, VT,
                                    N1, DAG.getConstantFP(1.0, VT)));
 
   // (fma x, c, (fneg x)) -> (fmul x, (c-1))
-  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
+  if (Options.UnsafeFPMath && N1CFP &&
       N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0)
     return DAG.getNode(ISD::FMUL, dl, VT, N0,
                        DAG.getNode(ISD::FADD, dl, VT,
@@ -6915,7 +7386,8 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc DL(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
 
   // fold vector ops
   if (VT.isVector()) {
@@ -6927,30 +7399,79 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1);
 
-  // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
-  if (N1CFP && DAG.getTarget().Options.UnsafeFPMath) {
-    // Compute the reciprocal 1.0 / c2.
-    APFloat N1APF = N1CFP->getValueAPF();
-    APFloat Recip(N1APF.getSemantics(), 1); // 1.0
-    APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
-    // Only do the transform if the reciprocal is a legal fp immediate that
-    // isn't too nasty (eg NaN, denormal, ...).
-    if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
-        (!LegalOperations ||
-         // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
-         // backend)... we should handle this gracefully after Legalize.
-         // TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT) ||
-         TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) ||
-         TLI.isFPImmLegal(Recip, VT)))
-      return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0,
-                         DAG.getConstantFP(Recip, VT));
+  if (Options.UnsafeFPMath) {
+    // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
+    if (N1CFP) {
+      // Compute the reciprocal 1.0 / c2.
+      APFloat N1APF = N1CFP->getValueAPF();
+      APFloat Recip(N1APF.getSemantics(), 1); // 1.0
+      APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
+      // Only do the transform if the reciprocal is a legal fp immediate that
+      // isn't too nasty (eg NaN, denormal, ...).
+      if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
+          (!LegalOperations ||
+           // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
+           // backend)... we should handle this gracefully after Legalize.
+           // TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT) ||
+           TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) ||
+           TLI.isFPImmLegal(Recip, VT)))
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0,
+                           DAG.getConstantFP(Recip, VT));
+    }
+
+    // If this FDIV is part of a reciprocal square root, it may be folded
+    // into a target-specific square root estimate instruction.
+    if (N1.getOpcode() == ISD::FSQRT) {
+      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0))) {
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+      }
+    } else if (N1.getOpcode() == ISD::FP_EXTEND &&
+               N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
+        RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
+        AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+      }
+    } else if (N1.getOpcode() == ISD::FP_ROUND &&
+               N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
+        RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
+        AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+      }
+    } else if (N1.getOpcode() == ISD::FMUL) {
+      // Look through an FMUL. Even though this won't remove the FDIV directly,
+      // it's still worthwhile to get rid of the FSQRT if possible.
+      SDValue SqrtOp;
+      SDValue OtherOp;
+      if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+        SqrtOp = N1.getOperand(0);
+        OtherOp = N1.getOperand(1);
+      } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
+        SqrtOp = N1.getOperand(1);
+        OtherOp = N1.getOperand(0);
+      }
+      if (SqrtOp.getNode()) {
+        // We found a FSQRT, so try to make this fold:
+        // x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
+        if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0))) {
+          RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp);
+          AddToWorklist(RV.getNode());
+          return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+        }
+      }
+    }
+
+    // Fold into a reciprocal estimate and multiply instead of a real divide.
+    if (SDValue RV = BuildReciprocalEstimate(N1)) {
+      AddToWorklist(RV.getNode());
+      return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+    }
   }
 
   // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
-  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI,
-                                       &DAG.getTarget().Options)) {
-    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI,
-                                         &DAG.getTarget().Options)) {
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) {
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) {
       // Both can be negated for free, check to see if at least one is cheaper
       // negated.
       if (LHSNeg == 2 || RHSNeg == 2)
@@ -6960,6 +7481,44 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     }
   }
 
+  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+  // reciprocal.
+  // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
+  // Notice that this is not always beneficial. One reason is different target
+  // may have different costs for FDIV and FMUL, so sometimes the cost of two
+  // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
+  // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
+  if (Options.UnsafeFPMath) {
+    // Skip if current node is a reciprocal.
+    if (N0CFP && N0CFP->isExactlyValue(1.0))
+      return SDValue();
+
+    SmallVector<SDNode *, 4> Users;
+    // Find all FDIV users of the same divisor.
+    for (SDNode::use_iterator UI = N1.getNode()->use_begin(),
+                              UE = N1.getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = UI.getUse().getUser();
+      if (User->getOpcode() == ISD::FDIV && User->getOperand(1) == N1)
+        Users.push_back(User);
+    }
+
+    if (TLI.combineRepeatedFPDivisors(Users.size())) {
+      SDValue FPOne = DAG.getConstantFP(1.0, VT); // floating point 1.0
+      SDValue Reciprocal = DAG.getNode(ISD::FDIV, SDLoc(N), VT, FPOne, N1);
+
+      // Dividend / Divisor -> Dividend * Reciprocal
+      for (auto I = Users.begin(), E = Users.end(); I != E; ++I) {
+        if ((*I)->getOperand(0) != FPOne) {
+          SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(*I), VT,
+                                        (*I)->getOperand(0), Reciprocal);
+          DAG.ReplaceAllUsesWith(*I, NewNode.getNode());
+        }
+      }
+      return SDValue();
+    }
+  }
+
   return SDValue();
 }
 
@@ -6977,6 +7536,32 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFSQRT(SDNode *N) {
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      !TLI.isFsqrtCheap()) {
+    // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
+    if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) {
+      EVT VT = RV.getValueType();
+      RV = DAG.getNode(ISD::FMUL, SDLoc(N), VT, N->getOperand(0), RV);
+      AddToWorklist(RV.getNode());
+
+      // Unfortunately, RV is now NaN if the input was exactly 0.
+      // Select out this case and force the answer to 0.
+      SDValue Zero = DAG.getConstantFP(0.0, VT);
+      SDValue ZeroCmp =
+        DAG.getSetCC(SDLoc(N), TLI.getSetCCResultType(*DAG.getContext(), VT),
+                     N->getOperand(0), Zero, ISD::SETEQ);
+      AddToWorklist(ZeroCmp.getNode());
+      AddToWorklist(RV.getNode());
+
+      RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT,
+                       SDLoc(N), VT, ZeroCmp, Zero, RV);
+      return RV;
+    }
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -7222,7 +7807,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
 
   // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
-       TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType())) {
+       TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
@@ -7239,6 +7824,43 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFCEIL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fceil c1) -> fceil(c1)
+  if (N0CFP)
+    return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ftrunc c1) -> ftrunc(c1)
+  if (N0CFP)
+    return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ffloor c1) -> ffloor(c1)
+  if (N0CFP)
+    return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);
+
+  return SDValue();
+}
+
+// FIXME: FNEG and FABS have a lot in common; refactor.
 SDValue DAGCombiner::visitFNEG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -7248,26 +7870,36 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
     if (FoldedVOp.getNode()) return FoldedVOp;
   }
 
+  // Constant fold FNEG.
+  if (isa<ConstantFPSDNode>(N0))
+    return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N->getOperand(0));
+
   if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
                          &DAG.getTarget().Options))
     return GetNegatedExpression(N0, DAG, LegalOperations);
 
-  // Transform fneg(bitconvert(x)) -> bitconvert(x^sign) to avoid loading
+  // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
   // constant pool values.
-  // TODO: We can also optimize for vectors here, but we need to make sure
-  // that the sign mask is created properly for each vector element.
-  if (!TLI.isFNegFree(VT) && N0.getOpcode() == ISD::BITCAST &&
-      !VT.isVector() &&
-      N0.getNode()->hasOneUse() &&
-      N0.getOperand(0).getValueType().isInteger()) {
+  if (!TLI.isFNegFree(VT) &&
+      N0.getOpcode() == ISD::BITCAST &&
+      N0.getNode()->hasOneUse()) {
     SDValue Int = N0.getOperand(0);
     EVT IntVT = Int.getValueType();
     if (IntVT.isInteger() && !IntVT.isVector()) {
+      APInt SignMask;
+      if (N0.getValueType().isVector()) {
+        // For a vector, get a mask such as 0x80... per scalar element
+        // and splat it.
+        SignMask = APInt::getSignBit(N0.getValueType().getScalarSizeInBits());
+        SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
+      } else {
+        // For a scalar, just generate 0x80...
+        SignMask = APInt::getSignBit(IntVT.getSizeInBits());
+      }
       Int = DAG.getNode(ISD::XOR, SDLoc(N0), IntVT, Int,
-              DAG.getConstant(APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
+                        DAG.getConstant(SignMask, IntVT));
       AddToWorklist(Int.getNode());
-      return DAG.getNode(ISD::BITCAST, SDLoc(N),
-                         VT, Int);
+      return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Int);
     }
   }
 
@@ -7289,45 +7921,50 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitFCEIL(SDNode *N) {
+SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  EVT VT = N->getValueType(0);
-
-  // fold (fceil c1) -> fceil(c1)
-  if (N0CFP)
-    return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);
-
-  return SDValue();
-}
+  SDValue N1 = N->getOperand(1);
+  const ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  const ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
 
-SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
-  SDValue N0 = N->getOperand(0);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  EVT VT = N->getValueType(0);
+  if (N0CFP && N1CFP) {
+    const APFloat &C0 = N0CFP->getValueAPF();
+    const APFloat &C1 = N1CFP->getValueAPF();
+    return DAG.getConstantFP(minnum(C0, C1), N->getValueType(0));
+  }
 
-  // fold (ftrunc c1) -> ftrunc(c1)
-  if (N0CFP)
-    return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);
+  if (N0CFP) {
+    EVT VT = N->getValueType(0);
+    // Canonicalize to constant on RHS.
+    return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0);
+  }
 
   return SDValue();
 }
 
-SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
+SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  EVT VT = N->getValueType(0);
+  SDValue N1 = N->getOperand(1);
+  const ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  const ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
 
-  // fold (ffloor c1) -> ffloor(c1)
-  if (N0CFP)
-    return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);
+  if (N0CFP && N1CFP) {
+    const APFloat &C0 = N0CFP->getValueAPF();
+    const APFloat &C1 = N1CFP->getValueAPF();
+    return DAG.getConstantFP(maxnum(C0, C1), N->getValueType(0));
+  }
+
+  if (N0CFP) {
+    EVT VT = N->getValueType(0);
+    // Canonicalize to constant on RHS.
+    return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0);
+  }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFABS(SDNode *N) {
   SDValue N0 = N->getOperand(0);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   EVT VT = N->getValueType(0);
 
   if (VT.isVector()) {
@@ -7336,32 +7973,40 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
   }
 
   // fold (fabs c1) -> fabs(c1)
-  if (N0CFP)
+  if (isa<ConstantFPSDNode>(N0))
     return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
+
   // fold (fabs (fabs x)) -> (fabs x)
   if (N0.getOpcode() == ISD::FABS)
     return N->getOperand(0);
+
   // fold (fabs (fneg x)) -> (fabs x)
   // fold (fabs (fcopysign x, y)) -> (fabs x)
   if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
     return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));
 
-  // Transform fabs(bitconvert(x)) -> bitconvert(x&~sign) to avoid loading
+  // Transform fabs(bitconvert(x)) -> bitconvert(x & ~sign) to avoid loading
   // constant pool values.
-  // TODO: We can also optimize for vectors here, but we need to make sure
-  // that the sign mask is created properly for each vector element.
   if (!TLI.isFAbsFree(VT) &&
-      N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse() &&
-      N0.getOperand(0).getValueType().isInteger() &&
-      !VT.isVector()) {
+      N0.getOpcode() == ISD::BITCAST &&
+      N0.getNode()->hasOneUse()) {
     SDValue Int = N0.getOperand(0);
     EVT IntVT = Int.getValueType();
     if (IntVT.isInteger() && !IntVT.isVector()) {
+      APInt SignMask;
+      if (N0.getValueType().isVector()) {
+        // For a vector, get a mask such as 0x7f... per scalar element
+        // and splat it.
+        SignMask = ~APInt::getSignBit(N0.getValueType().getScalarSizeInBits());
+        SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
+      } else {
+        // For a scalar, just generate 0x7f...
+        SignMask = ~APInt::getSignBit(IntVT.getSizeInBits());
+      }
       Int = DAG.getNode(ISD::AND, SDLoc(N0), IntVT, Int,
-             DAG.getConstant(~APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
+                        DAG.getConstant(SignMask, IntVT));
       AddToWorklist(Int.getNode());
-      return DAG.getNode(ISD::BITCAST, SDLoc(N),
-                         N->getValueType(0), Int);
+      return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Int);
     }
   }
 
@@ -7441,15 +8086,12 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
           // will convert it back to (X & C1) >> C2.
           CombineTo(N, NewBRCond, false);
           // Truncate is dead.
-          if (Trunc) {
-            removeFromWorklist(Trunc);
-            DAG.DeleteNode(Trunc);
-          }
+          if (Trunc)
+            deleteAndRecombine(Trunc);
           // Replace the uses of SRL with SETCC
           WorklistRemover DeadNodes(*this);
           DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
-          removeFromWorklist(N1.getNode());
-          DAG.DeleteNode(N1.getNode());
+          deleteAndRecombine(N1.getNode());
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
         }
       }
@@ -7478,8 +8120,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
                 dbgs() << '\n');
           WorklistRemover DeadNodes(*this);
           DAG.ReplaceAllUsesOfValueWith(N1, Tmp);
-          removeFromWorklist(TheXor);
-          DAG.DeleteNode(TheXor);
+          deleteAndRecombine(TheXor);
           return DAG.getNode(ISD::BRCOND, SDLoc(N),
                              MVT::Other, Chain, Tmp, N2);
         }
@@ -7509,8 +8150,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
       // Replace the uses of XOR with SETCC
       WorklistRemover DeadNodes(*this);
       DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
-      removeFromWorklist(N1.getNode());
-      DAG.DeleteNode(N1.getNode());
+      deleteAndRecombine(N1.getNode());
       return DAG.getNode(ISD::BRCOND, SDLoc(N),
                          MVT::Other, Chain, SetCC, N2);
     }
@@ -7547,9 +8187,8 @@ SDValue DAGCombiner::visitBR_CC(SDNode *N) {
   return SDValue();
 }
 
-/// canFoldInAddressingMode - Return true if 'Use' is a load or a store that
-/// uses N as its base pointer and that N may be folded in the load / store
-/// addressing mode.
+/// Return true if 'Use' is a load or a store that uses N as its base pointer
+/// and that N may be folded in the load / store addressing mode.
 static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
                                     SelectionDAG &DAG,
                                     const TargetLowering &TLI) {
@@ -7588,12 +8227,11 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
   return TLI.isLegalAddressingMode(AM, VT.getTypeForEVT(*DAG.getContext()));
 }
 
-/// CombineToPreIndexedLoadStore - Try turning a load / store into a
-/// pre-indexed load / store when the base pointer is an add or subtract
-/// and it has other uses besides the load / store. After the
-/// transformation, the new indexed load / store has effectively folded
-/// the add / subtract in and all of its other uses are redirected to the
-/// new load / store.
+/// Try turning a load/store into a pre-indexed load/store when the base
+/// pointer is an add or subtract and it has other uses besides the load/store.
+/// After the transformation, the new indexed load/store has effectively folded
+/// the add/subtract in and all of its other uses are redirected to the
+/// new load/store.
 bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   if (Level < AfterLegalizeDAG)
     return false;
@@ -7754,7 +8392,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   }
 
   // Finally, since the node is now dead, remove it from the graph.
-  DAG.DeleteNode(N);
+  deleteAndRecombine(N);
 
   if (Swapped)
     std::swap(BasePtr, Offset);
@@ -7804,23 +8442,20 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
                                  SDLoc(OtherUses[i]),
                                  OtherUses[i]->getValueType(0), NewOp1, NewOp2);
     DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
-    removeFromWorklist(OtherUses[i]);
-    DAG.DeleteNode(OtherUses[i]);
+    deleteAndRecombine(OtherUses[i]);
   }
 
   // Replace the uses of Ptr with uses of the updated base value.
   DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
-  removeFromWorklist(Ptr.getNode());
-  DAG.DeleteNode(Ptr.getNode());
+  deleteAndRecombine(Ptr.getNode());
 
   return true;
 }
 
-/// CombineToPostIndexedLoadStore - Try to combine a load / store with a
-/// add / sub of the base pointer node into a post-indexed load / store.
-/// The transformation folded the add / subtract into the new indexed
-/// load / store effectively and all of its uses are redirected to the
-/// new load / store.
+/// Try to combine a load/store with a add/sub of the base pointer node into a
+/// post-indexed load/store. The transformation folded the add/subtract into the
+/// new indexed load/store effectively and all of its uses are redirected to the
+/// new load/store.
 bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
   if (Level < AfterLegalizeDAG)
     return false;
@@ -7924,13 +8559,12 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
         }
 
         // Finally, since the node is now dead, remove it from the graph.
-        DAG.DeleteNode(N);
+        deleteAndRecombine(N);
 
         // Replace the uses of Use with uses of the updated base value.
         DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
                                       Result.getValue(isLoad ? 1 : 0));
-        removeFromWorklist(Op);
-        DAG.DeleteNode(Op);
+        deleteAndRecombine(Op);
         return true;
       }
     }
@@ -7939,6 +8573,30 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
   return false;
 }
 
+/// \brief Return the base-pointer arithmetic from an indexed \p LD.
+SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  assert(AM != ISD::UNINDEXED);
+  SDValue BP = LD->getOperand(1);
+  SDValue Inc = LD->getOperand(2);
+
+  // Some backends use TargetConstants for load offsets, but don't expect
+  // TargetConstants in general ADD nodes. We can convert these constants into
+  // regular Constants (if the constant is not opaque).
+  assert((Inc.getOpcode() != ISD::TargetConstant ||
+          !cast<ConstantSDNode>(Inc)->isOpaque()) &&
+         "Cannot split out indexing using opaque target constants");
+  if (Inc.getOpcode() == ISD::TargetConstant) {
+    ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc);
+    Inc = DAG.getConstant(*ConstInc->getConstantIntValue(),
+                          ConstInc->getValueType(0));
+  }
+
+  unsigned Opc =
+      (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
+  return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
+}
+
 SDValue DAGCombiner::visitLOAD(SDNode *N) {
   LoadSDNode *LD  = cast<LoadSDNode>(N);
   SDValue Chain = LD->getChain();
@@ -7965,18 +8623,33 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
 
-        if (N->use_empty()) {
-          removeFromWorklist(N);
-          DAG.DeleteNode(N);
-        }
+        if (N->use_empty())
+          deleteAndRecombine(N);
 
         return SDValue(N, 0);   // Return N so it doesn't get rechecked!
       }
     } else {
       // Indexed loads.
       assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
-      if (!N->hasAnyUseOfValue(0) && !N->hasAnyUseOfValue(1)) {
+
+      // If this load has an opaque TargetConstant offset, then we cannot split
+      // the indexing into an add/sub directly (that TargetConstant may not be
+      // valid for a different type of node, and we cannot convert an opaque
+      // target constant into a regular constant).
+      bool HasOTCInc = LD->getOperand(2).getOpcode() == ISD::TargetConstant &&
+                       cast<ConstantSDNode>(LD->getOperand(2))->isOpaque();
+
+      if (!N->hasAnyUseOfValue(0) &&
+          ((MaySplitLoadIndex && !HasOTCInc) || !N->hasAnyUseOfValue(1))) {
         SDValue Undef = DAG.getUNDEF(N->getValueType(0));
+        SDValue Index;
+        if (N->hasAnyUseOfValue(1) && MaySplitLoadIndex && !HasOTCInc) {
+          Index = SplitIndexingFromLoad(LD);
+          // Try to fold the base pointer arithmetic into subsequent loads and
+          // stores.
+          AddUsersToWorklist(N);
+        } else
+          Index = DAG.getUNDEF(N->getValueType(1));
         DEBUG(dbgs() << "\nReplacing.7 ";
               N->dump(&DAG);
               dbgs() << "\nWith: ";
@@ -7984,11 +8657,9 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
               dbgs() << " and 2 other values\n");
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1),
-                                      DAG.getUNDEF(N->getValueType(1)));
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
-        removeFromWorklist(N);
-        DAG.DeleteNode(N);
+        deleteAndRecombine(N);
         return SDValue(N, 0);   // Return N so it doesn't get rechecked!
       }
     }
@@ -8016,15 +8687,15 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
                               LD->getValueType(0),
                               Chain, Ptr, LD->getPointerInfo(),
                               LD->getMemoryVT(),
-                              LD->isVolatile(), LD->isNonTemporal(), Align,
-                              LD->getTBAAInfo());
+                              LD->isVolatile(), LD->isNonTemporal(),
+                              LD->isInvariant(), Align, LD->getAAInfo());
         return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true);
       }
     }
   }
 
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA :
-    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
 #ifndef NDEBUG
   if (CombinerAAOnlyFunc.getNumOccurrences() &&
       CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
@@ -8354,7 +9025,7 @@ struct LoadedSlice {
 
     // At this point, we know that we perform a cross-register-bank copy.
     // Check if it is expensive.
-    const TargetRegisterInfo *TRI = TLI.getTargetMachine().getRegisterInfo();
+    const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo();
     // Assume bitcasts are cheap, unless both register classes do not
     // explicitly share a common sub class.
     if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
@@ -8618,9 +9289,9 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
   return true;
 }
 
-/// CheckForMaskedLoad - Check to see if V is (and load (ptr), imm), where the
-/// load is having specific bytes cleared out.  If so, return the byte size
-/// being masked out and the shift amount.
+/// Check to see if V is (and load (ptr), imm), where the load is having
+/// specific bytes cleared out.  If so, return the byte size being masked out
+/// and the shift amount.
 static std::pair<unsigned, unsigned>
 CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
   std::pair<unsigned, unsigned> Result(0, 0);
@@ -8693,9 +9364,9 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
 }
 
 
-/// ShrinkLoadReplaceStoreWithStore - Check to see if IVal is something that
-/// provides a value as specified by MaskInfo.  If so, replace the specified
-/// store with a narrower store of truncated IVal.
+/// Check to see if IVal is something that provides a value as specified by
+/// MaskInfo. If so, replace the specified store with a narrower store of
+/// truncated IVal.
 static SDNode *
 ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
                                 SDValue IVal, StoreSDNode *St,
@@ -8750,10 +9421,10 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
 }
 
 
-/// ReduceLoadOpStoreWidth - Look for sequence of load / op / store where op is
-/// one of 'or', 'xor', and 'and' of immediates. If 'op' is only touching some
-/// of the loaded bits, try narrowing the load and store if it would end up
-/// being a win for performance or code size.
+/// Look for sequence of load / op / store where op is one of 'or', 'xor', and
+/// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
+/// narrowing the load and store if it would end up being a win for performance
+/// or code size.
 SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
   if (ST->isVolatile())
@@ -8853,7 +9524,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
                                   LD->getPointerInfo().getWithOffset(PtrOff),
                                   LD->isVolatile(), LD->isNonTemporal(),
                                   LD->isInvariant(), NewAlign,
-                                  LD->getTBAAInfo());
+                                  LD->getAAInfo());
       SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
                                    DAG.getConstant(NewImm, NewVT));
       SDValue NewST = DAG.getStore(Chain, SDLoc(N),
@@ -8874,10 +9545,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
   return SDValue();
 }
 
-/// TransformFPLoadStorePair - For a given floating point load / store pair,
-/// if the load value isn't used by any other operations, then consider
-/// transforming the pair to integer load / store operations if the target
-/// deems the transformation profitable.
+/// For a given floating point load / store pair, if the load value isn't used
+/// by any other operations, then consider transforming the pair to integer
+/// load / store operations if the target deems the transformation profitable.
 SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
   SDValue Chain = ST->getChain();
@@ -9051,7 +9721,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
     return false;
 
   // Only look at ends of store sequences.
-  SDValue Chain = SDValue(St, 1);
+  SDValue Chain = SDValue(St, 0);
   if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE)
     return false;
 
@@ -9082,7 +9752,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   StoreSDNode *Index = St;
   while (Index) {
     // If the chain has more than one use, then we can't reorder the mem ops.
-    if (Index != St && !SDValue(Index, 1)->hasOneUse())
+    if (Index != St && !SDValue(Index, 0)->hasOneUse())
       break;
 
     // Find the base pointer and offset for this memory node.
@@ -9279,7 +9949,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
         } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
           StoreInt|= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
         } else {
-          assert(false && "Invalid constant element type");
+          llvm_unreachable("Invalid constant element type");
         }
       }
 
@@ -9313,8 +9983,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
       // Since we know that St is redundant, just iterate.
       while (!St->use_empty())
         DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
-      removeFromWorklist(St);
-      DAG.DeleteNode(St);
+      deleteAndRecombine(St);
     }
 
     return true;
@@ -9373,6 +10042,13 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   if (LoadNodes.size() < 2)
     return false;
 
+  // If we have load/store pair instructions and we only have two values,
+  // don't bother.
+  unsigned RequiredAlignment;
+  if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
+      St->getAlignment() >= RequiredAlignment)
+    return false;
+
   // Scan the memory operations on the chain and find the first non-consecutive
   // load memory address. These variables hold the index in the store node
   // array.
@@ -9408,9 +10084,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
       EVT LegalizedStoredValueTy =
         TLI.getTypeToTransformTo(*DAG.getContext(), StoreTy);
       if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::ZEXTLOAD, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::SEXTLOAD, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::EXTLOAD, StoreTy))
+          TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
+          TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
+          TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy))
         LastLegalIntegerType = i+1;
     }
   }
@@ -9488,8 +10164,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
       continue;
     StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
     DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain());
-    removeFromWorklist(St);
-    DAG.DeleteNode(St);
+    deleteAndRecombine(St);
   }
 
   return true;
@@ -9515,7 +10190,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0),
                           Ptr, ST->getPointerInfo(), ST->isVolatile(),
                           ST->isNonTemporal(), OrigAlign,
-                          ST->getTBAAInfo());
+                          ST->getAAInfo());
   }
 
   // Turn 'store undef, Ptr' -> nothing.
@@ -9569,19 +10244,19 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
           unsigned Alignment = ST->getAlignment();
           bool isVolatile = ST->isVolatile();
           bool isNonTemporal = ST->isNonTemporal();
-          const MDNode *TBAAInfo = ST->getTBAAInfo();
+          AAMDNodes AAInfo = ST->getAAInfo();
 
           SDValue St0 = DAG.getStore(Chain, SDLoc(ST), Lo,
                                      Ptr, ST->getPointerInfo(),
                                      isVolatile, isNonTemporal,
-                                     ST->getAlignment(), TBAAInfo);
+                                     ST->getAlignment(), AAInfo);
           Ptr = DAG.getNode(ISD::ADD, SDLoc(N), Ptr.getValueType(), Ptr,
                             DAG.getConstant(4, Ptr.getValueType()));
           Alignment = MinAlign(Alignment, 4U);
           SDValue St1 = DAG.getStore(Chain, SDLoc(ST), Hi,
                                      Ptr, ST->getPointerInfo().getWithOffset(4),
                                      isVolatile, isNonTemporal,
-                                     Alignment, TBAAInfo);
+                                     Alignment, AAInfo);
           return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other,
                              St0, St1);
         }
@@ -9598,7 +10273,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
         return DAG.getTruncStore(Chain, SDLoc(N), Value,
                                  Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
                                  ST->isVolatile(), ST->isNonTemporal(), Align,
-                                 ST->getTBAAInfo());
+                                 ST->getAAInfo());
     }
   }
 
@@ -9608,8 +10283,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   if (NewST.getNode())
     return NewST;
 
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA :
-    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
 #ifndef NDEBUG
   if (CombinerAAOnlyFunc.getNumOccurrences() &&
       CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
@@ -9686,6 +10361,17 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     }
   }
 
+  // If this is a store followed by a store with the same value to the same
+  // location, then the store is dead/noop.
+  if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
+    if (ST1->getBasePtr() == Ptr && ST->getMemoryVT() == ST1->getMemoryVT() &&
+        ST1->getValue() == Value && ST->isUnindexed() && !ST->isVolatile() &&
+        ST1->isUnindexed() && !ST1->isVolatile()) {
+      // The store is dead, remove it.
+      return Chain;
+    }
+  }
+
   // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
   // truncating store.  We can do this even if this is already a truncstore.
   if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE)
@@ -9791,6 +10477,87 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
 
+SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
+    SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) {
+  EVT ResultVT = EVE->getValueType(0);
+  EVT VecEltVT = InVecVT.getVectorElementType();
+  unsigned Align = OriginalLoad->getAlignment();
+  unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
+      VecEltVT.getTypeForEVT(*DAG.getContext()));
+
+  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
+    return SDValue();
+
+  Align = NewAlign;
+
+  SDValue NewPtr = OriginalLoad->getBasePtr();
+  SDValue Offset;
+  EVT PtrType = NewPtr.getValueType();
+  MachinePointerInfo MPI;
+  if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
+    int Elt = ConstEltNo->getZExtValue();
+    unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
+    if (TLI.isBigEndian())
+      PtrOff = InVecVT.getSizeInBits() / 8 - PtrOff;
+    Offset = DAG.getConstant(PtrOff, PtrType);
+    MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
+  } else {
+    Offset = DAG.getNode(
+        ISD::MUL, SDLoc(EVE), EltNo.getValueType(), EltNo,
+        DAG.getConstant(VecEltVT.getStoreSize(), EltNo.getValueType()));
+    if (TLI.isBigEndian())
+      Offset = DAG.getNode(
+          ISD::SUB, SDLoc(EVE), EltNo.getValueType(),
+          DAG.getConstant(InVecVT.getStoreSize(), EltNo.getValueType()), Offset);
+    MPI = OriginalLoad->getPointerInfo();
+  }
+  NewPtr = DAG.getNode(ISD::ADD, SDLoc(EVE), PtrType, NewPtr, Offset);
+
+  // The replacement we need to do here is a little tricky: we need to
+  // replace an extractelement of a load with a load.
+  // Use ReplaceAllUsesOfValuesWith to do the replacement.
+  // Note that this replacement assumes that the extractvalue is the only
+  // use of the load; that's okay because we don't want to perform this
+  // transformation in other cases anyway.
+  SDValue Load;
+  SDValue Chain;
+  if (ResultVT.bitsGT(VecEltVT)) {
+    // If the result type of vextract is wider than the load, then issue an
+    // extending load instead.
+    ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT,
+                                                  VecEltVT)
+                                   ? ISD::ZEXTLOAD
+                                   : ISD::EXTLOAD;
+    Load = DAG.getExtLoad(
+        ExtType, SDLoc(EVE), ResultVT, OriginalLoad->getChain(), NewPtr, MPI,
+        VecEltVT, OriginalLoad->isVolatile(), OriginalLoad->isNonTemporal(),
+        OriginalLoad->isInvariant(), Align, OriginalLoad->getAAInfo());
+    Chain = Load.getValue(1);
+  } else {
+    Load = DAG.getLoad(
+        VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI,
+        OriginalLoad->isVolatile(), OriginalLoad->isNonTemporal(),
+        OriginalLoad->isInvariant(), Align, OriginalLoad->getAAInfo());
+    Chain = Load.getValue(1);
+    if (ResultVT.bitsLT(VecEltVT))
+      Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
+    else
+      Load = DAG.getNode(ISD::BITCAST, SDLoc(EVE), ResultVT, Load);
+  }
+  WorklistRemover DeadNodes(*this);
+  SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
+  SDValue To[] = { Load, Chain };
+  DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+  // Since we're explicitly calling ReplaceAllUses, add the new node to the
+  // worklist explicitly as well.
+  AddToWorklist(Load.getNode());
+  AddUsersToWorklist(Load.getNode()); // Add users too
+  // Make sure to revisit this node to clean it up; it will usually be dead.
+  AddToWorklist(EVE);
+  ++OpsNarrowed;
+  return SDValue(EVE, 0);
+}
+
 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   // (vextract (scalar_to_vector val, 0) -> val
   SDValue InVec = N->getOperand(0);
@@ -9859,6 +10626,39 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     }
   }
 
+  bool BCNumEltsChanged = false;
+  EVT ExtVT = VT.getVectorElementType();
+  EVT LVT = ExtVT;
+
+  // If the result of load has to be truncated, then it's not necessarily
+  // profitable.
+  if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
+    return SDValue();
+
+  if (InVec.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!InVec.hasOneUse())
+      return SDValue();
+
+    EVT BCVT = InVec.getOperand(0).getValueType();
+    if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
+      return SDValue();
+    if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
+      BCNumEltsChanged = true;
+    InVec = InVec.getOperand(0);
+    ExtVT = BCVT.getVectorElementType();
+  }
+
+  // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size)
+  if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() &&
+      ISD::isNormalLoad(InVec.getNode()) &&
+      !N->getOperand(1)->hasPredecessor(InVec.getNode())) {
+    SDValue Index = N->getOperand(1);
+    if (LoadSDNode *OrigLoad = dyn_cast<LoadSDNode>(InVec))
+      return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index,
+                                                           OrigLoad);
+  }
+
   // Perform only after legalization to ensure build_vector / vector_shuffle
   // optimizations have already been done.
   if (!LegalOperations) return SDValue();
@@ -9869,30 +10669,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
   if (ConstEltNo) {
     int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-    bool NewLoad = false;
-    bool BCNumEltsChanged = false;
-    EVT ExtVT = VT.getVectorElementType();
-    EVT LVT = ExtVT;
-
-    // If the result of load has to be truncated, then it's not necessarily
-    // profitable.
-    if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
-      return SDValue();
-
-    if (InVec.getOpcode() == ISD::BITCAST) {
-      // Don't duplicate a load with other uses.
-      if (!InVec.hasOneUse())
-        return SDValue();
-
-      EVT BCVT = InVec.getOperand(0).getValueType();
-      if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
-        return SDValue();
-      if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
-        BCNumEltsChanged = true;
-      InVec = InVec.getOperand(0);
-      ExtVT = BCVT.getVectorElementType();
-      NewLoad = true;
-    }
 
     LoadSDNode *LN0 = nullptr;
     const ShuffleVectorSDNode *SVN = nullptr;
@@ -9935,6 +10711,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       if (ISD::isNormalLoad(InVec.getNode())) {
         LN0 = cast<LoadSDNode>(InVec);
         Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems;
+        EltNo = DAG.getConstant(Elt, EltNo.getValueType());
       }
     }
 
@@ -9947,72 +10724,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     if (Elt == -1)
       return DAG.getUNDEF(LVT);
 
-    unsigned Align = LN0->getAlignment();
-    if (NewLoad) {
-      // Check the resultant load doesn't need a higher alignment than the
-      // original load.
-      unsigned NewAlign =
-        TLI.getDataLayout()
-            ->getABITypeAlignment(LVT.getTypeForEVT(*DAG.getContext()));
-
-      if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, LVT))
-        return SDValue();
-
-      Align = NewAlign;
-    }
-
-    SDValue NewPtr = LN0->getBasePtr();
-    unsigned PtrOff = 0;
-
-    if (Elt) {
-      PtrOff = LVT.getSizeInBits() * Elt / 8;
-      EVT PtrType = NewPtr.getValueType();
-      if (TLI.isBigEndian())
-        PtrOff = VT.getSizeInBits() / 8 - PtrOff;
-      NewPtr = DAG.getNode(ISD::ADD, SDLoc(N), PtrType, NewPtr,
-                           DAG.getConstant(PtrOff, PtrType));
-    }
-
-    // The replacement we need to do here is a little tricky: we need to
-    // replace an extractelement of a load with a load.
-    // Use ReplaceAllUsesOfValuesWith to do the replacement.
-    // Note that this replacement assumes that the extractvalue is the only
-    // use of the load; that's okay because we don't want to perform this
-    // transformation in other cases anyway.
-    SDValue Load;
-    SDValue Chain;
-    if (NVT.bitsGT(LVT)) {
-      // If the result type of vextract is wider than the load, then issue an
-      // extending load instead.
-      ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, LVT)
-        ? ISD::ZEXTLOAD : ISD::EXTLOAD;
-      Load = DAG.getExtLoad(ExtType, SDLoc(N), NVT, LN0->getChain(),
-                            NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff),
-                            LVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                            Align, LN0->getTBAAInfo());
-      Chain = Load.getValue(1);
-    } else {
-      Load = DAG.getLoad(LVT, SDLoc(N), LN0->getChain(), NewPtr,
-                         LN0->getPointerInfo().getWithOffset(PtrOff),
-                         LN0->isVolatile(), LN0->isNonTemporal(),
-                         LN0->isInvariant(), Align, LN0->getTBAAInfo());
-      Chain = Load.getValue(1);
-      if (NVT.bitsLT(LVT))
-        Load = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, Load);
-      else
-        Load = DAG.getNode(ISD::BITCAST, SDLoc(N), NVT, Load);
-    }
-    WorklistRemover DeadNodes(*this);
-    SDValue From[] = { SDValue(N, 0), SDValue(LN0,1) };
-    SDValue To[] = { Load, Chain };
-    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
-    // Since we're explcitly calling ReplaceAllUses, add the new node to the
-    // worklist explicitly as well.
-    AddToWorklist(Load.getNode());
-    AddUsersToWorklist(Load.getNode()); // Add users too
-    // Make sure to revisit this node to clean it up; it will usually be dead.
-    AddToWorklist(N);
-    return SDValue(N, 0);
+    return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0);
   }
 
   return SDValue();
@@ -10215,32 +10927,46 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   // operations.  If so, and if the EXTRACT_VECTOR_ELT vector inputs come from
   // at most two distinct vectors, turn this into a shuffle node.
 
+  // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
+  if (!isTypeLegal(VT))
+    return SDValue();
+
   // May only combine to shuffle after legalize if shuffle is legal.
-  if (LegalOperations &&
-      !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))
+  if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
     return SDValue();
 
   SDValue VecIn1, VecIn2;
+  bool UsesZeroVector = false;
   for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue Op = N->getOperand(i);
     // Ignore undef inputs.
-    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (Op.getOpcode() == ISD::UNDEF) continue;
+
+    // See if we can combine this build_vector into a blend with a zero vector.
+    if (!VecIn2.getNode() && ((Op.getOpcode() == ISD::Constant &&
+        cast<ConstantSDNode>(Op.getNode())->isNullValue()) ||
+        (Op.getOpcode() == ISD::ConstantFP &&
+        cast<ConstantFPSDNode>(Op.getNode())->getValueAPF().isZero()))) {
+      UsesZeroVector = true;
+      continue;
+    }
 
     // If this input is something other than a EXTRACT_VECTOR_ELT with a
     // constant index, bail out.
-    if (N->getOperand(i).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        !isa<ConstantSDNode>(N->getOperand(i).getOperand(1))) {
+    if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(Op.getOperand(1))) {
       VecIn1 = VecIn2 = SDValue(nullptr, 0);
       break;
     }
 
     // We allow up to two distinct input vectors.
-    SDValue ExtractedFromVec = N->getOperand(i).getOperand(0);
+    SDValue ExtractedFromVec = Op.getOperand(0);
     if (ExtractedFromVec == VecIn1 || ExtractedFromVec == VecIn2)
       continue;
 
     if (!VecIn1.getNode()) {
       VecIn1 = ExtractedFromVec;
-    } else if (!VecIn2.getNode()) {
+    } else if (!VecIn2.getNode() && !UsesZeroVector) {
       VecIn2 = ExtractedFromVec;
     } else {
       // Too many inputs.
@@ -10251,55 +10977,93 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
 
   // If everything is good, we can make a shuffle operation.
   if (VecIn1.getNode()) {
+    unsigned InNumElements = VecIn1.getValueType().getVectorNumElements();
     SmallVector<int, 8> Mask;
     for (unsigned i = 0; i != NumInScalars; ++i) {
-      if (N->getOperand(i).getOpcode() == ISD::UNDEF) {
+      unsigned Opcode = N->getOperand(i).getOpcode();
+      if (Opcode == ISD::UNDEF) {
         Mask.push_back(-1);
         continue;
       }
 
+      // Operands can also be zero.
+      if (Opcode != ISD::EXTRACT_VECTOR_ELT) {
+        assert(UsesZeroVector &&
+               (Opcode == ISD::Constant || Opcode == ISD::ConstantFP) &&
+               "Unexpected node found!");
+        Mask.push_back(NumInScalars+i);
+        continue;
+      }
+
       // If extracting from the first vector, just use the index directly.
       SDValue Extract = N->getOperand(i);
       SDValue ExtVal = Extract.getOperand(1);
+      unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();
       if (Extract.getOperand(0) == VecIn1) {
-        unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();
-        if (ExtIndex > VT.getVectorNumElements())
-          return SDValue();
-
         Mask.push_back(ExtIndex);
         continue;
       }
 
-      // Otherwise, use InIdx + VecSize
-      unsigned Idx = cast<ConstantSDNode>(ExtVal)->getZExtValue();
-      Mask.push_back(Idx+NumInScalars);
+      // Otherwise, use InIdx + InputVecSize
+      Mask.push_back(InNumElements + ExtIndex);
     }
 
+    // Avoid introducing illegal shuffles with zero.
+    if (UsesZeroVector && !TLI.isVectorClearMaskLegal(Mask, VT))
+      return SDValue();
+
     // We can't generate a shuffle node with mismatched input and output types.
     // Attempt to transform a single input vector to the correct type.
     if ((VT != VecIn1.getValueType())) {
-      // We don't support shuffeling between TWO values of different types.
-      if (VecIn2.getNode())
+      // If the input vector type has a different base type to the output
+      // vector type, bail out.
+      EVT VTElemType = VT.getVectorElementType();
+      if ((VecIn1.getValueType().getVectorElementType() != VTElemType) ||
+          (VecIn2.getNode() &&
+           (VecIn2.getValueType().getVectorElementType() != VTElemType)))
         return SDValue();
 
+      // If the input vector is too small, widen it.
       // We only support widening of vectors which are half the size of the
       // output registers. For example XMM->YMM widening on X86 with AVX.
-      if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits())
-        return SDValue();
+      EVT VecInT = VecIn1.getValueType();
+      if (VecInT.getSizeInBits() * 2 == VT.getSizeInBits()) {
+        // If we only have one small input, widen it by adding undef values.
+        if (!VecIn2.getNode())
+          VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, VecIn1,
+                               DAG.getUNDEF(VecIn1.getValueType()));
+        else if (VecIn1.getValueType() == VecIn2.getValueType()) {
+          // If we have two small inputs of the same type, try to concat them.
+          VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, VecIn1, VecIn2);
+          VecIn2 = SDValue(nullptr, 0);
+        } else
+          return SDValue();
+      } else if (VecInT.getSizeInBits() == VT.getSizeInBits() * 2) {
+        // If the input vector is too large, try to split it.
+        // We don't support having two input vectors that are too large.
+        if (VecIn2.getNode())
+          return SDValue();
 
-      // If the input vector type has a different base type to the output
-      // vector type, bail out.
-      if (VecIn1.getValueType().getVectorElementType() !=
-          VT.getVectorElementType())
+        if (!TLI.isExtractSubvectorCheap(VT, VT.getVectorNumElements()))
+          return SDValue();
+        
+        // Try to replace VecIn1 with two extract_subvectors
+        // No need to update the masks, they should still be correct.
+        VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, 
+          DAG.getConstant(VT.getVectorNumElements(), TLI.getVectorIdxTy()));
+        VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
+          DAG.getConstant(0, TLI.getVectorIdxTy()));
+        UsesZeroVector = false;
+      } else
         return SDValue();
-
-      // Widen the input vector by adding undef values.
-      VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                           VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
     }
 
-    // If VecIn2 is unused then change it to undef.
-    VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+    if (UsesZeroVector)
+      VecIn2 = VT.isInteger() ? DAG.getConstant(0, VT) :
+                                DAG.getConstantFP(0.0, VT);
+    else
+      // If VecIn2 is unused then change it to undef.
+      VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
 
     // Check that we were able to transform all incoming values to the same
     // type.
@@ -10307,10 +11071,6 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
         VecIn1.getValueType() != VT)
           return SDValue();
 
-    // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
-    if (!isTypeLegal(VT))
-      return SDValue();
-
     // Return the new VECTOR_SHUFFLE node.
     SDValue Ops[2];
     Ops[0] = VecIn1;
@@ -10501,6 +11261,92 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
   return SDValue();
 }
 
+static SDValue simplifyShuffleOperandRecursively(SmallBitVector &UsedElements,
+                                                 SDValue V, SelectionDAG &DAG) {
+  SDLoc DL(V);
+  EVT VT = V.getValueType();
+
+  switch (V.getOpcode()) {
+  default:
+    return V;
+
+  case ISD::CONCAT_VECTORS: {
+    EVT OpVT = V->getOperand(0).getValueType();
+    int OpSize = OpVT.getVectorNumElements();
+    SmallBitVector OpUsedElements(OpSize, false);
+    bool FoundSimplification = false;
+    SmallVector<SDValue, 4> NewOps;
+    NewOps.reserve(V->getNumOperands());
+    for (int i = 0, NumOps = V->getNumOperands(); i < NumOps; ++i) {
+      SDValue Op = V->getOperand(i);
+      bool OpUsed = false;
+      for (int j = 0; j < OpSize; ++j)
+        if (UsedElements[i * OpSize + j]) {
+          OpUsedElements[j] = true;
+          OpUsed = true;
+        }
+      NewOps.push_back(
+          OpUsed ? simplifyShuffleOperandRecursively(OpUsedElements, Op, DAG)
+                 : DAG.getUNDEF(OpVT));
+      FoundSimplification |= Op == NewOps.back();
+      OpUsedElements.reset();
+    }
+    if (FoundSimplification)
+      V = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, NewOps);
+    return V;
+  }
+
+  case ISD::INSERT_SUBVECTOR: {
+    SDValue BaseV = V->getOperand(0);
+    SDValue SubV = V->getOperand(1);
+    auto *IdxN = dyn_cast<ConstantSDNode>(V->getOperand(2));
+    if (!IdxN)
+      return V;
+
+    int SubSize = SubV.getValueType().getVectorNumElements();
+    int Idx = IdxN->getZExtValue();
+    bool SubVectorUsed = false;
+    SmallBitVector SubUsedElements(SubSize, false);
+    for (int i = 0; i < SubSize; ++i)
+      if (UsedElements[i + Idx]) {
+        SubVectorUsed = true;
+        SubUsedElements[i] = true;
+        UsedElements[i + Idx] = false;
+      }
+
+    // Now recurse on both the base and sub vectors.
+    SDValue SimplifiedSubV =
+        SubVectorUsed
+            ? simplifyShuffleOperandRecursively(SubUsedElements, SubV, DAG)
+            : DAG.getUNDEF(SubV.getValueType());
+    SDValue SimplifiedBaseV = simplifyShuffleOperandRecursively(UsedElements, BaseV, DAG);
+    if (SimplifiedSubV != SubV || SimplifiedBaseV != BaseV)
+      V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                      SimplifiedBaseV, SimplifiedSubV, V->getOperand(2));
+    return V;
+  }
+  }
+}
+
+static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,
+                                       SDValue N1, SelectionDAG &DAG) {
+  EVT VT = SVN->getValueType(0);
+  int NumElts = VT.getVectorNumElements();
+  SmallBitVector N0UsedElements(NumElts, false), N1UsedElements(NumElts, false);
+  for (int M : SVN->getMask())
+    if (M >= 0 && M < NumElts)
+      N0UsedElements[M] = true;
+    else if (M >= NumElts)
+      N1UsedElements[M - NumElts] = true;
+
+  SDValue S0 = simplifyShuffleOperandRecursively(N0UsedElements, N0, DAG);
+  SDValue S1 = simplifyShuffleOperandRecursively(N1UsedElements, N1, DAG);
+  if (S0 == N0 && S1 == N1)
+    return SDValue();
+
+  return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
+}
+
 // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat.
 static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
@@ -10653,6 +11499,12 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     }
   }
 
+  // There are various patterns used to build up a vector from smaller vectors,
+  // subvectors, or elements. Scan chains of these and replace unused insertions
+  // or components with undef.
+  if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG))
+    return S;
+
   if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
       Level < AfterLegalizeVectorOps &&
       (N1.getOpcode() == ISD::UNDEF ||
@@ -10664,99 +11516,11 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       return V;
   }
 
-  // If this shuffle node is simply a swizzle of another shuffle node,
-  // then try to simplify it.
-  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
-      N1.getOpcode() == ISD::UNDEF) {
-
-    ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
-
-    // The incoming shuffle must be of the same type as the result of the
-    // current shuffle.
-    assert(OtherSV->getOperand(0).getValueType() == VT &&
-           "Shuffle types don't match");
-
-    SmallVector<int, 4> Mask;
-    // Compute the combined shuffle mask.
-    for (unsigned i = 0; i != NumElts; ++i) {
-      int Idx = SVN->getMaskElt(i);
-      assert(Idx < (int)NumElts && "Index references undef operand");
-      // Next, this index comes from the first value, which is the incoming
-      // shuffle. Adopt the incoming index.
-      if (Idx >= 0)
-        Idx = OtherSV->getMaskElt(Idx);
-      Mask.push_back(Idx);
-    }
-    
-    bool CommuteOperands = false;
-    if (N0.getOperand(1).getOpcode() != ISD::UNDEF) {
-      // To be valid, the combine shuffle mask should only reference elements
-      // from one of the two vectors in input to the inner shufflevector.
-      bool IsValidMask = true;
-      for (unsigned i = 0; i != NumElts && IsValidMask; ++i)
-        // See if the combined mask only reference undefs or elements coming
-        // from the first shufflevector operand.
-        IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] < NumElts;
-
-      if (!IsValidMask) {
-        IsValidMask = true;
-        for (unsigned i = 0; i != NumElts && IsValidMask; ++i)
-          // Check that all the elements come from the second shuffle operand.
-          IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] >= NumElts;
-        CommuteOperands = IsValidMask;
-      }
-
-      // Early exit if the combined shuffle mask is not valid.
-      if (!IsValidMask)
-        return SDValue();
-    }
-
-    // See if this pair of shuffles can be safely folded according to either
-    // of the following rules:
-    //   shuffle(shuffle(x, y), undef) -> x
-    //   shuffle(shuffle(x, undef), undef) -> x
-    //   shuffle(shuffle(x, y), undef) -> y
-    bool IsIdentityMask = true;
-    unsigned BaseMaskIndex = CommuteOperands ? NumElts : 0;
-    for (unsigned i = 0; i != NumElts && IsIdentityMask; ++i) {
-      // Skip Undefs.
-      if (Mask[i] < 0)
-        continue;
-
-      // The combined shuffle must map each index to itself.
-      IsIdentityMask = (unsigned)Mask[i] == i + BaseMaskIndex;
-    }
-    
-    if (IsIdentityMask) {
-      if (CommuteOperands)
-        // optimize shuffle(shuffle(x, y), undef) -> y.
-        return OtherSV->getOperand(1);
-      
-      // optimize shuffle(shuffle(x, undef), undef) -> x
-      // optimize shuffle(shuffle(x, y), undef) -> x
-      return OtherSV->getOperand(0);
-    }
-
-    // It may still be beneficial to combine the two shuffles if the
-    // resulting shuffle is legal.
-    if (TLI.isTypeLegal(VT) && TLI.isShuffleMaskLegal(Mask, VT)) {
-      if (!CommuteOperands)
-        // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3).
-        // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(x, undef, M3)
-        return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1,
-                                    &Mask[0]);
-      
-      //   shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(undef, y, M3)
-      return DAG.getVectorShuffle(VT, SDLoc(N), N1, N0->getOperand(1),
-                                  &Mask[0]);
-    }
-  }
-
   // Canonicalize shuffles according to rules:
   //  shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
   //  shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
   //  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
-  if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && N0.getOpcode() != ISD::UNDEF &&
+  if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
       N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
       TLI.isTypeLegal(VT)) {
     // The incoming shuffle must be of the same type as the result of the
@@ -10775,13 +11539,12 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   }
 
   // Try to fold according to rules:
-  //   shuffle(shuffle(A, B, M0), B, M1) -> shuffle(A, B, M2)
-  //   shuffle(shuffle(A, B, M0), A, M1) -> shuffle(A, B, M2)
-  //   shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(A, B, M2)
-  //   shuffle(shuffle(A, Undef, M0), A, M1) -> shuffle(A, Undef, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
   // Don't try to fold shuffles with illegal type.
   if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
-      N1.getOpcode() != ISD::UNDEF && TLI.isTypeLegal(VT)) {
+      TLI.isTypeLegal(VT)) {
     ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
 
     // The incoming shuffle must be of the same type as the result of the
@@ -10789,14 +11552,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     assert(OtherSV->getOperand(0).getValueType() == VT &&
            "Shuffle types don't match");
 
-    SDValue SV0 = OtherSV->getOperand(0);
-    SDValue SV1 = OtherSV->getOperand(1);
-    bool HasSameOp0 = N1 == SV0;
-    bool IsSV1Undef = SV1.getOpcode() == ISD::UNDEF;
-    if (!HasSameOp0 && !IsSV1Undef && N1 != SV1)
-      // Early exit.
-      return SDValue();
-
+    SDValue SV0, SV1;
     SmallVector<int, 4> Mask;
     // Compute the combined shuffle mask for a shuffle with SV0 as the first
     // operand, and SV1 as the second operand.
@@ -10808,24 +11564,90 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
         continue;
       }
 
+      SDValue CurrentVec;
       if (Idx < (int)NumElts) {
+        // This shuffle index refers to the inner shuffle N0. Lookup the inner
+        // shuffle mask to identify which vector is actually referenced.
         Idx = OtherSV->getMaskElt(Idx);
-        if (IsSV1Undef && Idx >= (int) NumElts)
-          Idx = -1;  // Propagate Undef.
-      } else
-        Idx = HasSameOp0 ? Idx - NumElts : Idx;
+        if (Idx < 0) {
+          // Propagate Undef.
+          Mask.push_back(Idx);
+          continue;
+        }
+
+        CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0)
+                                           : OtherSV->getOperand(1);
+      } else {
+        // This shuffle index references an element within N1.
+        CurrentVec = N1;
+      }
+
+      // Simple case where 'CurrentVec' is UNDEF.
+      if (CurrentVec.getOpcode() == ISD::UNDEF) {
+        Mask.push_back(-1);
+        continue;
+      }
+
+      // Canonicalize the shuffle index. We don't know yet if CurrentVec
+      // will be the first or second operand of the combined shuffle.
+      Idx = Idx % NumElts;
+      if (!SV0.getNode() || SV0 == CurrentVec) {
+        // Ok. CurrentVec is the left hand side.
+        // Update the mask accordingly.
+        SV0 = CurrentVec;
+        Mask.push_back(Idx);
+        continue;
+      }
+
+      // Bail out if we cannot convert the shuffle pair into a single shuffle.
+      if (SV1.getNode() && SV1 != CurrentVec)
+        return SDValue();
 
-      Mask.push_back(Idx);
+      // Ok. CurrentVec is the right hand side.
+      // Update the mask accordingly.
+      SV1 = CurrentVec;
+      Mask.push_back(Idx + NumElts);
     }
 
+    // Check if all indices in Mask are Undef. In case, propagate Undef.
+    bool isUndefMask = true;
+    for (unsigned i = 0; i != NumElts && isUndefMask; ++i)
+      isUndefMask &= Mask[i] < 0;
+
+    if (isUndefMask)
+      return DAG.getUNDEF(VT);
+
+    if (!SV0.getNode())
+      SV0 = DAG.getUNDEF(VT);
+    if (!SV1.getNode())
+      SV1 = DAG.getUNDEF(VT);
+
     // Avoid introducing shuffles with illegal mask.
-    if (TLI.isShuffleMaskLegal(Mask, VT)) {
-      if (IsSV1Undef)
-        //   shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(A, B, M2)
-        //   shuffle(shuffle(A, Undef, M0), A, M1) -> shuffle(A, Undef, M2)
-        return DAG.getVectorShuffle(VT, SDLoc(N), SV0, N1, &Mask[0]);
-      return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]);
+    if (!TLI.isShuffleMaskLegal(Mask, VT)) {
+      // Compute the commuted shuffle mask and test again.
+      for (unsigned i = 0; i != NumElts; ++i) {
+        int idx = Mask[i];
+        if (idx < 0)
+          continue;
+        else if (idx < (int)NumElts)
+          Mask[i] = idx + NumElts;
+        else
+          Mask[i] = idx - NumElts;
+      }
+
+      if (!TLI.isShuffleMaskLegal(Mask, VT))
+        return SDValue();
+ 
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
+      std::swap(SV0, SV1);
     }
+
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
+    return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]);
   }
 
   return SDValue();
@@ -10858,8 +11680,8 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   return SDValue();
 }
 
-/// XformToShuffleWithZero - Returns a vector_shuffle if it able to transform
-/// an AND to a vector_shuffle with the destination vector and a zero vector.
+/// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
+/// with the destination vector and a zero vector.
 /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
 ///      vector_shuffle V, Zero, <0, 4, 2, 4>
 SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
@@ -10881,7 +11703,7 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
         if (cast<ConstantSDNode>(Elt)->isAllOnesValue())
           Indices.push_back(i);
         else if (cast<ConstantSDNode>(Elt)->isNullValue())
-          Indices.push_back(NumElts);
+          Indices.push_back(NumElts+i);
         else
           return SDValue();
       }
@@ -10905,7 +11727,7 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
   return SDValue();
 }
 
-/// SimplifyVBinOp - Visit a binary vector operation, like ADD.
+/// Visit a binary vector operation, like ADD.
 SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
   assert(N->getValueType(0).isVector() &&
          "SimplifyVBinOp only works on vectors!");
@@ -10991,7 +11813,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
   return SDValue();
 }
 
-/// SimplifyVUnaryOp - Visit a binary vector operation, like FABS/FNEG.
+/// Visit a binary vector operation, like FABS/FNEG.
 SDValue DAGCombiner::SimplifyVUnaryOp(SDNode *N) {
   assert(N->getValueType(0).isVector() &&
          "SimplifyVUnaryOp only works on vectors!");
@@ -11042,8 +11864,8 @@ SDValue DAGCombiner::SimplifySelect(SDLoc DL, SDValue N0,
                                   SCC.getOperand(0), SCC.getOperand(1),
                                   SCC.getOperand(4));
       AddToWorklist(SETCC.getNode());
-      return DAG.getSelect(SDLoc(SCC), SCC.getValueType(),
-                           SCC.getOperand(2), SCC.getOperand(3), SETCC);
+      return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
+                           SCC.getOperand(2), SCC.getOperand(3));
     }
 
     return SCC;
@@ -11051,12 +11873,11 @@ SDValue DAGCombiner::SimplifySelect(SDLoc DL, SDValue N0,
   return SDValue();
 }
 
-/// SimplifySelectOps - Given a SELECT or a SELECT_CC node, where LHS and RHS
-/// are the two values being selected between, see if we can simplify the
-/// select.  Callers of this should assume that TheSelect is deleted if this
-/// returns true.  As such, they should return the appropriate thing (e.g. the
-/// node) back to the top-level of the DAG combiner loop to avoid it being
-/// looked at.
+/// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values
+/// being selected between, see if we can simplify the select.  Callers of this
+/// should assume that TheSelect is deleted if this returns true.  As such, they
+/// should return the appropriate thing (e.g. the node) back to the top-level of
+/// the DAG combiner loop to avoid it being looked at.
 bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
                                     SDValue RHS) {
 
@@ -11135,22 +11956,27 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
     }
 
     SDValue Load;
+    // It is safe to replace the two loads if they have different alignments,
+    // but the new load must be the minimum (most restrictive) alignment of the
+    // inputs.
+    bool isInvariant = LLD->isInvariant() & RLD->isInvariant();
+    unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment());
     if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
       Load = DAG.getLoad(TheSelect->getValueType(0),
                          SDLoc(TheSelect),
-                         // FIXME: Discards pointer and TBAA info.
+                         // FIXME: Discards pointer and AA info.
                          LLD->getChain(), Addr, MachinePointerInfo(),
                          LLD->isVolatile(), LLD->isNonTemporal(),
-                         LLD->isInvariant(), LLD->getAlignment());
+                         isInvariant, Alignment);
     } else {
       Load = DAG.getExtLoad(LLD->getExtensionType() == ISD::EXTLOAD ?
                             RLD->getExtensionType() : LLD->getExtensionType(),
                             SDLoc(TheSelect),
                             TheSelect->getValueType(0),
-                            // FIXME: Discards pointer and TBAA info.
+                            // FIXME: Discards pointer and AA info.
                             LLD->getChain(), Addr, MachinePointerInfo(),
                             LLD->getMemoryVT(), LLD->isVolatile(),
-                            LLD->isNonTemporal(), LLD->getAlignment());
+                            LLD->isNonTemporal(), isInvariant, Alignment);
     }
 
     // Users of the select now use the result of the load.
@@ -11166,7 +11992,7 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
   return false;
 }
 
-/// SimplifySelectCC - Simplify an expression of the form (N0 cond N1) ? N2 : N3
+/// Simplify an expression of the form (N0 cond N1) ? N2 : N3
 /// where 'cond' is the comparison specified by CC.
 SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
                                       SDValue N2, SDValue N3,
@@ -11458,7 +12284,7 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-/// SimplifySetCC - This is a stub for TargetLowering::SimplifySetCC.
+/// This is a stub for TargetLowering::SimplifySetCC.
 SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0,
                                    SDValue N1, ISD::CondCode Cond,
                                    SDLoc DL, bool foldBooleans) {
@@ -11467,10 +12293,10 @@ SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0,
   return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
 }
 
-/// BuildSDIVSequence - Given an ISD::SDIV node expressing a divide by constant,
-/// return a DAG expression to select that will generate the same value by
-/// multiplying by a magic number.  See:
-/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+/// Given an ISD::SDIV node expressing a divide by constant, return
+/// a DAG expression to select that will generate the same value by multiplying
+/// by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
 SDValue DAGCombiner::BuildSDIV(SDNode *N) {
   ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
   if (!C)
@@ -11489,10 +12315,29 @@ SDValue DAGCombiner::BuildSDIV(SDNode *N) {
   return S;
 }
 
-/// BuildUDIV - Given an ISD::UDIV node expressing a divide by constant,
-/// return a DAG expression to select that will generate the same value by
-/// multiplying by a magic number.  See:
-/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+/// Given an ISD::SDIV node expressing a divide by constant power of 2, return a
+/// DAG expression that will generate the same value by right shifting.
+SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
+  ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  // Avoid division by zero.
+  if (!C->getAPIntValue())
+    return SDValue();
+
+  std::vector<SDNode *> Built;
+  SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, &Built);
+
+  for (SDNode *N : Built)
+    AddToWorklist(N);
+  return S;
+}
+
+/// Given an ISD::UDIV node expressing a divide by constant, return a DAG
+/// expression that will generate the same value by multiplying by a magic
+/// number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
 SDValue DAGCombiner::BuildUDIV(SDNode *N) {
   ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
   if (!C)
@@ -11511,9 +12356,141 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) {
   return S;
 }
 
-/// FindBaseOffset - Return true if base is a frame index, which is known not
-// to alias with anything but itself.  Provides base object and offset as
-// results.
+SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) {
+  if (Level >= AfterLegalizeDAG)
+    return SDValue();
+
+  // Expose the DAG combiner to the target combiner implementations.
+  TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this);
+
+  unsigned Iterations = 0;
+  if (SDValue Est = TLI.getRecipEstimate(Op, DCI, Iterations)) {
+    if (Iterations) {
+      // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+      // For the reciprocal, we need to find the zero of the function:
+      //   F(X) = A X - 1 [which has a zero at X = 1/A]
+      //     =>
+      //   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
+      //     does not require additional intermediate precision]
+      EVT VT = Op.getValueType();
+      SDLoc DL(Op);
+      SDValue FPOne = DAG.getConstantFP(1.0, VT);
+
+      AddToWorklist(Est.getNode());
+
+      // Newton iterations: Est = Est + Est (1 - Arg * Est)
+      for (unsigned i = 0; i < Iterations; ++i) {
+        SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est);
+        AddToWorklist(NewEst.getNode());
+
+        NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst);
+        AddToWorklist(NewEst.getNode());
+
+        NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst);
+        AddToWorklist(NewEst.getNode());
+
+        Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst);
+        AddToWorklist(Est.getNode());
+      }
+    }
+    return Est;
+  }
+
+  return SDValue();
+}
+
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal sqrt, we need to find the zero of the function:
+///   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+///     =>
+///   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
+/// As a result, we precompute A/2 prior to the iteration loop.
+SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est,
+                                          unsigned Iterations) {
+  EVT VT = Arg.getValueType();
+  SDLoc DL(Arg);
+  SDValue ThreeHalves = DAG.getConstantFP(1.5, VT);
+
+  // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
+  // this entire sequence requires only one FP constant.
+  SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg);
+  AddToWorklist(HalfArg.getNode());
+
+  HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg);
+  AddToWorklist(HalfArg.getNode());
+
+  // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
+  for (unsigned i = 0; i < Iterations; ++i) {
+    SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
+    AddToWorklist(NewEst.getNode());
+
+    NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst);
+    AddToWorklist(NewEst.getNode());
+
+    NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst);
+    AddToWorklist(NewEst.getNode());
+
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst);
+    AddToWorklist(Est.getNode());
+  }
+  return Est;
+}
+
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal sqrt, we need to find the zero of the function:
+///   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+///     =>
+///   X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
+SDValue DAGCombiner::BuildRsqrtNRTwoConst(SDValue Arg, SDValue Est,
+                                          unsigned Iterations) {
+  EVT VT = Arg.getValueType();
+  SDLoc DL(Arg);
+  SDValue MinusThree = DAG.getConstantFP(-3.0, VT);
+  SDValue MinusHalf = DAG.getConstantFP(-0.5, VT);
+
+  // Newton iterations: Est = -0.5 * Est * (-3.0 + Arg * Est * Est)
+  for (unsigned i = 0; i < Iterations; ++i) {
+    SDValue HalfEst = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf);
+    AddToWorklist(HalfEst.getNode());
+
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
+    AddToWorklist(Est.getNode());
+
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg);
+    AddToWorklist(Est.getNode());
+
+    Est = DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree);
+    AddToWorklist(Est.getNode());
+
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst);
+    AddToWorklist(Est.getNode());
+  }
+  return Est;
+}
+
+SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) {
+  if (Level >= AfterLegalizeDAG)
+    return SDValue();
+
+  // Expose the DAG combiner to the target combiner implementations.
+  TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this);
+  unsigned Iterations = 0;
+  bool UseOneConstNR = false;
+  if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations, UseOneConstNR)) {
+    AddToWorklist(Est.getNode());
+    if (Iterations) {
+      Est = UseOneConstNR ?
+        BuildRsqrtNROneConst(Op, Est, Iterations) :
+        BuildRsqrtNRTwoConst(Op, Est, Iterations);
+    }
+    return Est;
+  }
+
+  return SDValue();
+}
+
+/// Return true if base is a frame index, which is known not to alias with
+/// anything but itself.  Provides base object and offset as results.
 static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
                            const GlobalValue *&GV, const void *&CV) {
   // Assume it is a primitive operation.
@@ -11549,8 +12526,7 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
   return isa<FrameIndexSDNode>(Base);
 }
 
-/// isAlias - Return true if there is any possibility that the two addresses
-/// overlap.
+/// Return true if there is any possibility that the two addresses overlap.
 bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
   // If they are the same then they must be aliases.
   if (Op0->getBasePtr() == Op1->getBasePtr()) return true;
@@ -11609,8 +12585,9 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
       return false;
   }
 
-  bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0 ? CombinerGlobalAA :
-    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0
+                   ? CombinerGlobalAA
+                   : DAG.getSubtarget().useAA();
 #ifndef NDEBUG
   if (CombinerAAOnlyFunc.getNumOccurrences() &&
       CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
@@ -11628,10 +12605,10 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
     AliasAnalysis::AliasResult AAResult =
         AA.alias(AliasAnalysis::Location(Op0->getMemOperand()->getValue(),
                                          Overlap1,
-                                         UseTBAA ? Op0->getTBAAInfo() : nullptr),
+                                         UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
                  AliasAnalysis::Location(Op1->getMemOperand()->getValue(),
                                          Overlap2,
-                                         UseTBAA ? Op1->getTBAAInfo() : nullptr));
+                                         UseTBAA ? Op1->getAAInfo() : AAMDNodes()));
     if (AAResult == AliasAnalysis::NoAlias)
       return false;
   }
@@ -11640,7 +12617,7 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
   return true;
 }
 
-/// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
+/// Walk up chain skipping non-aliasing memory nodes,
 /// looking for aliasing nodes and adding them to the Aliases vector.
 void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
                                    SmallVectorImpl<SDValue> &Aliases) {
@@ -11676,7 +12653,7 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
     }
 
     // Don't bother if we've been before.
-    if (!Visited.insert(Chain.getNode()))
+    if (!Visited.insert(Chain.getNode()).second)
       continue;
 
     switch (Chain.getOpcode()) {
@@ -11751,10 +12728,9 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
   // like register copies will interfere with trivial cases).
 
   SmallVector<const SDNode *, 16> Worklist;
-  for (SmallPtrSet<SDNode *, 16>::iterator I = Visited.begin(),
-       IE = Visited.end(); I != IE; ++I)
-    if (*I != OriginalChain.getNode())
-      Worklist.push_back(*I);
+  for (const SDNode *N : Visited)
+    if (N != OriginalChain.getNode())
+      Worklist.push_back(N);
 
   while (!Worklist.empty()) {
     const SDNode *M = Worklist.pop_back_val();
@@ -11765,7 +12741,8 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
 
     for (SDNode::use_iterator UI = M->use_begin(),
          UIE = M->use_end(); UI != UIE; ++UI)
-      if (UI.getUse().getValueType() == MVT::Other && Visited.insert(*UI)) {
+      if (UI.getUse().getValueType() == MVT::Other &&
+          Visited.insert(*UI).second) {
         if (isa<MemIntrinsicSDNode>(*UI) || isa<MemSDNode>(*UI)) {
           // We've not visited this use, and we care about it (it could have an
           // ordering dependency with the original node).
@@ -11781,8 +12758,8 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
   }
 }
 
-/// FindBetterChain - Walk up chain skipping non-aliasing memory nodes, looking
-/// for a better chain (aliasing node.)
+/// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
+/// (aliasing node.)
 SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
   SmallVector<SDValue, 8> Aliases;  // Ops for replacing token factor.
 
@@ -11801,11 +12778,9 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
   return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
 }
 
-// SelectionDAG::Combine - This is the entry point for the file.
-//
+/// This is the entry point for the file.
 void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA,
                            CodeGenOpt::Level OptLevel) {
-  /// run - This is the main entry point to this class.
-  ///
+  /// This is the main entry point to this class.
   DAGCombiner(*this, AA, OptLevel).Run(Level);
 }
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index ad75e916cefa..97fed230c536 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -40,12 +40,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/FastISel.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -65,34 +65,32 @@
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "isel"
 
 STATISTIC(NumFastIselSuccessIndependent, "Number of insts selected by "
-          "target-independent selector");
+                                         "target-independent selector");
 STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "
-          "target-specific selector");
+                                    "target-specific selector");
 STATISTIC(NumFastIselDead, "Number of dead insts removed on failure");
 
-/// \brief Set CallLoweringInfo attribute flags based on a call instruction
-/// and called function attributes.
 void FastISel::ArgListEntry::setAttributes(ImmutableCallSite *CS,
                                            unsigned AttrIdx) {
-  isSExt     = CS->paramHasAttr(AttrIdx, Attribute::SExt);
-  isZExt     = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
-  isInReg    = CS->paramHasAttr(AttrIdx, Attribute::InReg);
-  isSRet     = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
-  isNest     = CS->paramHasAttr(AttrIdx, Attribute::Nest);
-  isByVal    = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
-  isInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
-  isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
-  Alignment  = CS->getParamAlignment(AttrIdx);
-}
-
-/// startNewBlock - Set the current block to which generated machine
-/// instructions will be appended, and clear the local CSE map.
-///
+  IsSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt);
+  IsZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
+  IsInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg);
+  IsSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
+  IsNest = CS->paramHasAttr(AttrIdx, Attribute::Nest);
+  IsByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
+  IsInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
+  IsReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
+  Alignment = CS->getParamAlignment(AttrIdx);
+}
+
+/// Set the current block to which generated machine instructions will be
+/// appended, and clear the local CSE map.
 void FastISel::startNewBlock() {
   LocalValueMap.clear();
 
@@ -105,18 +103,19 @@ void FastISel::startNewBlock() {
   LastLocalValue = EmitStartPt;
 }
 
-bool FastISel::LowerArguments() {
+bool FastISel::lowerArguments() {
   if (!FuncInfo.CanLowerReturn)
     // Fallback to SDISel argument lowering code to deal with sret pointer
     // parameter.
     return false;
 
-  if (!FastLowerArguments())
+  if (!fastLowerArguments())
     return false;
 
   // Enter arguments into ValueMap for uses in non-entry BBs.
   for (Function::const_arg_iterator I = FuncInfo.Fn->arg_begin(),
-         E = FuncInfo.Fn->arg_end(); I != E; ++I) {
+                                    E = FuncInfo.Fn->arg_end();
+       I != E; ++I) {
     DenseMap<const Value *, unsigned>::iterator VI = LocalValueMap.find(I);
     assert(VI != LocalValueMap.end() && "Missed an argument?");
     FuncInfo.ValueMap[I] = VI->second;
@@ -128,22 +127,30 @@ void FastISel::flushLocalValueMap() {
   LocalValueMap.clear();
   LastLocalValue = EmitStartPt;
   recomputeInsertPt();
+  SavedInsertPt = FuncInfo.InsertPt;
 }
 
-bool FastISel::hasTrivialKill(const Value *V) const {
+bool FastISel::hasTrivialKill(const Value *V) {
   // Don't consider constants or arguments to have trivial kills.
   const Instruction *I = dyn_cast<Instruction>(V);
   if (!I)
     return false;
 
   // No-op casts are trivially coalesced by fast-isel.
-  if (const CastInst *Cast = dyn_cast<CastInst>(I))
+  if (const auto *Cast = dyn_cast<CastInst>(I))
     if (Cast->isNoopCast(DL.getIntPtrType(Cast->getContext())) &&
         !hasTrivialKill(Cast->getOperand(0)))
       return false;
 
+  // Even the value might have only one use in the LLVM IR, it is possible that
+  // FastISel might fold the use into another instruction and now there is more
+  // than one use at the Machine Instruction level.
+  unsigned Reg = lookUpRegForValue(V);
+  if (Reg && !MRI.use_empty(Reg))
+    return false;
+
   // GEPs with all zero indices are trivially coalesced by fast-isel.
-  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
+  if (const auto *GEP = dyn_cast<GetElementPtrInst>(I))
     if (GEP->hasAllZeroIndices() && !hasTrivialKill(GEP->getOperand(0)))
       return false;
 
@@ -176,7 +183,7 @@ unsigned FastISel::getRegForValue(const Value *V) {
 
   // Look up the value to see if we already have a register for it.
   unsigned Reg = lookUpRegForValue(V);
-  if (Reg != 0)
+  if (Reg)
     return Reg;
 
   // In bottom-up mode, just create the virtual register which will be used
@@ -197,29 +204,24 @@ unsigned FastISel::getRegForValue(const Value *V) {
   return Reg;
 }
 
-/// materializeRegForValue - Helper for getRegForValue. This function is
-/// called when the value isn't already available in a register and must
-/// be materialized with new instructions.
-unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
+unsigned FastISel::materializeConstant(const Value *V, MVT VT) {
   unsigned Reg = 0;
-
-  if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+  if (const auto *CI = dyn_cast<ConstantInt>(V)) {
     if (CI->getValue().getActiveBits() <= 64)
-      Reg = FastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
-  } else if (isa<AllocaInst>(V)) {
-    Reg = TargetMaterializeAlloca(cast<AllocaInst>(V));
-  } else if (isa<ConstantPointerNull>(V)) {
+      Reg = fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+  } else if (isa<AllocaInst>(V))
+    Reg = fastMaterializeAlloca(cast<AllocaInst>(V));
+  else if (isa<ConstantPointerNull>(V))
     // Translate this as an integer zero so that it can be
     // local-CSE'd with actual integer zeros.
-    Reg =
-      getRegForValue(Constant::getNullValue(DL.getIntPtrType(V->getContext())));
-  } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
-    if (CF->isNullValue()) {
-      Reg = TargetMaterializeFloatZero(CF);
-    } else {
+    Reg = getRegForValue(
+        Constant::getNullValue(DL.getIntPtrType(V->getContext())));
+  else if (const auto *CF = dyn_cast<ConstantFP>(V)) {
+    if (CF->isNullValue())
+      Reg = fastMaterializeFloatZero(CF);
+    else
       // Try to emit the constant directly.
-      Reg = FastEmit_f(VT, VT, ISD::ConstantFP, CF);
-    }
+      Reg = fastEmit_f(VT, VT, ISD::ConstantFP, CF);
 
     if (!Reg) {
       // Try to emit the constant by using an integer constant with a cast.
@@ -229,22 +231,22 @@ unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
       uint64_t x[2];
       uint32_t IntBitWidth = IntVT.getSizeInBits();
       bool isExact;
-      (void) Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
-                                  APFloat::rmTowardZero, &isExact);
+      (void)Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
+                                 APFloat::rmTowardZero, &isExact);
       if (isExact) {
         APInt IntVal(IntBitWidth, x);
 
         unsigned IntegerReg =
-          getRegForValue(ConstantInt::get(V->getContext(), IntVal));
+            getRegForValue(ConstantInt::get(V->getContext(), IntVal));
         if (IntegerReg != 0)
-          Reg = FastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP,
-                           IntegerReg, /*Kill=*/false);
+          Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg,
+                           /*Kill=*/false);
       }
     }
-  } else if (const Operator *Op = dyn_cast<Operator>(V)) {
-    if (!SelectOperator(Op, Op->getOpcode()))
+  } else if (const auto *Op = dyn_cast<Operator>(V)) {
+    if (!selectOperator(Op, Op->getOpcode()))
       if (!isa<Instruction>(Op) ||
-          !TargetSelectInstruction(cast<Instruction>(Op)))
+          !fastSelectInstruction(cast<Instruction>(Op)))
         return 0;
     Reg = lookUpRegForValue(Op);
   } else if (isa<UndefValue>(V)) {
@@ -252,15 +254,26 @@ unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::IMPLICIT_DEF), Reg);
   }
+  return Reg;
+}
+
+/// Helper for getRegForValue. This function is called when the value isn't
+/// already available in a register and must be materialized with new
+/// instructions.
+unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
+  unsigned Reg = 0;
+  // Give the target-specific code a try first.
+  if (isa<Constant>(V))
+    Reg = fastMaterializeConstant(cast<Constant>(V));
 
-  // If target-independent code couldn't handle the value, give target-specific
-  // code a try.
-  if (!Reg && isa<Constant>(V))
-    Reg = TargetMaterializeConstant(cast<Constant>(V));
+  // If target-specific code couldn't or didn't want to handle the value, then
+  // give target-independent code a try.
+  if (!Reg)
+    Reg = materializeConstant(V, VT);
 
   // Don't cache constant materializations in the general ValueMap.
   // To do so would require tracking what uses they dominate.
-  if (Reg != 0) {
+  if (Reg) {
     LocalValueMap[V] = Reg;
     LastLocalValue = MRI.getVRegDef(Reg);
   }
@@ -278,13 +291,7 @@ unsigned FastISel::lookUpRegForValue(const Value *V) {
   return LocalValueMap[V];
 }
 
-/// UpdateValueMap - Update the value map to include the new mapping for this
-/// instruction, or insert an extra copy to get the result in a previous
-/// determined register.
-/// NOTE: This is only necessary because we might select a block that uses
-/// a value before we select the block that defines the value.  It might be
-/// possible to fix this by selecting blocks in reverse postorder.
-void FastISel::UpdateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) {
+void FastISel::updateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) {
   if (!isa<Instruction>(I)) {
     LocalValueMap[I] = Reg;
     return;
@@ -297,7 +304,7 @@ void FastISel::UpdateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) {
   else if (Reg != AssignedReg) {
     // Arrange for uses of AssignedReg to be replaced by uses of Reg.
     for (unsigned i = 0; i < NumRegs; i++)
-      FuncInfo.RegFixups[AssignedReg+i] = Reg+i;
+      FuncInfo.RegFixups[AssignedReg + i] = Reg + i;
 
     AssignedReg = Reg;
   }
@@ -315,13 +322,12 @@ std::pair<unsigned, bool> FastISel::getRegForGEPIndex(const Value *Idx) {
   MVT PtrVT = TLI.getPointerTy();
   EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
   if (IdxVT.bitsLT(PtrVT)) {
-    IdxN = FastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND,
-                      IdxN, IdxNIsKill);
+    IdxN = fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND, IdxN,
+                      IdxNIsKill);
     IdxNIsKill = true;
-  }
-  else if (IdxVT.bitsGT(PtrVT)) {
-    IdxN = FastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::TRUNCATE,
-                      IdxN, IdxNIsKill);
+  } else if (IdxVT.bitsGT(PtrVT)) {
+    IdxN =
+        fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::TRUNCATE, IdxN, IdxNIsKill);
     IdxNIsKill = true;
   }
   return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
@@ -343,7 +349,7 @@ void FastISel::recomputeInsertPt() {
 
 void FastISel::removeDeadCode(MachineBasicBlock::iterator I,
                               MachineBasicBlock::iterator E) {
-  assert (I && E && std::distance(I, E) > 0 && "Invalid iterator!");
+  assert(I && E && std::distance(I, E) > 0 && "Invalid iterator!");
   while (I != E) {
     MachineInstr *Dead = &*I;
     ++I;
@@ -358,7 +364,7 @@ FastISel::SavePoint FastISel::enterLocalValueArea() {
   DebugLoc OldDL = DbgLoc;
   recomputeInsertPt();
   DbgLoc = DebugLoc();
-  SavePoint SP = { OldInsertPt, OldDL };
+  SavePoint SP = {OldInsertPt, OldDL};
   return SP;
 }
 
@@ -371,10 +377,7 @@ void FastISel::leaveLocalValueArea(SavePoint OldInsertPt) {
   DbgLoc = OldInsertPt.DL;
 }
 
-/// SelectBinaryOp - Select and emit code for a binary operator instruction,
-/// which has an opcode which directly corresponds to the given ISD opcode.
-///
-bool FastISel::SelectBinaryOp(const User *I, unsigned ISDOpcode) {
+bool FastISel::selectBinaryOp(const User *I, unsigned ISDOpcode) {
   EVT VT = EVT::getEVT(I->getType(), /*HandleUnknown=*/true);
   if (VT == MVT::Other || !VT.isSimple())
     // Unhandled type. Halt "fast" selection and bail.
@@ -387,9 +390,8 @@ bool FastISel::SelectBinaryOp(const User *I, unsigned ISDOpcode) {
   if (!TLI.isTypeLegal(VT)) {
     // MVT::i1 is special. Allow AND, OR, or XOR because they
     // don't require additional zeroing, which makes them easy.
-    if (VT == MVT::i1 &&
-        (ISDOpcode == ISD::AND || ISDOpcode == ISD::OR ||
-         ISDOpcode == ISD::XOR))
+    if (VT == MVT::i1 && (ISDOpcode == ISD::AND || ISDOpcode == ISD::OR ||
+                          ISDOpcode == ISD::XOR))
       VT = TLI.getTypeToTransformTo(I->getContext(), VT);
     else
       return false;
@@ -397,38 +399,36 @@ bool FastISel::SelectBinaryOp(const User *I, unsigned ISDOpcode) {
 
   // Check if the first operand is a constant, and handle it as "ri".  At -O0,
   // we don't have anything that canonicalizes operand order.
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(0)))
+  if (const auto *CI = dyn_cast<ConstantInt>(I->getOperand(0)))
     if (isa<Instruction>(I) && cast<Instruction>(I)->isCommutative()) {
       unsigned Op1 = getRegForValue(I->getOperand(1));
-      if (Op1 == 0) return false;
-
+      if (!Op1)
+        return false;
       bool Op1IsKill = hasTrivialKill(I->getOperand(1));
 
-      unsigned ResultReg = FastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op1,
-                                        Op1IsKill, CI->getZExtValue(),
-                                        VT.getSimpleVT());
-      if (ResultReg == 0) return false;
+      unsigned ResultReg =
+          fastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op1, Op1IsKill,
+                       CI->getZExtValue(), VT.getSimpleVT());
+      if (!ResultReg)
+        return false;
 
       // We successfully emitted code for the given LLVM Instruction.
-      UpdateValueMap(I, ResultReg);
+      updateValueMap(I, ResultReg);
       return true;
     }
 
-
   unsigned Op0 = getRegForValue(I->getOperand(0));
-  if (Op0 == 0)   // Unhandled operand. Halt "fast" selection and bail.
+  if (!Op0) // Unhandled operand. Halt "fast" selection and bail.
     return false;
-
   bool Op0IsKill = hasTrivialKill(I->getOperand(0));
 
   // Check if the second operand is a constant and handle it appropriately.
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+  if (const auto *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
     uint64_t Imm = CI->getZExtValue();
 
     // Transform "sdiv exact X, 8" -> "sra X, 3".
     if (ISDOpcode == ISD::SDIV && isa<BinaryOperator>(I) &&
-        cast<BinaryOperator>(I)->isExact() &&
-        isPowerOf2_64(Imm)) {
+        cast<BinaryOperator>(I)->isExact() && isPowerOf2_64(Imm)) {
       Imm = Log2_64(Imm);
       ISDOpcode = ISD::SRA;
     }
@@ -440,54 +440,49 @@ bool FastISel::SelectBinaryOp(const User *I, unsigned ISDOpcode) {
       ISDOpcode = ISD::AND;
     }
 
-    unsigned ResultReg = FastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0,
+    unsigned ResultReg = fastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0,
                                       Op0IsKill, Imm, VT.getSimpleVT());
-    if (ResultReg == 0) return false;
+    if (!ResultReg)
+      return false;
 
     // We successfully emitted code for the given LLVM Instruction.
-    UpdateValueMap(I, ResultReg);
+    updateValueMap(I, ResultReg);
     return true;
   }
 
   // Check if the second operand is a constant float.
-  if (ConstantFP *CF = dyn_cast<ConstantFP>(I->getOperand(1))) {
-    unsigned ResultReg = FastEmit_rf(VT.getSimpleVT(), VT.getSimpleVT(),
+  if (const auto *CF = dyn_cast<ConstantFP>(I->getOperand(1))) {
+    unsigned ResultReg = fastEmit_rf(VT.getSimpleVT(), VT.getSimpleVT(),
                                      ISDOpcode, Op0, Op0IsKill, CF);
-    if (ResultReg != 0) {
+    if (ResultReg) {
       // We successfully emitted code for the given LLVM Instruction.
-      UpdateValueMap(I, ResultReg);
+      updateValueMap(I, ResultReg);
       return true;
     }
   }
 
   unsigned Op1 = getRegForValue(I->getOperand(1));
-  if (Op1 == 0)
-    // Unhandled operand. Halt "fast" selection and bail.
+  if (!Op1) // Unhandled operand. Halt "fast" selection and bail.
     return false;
-
   bool Op1IsKill = hasTrivialKill(I->getOperand(1));
 
   // Now we have both operands in registers. Emit the instruction.
-  unsigned ResultReg = FastEmit_rr(VT.getSimpleVT(), VT.getSimpleVT(),
-                                   ISDOpcode,
-                                   Op0, Op0IsKill,
-                                   Op1, Op1IsKill);
-  if (ResultReg == 0)
+  unsigned ResultReg = fastEmit_rr(VT.getSimpleVT(), VT.getSimpleVT(),
+                                   ISDOpcode, Op0, Op0IsKill, Op1, Op1IsKill);
+  if (!ResultReg)
     // Target-specific code wasn't able to find a machine opcode for
     // the given ISD opcode and type. Halt "fast" selection and bail.
     return false;
 
   // We successfully emitted code for the given LLVM Instruction.
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool FastISel::SelectGetElementPtr(const User *I) {
+bool FastISel::selectGetElementPtr(const User *I) {
   unsigned N = getRegForValue(I->getOperand(0));
-  if (N == 0)
-    // Unhandled operand. Halt "fast" selection and bail.
+  if (!N) // Unhandled operand. Halt "fast" selection and bail.
     return false;
-
   bool NIsKill = hasTrivialKill(I->getOperand(0));
 
   // Keep a running tab of the total offset to coalesce multiple N = N + Offset
@@ -497,18 +492,18 @@ bool FastISel::SelectGetElementPtr(const User *I) {
   uint64_t MaxOffs = 2048;
   Type *Ty = I->getOperand(0)->getType();
   MVT VT = TLI.getPointerTy();
-  for (GetElementPtrInst::const_op_iterator OI = I->op_begin()+1,
-       E = I->op_end(); OI != E; ++OI) {
+  for (GetElementPtrInst::const_op_iterator OI = I->op_begin() + 1,
+                                            E = I->op_end();
+       OI != E; ++OI) {
     const Value *Idx = *OI;
-    if (StructType *StTy = dyn_cast<StructType>(Ty)) {
+    if (auto *StTy = dyn_cast<StructType>(Ty)) {
       unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
       if (Field) {
         // N = N + Offset
         TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
         if (TotalOffs >= MaxOffs) {
-          N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
-          if (N == 0)
-            // Unhandled operand. Halt "fast" selection and bail.
+          N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+          if (!N) // Unhandled operand. Halt "fast" selection and bail.
             return false;
           NIsKill = true;
           TotalOffs = 0;
@@ -519,15 +514,15 @@ bool FastISel::SelectGetElementPtr(const User *I) {
       Ty = cast<SequentialType>(Ty)->getElementType();
 
       // If this is a constant subscript, handle it quickly.
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
-        if (CI->isZero()) continue;
+      if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->isZero())
+          continue;
         // N = N + Offset
         TotalOffs +=
-          DL.getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
+            DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue();
         if (TotalOffs >= MaxOffs) {
-          N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
-          if (N == 0)
-            // Unhandled operand. Halt "fast" selection and bail.
+          N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+          if (!N) // Unhandled operand. Halt "fast" selection and bail.
             return false;
           NIsKill = true;
           TotalOffs = 0;
@@ -535,9 +530,8 @@ bool FastISel::SelectGetElementPtr(const User *I) {
         continue;
       }
       if (TotalOffs) {
-        N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
-        if (N == 0)
-          // Unhandled operand. Halt "fast" selection and bail.
+        N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+        if (!N) // Unhandled operand. Halt "fast" selection and bail.
           return false;
         NIsKill = true;
         TotalOffs = 0;
@@ -548,43 +542,37 @@ bool FastISel::SelectGetElementPtr(const User *I) {
       std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx);
       unsigned IdxN = Pair.first;
       bool IdxNIsKill = Pair.second;
-      if (IdxN == 0)
-        // Unhandled operand. Halt "fast" selection and bail.
+      if (!IdxN) // Unhandled operand. Halt "fast" selection and bail.
         return false;
 
       if (ElementSize != 1) {
-        IdxN = FastEmit_ri_(VT, ISD::MUL, IdxN, IdxNIsKill, ElementSize, VT);
-        if (IdxN == 0)
-          // Unhandled operand. Halt "fast" selection and bail.
+        IdxN = fastEmit_ri_(VT, ISD::MUL, IdxN, IdxNIsKill, ElementSize, VT);
+        if (!IdxN) // Unhandled operand. Halt "fast" selection and bail.
           return false;
         IdxNIsKill = true;
       }
-      N = FastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
-      if (N == 0)
-        // Unhandled operand. Halt "fast" selection and bail.
+      N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
+      if (!N) // Unhandled operand. Halt "fast" selection and bail.
         return false;
     }
   }
   if (TotalOffs) {
-    N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
-    if (N == 0)
-      // Unhandled operand. Halt "fast" selection and bail.
+    N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+    if (!N) // Unhandled operand. Halt "fast" selection and bail.
       return false;
   }
 
   // We successfully emitted code for the given LLVM Instruction.
-  UpdateValueMap(I, N);
+  updateValueMap(I, N);
   return true;
 }
 
-/// \brief Add a stackmap or patchpoint intrinsic call's live variable operands
-/// to a stackmap or patchpoint machine instruction.
 bool FastISel::addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
                                    const CallInst *CI, unsigned StartIdx) {
   for (unsigned i = StartIdx, e = CI->getNumArgOperands(); i != e; ++i) {
     Value *Val = CI->getArgOperand(i);
     // Check for constants and encode them with a StackMaps::ConstantOp prefix.
-    if (auto *C = dyn_cast<ConstantInt>(Val)) {
+    if (const auto *C = dyn_cast<ConstantInt>(Val)) {
       Ops.push_back(MachineOperand::CreateImm(StackMaps::ConstantOp));
       Ops.push_back(MachineOperand::CreateImm(C->getSExtValue()));
     } else if (isa<ConstantPointerNull>(Val)) {
@@ -601,16 +589,15 @@ bool FastISel::addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
         return false;
     } else {
       unsigned Reg = getRegForValue(Val);
-      if (Reg == 0)
+      if (!Reg)
         return false;
       Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/false));
     }
   }
-
   return true;
 }
 
-bool FastISel::SelectStackmap(const CallInst *I) {
+bool FastISel::selectStackmap(const CallInst *I) {
   // void @llvm.experimental.stackmap(i64 <id>, i32 <numShadowBytes>,
   //                                  [live variables...])
   assert(I->getCalledFunction()->getReturnType()->isVoidTy() &&
@@ -637,7 +624,7 @@ bool FastISel::SelectStackmap(const CallInst *I) {
   assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos)) &&
          "Expected a constant integer.");
   const auto *NumBytes =
-    cast<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos));
+      cast<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos));
   Ops.push_back(MachineOperand::CreateImm(NumBytes->getZExtValue()));
 
   // Push live variables for the stack map (skipping the first two arguments
@@ -653,13 +640,13 @@ bool FastISel::SelectStackmap(const CallInst *I) {
   const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC);
   for (unsigned i = 0; ScratchRegs[i]; ++i)
     Ops.push_back(MachineOperand::CreateReg(
-      ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false,
-      /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true));
+        ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false,
+        /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true));
 
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-    .addImm(0);
+      .addImm(0);
 
   // Issue STACKMAP.
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -670,7 +657,8 @@ bool FastISel::SelectStackmap(const CallInst *I) {
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
-    .addImm(0).addImm(0);
+      .addImm(0)
+      .addImm(0);
 
   // Inform the Frame Information that we have a stackmap in this function.
   FuncInfo.MF->getFrameInfo()->setHasStackMap();
@@ -709,10 +697,10 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
                                : CI->getType();
   CLI.setCallee(CI->getCallingConv(), RetTy, Callee, std::move(Args), NumArgs);
 
-  return LowerCallTo(CLI);
+  return lowerCallTo(CLI);
 }
 
-bool FastISel::SelectPatchpoint(const CallInst *I) {
+bool FastISel::selectPatchpoint(const CallInst *I) {
   // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
   //                                                 i32 <numBytes>,
   //                                                 i8* <target>,
@@ -728,7 +716,7 @@ bool FastISel::SelectPatchpoint(const CallInst *I) {
   assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NArgPos)) &&
          "Expected a constant integer.");
   const auto *NumArgsVal =
-    cast<ConstantInt>(I->getOperand(PatchPointOpers::NArgPos));
+      cast<ConstantInt>(I->getOperand(PatchPointOpers::NArgPos));
   unsigned NumArgs = NumArgsVal->getZExtValue();
 
   // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
@@ -740,6 +728,7 @@ bool FastISel::SelectPatchpoint(const CallInst *I) {
   // For AnyRegCC the arguments are lowered later on manually.
   unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
   CallLoweringInfo CLI;
+  CLI.setIsPatchPoint();
   if (!lowerCallOperands(I, NumMetaOpers, NumCallArgs, Callee, IsAnyRegCC, CLI))
     return false;
 
@@ -764,12 +753,12 @@ bool FastISel::SelectPatchpoint(const CallInst *I) {
   assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos)) &&
          "Expected a constant integer.");
   const auto *NumBytes =
-    cast<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos));
+      cast<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos));
   Ops.push_back(MachineOperand::CreateImm(NumBytes->getZExtValue()));
 
   // Assume that the callee is a constant address or null pointer.
   // FIXME: handle function symbols in the future.
-  unsigned CalleeAddr;
+  uint64_t CalleeAddr;
   if (const auto *C = dyn_cast<IntToPtrInst>(Callee))
     CalleeAddr = cast<ConstantInt>(C->getOperand(0))->getZExtValue();
   else if (const auto *C = dyn_cast<ConstantExpr>(Callee)) {
@@ -818,8 +807,8 @@ bool FastISel::SelectPatchpoint(const CallInst *I) {
   const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC);
   for (unsigned i = 0; ScratchRegs[i]; ++i)
     Ops.push_back(MachineOperand::CreateReg(
-      ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false,
-      /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true));
+        ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false,
+        /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true));
 
   // Add implicit defs (return values).
   for (auto Reg : CLI.InRegs)
@@ -842,7 +831,7 @@ bool FastISel::SelectPatchpoint(const CallInst *I) {
   FuncInfo.MF->getFrameInfo()->setHasPatchPoint();
 
   if (CLI.NumResultRegs)
-    UpdateValueMap(I, CLI.ResultReg, CLI.NumResultRegs);
+    updateValueMap(I, CLI.ResultReg, CLI.NumResultRegs);
   return true;
 }
 
@@ -861,7 +850,7 @@ static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
                            Attrs);
 }
 
-bool FastISel::LowerCallTo(const CallInst *CI, const char *SymName,
+bool FastISel::lowerCallTo(const CallInst *CI, const char *SymName,
                            unsigned NumArgs) {
   ImmutableCallSite CS(CI);
 
@@ -889,10 +878,10 @@ bool FastISel::LowerCallTo(const CallInst *CI, const char *SymName,
   CallLoweringInfo CLI;
   CLI.setCallee(RetTy, FTy, SymName, std::move(Args), CS, NumArgs);
 
-  return LowerCallTo(CLI);
+  return lowerCallTo(CLI);
 }
 
-bool FastISel::LowerCallTo(CallLoweringInfo &CLI) {
+bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
   // Handle the incoming return values from the call.
   CLI.clearIns();
   SmallVector<EVT, 4> RetTys;
@@ -901,9 +890,8 @@ bool FastISel::LowerCallTo(CallLoweringInfo &CLI) {
   SmallVector<ISD::OutputArg, 4> Outs;
   GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, TLI);
 
-  bool CanLowerReturn = TLI.CanLowerReturn(CLI.CallConv, *FuncInfo.MF,
-                                           CLI.IsVarArg, Outs,
-                                           CLI.RetTy->getContext());
+  bool CanLowerReturn = TLI.CanLowerReturn(
+      CLI.CallConv, *FuncInfo.MF, CLI.IsVarArg, Outs, CLI.RetTy->getContext());
 
   // FIXME: sret demotion isn't supported yet - bail out.
   if (!CanLowerReturn)
@@ -932,23 +920,23 @@ bool FastISel::LowerCallTo(CallLoweringInfo &CLI) {
   CLI.clearOuts();
   for (auto &Arg : CLI.getArgs()) {
     Type *FinalType = Arg.Ty;
-    if (Arg.isByVal)
+    if (Arg.IsByVal)
       FinalType = cast<PointerType>(Arg.Ty)->getElementType();
     bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
-      FinalType, CLI.CallConv, CLI.IsVarArg);
+        FinalType, CLI.CallConv, CLI.IsVarArg);
 
     ISD::ArgFlagsTy Flags;
-    if (Arg.isZExt)
+    if (Arg.IsZExt)
       Flags.setZExt();
-    if (Arg.isSExt)
+    if (Arg.IsSExt)
       Flags.setSExt();
-    if (Arg.isInReg)
+    if (Arg.IsInReg)
       Flags.setInReg();
-    if (Arg.isSRet)
+    if (Arg.IsSRet)
       Flags.setSRet();
-    if (Arg.isByVal)
+    if (Arg.IsByVal)
       Flags.setByVal();
-    if (Arg.isInAlloca) {
+    if (Arg.IsInAlloca) {
       Flags.setInAlloca();
       // Set the byval flag for CCAssignFn callbacks that don't know about
       // inalloca. This way we can know how many bytes we should've allocated
@@ -957,7 +945,7 @@ bool FastISel::LowerCallTo(CallLoweringInfo &CLI) {
       // the various CC lowering callbacks.
       Flags.setByVal();
     }
-    if (Arg.isByVal || Arg.isInAlloca) {
+    if (Arg.IsByVal || Arg.IsInAlloca) {
       PointerType *Ty = cast<PointerType>(Arg.Ty);
       Type *ElementTy = Ty->getElementType();
       unsigned FrameSize = DL.getTypeAllocSize(ElementTy);
@@ -969,7 +957,7 @@ bool FastISel::LowerCallTo(CallLoweringInfo &CLI) {
       Flags.setByValSize(FrameSize);
       Flags.setByValAlign(FrameAlign);
     }
-    if (Arg.isNest)
+    if (Arg.IsNest)
       Flags.setNest();
     if (NeedsRegBlock)
       Flags.setInConsecutiveRegs();
@@ -980,7 +968,7 @@ bool FastISel::LowerCallTo(CallLoweringInfo &CLI) {
     CLI.OutFlags.push_back(Flags);
   }
 
-  if (!FastLowerCall(CLI))
+  if (!fastLowerCall(CLI))
     return false;
 
   // Set all unused physreg defs as dead.
@@ -988,12 +976,12 @@ bool FastISel::LowerCallTo(CallLoweringInfo &CLI) {
   CLI.Call->setPhysRegsDeadExcept(CLI.InRegs, TRI);
 
   if (CLI.NumResultRegs && CLI.CS)
-    UpdateValueMap(CLI.CS->getInstruction(), CLI.ResultReg, CLI.NumResultRegs);
+    updateValueMap(CLI.CS->getInstruction(), CLI.ResultReg, CLI.NumResultRegs);
 
   return true;
 }
 
-bool FastISel::LowerCall(const CallInst *CI) {
+bool FastISel::lowerCall(const CallInst *CI) {
   ImmutableCallSite CS(CI);
 
   PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
@@ -1021,19 +1009,19 @@ bool FastISel::LowerCall(const CallInst *CI) {
   }
 
   // Check if target-independent constraints permit a tail call here.
-  // Target-dependent constraints are checked within FastLowerCall.
+  // Target-dependent constraints are checked within fastLowerCall.
   bool IsTailCall = CI->isTailCall();
   if (IsTailCall && !isInTailCallPosition(CS, TM))
     IsTailCall = false;
 
   CallLoweringInfo CLI;
   CLI.setCallee(RetTy, FuncTy, CI->getCalledValue(), std::move(Args), CS)
-    .setTailCall(IsTailCall);
+      .setTailCall(IsTailCall);
 
-  return LowerCallTo(CLI);
+  return lowerCallTo(CLI);
 }
 
-bool FastISel::SelectCall(const User *I) {
+bool FastISel::selectCall(const User *I) {
   const CallInst *Call = cast<CallInst>(I);
 
   // Handle simple inline asms.
@@ -1055,8 +1043,8 @@ bool FastISel::SelectCall(const User *I) {
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::INLINEASM))
-      .addExternalSymbol(IA->getAsmString().c_str())
-      .addImm(ExtraInfo);
+        .addExternalSymbol(IA->getAsmString().c_str())
+        .addImm(ExtraInfo);
     return true;
   }
 
@@ -1065,7 +1053,7 @@ bool FastISel::SelectCall(const User *I) {
 
   // Handle intrinsic function calls.
   if (const auto *II = dyn_cast<IntrinsicInst>(Call))
-    return SelectIntrinsicCall(II);
+    return selectIntrinsicCall(II);
 
   // Usually, it does not make sense to initialize a value,
   // make an unrelated function call and use the value, because
@@ -1076,12 +1064,13 @@ bool FastISel::SelectCall(const User *I) {
   // since they tend to be inlined.
   flushLocalValueMap();
 
-  return LowerCall(Call);
+  return lowerCall(Call);
 }
 
-bool FastISel::SelectIntrinsicCall(const IntrinsicInst *II) {
+bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
   switch (II->getIntrinsicID()) {
-  default: break;
+  default:
+    break;
   // At -O0 we don't care about the lifetime intrinsics.
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end:
@@ -1106,7 +1095,7 @@ bool FastISel::SelectIntrinsicCall(const IntrinsicInst *II) {
 
     unsigned Offset = 0;
     Optional<MachineOperand> Op;
-    if (const Argument *Arg = dyn_cast<Argument>(Address))
+    if (const auto *Arg = dyn_cast<Argument>(Address))
       // Some arguments' frame index is recorded during argument lowering.
       Offset = FuncInfo.getArgumentFrameIndex(Arg);
     if (Offset)
@@ -1137,13 +1126,14 @@ bool FastISel::SelectIntrinsicCall(const IntrinsicInst *II) {
         Op->setIsDebug(true);
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(TargetOpcode::DBG_VALUE), false, Op->getReg(), 0,
-                DI->getVariable());
+                DI->getVariable(), DI->getExpression());
       } else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(TargetOpcode::DBG_VALUE))
-          .addOperand(*Op)
-          .addImm(0)
-          .addMetadata(DI->getVariable());
+            .addOperand(*Op)
+            .addImm(0)
+            .addMetadata(DI->getVariable())
+            .addMetadata(DI->getExpression());
     } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
@@ -1160,26 +1150,34 @@ bool FastISel::SelectIntrinsicCall(const IntrinsicInst *II) {
       // Currently the optimizer can produce this; insert an undef to
       // help debugging.  Probably the optimizer should not do this.
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-        .addReg(0U).addImm(DI->getOffset())
-        .addMetadata(DI->getVariable());
-    } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+          .addReg(0U)
+          .addImm(DI->getOffset())
+          .addMetadata(DI->getVariable())
+          .addMetadata(DI->getExpression());
+    } else if (const auto *CI = dyn_cast<ConstantInt>(V)) {
       if (CI->getBitWidth() > 64)
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-          .addCImm(CI).addImm(DI->getOffset())
-          .addMetadata(DI->getVariable());
+            .addCImm(CI)
+            .addImm(DI->getOffset())
+            .addMetadata(DI->getVariable())
+            .addMetadata(DI->getExpression());
       else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-          .addImm(CI->getZExtValue()).addImm(DI->getOffset())
-          .addMetadata(DI->getVariable());
-    } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+            .addImm(CI->getZExtValue())
+            .addImm(DI->getOffset())
+            .addMetadata(DI->getVariable())
+            .addMetadata(DI->getExpression());
+    } else if (const auto *CF = dyn_cast<ConstantFP>(V)) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-        .addFPImm(CF).addImm(DI->getOffset())
-        .addMetadata(DI->getVariable());
+          .addFPImm(CF)
+          .addImm(DI->getOffset())
+          .addMetadata(DI->getVariable())
+          .addMetadata(DI->getExpression());
     } else if (unsigned Reg = lookUpRegForValue(V)) {
       // FIXME: This does not handle register-indirect values at offset 0.
       bool IsIndirect = DI->getOffset() != 0;
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, IsIndirect,
-              Reg, DI->getOffset(), DI->getVariable());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, IsIndirect, Reg,
+              DI->getOffset(), DI->getVariable(), DI->getExpression());
     } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
@@ -1192,34 +1190,34 @@ bool FastISel::SelectIntrinsicCall(const IntrinsicInst *II) {
     unsigned long long Res = CI->isZero() ? -1ULL : 0;
     Constant *ResCI = ConstantInt::get(II->getType(), Res);
     unsigned ResultReg = getRegForValue(ResCI);
-    if (ResultReg == 0)
+    if (!ResultReg)
       return false;
-    UpdateValueMap(II, ResultReg);
+    updateValueMap(II, ResultReg);
     return true;
   }
   case Intrinsic::expect: {
     unsigned ResultReg = getRegForValue(II->getArgOperand(0));
-    if (ResultReg == 0)
+    if (!ResultReg)
       return false;
-    UpdateValueMap(II, ResultReg);
+    updateValueMap(II, ResultReg);
     return true;
   }
   case Intrinsic::experimental_stackmap:
-    return SelectStackmap(II);
+    return selectStackmap(II);
   case Intrinsic::experimental_patchpoint_void:
   case Intrinsic::experimental_patchpoint_i64:
-    return SelectPatchpoint(II);
+    return selectPatchpoint(II);
   }
 
-  return FastLowerIntrinsicCall(II);
+  return fastLowerIntrinsicCall(II);
 }
 
-bool FastISel::SelectCast(const User *I, unsigned Opcode) {
+bool FastISel::selectCast(const User *I, unsigned Opcode) {
   EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
   EVT DstVT = TLI.getValueType(I->getType());
 
-  if (SrcVT == MVT::Other || !SrcVT.isSimple() ||
-      DstVT == MVT::Other || !DstVT.isSimple())
+  if (SrcVT == MVT::Other || !SrcVT.isSimple() || DstVT == MVT::Other ||
+      !DstVT.isSimple())
     // Unhandled type. Halt "fast" selection and bail.
     return false;
 
@@ -1238,24 +1236,22 @@ bool FastISel::SelectCast(const User *I, unsigned Opcode) {
 
   bool InputRegIsKill = hasTrivialKill(I->getOperand(0));
 
-  unsigned ResultReg = FastEmit_r(SrcVT.getSimpleVT(),
-                                  DstVT.getSimpleVT(),
-                                  Opcode,
-                                  InputReg, InputRegIsKill);
+  unsigned ResultReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(),
+                                  Opcode, InputReg, InputRegIsKill);
   if (!ResultReg)
     return false;
 
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool FastISel::SelectBitCast(const User *I) {
+bool FastISel::selectBitCast(const User *I) {
   // If the bitcast doesn't change the type, just use the operand value.
   if (I->getType() == I->getOperand(0)->getType()) {
     unsigned Reg = getRegForValue(I->getOperand(0));
-    if (Reg == 0)
+    if (!Reg)
       return false;
-    UpdateValueMap(I, Reg);
+    updateValueMap(I, Reg);
     return true;
   }
 
@@ -1270,17 +1266,15 @@ bool FastISel::SelectBitCast(const User *I) {
   MVT SrcVT = SrcEVT.getSimpleVT();
   MVT DstVT = DstEVT.getSimpleVT();
   unsigned Op0 = getRegForValue(I->getOperand(0));
-  if (Op0 == 0)
-    // Unhandled operand. Halt "fast" selection and bail.
+  if (!Op0) // Unhandled operand. Halt "fast" selection and bail.
     return false;
-
   bool Op0IsKill = hasTrivialKill(I->getOperand(0));
 
   // First, try to perform the bitcast by inserting a reg-reg copy.
   unsigned ResultReg = 0;
   if (SrcVT == DstVT) {
-    const TargetRegisterClass* SrcClass = TLI.getRegClassFor(SrcVT);
-    const TargetRegisterClass* DstClass = TLI.getRegClassFor(DstVT);
+    const TargetRegisterClass *SrcClass = TLI.getRegClassFor(SrcVT);
+    const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT);
     // Don't attempt a cross-class copy. It will likely fail.
     if (SrcClass == DstClass) {
       ResultReg = createResultReg(DstClass);
@@ -1291,28 +1285,27 @@ bool FastISel::SelectBitCast(const User *I) {
 
   // If the reg-reg copy failed, select a BITCAST opcode.
   if (!ResultReg)
-    ResultReg = FastEmit_r(SrcVT, DstVT, ISD::BITCAST, Op0, Op0IsKill);
+    ResultReg = fastEmit_r(SrcVT, DstVT, ISD::BITCAST, Op0, Op0IsKill);
 
   if (!ResultReg)
     return false;
 
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool
-FastISel::SelectInstruction(const Instruction *I) {
+bool FastISel::selectInstruction(const Instruction *I) {
   // Just before the terminator instruction, insert instructions to
   // feed PHI nodes in successor blocks.
   if (isa<TerminatorInst>(I))
-    if (!HandlePHINodesInSuccessorBlocks(I->getParent()))
+    if (!handlePHINodesInSuccessorBlocks(I->getParent()))
       return false;
 
   DbgLoc = I->getDebugLoc();
 
-  MachineBasicBlock::iterator SavedInsertPt = FuncInfo.InsertPt;
+  SavedInsertPt = FuncInfo.InsertPt;
 
-  if (const CallInst *Call = dyn_cast<CallInst>(I)) {
+  if (const auto *Call = dyn_cast<CallInst>(I)) {
     const Function *F = Call->getCalledFunction();
     LibFunc::Func Func;
 
@@ -1330,40 +1323,39 @@ FastISel::SelectInstruction(const Instruction *I) {
   }
 
   // First, try doing target-independent selection.
-  if (SelectOperator(I, I->getOpcode())) {
-    ++NumFastIselSuccessIndependent;
-    DbgLoc = DebugLoc();
-    return true;
-  }
-  // Remove dead code.  However, ignore call instructions since we've flushed
-  // the local value map and recomputed the insert point.
-  if (!isa<CallInst>(I)) {
+  if (!SkipTargetIndependentISel) {
+    if (selectOperator(I, I->getOpcode())) {
+      ++NumFastIselSuccessIndependent;
+      DbgLoc = DebugLoc();
+      return true;
+    }
+    // Remove dead code.
     recomputeInsertPt();
     if (SavedInsertPt != FuncInfo.InsertPt)
       removeDeadCode(FuncInfo.InsertPt, SavedInsertPt);
+    SavedInsertPt = FuncInfo.InsertPt;
   }
-
   // Next, try calling the target to attempt to handle the instruction.
-  SavedInsertPt = FuncInfo.InsertPt;
-  if (TargetSelectInstruction(I)) {
+  if (fastSelectInstruction(I)) {
     ++NumFastIselSuccessTarget;
     DbgLoc = DebugLoc();
     return true;
   }
-  // Check for dead code and remove as necessary.
+  // Remove dead code.
   recomputeInsertPt();
   if (SavedInsertPt != FuncInfo.InsertPt)
     removeDeadCode(FuncInfo.InsertPt, SavedInsertPt);
 
   DbgLoc = DebugLoc();
+  // Undo phi node updates, because they will be added again by SelectionDAG.
+  if (isa<TerminatorInst>(I))
+    FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
   return false;
 }
 
-/// FastEmitBranch - Emit an unconditional branch to the given block,
-/// unless it is the immediate (fall-through) successor, and update
-/// the CFG.
-void
-FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) {
+/// Emit an unconditional branch to the given block, unless it is the immediate
+/// (fall-through) successor, and update the CFG.
+void FastISel::fastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) {
   if (FuncInfo.MBB->getBasicBlock()->size() > 1 &&
       FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
     // For more accurate line information if this is the only instruction
@@ -1381,54 +1373,51 @@ FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) {
   FuncInfo.MBB->addSuccessor(MSucc, BranchWeight);
 }
 
-/// SelectFNeg - Emit an FNeg operation.
-///
-bool
-FastISel::SelectFNeg(const User *I) {
+/// Emit an FNeg operation.
+bool FastISel::selectFNeg(const User *I) {
   unsigned OpReg = getRegForValue(BinaryOperator::getFNegArgument(I));
-  if (OpReg == 0) return false;
-
+  if (!OpReg)
+    return false;
   bool OpRegIsKill = hasTrivialKill(I);
 
   // If the target has ISD::FNEG, use it.
   EVT VT = TLI.getValueType(I->getType());
-  unsigned ResultReg = FastEmit_r(VT.getSimpleVT(), VT.getSimpleVT(),
-                                  ISD::FNEG, OpReg, OpRegIsKill);
-  if (ResultReg != 0) {
-    UpdateValueMap(I, ResultReg);
+  unsigned ResultReg = fastEmit_r(VT.getSimpleVT(), VT.getSimpleVT(), ISD::FNEG,
+                                  OpReg, OpRegIsKill);
+  if (ResultReg) {
+    updateValueMap(I, ResultReg);
     return true;
   }
 
   // Bitcast the value to integer, twiddle the sign bit with xor,
   // and then bitcast it back to floating-point.
-  if (VT.getSizeInBits() > 64) return false;
+  if (VT.getSizeInBits() > 64)
+    return false;
   EVT IntVT = EVT::getIntegerVT(I->getContext(), VT.getSizeInBits());
   if (!TLI.isTypeLegal(IntVT))
     return false;
 
-  unsigned IntReg = FastEmit_r(VT.getSimpleVT(), IntVT.getSimpleVT(),
+  unsigned IntReg = fastEmit_r(VT.getSimpleVT(), IntVT.getSimpleVT(),
                                ISD::BITCAST, OpReg, OpRegIsKill);
-  if (IntReg == 0)
+  if (!IntReg)
     return false;
 
-  unsigned IntResultReg = FastEmit_ri_(IntVT.getSimpleVT(), ISD::XOR,
-                                       IntReg, /*Kill=*/true,
-                                       UINT64_C(1) << (VT.getSizeInBits()-1),
-                                       IntVT.getSimpleVT());
-  if (IntResultReg == 0)
+  unsigned IntResultReg = fastEmit_ri_(
+      IntVT.getSimpleVT(), ISD::XOR, IntReg, /*IsKill=*/true,
+      UINT64_C(1) << (VT.getSizeInBits() - 1), IntVT.getSimpleVT());
+  if (!IntResultReg)
     return false;
 
-  ResultReg = FastEmit_r(IntVT.getSimpleVT(), VT.getSimpleVT(),
-                         ISD::BITCAST, IntResultReg, /*Kill=*/true);
-  if (ResultReg == 0)
+  ResultReg = fastEmit_r(IntVT.getSimpleVT(), VT.getSimpleVT(), ISD::BITCAST,
+                         IntResultReg, /*IsKill=*/true);
+  if (!ResultReg)
     return false;
 
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool
-FastISel::SelectExtractValue(const User *U) {
+bool FastISel::selectExtractValue(const User *U) {
   const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(U);
   if (!EVI)
     return false;
@@ -1464,55 +1453,54 @@ FastISel::SelectExtractValue(const User *U) {
   for (unsigned i = 0; i < VTIndex; i++)
     ResultReg += TLI.getNumRegisters(FuncInfo.Fn->getContext(), AggValueVTs[i]);
 
-  UpdateValueMap(EVI, ResultReg);
+  updateValueMap(EVI, ResultReg);
   return true;
 }
 
-bool
-FastISel::SelectOperator(const User *I, unsigned Opcode) {
+bool FastISel::selectOperator(const User *I, unsigned Opcode) {
   switch (Opcode) {
   case Instruction::Add:
-    return SelectBinaryOp(I, ISD::ADD);
+    return selectBinaryOp(I, ISD::ADD);
   case Instruction::FAdd:
-    return SelectBinaryOp(I, ISD::FADD);
+    return selectBinaryOp(I, ISD::FADD);
   case Instruction::Sub:
-    return SelectBinaryOp(I, ISD::SUB);
+    return selectBinaryOp(I, ISD::SUB);
   case Instruction::FSub:
     // FNeg is currently represented in LLVM IR as a special case of FSub.
     if (BinaryOperator::isFNeg(I))
-      return SelectFNeg(I);
-    return SelectBinaryOp(I, ISD::FSUB);
+      return selectFNeg(I);
+    return selectBinaryOp(I, ISD::FSUB);
   case Instruction::Mul:
-    return SelectBinaryOp(I, ISD::MUL);
+    return selectBinaryOp(I, ISD::MUL);
   case Instruction::FMul:
-    return SelectBinaryOp(I, ISD::FMUL);
+    return selectBinaryOp(I, ISD::FMUL);
   case Instruction::SDiv:
-    return SelectBinaryOp(I, ISD::SDIV);
+    return selectBinaryOp(I, ISD::SDIV);
   case Instruction::UDiv:
-    return SelectBinaryOp(I, ISD::UDIV);
+    return selectBinaryOp(I, ISD::UDIV);
   case Instruction::FDiv:
-    return SelectBinaryOp(I, ISD::FDIV);
+    return selectBinaryOp(I, ISD::FDIV);
   case Instruction::SRem:
-    return SelectBinaryOp(I, ISD::SREM);
+    return selectBinaryOp(I, ISD::SREM);
   case Instruction::URem:
-    return SelectBinaryOp(I, ISD::UREM);
+    return selectBinaryOp(I, ISD::UREM);
   case Instruction::FRem:
-    return SelectBinaryOp(I, ISD::FREM);
+    return selectBinaryOp(I, ISD::FREM);
   case Instruction::Shl:
-    return SelectBinaryOp(I, ISD::SHL);
+    return selectBinaryOp(I, ISD::SHL);
   case Instruction::LShr:
-    return SelectBinaryOp(I, ISD::SRL);
+    return selectBinaryOp(I, ISD::SRL);
   case Instruction::AShr:
-    return SelectBinaryOp(I, ISD::SRA);
+    return selectBinaryOp(I, ISD::SRA);
   case Instruction::And:
-    return SelectBinaryOp(I, ISD::AND);
+    return selectBinaryOp(I, ISD::AND);
   case Instruction::Or:
-    return SelectBinaryOp(I, ISD::OR);
+    return selectBinaryOp(I, ISD::OR);
   case Instruction::Xor:
-    return SelectBinaryOp(I, ISD::XOR);
+    return selectBinaryOp(I, ISD::XOR);
 
   case Instruction::GetElementPtr:
-    return SelectGetElementPtr(I);
+    return selectGetElementPtr(I);
 
   case Instruction::Br: {
     const BranchInst *BI = cast<BranchInst>(I);
@@ -1520,7 +1508,7 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
     if (BI->isUnconditional()) {
       const BasicBlock *LLVMSucc = BI->getSuccessor(0);
       MachineBasicBlock *MSucc = FuncInfo.MBBMap[LLVMSucc];
-      FastEmitBranch(MSucc, BI->getDebugLoc());
+      fastEmitBranch(MSucc, BI->getDebugLoc());
       return true;
     }
 
@@ -1531,7 +1519,7 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
 
   case Instruction::Unreachable:
     if (TM.Options.TrapUnreachable)
-      return FastEmit_(MVT::Other, MVT::Other, ISD::TRAP) != 0;
+      return fastEmit_(MVT::Other, MVT::Other, ISD::TRAP) != 0;
     else
       return true;
 
@@ -1544,38 +1532,39 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
     return false;
 
   case Instruction::Call:
-    return SelectCall(I);
+    return selectCall(I);
 
   case Instruction::BitCast:
-    return SelectBitCast(I);
+    return selectBitCast(I);
 
   case Instruction::FPToSI:
-    return SelectCast(I, ISD::FP_TO_SINT);
+    return selectCast(I, ISD::FP_TO_SINT);
   case Instruction::ZExt:
-    return SelectCast(I, ISD::ZERO_EXTEND);
+    return selectCast(I, ISD::ZERO_EXTEND);
   case Instruction::SExt:
-    return SelectCast(I, ISD::SIGN_EXTEND);
+    return selectCast(I, ISD::SIGN_EXTEND);
   case Instruction::Trunc:
-    return SelectCast(I, ISD::TRUNCATE);
+    return selectCast(I, ISD::TRUNCATE);
   case Instruction::SIToFP:
-    return SelectCast(I, ISD::SINT_TO_FP);
+    return selectCast(I, ISD::SINT_TO_FP);
 
   case Instruction::IntToPtr: // Deliberate fall-through.
   case Instruction::PtrToInt: {
     EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
     EVT DstVT = TLI.getValueType(I->getType());
     if (DstVT.bitsGT(SrcVT))
-      return SelectCast(I, ISD::ZERO_EXTEND);
+      return selectCast(I, ISD::ZERO_EXTEND);
     if (DstVT.bitsLT(SrcVT))
-      return SelectCast(I, ISD::TRUNCATE);
+      return selectCast(I, ISD::TRUNCATE);
     unsigned Reg = getRegForValue(I->getOperand(0));
-    if (Reg == 0) return false;
-    UpdateValueMap(I, Reg);
+    if (!Reg)
+      return false;
+    updateValueMap(I, Reg);
     return true;
   }
 
   case Instruction::ExtractValue:
-    return SelectExtractValue(I);
+    return selectExtractValue(I);
 
   case Instruction::PHI:
     llvm_unreachable("FastISel shouldn't visit PHI nodes!");
@@ -1586,91 +1575,72 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
   }
 }
 
-FastISel::FastISel(FunctionLoweringInfo &funcInfo,
-                   const TargetLibraryInfo *libInfo)
-  : FuncInfo(funcInfo),
-    MF(funcInfo.MF),
-    MRI(FuncInfo.MF->getRegInfo()),
-    MFI(*FuncInfo.MF->getFrameInfo()),
-    MCP(*FuncInfo.MF->getConstantPool()),
-    TM(FuncInfo.MF->getTarget()),
-    DL(*TM.getDataLayout()),
-    TII(*TM.getInstrInfo()),
-    TLI(*TM.getTargetLowering()),
-    TRI(*TM.getRegisterInfo()),
-    LibInfo(libInfo) {
-}
+FastISel::FastISel(FunctionLoweringInfo &FuncInfo,
+                   const TargetLibraryInfo *LibInfo,
+                   bool SkipTargetIndependentISel)
+    : FuncInfo(FuncInfo), MF(FuncInfo.MF), MRI(FuncInfo.MF->getRegInfo()),
+      MFI(*FuncInfo.MF->getFrameInfo()), MCP(*FuncInfo.MF->getConstantPool()),
+      TM(FuncInfo.MF->getTarget()), DL(*MF->getSubtarget().getDataLayout()),
+      TII(*MF->getSubtarget().getInstrInfo()),
+      TLI(*MF->getSubtarget().getTargetLowering()),
+      TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo),
+      SkipTargetIndependentISel(SkipTargetIndependentISel) {}
 
 FastISel::~FastISel() {}
 
-bool FastISel::FastLowerArguments() {
-  return false;
-}
+bool FastISel::fastLowerArguments() { return false; }
 
-bool FastISel::FastLowerCall(CallLoweringInfo &/*CLI*/) {
-  return false;
-}
+bool FastISel::fastLowerCall(CallLoweringInfo & /*CLI*/) { return false; }
 
-bool FastISel::FastLowerIntrinsicCall(const IntrinsicInst * /*II*/) {
+bool FastISel::fastLowerIntrinsicCall(const IntrinsicInst * /*II*/) {
   return false;
 }
 
-unsigned FastISel::FastEmit_(MVT, MVT,
-                             unsigned) {
-  return 0;
-}
+unsigned FastISel::fastEmit_(MVT, MVT, unsigned) { return 0; }
 
-unsigned FastISel::FastEmit_r(MVT, MVT,
-                              unsigned,
-                              unsigned /*Op0*/, bool /*Op0IsKill*/) {
+unsigned FastISel::fastEmit_r(MVT, MVT, unsigned, unsigned /*Op0*/,
+                              bool /*Op0IsKill*/) {
   return 0;
 }
 
-unsigned FastISel::FastEmit_rr(MVT, MVT,
-                               unsigned,
-                               unsigned /*Op0*/, bool /*Op0IsKill*/,
-                               unsigned /*Op1*/, bool /*Op1IsKill*/) {
+unsigned FastISel::fastEmit_rr(MVT, MVT, unsigned, unsigned /*Op0*/,
+                               bool /*Op0IsKill*/, unsigned /*Op1*/,
+                               bool /*Op1IsKill*/) {
   return 0;
 }
 
-unsigned FastISel::FastEmit_i(MVT, MVT, unsigned, uint64_t /*Imm*/) {
+unsigned FastISel::fastEmit_i(MVT, MVT, unsigned, uint64_t /*Imm*/) {
   return 0;
 }
 
-unsigned FastISel::FastEmit_f(MVT, MVT,
-                              unsigned, const ConstantFP * /*FPImm*/) {
+unsigned FastISel::fastEmit_f(MVT, MVT, unsigned,
+                              const ConstantFP * /*FPImm*/) {
   return 0;
 }
 
-unsigned FastISel::FastEmit_ri(MVT, MVT,
-                               unsigned,
-                               unsigned /*Op0*/, bool /*Op0IsKill*/,
-                               uint64_t /*Imm*/) {
+unsigned FastISel::fastEmit_ri(MVT, MVT, unsigned, unsigned /*Op0*/,
+                               bool /*Op0IsKill*/, uint64_t /*Imm*/) {
   return 0;
 }
 
-unsigned FastISel::FastEmit_rf(MVT, MVT,
-                               unsigned,
-                               unsigned /*Op0*/, bool /*Op0IsKill*/,
+unsigned FastISel::fastEmit_rf(MVT, MVT, unsigned, unsigned /*Op0*/,
+                               bool /*Op0IsKill*/,
                                const ConstantFP * /*FPImm*/) {
   return 0;
 }
 
-unsigned FastISel::FastEmit_rri(MVT, MVT,
-                                unsigned,
-                                unsigned /*Op0*/, bool /*Op0IsKill*/,
-                                unsigned /*Op1*/, bool /*Op1IsKill*/,
-                                uint64_t /*Imm*/) {
+unsigned FastISel::fastEmit_rri(MVT, MVT, unsigned, unsigned /*Op0*/,
+                                bool /*Op0IsKill*/, unsigned /*Op1*/,
+                                bool /*Op1IsKill*/, uint64_t /*Imm*/) {
   return 0;
 }
 
-/// FastEmit_ri_ - This method is a wrapper of FastEmit_ri. It first tries
-/// to emit an instruction with an immediate operand using FastEmit_ri.
+/// This method is a wrapper of fastEmit_ri. It first tries to emit an
+/// instruction with an immediate operand using fastEmit_ri.
 /// If that fails, it materializes the immediate into a register and try
-/// FastEmit_rr instead.
-unsigned FastISel::FastEmit_ri_(MVT VT, unsigned Opcode,
-                                unsigned Op0, bool Op0IsKill,
-                                uint64_t Imm, MVT ImmType) {
+/// fastEmit_rr instead.
+unsigned FastISel::fastEmit_ri_(MVT VT, unsigned Opcode, unsigned Op0,
+                                bool Op0IsKill, uint64_t Imm, MVT ImmType) {
   // If this is a multiply by a power of two, emit this as a shift left.
   if (Opcode == ISD::MUL && isPowerOf2_64(Imm)) {
     Opcode = ISD::SHL;
@@ -1688,30 +1658,29 @@ unsigned FastISel::FastEmit_ri_(MVT VT, unsigned Opcode,
     return 0;
 
   // First check if immediate type is legal. If not, we can't use the ri form.
-  unsigned ResultReg = FastEmit_ri(VT, VT, Opcode, Op0, Op0IsKill, Imm);
-  if (ResultReg != 0)
+  unsigned ResultReg = fastEmit_ri(VT, VT, Opcode, Op0, Op0IsKill, Imm);
+  if (ResultReg)
     return ResultReg;
-  unsigned MaterialReg = FastEmit_i(ImmType, ImmType, ISD::Constant, Imm);
-  if (MaterialReg == 0) {
+  unsigned MaterialReg = fastEmit_i(ImmType, ImmType, ISD::Constant, Imm);
+  if (!MaterialReg) {
     // This is a bit ugly/slow, but failing here means falling out of
     // fast-isel, which would be very slow.
-    IntegerType *ITy = IntegerType::get(FuncInfo.Fn->getContext(),
-                                              VT.getSizeInBits());
+    IntegerType *ITy =
+        IntegerType::get(FuncInfo.Fn->getContext(), VT.getSizeInBits());
     MaterialReg = getRegForValue(ConstantInt::get(ITy, Imm));
-    assert (MaterialReg != 0 && "Unable to materialize imm.");
-    if (MaterialReg == 0) return 0;
+    if (!MaterialReg)
+      return 0;
   }
-  return FastEmit_rr(VT, VT, Opcode,
-                     Op0, Op0IsKill,
-                     MaterialReg, /*Kill=*/true);
+  return fastEmit_rr(VT, VT, Opcode, Op0, Op0IsKill, MaterialReg,
+                     /*IsKill=*/true);
 }
 
-unsigned FastISel::createResultReg(const TargetRegisterClass* RC) {
+unsigned FastISel::createResultReg(const TargetRegisterClass *RC) {
   return MRI.createVirtualRegister(RC);
 }
 
-unsigned FastISel::constrainOperandRegClass(const MCInstrDesc &II,
-                                            unsigned Op, unsigned OpNum) {
+unsigned FastISel::constrainOperandRegClass(const MCInstrDesc &II, unsigned Op,
+                                            unsigned OpNum) {
   if (TargetRegisterInfo::isVirtualRegister(Op)) {
     const TargetRegisterClass *RegClass =
         TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
@@ -1727,8 +1696,8 @@ unsigned FastISel::constrainOperandRegClass(const MCInstrDesc &II,
   return Op;
 }
 
-unsigned FastISel::FastEmitInst_(unsigned MachineInstOpcode,
-                                 const TargetRegisterClass* RC) {
+unsigned FastISel::fastEmitInst_(unsigned MachineInstOpcode,
+                                 const TargetRegisterClass *RC) {
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
@@ -1736,9 +1705,9 @@ unsigned FastISel::FastEmitInst_(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_r(unsigned MachineInstOpcode,
-                                  const TargetRegisterClass *RC,
-                                  unsigned Op0, bool Op0IsKill) {
+unsigned FastISel::fastEmitInst_r(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC, unsigned Op0,
+                                  bool Op0IsKill) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1746,10 +1715,10 @@ unsigned FastISel::FastEmitInst_r(unsigned MachineInstOpcode,
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill);
+        .addReg(Op0, getKillRegState(Op0IsKill));
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill);
+        .addReg(Op0, getKillRegState(Op0IsKill));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
@@ -1757,10 +1726,10 @@ unsigned FastISel::FastEmitInst_r(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC,
-                                   unsigned Op0, bool Op0IsKill,
-                                   unsigned Op1, bool Op1IsKill) {
+unsigned FastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC, unsigned Op0,
+                                   bool Op0IsKill, unsigned Op1,
+                                   bool Op1IsKill) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1769,23 +1738,23 @@ unsigned FastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill));
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC,
-                                   unsigned Op0, bool Op0IsKill,
-                                   unsigned Op1, bool Op1IsKill,
-                                   unsigned Op2, bool Op2IsKill) {
+unsigned FastISel::fastEmitInst_rrr(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC, unsigned Op0,
+                                    bool Op0IsKill, unsigned Op1,
+                                    bool Op1IsKill, unsigned Op2,
+                                    bool Op2IsKill) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1795,48 +1764,46 @@ unsigned FastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill)
-      .addReg(Op2, Op2IsKill * RegState::Kill);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addReg(Op2, getKillRegState(Op2IsKill));
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill)
-      .addReg(Op2, Op2IsKill * RegState::Kill);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addReg(Op2, getKillRegState(Op2IsKill));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC,
-                                   unsigned Op0, bool Op0IsKill,
-                                   uint64_t Imm) {
+unsigned FastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC, unsigned Op0,
+                                   bool Op0IsKill, uint64_t Imm) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
-  RC = TII.getRegClass(II, II.getNumDefs(), &TRI, *FuncInfo.MF);
-  MRI.constrainRegClass(Op0, RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addImm(Imm);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(Imm);
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addImm(Imm);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(Imm);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_rii(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC,
-                                   unsigned Op0, bool Op0IsKill,
-                                   uint64_t Imm1, uint64_t Imm2) {
+unsigned FastISel::fastEmitInst_rii(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC, unsigned Op0,
+                                    bool Op0IsKill, uint64_t Imm1,
+                                    uint64_t Imm2) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1844,24 +1811,23 @@ unsigned FastISel::FastEmitInst_rii(unsigned MachineInstOpcode,
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addImm(Imm1)
-      .addImm(Imm2);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(Imm1)
+        .addImm(Imm2);
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addImm(Imm1)
-      .addImm(Imm2);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(Imm1)
+        .addImm(Imm2);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC,
-                                   unsigned Op0, bool Op0IsKill,
-                                   const ConstantFP *FPImm) {
+unsigned FastISel::fastEmitInst_rf(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC, unsigned Op0,
+                                   bool Op0IsKill, const ConstantFP *FPImm) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1869,23 +1835,22 @@ unsigned FastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addFPImm(FPImm);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addFPImm(FPImm);
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addFPImm(FPImm);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addFPImm(FPImm);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
-                                    const TargetRegisterClass *RC,
-                                    unsigned Op0, bool Op0IsKill,
-                                    unsigned Op1, bool Op1IsKill,
-                                    uint64_t Imm) {
+unsigned FastISel::fastEmitInst_rri(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC, unsigned Op0,
+                                    bool Op0IsKill, unsigned Op1,
+                                    bool Op1IsKill, uint64_t Imm) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1894,25 +1859,25 @@ unsigned FastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill)
-      .addImm(Imm);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addImm(Imm);
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill)
-      .addImm(Imm);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addImm(Imm);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_rrii(unsigned MachineInstOpcode,
+unsigned FastISel::fastEmitInst_rrii(unsigned MachineInstOpcode,
                                      const TargetRegisterClass *RC,
-                                     unsigned Op0, bool Op0IsKill,
-                                     unsigned Op1, bool Op1IsKill,
-                                     uint64_t Imm1, uint64_t Imm2) {
+                                     unsigned Op0, bool Op0IsKill, unsigned Op1,
+                                     bool Op1IsKill, uint64_t Imm1,
+                                     uint64_t Imm2) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1921,28 +1886,30 @@ unsigned FastISel::FastEmitInst_rrii(unsigned MachineInstOpcode,
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill)
-      .addImm(Imm1).addImm(Imm2);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addImm(Imm1)
+        .addImm(Imm2);
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill)
-      .addImm(Imm1).addImm(Imm2);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addImm(Imm1)
+        .addImm(Imm2);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode,
-                                  const TargetRegisterClass *RC,
-                                  uint64_t Imm) {
+unsigned FastISel::fastEmitInst_i(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC, uint64_t Imm) {
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg).addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+        .addImm(Imm);
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addImm(Imm);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -1951,41 +1918,41 @@ unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_ii(unsigned MachineInstOpcode,
-                                  const TargetRegisterClass *RC,
-                                  uint64_t Imm1, uint64_t Imm2) {
+unsigned FastISel::fastEmitInst_ii(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC, uint64_t Imm1,
+                                   uint64_t Imm2) {
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addImm(Imm1).addImm(Imm2);
+        .addImm(Imm1)
+        .addImm(Imm2);
   else {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addImm(Imm1).addImm(Imm2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addImm(Imm1)
+        .addImm(Imm2);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_extractsubreg(MVT RetVT,
-                                              unsigned Op0, bool Op0IsKill,
-                                              uint32_t Idx) {
+unsigned FastISel::fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0,
+                                              bool Op0IsKill, uint32_t Idx) {
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
   assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
          "Cannot yet extract from physregs");
   const TargetRegisterClass *RC = MRI.getRegClass(Op0);
   MRI.constrainRegClass(Op0, TRI.getSubClassWithSubReg(RC, Idx));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-          DbgLoc, TII.get(TargetOpcode::COPY), ResultReg)
-    .addReg(Op0, getKillRegState(Op0IsKill), Idx);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+          ResultReg).addReg(Op0, getKillRegState(Op0IsKill), Idx);
   return ResultReg;
 }
 
-/// FastEmitZExtFromI1 - Emit MachineInstrs to compute the value of Op
-/// with all but the least significant bit set to zero.
-unsigned FastISel::FastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill) {
-  return FastEmit_ri(VT, VT, ISD::AND, Op0, Op0IsKill, 1);
+/// Emit MachineInstrs to compute the value of Op with all but the least
+/// significant bit set to zero.
+unsigned FastISel::fastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill) {
+  return fastEmit_ri(VT, VT, ISD::AND, Op0, Op0IsKill, 1);
 }
 
 /// HandlePHINodesInSuccessorBlocks - Handle PHI nodes in successor blocks.
@@ -1994,22 +1961,24 @@ unsigned FastISel::FastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill) {
 /// nodes as input.  We cannot just directly add them, because expansion
 /// might result in multiple MBB's for one BB.  As such, the start of the
 /// BB might correspond to a different MBB than the end.
-bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
+bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
   const TerminatorInst *TI = LLVMBB->getTerminator();
 
   SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
-  unsigned OrigNumPHINodesToUpdate = FuncInfo.PHINodesToUpdate.size();
+  FuncInfo.OrigNumPHINodesToUpdate = FuncInfo.PHINodesToUpdate.size();
 
   // Check successor nodes' PHI nodes that expect a constant to be available
   // from this block.
   for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
     const BasicBlock *SuccBB = TI->getSuccessor(succ);
-    if (!isa<PHINode>(SuccBB->begin())) continue;
+    if (!isa<PHINode>(SuccBB->begin()))
+      continue;
     MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
 
     // If this terminator has multiple identical successors (common for
     // switches), only handle each succ once.
-    if (!SuccsHandled.insert(SuccMBB)) continue;
+    if (!SuccsHandled.insert(SuccMBB).second)
+      continue;
 
     MachineBasicBlock::iterator MBBI = SuccMBB->begin();
 
@@ -2017,10 +1986,11 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
     // nodes and Machine PHI nodes, but the incoming operands have not been
     // emitted yet.
     for (BasicBlock::const_iterator I = SuccBB->begin();
-         const PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+         const auto *PN = dyn_cast<PHINode>(I); ++I) {
 
       // Ignore dead phi's.
-      if (PN->use_empty()) continue;
+      if (PN->use_empty())
+        continue;
 
       // Only handle legal types. Two interesting things to note here. First,
       // by bailing out early, we may leave behind some dead instructions,
@@ -2031,10 +2001,8 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       EVT VT = TLI.getValueType(PN->getType(), /*AllowUnknown=*/true);
       if (VT == MVT::Other || !TLI.isTypeLegal(VT)) {
         // Handle integer promotions, though, because they're common and easy.
-        if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
-          VT = TLI.getTypeToTransformTo(LLVMBB->getContext(), VT);
-        else {
-          FuncInfo.PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
+        if (!(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)) {
+          FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
           return false;
         }
       }
@@ -2044,12 +2012,12 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       // Set the DebugLoc for the copy. Prefer the location of the operand
       // if there is one; use the location of the PHI otherwise.
       DbgLoc = PN->getDebugLoc();
-      if (const Instruction *Inst = dyn_cast<Instruction>(PHIOp))
+      if (const auto *Inst = dyn_cast<Instruction>(PHIOp))
         DbgLoc = Inst->getDebugLoc();
 
       unsigned Reg = getRegForValue(PHIOp);
-      if (Reg == 0) {
-        FuncInfo.PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
+      if (!Reg) {
+        FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
         return false;
       }
       FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg));
@@ -2062,17 +2030,17 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
 
 bool FastISel::tryToFoldLoad(const LoadInst *LI, const Instruction *FoldInst) {
   assert(LI->hasOneUse() &&
-      "tryToFoldLoad expected a LoadInst with a single use");
+         "tryToFoldLoad expected a LoadInst with a single use");
   // We know that the load has a single use, but don't know what it is.  If it
   // isn't one of the folded instructions, then we can't succeed here.  Handle
   // this by scanning the single-use users of the load until we get to FoldInst.
-  unsigned MaxUsers = 6;  // Don't scan down huge single-use chains of instrs.
+  unsigned MaxUsers = 6; // Don't scan down huge single-use chains of instrs.
 
   const Instruction *TheUser = LI->user_back();
-  while (TheUser != FoldInst &&   // Scan up until we find FoldInst.
+  while (TheUser != FoldInst && // Scan up until we find FoldInst.
          // Stay in the right block.
          TheUser->getParent() == FoldInst->getParent() &&
-         --MaxUsers) {  // Don't scan too far.
+         --MaxUsers) { // Don't scan too far.
     // If there are multiple or no uses of this instruction, then bail out.
     if (!TheUser->hasOneUse())
       return false;
@@ -2094,7 +2062,7 @@ bool FastISel::tryToFoldLoad(const LoadInst *LI, const Instruction *FoldInst) {
   // then there actually was no reference to it.  Perhaps the load is referenced
   // by a dead instruction.
   unsigned LoadReg = getRegForValue(LI);
-  if (LoadReg == 0)
+  if (!LoadReg)
     return false;
 
   // We can't fold if this vreg has no uses or more than one use.  Multiple uses
@@ -2152,19 +2120,20 @@ FastISel::createMachineMemOperandFor(const Instruction *I) const {
     Flags = MachineMemOperand::MOStore;
     Ptr = SI->getPointerOperand();
     ValTy = SI->getValueOperand()->getType();
-  } else {
+  } else
     return nullptr;
-  }
 
-  bool IsNonTemporal = I->getMetadata("nontemporal") != nullptr;
-  bool IsInvariant = I->getMetadata("invariant.load") != nullptr;
-  const MDNode *TBAAInfo = I->getMetadata(LLVMContext::MD_tbaa);
+  bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal) != nullptr;
+  bool IsInvariant = I->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
   const MDNode *Ranges = I->getMetadata(LLVMContext::MD_range);
 
-  if (Alignment == 0)  // Ensure that codegen never sees alignment 0.
+  AAMDNodes AAInfo;
+  I->getAAMetadata(AAInfo);
+
+  if (Alignment == 0) // Ensure that codegen never sees alignment 0.
     Alignment = DL.getABITypeAlignment(ValTy);
 
-  unsigned Size = TM.getDataLayout()->getTypeStoreSize(ValTy);
+  unsigned Size = DL.getTypeStoreSize(ValTy);
 
   if (IsVolatile)
     Flags |= MachineMemOperand::MOVolatile;
@@ -2174,5 +2143,45 @@ FastISel::createMachineMemOperandFor(const Instruction *I) const {
     Flags |= MachineMemOperand::MOInvariant;
 
   return FuncInfo.MF->getMachineMemOperand(MachinePointerInfo(Ptr), Flags, Size,
-                                           Alignment, TBAAInfo, Ranges);
+                                           Alignment, AAInfo, Ranges);
+}
+
+CmpInst::Predicate FastISel::optimizeCmpPredicate(const CmpInst *CI) const {
+  // If both operands are the same, then try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = CI->getPredicate();
+  if (CI->getOperand(0) != CI->getOperand(1))
+    return Predicate;
+
+  switch (Predicate) {
+  default: llvm_unreachable("Invalid predicate!");
+  case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OEQ:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OGE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OLE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_ONE:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_ORD:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_UNO:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UEQ:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UGT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_ULT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UNE:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_TRUE:  Predicate = CmpInst::FCMP_TRUE;  break;
+
+  case CmpInst::ICMP_EQ:    Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_NE:    Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_ULT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SLE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  }
+
+  return Predicate;
 }
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index ae124e852a91..19aca6edd451 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -55,58 +56,71 @@ static bool isUsedOutsideOfDefiningBlock(const Instruction *I) {
   return false;
 }
 
+static ISD::NodeType getPreferredExtendForValue(const Value *V) {
+  // For the users of the source value being used for compare instruction, if
+  // the number of signed predicate is greater than unsigned predicate, we
+  // prefer to use SIGN_EXTEND.
+  //
+  // With this optimization, we would be able to reduce some redundant sign or
+  // zero extension instruction, and eventually more machine CSE opportunities
+  // can be exposed.
+  ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+  unsigned NumOfSigned = 0, NumOfUnsigned = 0;
+  for (const User *U : V->users()) {
+    if (const auto *CI = dyn_cast<CmpInst>(U)) {
+      NumOfSigned += CI->isSigned();
+      NumOfUnsigned += CI->isUnsigned();
+    }
+  }
+  if (NumOfSigned > NumOfUnsigned)
+    ExtendKind = ISD::SIGN_EXTEND;
+
+  return ExtendKind;
+}
+
 void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
                                SelectionDAG *DAG) {
-  const TargetLowering *TLI = TM.getTargetLowering();
-
   Fn = &fn;
   MF = &mf;
+  TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
 
   // Check whether the function can return without sret-demotion.
   SmallVector<ISD::OutputArg, 4> Outs;
   GetReturnInfo(Fn->getReturnType(), Fn->getAttributes(), Outs, *TLI);
   CanLowerReturn = TLI->CanLowerReturn(Fn->getCallingConv(), *MF,
-                                       Fn->isVarArg(),
-                                       Outs, Fn->getContext());
+                                       Fn->isVarArg(), Outs, Fn->getContext());
 
   // Initialize the mapping of values to registers.  This is only set up for
   // instruction values that are used outside of the block that defines
   // them.
   Function::const_iterator BB = Fn->begin(), EB = Fn->end();
-  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-    if (const AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
-      // Don't fold inalloca allocas or other dynamic allocas into the initial
-      // stack frame allocation, even if they are in the entry block.
-      if (!AI->isStaticAlloca())
-        continue;
-
-      if (const ConstantInt *CUI = dyn_cast<ConstantInt>(AI->getArraySize())) {
-        Type *Ty = AI->getAllocatedType();
-        uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(Ty);
-        unsigned Align =
-          std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty),
-                   AI->getAlignment());
-
-        TySize *= CUI->getZExtValue();   // Get total allocated size.
-        if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects.
-
-        StaticAllocaMap[AI] =
-          MF->getFrameInfo()->CreateStackObject(TySize, Align, false, AI);
-      }
-    }
-
   for (; BB != EB; ++BB)
     for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
          I != E; ++I) {
-      // Look for dynamic allocas.
       if (const AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
-        if (!AI->isStaticAlloca()) {
+        // Static allocas can be folded into the initial stack frame adjustment.
+        if (AI->isStaticAlloca()) {
+          const ConstantInt *CUI = cast<ConstantInt>(AI->getArraySize());
+          Type *Ty = AI->getAllocatedType();
+          uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(Ty);
+          unsigned Align =
+              std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty),
+                       AI->getAlignment());
+
+          TySize *= CUI->getZExtValue();   // Get total allocated size.
+          if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects.
+
+          StaticAllocaMap[AI] =
+            MF->getFrameInfo()->CreateStackObject(TySize, Align, false, AI);
+
+        } else {
           unsigned Align = std::max(
               (unsigned)TLI->getDataLayout()->getPrefTypeAlignment(
                 AI->getAllocatedType()),
               AI->getAlignment());
-          unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+          unsigned StackAlign =
+              MF->getSubtarget().getFrameLowering()->getStackAlignment();
           if (Align <= StackAlign)
             Align = 0;
           // Inform the Frame Information that we have variable-sized objects.
@@ -126,9 +140,9 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
             if (Op.Type == InlineAsm::isClobber) {
               // Clobbers don't have SDValue operands, hence SDValue().
               TLI->ComputeConstraintToUse(Op, SDValue(), DAG);
-              std::pair<unsigned, const TargetRegisterClass*> PhysReg =
-                TLI->getRegForInlineAsmConstraint(Op.ConstraintCode,
-                                                  Op.ConstraintVT);
+              std::pair<unsigned, const TargetRegisterClass *> PhysReg =
+                  TLI->getRegForInlineAsmConstraint(Op.ConstraintCode,
+                                                   Op.ConstraintVT);
               if (PhysReg.first == SP)
                 MF->getFrameInfo()->setHasInlineAsmWithSPAdjust(true);
             }
@@ -136,6 +150,21 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
         }
       }
 
+      // Look for calls to the @llvm.va_start intrinsic. We can omit some
+      // prologue boilerplate for variadic functions that don't examine their
+      // arguments.
+      if (const auto *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::vastart)
+          MF->getFrameInfo()->setHasVAStart(true);
+      }
+
+      // If we have a musttail call in a variadic funciton, we need to ensure we
+      // forward implicit register parameters.
+      if (const auto *CI = dyn_cast<CallInst>(I)) {
+        if (CI->isMustTailCall() && Fn->isVarArg())
+          MF->getFrameInfo()->setHasMustTailInVarArgFunc(true);
+      }
+
       // Mark values used outside their block as exported, by allocating
       // a virtual register for them.
       if (isUsedOutsideOfDefiningBlock(I))
@@ -166,13 +195,16 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
                 StaticAllocaMap.find(AI);
               if (SI != StaticAllocaMap.end()) { // Check for VLAs.
                 int FI = SI->second;
-                MMI.setVariableDbgInfo(DI->getVariable(),
+                MMI.setVariableDbgInfo(DI->getVariable(), DI->getExpression(),
                                        FI, DI->getDebugLoc());
               }
             }
           }
         }
       }
+
+      // Decide the preferred extend type for a value.
+      PreferredExtendType[I] = getPreferredExtendForValue(I);
     }
 
   // Create an initial MachineBasicBlock for each LLVM BasicBlock in F.  This
@@ -208,7 +240,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
       for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
         EVT VT = ValueVTs[vti];
         unsigned NumRegisters = TLI->getNumRegisters(Fn->getContext(), VT);
-        const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+        const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
         for (unsigned i = 0; i != NumRegisters; ++i)
           BuildMI(MBB, DL, TII->get(TargetOpcode::PHI), PHIReg + i);
         PHIReg += NumRegisters;
@@ -241,12 +273,14 @@ void FunctionLoweringInfo::clear() {
   ArgDbgValues.clear();
   ByValArgFrameIndexMap.clear();
   RegFixups.clear();
+  StatepointStackSlots.clear();
+  PreferredExtendType.clear();
 }
 
 /// CreateReg - Allocate a single virtual register for the given type.
 unsigned FunctionLoweringInfo::CreateReg(MVT VT) {
-  return RegInfo->
-    createVirtualRegister(TM.getTargetLowering()->getRegClassFor(VT));
+  return RegInfo->createVirtualRegister(
+      MF->getSubtarget().getTargetLowering()->getRegClassFor(VT));
 }
 
 /// CreateRegs - Allocate the appropriate number of virtual registers of
@@ -257,7 +291,7 @@ unsigned FunctionLoweringInfo::CreateReg(MVT VT) {
 /// will assign registers for each member or element.
 ///
 unsigned FunctionLoweringInfo::CreateRegs(Type *Ty) {
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
 
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(*TLI, Ty, ValueVTs);
@@ -306,8 +340,6 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
   if (!Ty->isIntegerTy() || Ty->isVectorTy())
     return;
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-
   SmallVector<EVT, 1> ValueVTs;
   ComputeValueVTs(*TLI, Ty, ValueVTs);
   assert(ValueVTs.size() == 1 &&
@@ -452,7 +484,7 @@ void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
 
   // Gather all the type infos for this landing pad and pass them along to
   // MachineModuleInfo.
-  std::vector<const GlobalVariable *> TyInfo;
+  std::vector<const GlobalValue *> TyInfo;
   unsigned N = I.getNumArgOperands();
 
   for (unsigned i = N - 1; i > 1; --i) {
@@ -510,14 +542,14 @@ void llvm::AddLandingPadInfo(const LandingPadInst &I, MachineModuleInfo &MMI,
     Value *Val = I.getClause(i - 1);
     if (I.isCatch(i - 1)) {
       MMI.addCatchTypeInfo(MBB,
-                           dyn_cast<GlobalVariable>(Val->stripPointerCasts()));
+                           dyn_cast<GlobalValue>(Val->stripPointerCasts()));
     } else {
       // Add filters in a list.
       Constant *CVal = cast<Constant>(Val);
-      SmallVector<const GlobalVariable*, 4> FilterList;
+      SmallVector<const GlobalValue*, 4> FilterList;
       for (User::op_iterator
              II = CVal->op_begin(), IE = CVal->op_end(); II != IE; ++II)
-        FilterList.push_back(cast<GlobalVariable>((*II)->stripPointerCasts()));
+        FilterList.push_back(cast<GlobalValue>((*II)->stripPointerCasts()));
 
       MMI.addFilterTypeInfo(MBB, FilterList);
     }
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 7c124b8caa91..a65f33e17774 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -27,7 +27,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "instr-emitter"
@@ -265,12 +265,16 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
       MIB.addReg(VRBase, RegState::Define);
     }
 
-    SDValue Op(Node, i);
-    if (IsClone)
-      VRBaseMap.erase(Op);
-    bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
-    (void)isNew; // Silence compiler warning.
-    assert(isNew && "Node emitted out of order - early");
+    // If this def corresponds to a result of the SDNode insert the VRBase into
+    // the lookup map.
+    if (i < NumResults) {
+      SDValue Op(Node, i);
+      if (IsClone)
+        VRBaseMap.erase(Op);
+      bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+      (void)isNew; // Silence compiler warning.
+      assert(isNew && "Node emitted out of order - early");
+    }
   }
 }
 
@@ -402,10 +406,10 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
     Type *Type = CP->getType();
     // MachineConstantPool wants an explicit alignment.
     if (Align == 0) {
-      Align = TM->getDataLayout()->getPrefTypeAlignment(Type);
+      Align = MF->getSubtarget().getDataLayout()->getPrefTypeAlignment(Type);
       if (Align == 0) {
         // Alignment of vector types.  FIXME!
-        Align = TM->getDataLayout()->getTypeAllocSize(Type);
+        Align = MF->getSubtarget().getDataLayout()->getTypeAllocSize(Type);
       }
     }
 
@@ -643,14 +647,18 @@ MachineInstr *
 InstrEmitter::EmitDbgValue(SDDbgValue *SD,
                            DenseMap<SDValue, unsigned> &VRBaseMap) {
   uint64_t Offset = SD->getOffset();
-  MDNode* MDPtr = SD->getMDPtr();
+  MDNode *Var = SD->getVariable();
+  MDNode *Expr = SD->getExpression();
   DebugLoc DL = SD->getDebugLoc();
 
   if (SD->getKind() == SDDbgValue::FRAMEIX) {
     // Stack address; this needs to be lowered in target-dependent fashion.
     // EmitTargetCodeForFrameDebugValue is responsible for allocation.
     return BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE))
-        .addFrameIndex(SD->getFrameIx()).addImm(Offset).addMetadata(MDPtr);
+        .addFrameIndex(SD->getFrameIx())
+        .addImm(Offset)
+        .addMetadata(Var)
+        .addMetadata(Expr);
   }
   // Otherwise, we're going to create an instruction here.
   const MCInstrDesc &II = TII->get(TargetOpcode::DBG_VALUE);
@@ -696,7 +704,8 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
     MIB.addReg(0U, RegState::Debug);
   }
 
-  MIB.addMetadata(MDPtr);
+  MIB.addMetadata(Var);
+  MIB.addMetadata(Expr);
 
   return &*MIB;
 }
@@ -859,9 +868,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
     MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
 
   // Run post-isel target hook to adjust this instruction if needed.
-#ifdef NDEBUG
   if (II.hasPostISelHook())
-#endif
     TLI->AdjustInstrPostInstrSelection(MIB, Node);
 }
 
@@ -1013,11 +1020,8 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
 /// at the given position in the given block.
 InstrEmitter::InstrEmitter(MachineBasicBlock *mbb,
                            MachineBasicBlock::iterator insertpos)
-  : MF(mbb->getParent()),
-    MRI(&MF->getRegInfo()),
-    TM(&MF->getTarget()),
-    TII(TM->getInstrInfo()),
-    TRI(TM->getRegisterInfo()),
-    TLI(TM->getTargetLowering()),
-    MBB(mbb), InsertPos(insertpos) {
-}
+    : MF(mbb->getParent()), MRI(&MF->getRegInfo()),
+      TII(MF->getSubtarget().getInstrInfo()),
+      TRI(MF->getSubtarget().getRegisterInfo()),
+      TLI(MF->getSubtarget().getTargetLowering()), MBB(mbb),
+      InsertPos(insertpos) {}
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.h b/lib/CodeGen/SelectionDAG/InstrEmitter.h
index 920dda8820f6..7b86f7dd8de0 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef INSTREMITTER_H
-#define INSTREMITTER_H
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_INSTREMITTER_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_INSTREMITTER_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -29,7 +29,6 @@ class SDDbgValue;
 class InstrEmitter {
   MachineFunction *MF;
   MachineRegisterInfo *MRI;
-  const TargetMachine *TM;
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const TargetLowering *TLI;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 16c5b4ba7768..e5473e35caed 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -33,10 +34,13 @@
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "legalizedag"
+
 //===----------------------------------------------------------------------===//
-/// SelectionDAGLegalize - This takes an arbitrary SelectionDAG as input and
+/// This takes an arbitrary SelectionDAG as input and
 /// hacks on it until the target machine can handle it.  This involves
 /// eliminating value sizes the machine cannot handle (promoting small sizes to
 /// large sizes or splitting up large values into small values) as well as
@@ -48,16 +52,17 @@ using namespace llvm;
 /// will attempt merge setcc and brc instructions into brcc's.
 ///
 namespace {
-class SelectionDAGLegalize : public SelectionDAG::DAGUpdateListener {
+class SelectionDAGLegalize {
   const TargetMachine &TM;
   const TargetLowering &TLI;
   SelectionDAG &DAG;
 
-  /// LegalizePosition - The iterator for walking through the node list.
-  SelectionDAG::allnodes_iterator LegalizePosition;
+  /// \brief The set of nodes which have already been legalized. We hold a
+  /// reference to it in order to update as necessary on node deletion.
+  SmallPtrSetImpl<SDNode *> &LegalizedNodes;
 
-  /// LegalizedNodes - The set of nodes which have already been legalized.
-  SmallPtrSet<SDNode *, 16> LegalizedNodes;
+  /// \brief A set of all the nodes updated during legalization.
+  SmallSetVector<SDNode *, 16> *UpdatedNodes;
 
   EVT getSetCCResultType(EVT VT) const {
     return TLI.getSetCCResultType(*DAG.getContext(), VT);
@@ -66,20 +71,22 @@ class SelectionDAGLegalize : public SelectionDAG::DAGUpdateListener {
   // Libcall insertion helpers.
 
 public:
-  explicit SelectionDAGLegalize(SelectionDAG &DAG);
-
-  void LegalizeDAG();
+  SelectionDAGLegalize(SelectionDAG &DAG,
+                       SmallPtrSetImpl<SDNode *> &LegalizedNodes,
+                       SmallSetVector<SDNode *, 16> *UpdatedNodes = nullptr)
+      : TM(DAG.getTarget()), TLI(DAG.getTargetLoweringInfo()), DAG(DAG),
+        LegalizedNodes(LegalizedNodes), UpdatedNodes(UpdatedNodes) {}
 
-private:
-  /// LegalizeOp - Legalizes the given operation.
+  /// \brief Legalizes the given operation.
   void LegalizeOp(SDNode *Node);
 
+private:
   SDValue OptimizeFloatStore(StoreSDNode *ST);
 
   void LegalizeLoadOps(SDNode *Node);
   void LegalizeStoreOps(SDNode *Node);
 
-  /// PerformInsertVectorEltInMemory - Some target cannot handle a variable
+  /// Some targets cannot handle a variable
   /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
   /// is necessary to spill the vector being inserted into to memory, perform
   /// the insert there, and then read the result back.
@@ -88,7 +95,7 @@ private:
   SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
                                   SDValue Idx, SDLoc dl);
 
-  /// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
+  /// Return a vector shuffle operation which
   /// performs the same shuffe in terms of order or result bytes, but on a type
   /// whose vector element type is narrower than the original shuffle type.
   /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
@@ -145,43 +152,55 @@ private:
   void ExpandNode(SDNode *Node);
   void PromoteNode(SDNode *Node);
 
-  void ForgetNode(SDNode *N) {
-    LegalizedNodes.erase(N);
-    if (LegalizePosition == SelectionDAG::allnodes_iterator(N))
-      ++LegalizePosition;
-  }
-
 public:
-  // DAGUpdateListener implementation.
-  void NodeDeleted(SDNode *N, SDNode *E) override {
-    ForgetNode(N);
-  }
-  void NodeUpdated(SDNode *N) override {}
-
   // Node replacement helpers
   void ReplacedNode(SDNode *N) {
-    if (N->use_empty()) {
-      DAG.RemoveDeadNode(N);
-    } else {
-      ForgetNode(N);
-    }
+    LegalizedNodes.erase(N);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(N);
   }
   void ReplaceNode(SDNode *Old, SDNode *New) {
+    DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+          dbgs() << "     with:      "; New->dump(&DAG));
+
+    assert(Old->getNumValues() == New->getNumValues() &&
+           "Replacing one node with another that produces a different number "
+           "of values!");
     DAG.ReplaceAllUsesWith(Old, New);
+    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i)
+      DAG.TransferDbgValues(SDValue(Old, i), SDValue(New, i));
+    if (UpdatedNodes)
+      UpdatedNodes->insert(New);
     ReplacedNode(Old);
   }
   void ReplaceNode(SDValue Old, SDValue New) {
+    DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+          dbgs() << "     with:      "; New->dump(&DAG));
+
     DAG.ReplaceAllUsesWith(Old, New);
+    DAG.TransferDbgValues(Old, New);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(New.getNode());
     ReplacedNode(Old.getNode());
   }
   void ReplaceNode(SDNode *Old, const SDValue *New) {
+    DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG));
+
     DAG.ReplaceAllUsesWith(Old, New);
+    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i) {
+      DEBUG(dbgs() << (i == 0 ? "     with:      "
+                              : "      and:      ");
+            New[i]->dump(&DAG));
+      DAG.TransferDbgValues(SDValue(Old, i), New[i]);
+      if (UpdatedNodes)
+        UpdatedNodes->insert(New[i].getNode());
+    }
     ReplacedNode(Old);
   }
 };
 }
 
-/// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
+/// Return a vector shuffle operation which
 /// performs the same shuffe in terms of order or result bytes, but on a type
 /// whose vector element type is narrower than the original shuffle type.
 /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
@@ -213,41 +232,7 @@ SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  SDLoc dl,
   return DAG.getVectorShuffle(NVT, dl, N1, N2, &NewMask[0]);
 }
 
-SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag)
-  : SelectionDAG::DAGUpdateListener(dag),
-    TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
-    DAG(dag) {
-}
-
-void SelectionDAGLegalize::LegalizeDAG() {
-  DAG.AssignTopologicalOrder();
-
-  // Visit all the nodes. We start in topological order, so that we see
-  // nodes with their original operands intact. Legalization can produce
-  // new nodes which may themselves need to be legalized. Iterate until all
-  // nodes have been legalized.
-  for (;;) {
-    bool AnyLegalized = false;
-    for (LegalizePosition = DAG.allnodes_end();
-         LegalizePosition != DAG.allnodes_begin(); ) {
-      --LegalizePosition;
-
-      SDNode *N = LegalizePosition;
-      if (LegalizedNodes.insert(N)) {
-        AnyLegalized = true;
-        LegalizeOp(N);
-      }
-    }
-    if (!AnyLegalized)
-      break;
-
-  }
-
-  // Remove dead nodes now.
-  DAG.RemoveDeadNodes();
-}
-
-/// ExpandConstantFP - Expands the ConstantFP node to an integer constant or
+/// Expands the ConstantFP node to an integer constant or
 /// a load from the constant pool.
 SDValue
 SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
@@ -275,7 +260,7 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
     if (ConstantFPSDNode::isValueValidForType(SVT, CFP->getValueAPF()) &&
         // Only do this if the target has a native EXTLOAD instruction from
         // smaller type.
-        TLI.isLoadExtLegal(ISD::EXTLOAD, SVT) &&
+        TLI.isLoadExtLegal(ISD::EXTLOAD, OrigVT, SVT) &&
         TLI.ShouldShrinkFPConstant(OrigVT)) {
       Type *SType = SVT.getTypeForEVT(*DAG.getContext());
       LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
@@ -291,7 +276,7 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
       DAG.getExtLoad(ISD::EXTLOAD, dl, OrigVT,
                      DAG.getEntryNode(),
                      CPIdx, MachinePointerInfo::getConstantPool(),
-                     VT, false, false, Alignment);
+                     VT, false, false, false, Alignment);
     return Result;
   }
   SDValue Result =
@@ -301,7 +286,7 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
   return Result;
 }
 
-/// ExpandUnalignedStore - Expands an unaligned store to 2 half-size stores.
+/// Expands an unaligned store to 2 half-size stores.
 static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                  const TargetLowering &TLI,
                                  SelectionDAGLegalize *DAGLegalize) {
@@ -377,7 +362,7 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
     // Load from the stack slot.
     SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
                                   MachinePointerInfo(),
-                                  MemVT, false, false, 0);
+                                  MemVT, false, false, false, 0);
 
     Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
                                        ST->getPointerInfo()
@@ -385,7 +370,7 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                        MemVT, ST->isVolatile(),
                                        ST->isNonTemporal(),
                                        MinAlign(ST->getAlignment(), Offset),
-                                       ST->getTBAAInfo()));
+                                       ST->getAAInfo()));
     // The order of the stores doesn't matter - say it with a TokenFactor.
     SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
     DAGLegalize->ReplaceNode(SDValue(ST, 0), Result);
@@ -417,14 +402,14 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
   Store2 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Hi:Lo, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                              NewStoredVT, ST->isVolatile(), ST->isNonTemporal(),
-                             Alignment, ST->getTBAAInfo());
+                             Alignment, ST->getAAInfo());
 
   SDValue Result =
     DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
   DAGLegalize->ReplaceNode(SDValue(ST, 0), Result);
 }
 
-/// ExpandUnalignedLoad - Expands an unaligned load to 2 half-size loads.
+/// Expands an unaligned load to 2 half-size loads.
 static void
 ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                     const TargetLowering &TLI,
@@ -476,7 +461,7 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                  LD->isVolatile(), LD->isNonTemporal(),
                                  LD->isInvariant(),
                                  MinAlign(LD->getAlignment(), Offset),
-                                 LD->getTBAAInfo());
+                                 LD->getAAInfo());
       // Follow the load with a store to the stack slot.  Remember the store.
       Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, StackPtr,
                                     MachinePointerInfo(), false, false, 0));
@@ -494,8 +479,9 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                   LD->getPointerInfo().getWithOffset(Offset),
                                   MemVT, LD->isVolatile(),
                                   LD->isNonTemporal(),
+                                  LD->isInvariant(),
                                   MinAlign(LD->getAlignment(), Offset),
-                                  LD->getTBAAInfo());
+                                  LD->getAAInfo());
     // Follow the load with a store to the stack slot.  Remember the store.
     // On big-endian machines this requires a truncating store to ensure
     // that the bits end up in the right place.
@@ -508,7 +494,8 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
 
     // Finally, perform the original load only redirected to the stack slot.
     Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
-                          MachinePointerInfo(), LoadedVT, false, false, 0);
+                          MachinePointerInfo(), LoadedVT, false,false, false,
+                          0);
 
     // Callers expect a MERGE_VALUES node.
     ValResult = Load;
@@ -538,25 +525,27 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
   if (TLI.isLittleEndian()) {
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), Alignment, LD->getTBAAInfo());
+                        LD->isNonTemporal(), LD->isInvariant(), Alignment,
+                        LD->getAAInfo());
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
                         LD->getPointerInfo().getWithOffset(IncrementSize),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize),
-                        LD->getTBAAInfo());
+                        LD->isNonTemporal(),LD->isInvariant(),
+                        MinAlign(Alignment, IncrementSize), LD->getAAInfo());
   } else {
     Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), Alignment, LD->getTBAAInfo());
+                        LD->isNonTemporal(), LD->isInvariant(), Alignment,
+                        LD->getAAInfo());
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
                         LD->getPointerInfo().getWithOffset(IncrementSize),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize),
-                        LD->getTBAAInfo());
+                        LD->isNonTemporal(), LD->isInvariant(),
+                        MinAlign(Alignment, IncrementSize), LD->getAAInfo());
   }
 
   // aggregate the two parts
@@ -572,8 +561,8 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
   ChainResult = TF;
 }
 
-/// PerformInsertVectorEltInMemory - Some target cannot handle a variable
-/// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
+/// Some target cannot handle a variable insertion index for the
+/// INSERT_VECTOR_ELT instruction.  In this case, it
 /// is necessary to spill the vector being inserted into to memory, perform
 /// the insert there, and then read the result back.
 SDValue SelectionDAGLegalize::
@@ -659,7 +648,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
-  const MDNode *TBAAInfo = ST->getTBAAInfo();
+  AAMDNodes AAInfo = ST->getAAInfo();
   SDLoc dl(ST);
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
     if (CFP->getValueType(0) == MVT::f32 &&
@@ -668,7 +657,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
                                       bitcastToAPInt().zextOrTrunc(32),
                               MVT::i32);
       return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                          isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                          isVolatile, isNonTemporal, Alignment, AAInfo);
     }
 
     if (CFP->getValueType(0) == MVT::f64) {
@@ -677,7 +666,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                   zextOrTrunc(64), MVT::i64);
         return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                            isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                            isVolatile, isNonTemporal, Alignment, AAInfo);
       }
 
       if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
@@ -690,13 +679,13 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         if (TLI.isBigEndian()) std::swap(Lo, Hi);
 
         Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), isVolatile,
-                          isNonTemporal, Alignment, TBAAInfo);
+                          isNonTemporal, Alignment, AAInfo);
         Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                           DAG.getConstant(4, Ptr.getValueType()));
         Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                           ST->getPointerInfo().getWithOffset(4),
                           isVolatile, isNonTemporal, MinAlign(Alignment, 4U),
-                          TBAAInfo);
+                          AAInfo);
 
         return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
       }
@@ -714,7 +703,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
     unsigned Alignment = ST->getAlignment();
     bool isVolatile = ST->isVolatile();
     bool isNonTemporal = ST->isNonTemporal();
-    const MDNode *TBAAInfo = ST->getTBAAInfo();
+    AAMDNodes AAInfo = ST->getAAInfo();
 
     if (!ST->isTruncatingStore()) {
       if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
@@ -731,12 +720,12 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           // If this is an unaligned store and the target doesn't support it,
           // expand it.
           unsigned AS = ST->getAddressSpace();
-          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT(), AS)) {
+          unsigned Align = ST->getAlignment();
+          if (!TLI.allowsMisalignedMemoryAccesses(ST->getMemoryVT(), AS, Align)) {
             Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
             unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty);
-            if (ST->getAlignment() < ABIAlignment)
-              ExpandUnalignedStore(cast<StoreSDNode>(Node),
-                                   DAG, TLI, this);
+            if (Align < ABIAlignment)
+              ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
           }
           break;
         }
@@ -754,7 +743,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           SDValue Result =
             DAG.getStore(Chain, dl, Value, Ptr,
                          ST->getPointerInfo(), isVolatile,
-                         isNonTemporal, Alignment, TBAAInfo);
+                         isNonTemporal, Alignment, AAInfo);
           ReplaceNode(SDValue(Node, 0), Result);
           break;
         }
@@ -776,8 +765,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
         Value = DAG.getZeroExtendInReg(Value, dl, StVT);
         SDValue Result =
           DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                            NVT, isVolatile, isNonTemporal, Alignment,
-                            TBAAInfo);
+                            NVT, isVolatile, isNonTemporal, Alignment, AAInfo);
         ReplaceNode(SDValue(Node, 0), Result);
       } else if (StWidth & (StWidth - 1)) {
         // If not storing a power-of-2 number of bits, expand as two stores.
@@ -799,7 +787,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
                                  RoundVT,
                                  isVolatile, isNonTemporal, Alignment,
-                                 TBAAInfo);
+                                 AAInfo);
 
           // Store the remaining ExtraWidth bits.
           IncrementSize = RoundWidth / 8;
@@ -811,7 +799,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                                  ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize), TBAAInfo);
+                                 MinAlign(Alignment, IncrementSize), AAInfo);
         } else {
           // Big endian - avoid unaligned stores.
           // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
@@ -821,7 +809,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
                                    TLI.getShiftAmountTy(Value.getValueType())));
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
                                  RoundVT, isVolatile, isNonTemporal, Alignment,
-                                 TBAAInfo);
+                                 AAInfo);
 
           // Store the remaining ExtraWidth bits.
           IncrementSize = RoundWidth / 8;
@@ -830,7 +818,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           Lo = DAG.getTruncStore(Chain, dl, Value, Ptr,
                               ST->getPointerInfo().getWithOffset(IncrementSize),
                                  ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize), TBAAInfo);
+                                 MinAlign(Alignment, IncrementSize), AAInfo);
         }
 
         // The order of the stores doesn't matter.
@@ -842,12 +830,13 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
         default: llvm_unreachable("This action is not supported yet!");
         case TargetLowering::Legal: {
           unsigned AS = ST->getAddressSpace();
+          unsigned Align = ST->getAlignment();
           // If this is an unaligned store and the target doesn't support it,
           // expand it.
-          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT(), AS)) {
+          if (!TLI.allowsMisalignedMemoryAccesses(ST->getMemoryVT(), AS, Align)) {
             Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
             unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty);
-            if (ST->getAlignment() < ABIAlignment)
+            if (Align < ABIAlignment)
               ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
           }
           break;
@@ -868,7 +857,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
           SDValue Result =
             DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                         isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                         isVolatile, isNonTemporal, Alignment, AAInfo);
           ReplaceNode(SDValue(Node, 0), Result);
           break;
         }
@@ -893,13 +882,14 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Legal: {
       unsigned AS = LD->getAddressSpace();
+      unsigned Align = LD->getAlignment();
       // If this is an unaligned load and the target doesn't support it,
       // expand it.
-      if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT(), AS)) {
+      if (!TLI.allowsMisalignedMemoryAccesses(LD->getMemoryVT(), AS, Align)) {
         Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
         unsigned ABIAlignment =
           TLI.getDataLayout()->getABITypeAlignment(Ty);
-        if (LD->getAlignment() < ABIAlignment){
+        if (Align < ABIAlignment){
           ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, RVal, RChain);
         }
       }
@@ -928,6 +918,10 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       assert(RVal.getNode() != Node && "Load must be completely replaced");
       DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), RVal);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), RChain);
+      if (UpdatedNodes) {
+        UpdatedNodes->insert(RVal.getNode());
+        UpdatedNodes->insert(RChain.getNode());
+      }
       ReplacedNode(Node);
     }
     return;
@@ -938,7 +932,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
   unsigned Alignment = LD->getAlignment();
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
-  const MDNode *TBAAInfo = LD->getTBAAInfo();
+  bool isInvariant = LD->isInvariant();
+  AAMDNodes AAInfo = LD->getAAInfo();
 
   if (SrcWidth != SrcVT.getStoreSizeInBits() &&
       // Some targets pretend to have an i1 loading operation, and actually
@@ -949,7 +944,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // nice to have an effective generic way of getting these benefits...
       // Until such a way is found, don't insist on promoting i1 here.
       (SrcVT != MVT::i1 ||
-       TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) {
+       TLI.getLoadExtAction(ExtType, Node->getValueType(0), MVT::i1) ==
+         TargetLowering::Promote)) {
     // Promote to a byte-sized load if not loading an integral number of
     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
     unsigned NewWidth = SrcVT.getStoreSizeInBits();
@@ -965,7 +961,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     SDValue Result =
       DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
                      Chain, Ptr, LD->getPointerInfo(),
-                     NVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                     NVT, isVolatile, isNonTemporal, isInvariant, Alignment,
+                     AAInfo);
 
     Ch = Result.getValue(1); // The chain.
 
@@ -1002,7 +999,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
                           Chain, Ptr,
                           LD->getPointerInfo(), RoundVT, isVolatile,
-                          isNonTemporal, Alignment, TBAAInfo);
+                          isNonTemporal, isInvariant, Alignment, AAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
@@ -1010,8 +1007,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
                          DAG.getConstant(IncrementSize, Ptr.getValueType()));
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
-                          ExtraVT, isVolatile, isNonTemporal,
-                          MinAlign(Alignment, IncrementSize), TBAAInfo);
+                          ExtraVT, isVolatile, isNonTemporal, isInvariant,
+                          MinAlign(Alignment, IncrementSize), AAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -1031,7 +1028,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // Load the top RoundWidth bits.
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo(), RoundVT, isVolatile,
-                          isNonTemporal, Alignment, TBAAInfo);
+                          isNonTemporal, isInvariant, Alignment, AAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
@@ -1040,8 +1037,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
                           dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
-                          ExtraVT, isVolatile, isNonTemporal,
-                          MinAlign(Alignment, IncrementSize), TBAAInfo);
+                          ExtraVT, isVolatile, isNonTemporal, isInvariant,
+                          MinAlign(Alignment, IncrementSize), AAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -1060,7 +1057,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     Chain = Ch;
   } else {
     bool isCustom = false;
-    switch (TLI.getLoadExtAction(ExtType, SrcVT.getSimpleVT())) {
+    switch (TLI.getLoadExtAction(ExtType, Node->getValueType(0),
+                                 SrcVT.getSimpleVT())) {
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Custom:
       isCustom = true;
@@ -1080,37 +1078,37 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
         // it, expand it.
         EVT MemVT = LD->getMemoryVT();
         unsigned AS = LD->getAddressSpace();
-        if (!TLI.allowsUnalignedMemoryAccesses(MemVT, AS)) {
-          Type *Ty =
-            LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-          unsigned ABIAlignment =
-            TLI.getDataLayout()->getABITypeAlignment(Ty);
-          if (LD->getAlignment() < ABIAlignment){
-            ExpandUnalignedLoad(cast<LoadSDNode>(Node),
-                                DAG, TLI, Value, Chain);
+        unsigned Align = LD->getAlignment();
+        if (!TLI.allowsMisalignedMemoryAccesses(MemVT, AS, Align)) {
+          Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+          unsigned ABIAlignment = TLI.getDataLayout()->getABITypeAlignment(Ty);
+          if (Align < ABIAlignment){
+            ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, Value, Chain);
           }
         }
       }
       break;
     }
     case TargetLowering::Expand:
-      if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) &&
-          TLI.isTypeLegal(SrcVT)) {
-        SDValue Load = DAG.getLoad(SrcVT, dl, Chain, Ptr,
-                                   LD->getMemOperand());
-        unsigned ExtendOp;
-        switch (ExtType) {
-        case ISD::EXTLOAD:
-          ExtendOp = (SrcVT.isFloatingPoint() ?
-                      ISD::FP_EXTEND : ISD::ANY_EXTEND);
+      if (!TLI.isLoadExtLegal(ISD::EXTLOAD, Node->getValueType(0), SrcVT)) {
+        // If the source type is not legal, see if there is a legal extload to
+        // an intermediate type that we can then extend further.
+        EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT());
+        if (TLI.isTypeLegal(SrcVT) || // Same as SrcVT == LoadVT?
+            TLI.isLoadExtLegal(ExtType, LoadVT, SrcVT)) {
+          // If we are loading a legal type, this is a non-extload followed by a
+          // full extend.
+          ISD::LoadExtType MidExtType =
+              (LoadVT == SrcVT) ? ISD::NON_EXTLOAD : ExtType;
+
+          SDValue Load = DAG.getExtLoad(MidExtType, dl, LoadVT, Chain, Ptr,
+                                        SrcVT, LD->getMemOperand());
+          unsigned ExtendOp =
+              ISD::getExtForLoadExtType(SrcVT.isFloatingPoint(), ExtType);
+          Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
+          Chain = Load.getValue(1);
           break;
-        case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break;
-        case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break;
-        default: llvm_unreachable("Unexpected extend load type!");
         }
-        Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
-        Chain = Load.getValue(1);
-        break;
       }
 
       assert(!SrcVT.isVector() &&
@@ -1134,8 +1132,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
                              Result.getValueType(),
                              Result, DAG.getValueType(SrcVT));
       else
-        ValRes = DAG.getZeroExtendInReg(Result, dl,
-                                        SrcVT.getScalarType());
+        ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
       Value = ValRes;
       Chain = Result.getValue(1);
       break;
@@ -1148,13 +1145,18 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     assert(Value.getNode() != Node && "Load must be completely replaced");
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Value);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    if (UpdatedNodes) {
+      UpdatedNodes->insert(Value.getNode());
+      UpdatedNodes->insert(Chain.getNode());
+    }
     ReplacedNode(Node);
   }
 }
 
-/// LegalizeOp - Return a legal replacement for the given operation, with
-/// all legal operands.
+/// Return a legal replacement for the given operation, with all legal operands.
 void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
+  DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));
+
   if (Node->getOpcode() == ISD::TargetConstant) // Allow illegal target nodes.
     return;
 
@@ -1335,10 +1337,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     }
 
     if (NewNode != Node) {
-      DAG.ReplaceAllUsesWith(Node, NewNode);
-      for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
-        DAG.TransferDbgValues(SDValue(Node, i), SDValue(NewNode, i));
-      ReplacedNode(Node);
+      ReplaceNode(Node, NewNode);
       Node = NewNode;
     }
     switch (Action) {
@@ -1349,19 +1348,19 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
       // a complete mess.
       SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
       if (Res.getNode()) {
-        SmallVector<SDValue, 8> ResultVals;
-        for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) {
-          if (e == 1)
-            ResultVals.push_back(Res);
-          else
-            ResultVals.push_back(Res.getValue(i));
-        }
-        if (Res.getNode() != Node || Res.getResNo() != 0) {
-          DAG.ReplaceAllUsesWith(Node, ResultVals.data());
-          for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
-            DAG.TransferDbgValues(SDValue(Node, i), ResultVals[i]);
-          ReplacedNode(Node);
+        if (!(Res.getNode() != Node || Res.getResNo() != 0))
+          return;
+
+        if (Node->getNumValues() == 1) {
+          // We can just directly replace this node with the lowered value.
+          ReplaceNode(SDValue(Node, 0), Res);
+          return;
         }
+
+        SmallVector<SDValue, 8> ResultVals;
+        for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+          ResultVals.push_back(Res.getValue(i));
+        ReplaceNode(Node, ResultVals.data());
         return;
       }
     }
@@ -1449,7 +1448,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   return DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
                         MachinePointerInfo(),
                         Vec.getValueType().getVectorElementType(),
-                        false, false, 0);
+                        false, false, false, 0);
 }
 
 SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
@@ -1484,7 +1483,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
                                     StackPtr);
 
   // Store the subvector.
-  Ch = DAG.getStore(DAG.getEntryNode(), dl, Part, SubStackPtr,
+  Ch = DAG.getStore(Ch, dl, Part, SubStackPtr,
                     MachinePointerInfo(), false, false, 0);
 
   // Finally, load the updated vector.
@@ -1624,7 +1623,8 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
   SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
   Chain = SP.getValue(1);
   unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
-  unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+  unsigned StackAlign =
+      DAG.getSubtarget().getFrameLowering()->getStackAlignment();
   Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size);       // Value
   if (Align > StackAlign)
     Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
@@ -1639,8 +1639,8 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
   Results.push_back(Tmp2);
 }
 
-/// LegalizeSetCCCondCode - Legalize a SETCC with given LHS and RHS and
-/// condition code CC on the current target.
+/// Legalize a SETCC with given LHS and RHS and condition code CC on the current
+/// target.
 ///
 /// If the SETCC has been legalized using AND / OR, then the legalized node
 /// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert
@@ -1754,7 +1754,7 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
   return false;
 }
 
-/// EmitStackConvert - Emit a store/load combination to the stack.  This stores
+/// Emit a store/load combination to the stack.  This stores
 /// SrcOp to a stack slot of type SlotVT, truncating it if needed.  It then does
 /// a load from the stack slot to DestVT, extending it if needed.
 /// The resultant code need not be legal.
@@ -1798,7 +1798,7 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
 
   assert(SlotSize < DestSize && "Unknown extension!");
   return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr,
-                        PtrInfo, SlotVT, false, false, DestAlign);
+                        PtrInfo, SlotVT, false, false, false, DestAlign);
 }
 
 SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
@@ -1878,7 +1878,8 @@ ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
                                          ShuffleVec.data());
         else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
           return false;
-        NewIntermedVals.push_back(std::make_pair(Shuffle, FinalIndices));
+        NewIntermedVals.push_back(
+            std::make_pair(Shuffle, std::move(FinalIndices)));
       }
 
       // If we had an odd number of defined values, then append the last
@@ -1913,7 +1914,7 @@ ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
   return true;
 }
 
-/// ExpandBUILD_VECTOR - Expand a BUILD_VECTOR node on targets that don't
+/// Expand a BUILD_VECTOR node on targets that don't
 /// support the operation, but do support the resultant vector type.
 SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
   unsigned NumElems = Node->getNumOperands();
@@ -2025,7 +2026,7 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
   return ExpandVectorBuildThroughStack(Node);
 }
 
-// ExpandLibCall - Expand a node into a call to a libcall.  If the result value
+// Expand a node into a call to a libcall.  If the result value
 // does not fit into a register, return the lo part and set the hi part to the
 // by-reg argument.  If it does fit into a single register, return the result
 // and leave the Hi part unset.
@@ -2073,7 +2074,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   return CallInfo.first;
 }
 
-/// ExpandLibCall - Generate a libcall taking the given operands as arguments
+/// Generate a libcall taking the given operands as arguments
 /// and returning a result of type RetVT.
 SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
                                             const SDValue *Ops, unsigned NumOps,
@@ -2104,7 +2105,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
   return CallInfo.first;
 }
 
-// ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
+// Expand a node into a call to a libcall. Similar to
 // ExpandLibCall except that the first operand is the in-chain.
 std::pair<SDValue, SDValue>
 SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
@@ -2174,7 +2175,7 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
   return ExpandLibCall(LC, Node, isSigned);
 }
 
-/// isDivRemLibcallAvailable - Return true if divmod libcall is available.
+/// Return true if divmod libcall is available.
 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
                                      const TargetLowering &TLI) {
   RTLIB::Libcall LC;
@@ -2190,8 +2191,7 @@ static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
   return TLI.getLibcallName(LC) != nullptr;
 }
 
-/// useDivRem - Only issue divrem libcall if both quotient and remainder are
-/// needed.
+/// Only issue divrem libcall if both quotient and remainder are needed.
 static bool useDivRem(SDNode *Node, bool isSigned, bool isDIV) {
   // The other use might have been replaced with a divrem already.
   unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
@@ -2216,8 +2216,7 @@ static bool useDivRem(SDNode *Node, bool isSigned, bool isDIV) {
   return false;
 }
 
-/// ExpandDivRemLibCall - Issue libcalls to __{u}divmod to compute div / rem
-/// pairs.
+/// Issue libcalls to __{u}divmod to compute div / rem pairs.
 void
 SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
                                           SmallVectorImpl<SDValue> &Results) {
@@ -2279,7 +2278,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   Results.push_back(Rem);
 }
 
-/// isSinCosLibcallAvailable - Return true if sincos libcall is available.
+/// Return true if sincos libcall is available.
 static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
@@ -2293,8 +2292,8 @@ static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
   return TLI.getLibcallName(LC) != nullptr;
 }
 
-/// canCombineSinCosLibcall - Return true if sincos libcall is available and
-/// can be used to combine sin and cos.
+/// Return true if sincos libcall is available and can be used to combine sin
+/// and cos.
 static bool canCombineSinCosLibcall(SDNode *Node, const TargetLowering &TLI,
                                     const TargetMachine &TM) {
   if (!isSinCosLibcallAvailable(Node, TLI))
@@ -2307,8 +2306,7 @@ static bool canCombineSinCosLibcall(SDNode *Node, const TargetLowering &TLI,
   return true;
 }
 
-/// useSinCos - Only issue sincos libcall if both sin and cos are
-/// needed.
+/// Only issue sincos libcall if both sin and cos are needed.
 static bool useSinCos(SDNode *Node) {
   unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN
     ? ISD::FCOS : ISD::FSIN;
@@ -2326,8 +2324,7 @@ static bool useSinCos(SDNode *Node) {
   return false;
 }
 
-/// ExpandSinCosLibCall - Issue libcalls to sincos to compute sin / cos
-/// pairs.
+/// Issue libcalls to sincos to compute sin / cos pairs.
 void
 SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
                                           SmallVectorImpl<SDValue> &Results) {
@@ -2392,7 +2389,7 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
                                 MachinePointerInfo(), false, false, false, 0));
 }
 
-/// ExpandLegalINT_TO_FP - This function is responsible for legalizing a
+/// This function is responsible for legalizing a
 /// INT_TO_FP operation of the specified operand when the target requests that
 /// we expand it.  At this point, we know that the result and operand types are
 /// legal for the target.
@@ -2581,7 +2578,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
     SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT,
                                   DAG.getEntryNode(), CPIdx,
                                   MachinePointerInfo::getConstantPool(),
-                                  MVT::f32, false, false, Alignment);
+                                  MVT::f32, false, false, false, Alignment);
     HandleSDNode Handle(Load);
     LegalizeOp(Load.getNode());
     FudgeInReg = Handle.getValue();
@@ -2590,7 +2587,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
   return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg);
 }
 
-/// PromoteLegalINT_TO_FP - This function is responsible for legalizing a
+/// This function is responsible for legalizing a
 /// *INT_TO_FP operation of the specified operand when the target requests that
 /// we promote it.  At this point, we know that the result and operand types are
 /// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP
@@ -2632,7 +2629,7 @@ SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp,
                                  dl, NewInTy, LegalOp));
 }
 
-/// PromoteLegalFP_TO_INT - This function is responsible for legalizing a
+/// This function is responsible for legalizing a
 /// FP_TO_*INT operation of the specified operand when the target requests that
 /// we promote it.  At this point, we know that the result and operand types are
 /// legal for the target, and that there is a legal FP_TO_UINT or FP_TO_SINT
@@ -2676,8 +2673,7 @@ SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
   return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
 }
 
-/// ExpandBSWAP - Open code the operations for BSWAP of the specified operation.
-///
+/// Open code the operations for BSWAP of the specified operation.
 SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, SDLoc dl) {
   EVT VT = Op.getValueType();
   EVT SHVT = TLI.getShiftAmountTy(VT);
@@ -2723,8 +2719,7 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, SDLoc dl) {
   }
 }
 
-/// ExpandBitCount - Expand the specified bitcount instruction into operations.
-///
+/// Expand the specified bitcount instruction into operations.
 SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
                                              SDLoc dl) {
   switch (Opc) {
@@ -2783,7 +2778,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     // x = x | (x >>32); // for 64-bit input
     // return popcount(~x);
     //
-    // but see also: http://www.hackersdelight.org/HDcode/nlz.cc
+    // Ref: "Hacker's Delight" by Henry Warren
     EVT VT = Op.getValueType();
     EVT ShVT = TLI.getShiftAmountTy(VT);
     unsigned len = VT.getSizeInBits();
@@ -2802,7 +2797,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     // for now, we use: { return popcount(~x & (x - 1)); }
     // unless the target has ctlz but not ctpop, in which case we use:
     // { return 32 - nlz(~x & (x-1)); }
-    // see also http://www.hackersdelight.org/HDcode/ntz.cc
+    // Ref: "Hacker's Delight" by Henry Warren
     EVT VT = Op.getValueType();
     SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT,
                                DAG.getNOT(dl, Op, VT),
@@ -3396,6 +3391,16 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   }
+  case ISD::FMINNUM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64,
+                                      RTLIB::FMIN_F80, RTLIB::FMIN_F128,
+                                      RTLIB::FMIN_PPCF128));
+    break;
+  case ISD::FMAXNUM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64,
+                                      RTLIB::FMAX_F80, RTLIB::FMAX_F128,
+                                      RTLIB::FMAX_PPCF128));
+    break;
   case ISD::FSQRT:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
                                       RTLIB::SQRT_F80, RTLIB::SQRT_F128,
@@ -3514,6 +3519,16 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                       RTLIB::FMA_F80, RTLIB::FMA_F128,
                                       RTLIB::FMA_PPCF128));
     break;
+  case ISD::FADD:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
+                                      RTLIB::ADD_F80, RTLIB::ADD_F128,
+                                      RTLIB::ADD_PPCF128));
+    break;
+  case ISD::FMUL:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64,
+                                      RTLIB::MUL_F80, RTLIB::MUL_F128,
+                                      RTLIB::MUL_PPCF128));
+    break;
   case ISD::FP16_TO_FP: {
     if (Node->getValueType(0) == MVT::f32) {
       Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
@@ -3546,12 +3561,16 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   }
   case ISD::FSUB: {
     EVT VT = Node->getValueType(0);
-    assert(TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
-           TLI.isOperationLegalOrCustom(ISD::FNEG, VT) &&
-           "Don't know how to expand this FP subtraction!");
-    Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
-    Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1);
-    Results.push_back(Tmp1);
+    if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::FNEG, VT)) {
+      Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
+      Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1);
+      Results.push_back(Tmp1);
+    } else {
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64,
+                                        RTLIB::SUB_F80, RTLIB::SUB_F128,
+                                        RTLIB::SUB_PPCF128));
+    }
     break;
   }
   case ISD::SUB: {
@@ -3806,9 +3825,11 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
                             DAG.getIntPtrConstant(1));
       // Ret is a node with an illegal type. Because such things are not
-      // generally permitted during this phase of legalization, delete the
-      // node. The above EXTRACT_ELEMENT nodes should have been folded.
-      DAG.DeleteNode(Ret.getNode());
+      // generally permitted during this phase of legalization, make sure the
+      // node has no more uses. The above EXTRACT_ELEMENT nodes should have been
+      // folded.
+      assert(Ret->use_empty() &&
+             "Unexpected uses of illegally type from expanded lib call.");
     }
 
     if (isSigned) {
@@ -3869,7 +3890,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
     SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, dl, PTy, Chain, Addr,
                                 MachinePointerInfo::getJumpTable(), MemVT,
-                                false, false, 0);
+                                false, false, false, 0);
     Addr = LD;
     if (TM.getRelocationModel() == Reloc::PIC_) {
       // For PIC, the sequence is:
@@ -4179,6 +4200,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     // use the new one.
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp2);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    if (UpdatedNodes) {
+      UpdatedNodes->insert(Tmp2.getNode());
+      UpdatedNodes->insert(Chain.getNode());
+    }
     ReplacedNode(Node);
     break;
   }
@@ -4255,6 +4280,9 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
                                   Tmp1, Tmp2, Node->getOperand(2)));
     break;
   }
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
   case ISD::FDIV:
   case ISD::FREM:
   case ISD::FPOW: {
@@ -4282,10 +4310,57 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     ReplaceNode(Node, Results.data());
 }
 
-// SelectionDAG::Legalize - This is the entry point for the file.
-//
+/// This is the entry point for the file.
 void SelectionDAG::Legalize() {
-  /// run - This is the main entry point to this class.
-  ///
-  SelectionDAGLegalize(*this).LegalizeDAG();
+  AssignTopologicalOrder();
+
+  SmallPtrSet<SDNode *, 16> LegalizedNodes;
+  SelectionDAGLegalize Legalizer(*this, LegalizedNodes);
+
+  // Visit all the nodes. We start in topological order, so that we see
+  // nodes with their original operands intact. Legalization can produce
+  // new nodes which may themselves need to be legalized. Iterate until all
+  // nodes have been legalized.
+  for (;;) {
+    bool AnyLegalized = false;
+    for (auto NI = allnodes_end(); NI != allnodes_begin();) {
+      --NI;
+
+      SDNode *N = NI;
+      if (N->use_empty() && N != getRoot().getNode()) {
+        ++NI;
+        DeleteNode(N);
+        continue;
+      }
+
+      if (LegalizedNodes.insert(N).second) {
+        AnyLegalized = true;
+        Legalizer.LegalizeOp(N);
+
+        if (N->use_empty() && N != getRoot().getNode()) {
+          ++NI;
+          DeleteNode(N);
+        }
+      }
+    }
+    if (!AnyLegalized)
+      break;
+
+  }
+
+  // Remove dead nodes now.
+  RemoveDeadNodes();
+}
+
+bool SelectionDAG::LegalizeOp(SDNode *N,
+                              SmallSetVector<SDNode *, 16> &UpdatedNodes) {
+  SmallPtrSet<SDNode *, 16> LegalizedNodes;
+  SelectionDAGLegalize Legalizer(*this, LegalizedNodes, &UpdatedNodes);
+
+  // Directly insert the node in question, and legalize it. This will recurse
+  // as needed through operands.
+  LegalizedNodes.insert(N);
+  Legalizer.LegalizeOp(N);
+
+  return LegalizedNodes.count(N);
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 649dd7a349ff..4591e79316d8 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -68,6 +68,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::EXTRACT_VECTOR_ELT:
       R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break;
     case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
+    case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
+    case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
     case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
     case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
     case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
@@ -153,6 +155,32 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
   return DAG.getNode(ISD::AND, SDLoc(N), NVT, Op, Mask);
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::FMIN_F32,
+                                           RTLIB::FMIN_F64,
+                                           RTLIB::FMIN_F80,
+                                           RTLIB::FMIN_F128,
+                                           RTLIB::FMIN_PPCF128),
+                         NVT, Ops, 2, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::FMAX_F32,
+                                           RTLIB::FMAX_F64,
+                                           RTLIB::FMAX_F80,
+                                           RTLIB::FMAX_F128,
+                                           RTLIB::FMAX_PPCF128),
+                         NVT, Ops, 2, false, SDLoc(N)).first;
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
@@ -377,10 +405,15 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   // There's only a libcall for f16 -> f32, so proceed in two stages. Also, it's
   // entirely possible for both f16 and f32 to be legal, so use the fully
   // hard-float FP_EXTEND rather than FP16_TO_FP.
-  if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32)
+  if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) {
     Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op);
+    if (getTypeAction(MVT::f32) == TargetLowering::TypeSoftenFloat)
+      SoftenFloatResult(Op.getNode(), 0);
+  }
 
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat)
+    Op = GetSoftenedFloat(Op);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
   return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first;
 }
@@ -543,7 +576,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
                        NVT, dl, L->getChain(), L->getBasePtr(), L->getOffset(),
                        L->getPointerInfo(), NVT, L->isVolatile(),
                        L->isNonTemporal(), false, L->getAlignment(),
-                       L->getTBAAInfo());
+                       L->getAAInfo());
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
     ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -556,7 +589,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
                      L->getBasePtr(), L->getOffset(), L->getPointerInfo(),
                      L->getMemoryVT(), L->isVolatile(),
                      L->isNonTemporal(), false, L->getAlignment(),
-                     L->getTBAAInfo());
+                     L->getAAInfo());
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -851,6 +884,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
 
   case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
   case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
+  case ISD::FMINNUM:    ExpandFloatRes_FMINNUM(N, Lo, Hi); break;
+  case ISD::FMAXNUM:    ExpandFloatRes_FMAXNUM(N, Lo, Hi); break;
   case ISD::FADD:       ExpandFloatRes_FADD(N, Lo, Hi); break;
   case ISD::FCEIL:      ExpandFloatRes_FCEIL(N, Lo, Hi); break;
   case ISD::FCOPYSIGN:  ExpandFloatRes_FCOPYSIGN(N, Lo, Hi); break;
@@ -914,6 +949,26 @@ void DAGTypeLegalizer::ExpandFloatRes_FABS(SDNode *N, SDValue &Lo,
                    ISD::SETEQ);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_FMINNUM(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::FMIN_F32, RTLIB::FMIN_F64,
+                                         RTLIB::FMIN_F80, RTLIB::FMIN_F128,
+                                         RTLIB::FMIN_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMAXNUM(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::FMAX_F32, RTLIB::FMAX_F64,
+                                         RTLIB::FMAX_F80, RTLIB::FMAX_F128,
+                                         RTLIB::FMAX_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FADD(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 44d9e3875b83..82b114b80aa9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -66,6 +66,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::EXTRACT_VECTOR_ELT:
                          Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
   case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N));break;
+  case ISD::MLOAD:       Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));break;
   case ISD::SELECT:      Res = PromoteIntRes_SELECT(N); break;
   case ISD::VSELECT:     Res = PromoteIntRes_VSELECT(N); break;
   case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
@@ -342,9 +343,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   EVT NVT = Op.getValueType();
   Op = DAG.getNode(N->getOpcode(), dl, NVT, Op);
   // Subtract off the extra leading bits in the bigger type.
-  return DAG.getNode(ISD::SUB, dl, NVT, Op,
-                     DAG.getConstant(NVT.getSizeInBits() -
-                                     OVT.getSizeInBits(), NVT));
+  return DAG.getNode(
+      ISD::SUB, dl, NVT, Op,
+      DAG.getConstant(NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(),
+                      NVT));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) {
@@ -362,8 +364,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
     // The count is the same in the promoted type except if the original
     // value was zero.  This can be handled by setting the bit just off
     // the top of the original type.
-    APInt TopBit(NVT.getSizeInBits(), 0);
-    TopBit.setBit(OVT.getSizeInBits());
+    auto TopBit = APInt::getOneBitSet(NVT.getScalarSizeInBits(),
+                                      OVT.getScalarSizeInBits());
     Op = DAG.getNode(ISD::OR, dl, NVT, Op, DAG.getConstant(TopBit, NVT));
   }
   return DAG.getNode(N->getOpcode(), dl, NVT, Op);
@@ -453,6 +455,24 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
   return Res;
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0());
+  SDValue ExtMask = PromoteTargetBoolean(N->getMask(), NVT);
+  SDLoc dl(N);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(N->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  NVT.getStoreSize(),
+                         N->getAlignment(), N->getAAInfo(), N->getRanges());
+
+  SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
+                                  ExtMask, ExtSrc0, MMO);
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
 /// Promote the overflow flag of an overflowing arithmetic node.
 SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   // Simply change the return type of the boolean result.
@@ -824,6 +844,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::SINT_TO_FP:   Res = PromoteIntOp_SINT_TO_FP(N); break;
   case ISD::STORE:        Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
                                                    OpNo); break;
+  case ISD::MSTORE:       Res = PromoteIntOp_MSTORE(cast<MaskedStoreSDNode>(N),
+                                                    OpNo); break;
+  case ISD::MLOAD:        Res = PromoteIntOp_MLOAD(cast<MaskedLoadSDNode>(N),
+                                                    OpNo); break;
   case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
   case ISD::FP16_TO_FP:
   case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
@@ -861,7 +885,26 @@ void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
   switch (CCCode) {
   default: llvm_unreachable("Unknown integer comparison!");
   case ISD::SETEQ:
-  case ISD::SETNE:
+  case ISD::SETNE: {
+    SDValue OpL = GetPromotedInteger(NewLHS);
+    SDValue OpR = GetPromotedInteger(NewRHS);
+
+    // We would prefer to promote the comparison operand with sign extension,
+    // if we find the operand is actually to truncate an AssertSext. With this
+    // optimization, we can avoid inserting real truncate instruction, which
+    // is redudant eventually.
+    if (OpL->getOpcode() == ISD::AssertSext &&
+        cast<VTSDNode>(OpL->getOperand(1))->getVT() == NewLHS.getValueType() &&
+        OpR->getOpcode() == ISD::AssertSext &&
+        cast<VTSDNode>(OpR->getOperand(1))->getVT() == NewRHS.getValueType()) {
+      NewLHS = OpL;
+      NewRHS = OpR;
+    } else {
+      NewLHS = ZExtPromotedInteger(NewLHS);
+      NewRHS = ZExtPromotedInteger(NewRHS);
+    }
+    break;
+  }
   case ISD::SETUGE:
   case ISD::SETUGT:
   case ISD::SETULE:
@@ -945,7 +988,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
   EVT VecVT = N->getValueType(0);
   unsigned NumElts = VecVT.getVectorNumElements();
   assert(!((NumElts & 1) && (!TLI.isTypeLegal(VecVT))) &&
-		 "Legal vector of one illegal element?");
+         "Legal vector of one illegal element?");
 
   // Promote the inserted value.  The type does not need to match the
   // vector element type.  Check that any extra bits introduced will be
@@ -1071,6 +1114,63 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
                            N->getMemoryVT(), N->getMemOperand());
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){
+
+  assert(OpNo == 2 && "Only know how to promote the mask!");
+  SDValue DataOp = N->getData();
+  EVT DataVT = DataOp.getValueType();
+  SDValue Mask = N->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDLoc dl(N);
+
+  if (!TLI.isTypeLegal(DataVT)) {
+    if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) {
+      DataOp = GetPromotedInteger(DataOp);
+      Mask = PromoteTargetBoolean(Mask, DataOp.getValueType());
+    }
+    else {
+      assert(getTypeAction(DataVT) == TargetLowering::TypeWidenVector &&
+             "Unexpected data legalization in MSTORE");
+      DataOp = GetWidenedVector(DataOp);
+
+      if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+        Mask = GetWidenedVector(Mask);
+      else {
+        EVT BoolVT = getSetCCResultType(DataOp.getValueType());
+
+        // We can't use ModifyToType() because we should fill the mask with
+        // zeroes
+        unsigned WidenNumElts = BoolVT.getVectorNumElements();
+        unsigned MaskNumElts = MaskVT.getVectorNumElements();
+
+        unsigned NumConcat = WidenNumElts / MaskNumElts;
+        SmallVector<SDValue, 16> Ops(NumConcat);
+        SDValue ZeroVal = DAG.getConstant(0, MaskVT);
+        Ops[0] = Mask;
+        for (unsigned i = 1; i != NumConcat; ++i)
+          Ops[i] = ZeroVal;
+
+        Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+      }
+    }
+  }
+  else
+    Mask = PromoteTargetBoolean(N->getMask(), DataOp.getValueType());
+  SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+  NewOps[2] = Mask;
+  NewOps[3] = DataOp;
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){
+  assert(OpNo == 2 && "Only know how to promote the mask!");
+  EVT DataVT = N->getValueType(0);
+  SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+  SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+  NewOps[OpNo] = Mask;
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op);
@@ -1859,7 +1959,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
   bool isVolatile = N->isVolatile();
   bool isNonTemporal = N->isNonTemporal();
   bool isInvariant = N->isInvariant();
-  const MDNode *TBAAInfo = N->getTBAAInfo();
+  AAMDNodes AAInfo = N->getAAInfo();
   SDLoc dl(N);
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
@@ -1868,7 +1968,8 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     EVT MemVT = N->getMemoryVT();
 
     Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
-                        MemVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                        MemVT, isVolatile, isNonTemporal, isInvariant,
+                        Alignment, AAInfo);
 
     // Remember the chain.
     Ch = Lo.getValue(1);
@@ -1891,7 +1992,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     // Little-endian - low bits are at low addresses.
     Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(),
                      isVolatile, isNonTemporal, isInvariant, Alignment,
-                     TBAAInfo);
+                     AAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -1903,8 +2004,8 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
                       DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
-                        isVolatile, isNonTemporal,
-                        MinAlign(Alignment, IncrementSize), TBAAInfo);
+                        isVolatile, isNonTemporal, isInvariant,
+                        MinAlign(Alignment, IncrementSize), AAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -1922,7 +2023,8 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
                         EVT::getIntegerVT(*DAG.getContext(),
                                           MemVT.getSizeInBits() - ExcessBits),
-                        isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                        isVolatile, isNonTemporal, isInvariant, Alignment,
+                        AAInfo);
 
     // Increment the pointer to the other half.
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
@@ -1931,8 +2033,8 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize),
                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
-                        isVolatile, isNonTemporal,
-                        MinAlign(Alignment, IncrementSize), TBAAInfo);
+                        isVolatile, isNonTemporal, isInvariant,
+                        MinAlign(Alignment, IncrementSize), AAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -2709,7 +2811,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   unsigned Alignment = N->getAlignment();
   bool isVolatile = N->isVolatile();
   bool isNonTemporal = N->isNonTemporal();
-  const MDNode *TBAAInfo = N->getTBAAInfo();
+  AAMDNodes AAInfo = N->getAAInfo();
   SDLoc dl(N);
   SDValue Lo, Hi;
 
@@ -2719,7 +2821,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     GetExpandedInteger(N->getValue(), Lo, Hi);
     return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
                              N->getMemoryVT(), isVolatile, isNonTemporal,
-                             Alignment, TBAAInfo);
+                             Alignment, AAInfo);
   }
 
   if (TLI.isLittleEndian()) {
@@ -2727,7 +2829,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     GetExpandedInteger(N->getValue(), Lo, Hi);
 
     Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
-                      isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                      isVolatile, isNonTemporal, Alignment, AAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -2740,7 +2842,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
                            NEVT, isVolatile, isNonTemporal,
-                           MinAlign(Alignment, IncrementSize), TBAAInfo);
+                           MinAlign(Alignment, IncrementSize), AAInfo);
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
   }
 
@@ -2768,7 +2870,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
 
   // Store both the high bits and maybe some of the low bits.
   Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(),
-                         HiVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                         HiVT, isVolatile, isNonTemporal, Alignment, AAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
@@ -2778,7 +2880,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
                          N->getPointerInfo().getWithOffset(IncrementSize),
                          EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
                          isVolatile, isNonTemporal,
-                         MinAlign(Alignment, IncrementSize), TBAAInfo);
+                         MinAlign(Alignment, IncrementSize), AAInfo);
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
 
@@ -2853,7 +2955,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
                                    FudgePtr,
                                    MachinePointerInfo::getConstantPool(),
                                    MVT::f32,
-                                   false, false, Alignment);
+                                   false, false, false, Alignment);
     return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge);
   }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index bd7dacf2bc69..ebf6b28259ea 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -921,6 +921,17 @@ bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
     // The target didn't want to custom lower it after all.
     return false;
 
+  // When called from DAGTypeLegalizer::ExpandIntegerResult, we might need to
+  // provide the same kind of custom splitting behavior.
+  if (Results.size() == N->getNumValues() + 1 && LegalizeResult) {
+    // We've legalized a return type by splitting it. If there is a chain,
+    // replace that too.
+    SetExpandedInteger(SDValue(N, 0), Results[0], Results[1]);
+    if (N->getNumValues() > 1)
+      ReplaceValueWith(SDValue(N, 1), Results[2]);
+    return true;
+  }
+
   // Make everything that once used N's values now use those in Results instead.
   assert(Results.size() == N->getNumValues() &&
          "Custom lowering returned the wrong number of results!");
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 117ff31e2e8b..1cd9f407bca1 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SELECTIONDAG_LEGALIZETYPES_H
-#define SELECTIONDAG_LEGALIZETYPES_H
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -122,8 +122,8 @@ public:
   explicit DAGTypeLegalizer(SelectionDAG &dag)
     : TLI(dag.getTargetLoweringInfo()), DAG(dag),
     ValueTypeActions(TLI.getValueTypeActions()) {
-    assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
-           "Too many value types for ValueTypeActions to hold!");
+    static_assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE,
+                  "Too many value types for ValueTypeActions to hold!");
   }
 
   /// run - This is the main entry point for the type legalizer.  This does a
@@ -240,6 +240,7 @@ private:
   SDValue PromoteIntRes_FP_TO_FP16(SDNode *N);
   SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
   SDValue PromoteIntRes_LOAD(LoadSDNode *N);
+  SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
   SDValue PromoteIntRes_Overflow(SDNode *N);
   SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_SDIV(SDNode *N);
@@ -285,6 +286,8 @@ private:
   SDValue PromoteIntOp_TRUNCATE(SDNode *N);
   SDValue PromoteIntOp_UINT_TO_FP(SDNode *N);
   SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
 
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
@@ -387,6 +390,8 @@ private:
   SDValue SoftenFloatRes_ConstantFP(ConstantFPSDNode *N);
   SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue SoftenFloatRes_FABS(SDNode *N);
+  SDValue SoftenFloatRes_FMINNUM(SDNode *N);
+  SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
   SDValue SoftenFloatRes_FADD(SDNode *N);
   SDValue SoftenFloatRes_FCEIL(SDNode *N);
   SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N);
@@ -450,6 +455,8 @@ private:
   void ExpandFloatResult(SDNode *N, unsigned ResNo);
   void ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMINNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMAXNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FADD      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FCEIL     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FCOPYSIGN (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -574,6 +581,7 @@ private:
   void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_MLOAD(MaskedLoadSDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -590,6 +598,7 @@ private:
   SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue SplitVecOp_TRUNCATE(SDNode *N);
   SDValue SplitVecOp_VSETCC(SDNode *N);
@@ -623,6 +632,7 @@ private:
   SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
   SDValue WidenVecRes_LOAD(SDNode* N);
+  SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
   SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
   SDValue WidenVecRes_SIGN_EXTEND_INREG(SDNode* N);
   SDValue WidenVecRes_SELECT(SDNode* N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 7e2f7b6ffb55..38829b6ac076 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -256,13 +256,13 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
   bool isInvariant = LD->isInvariant();
-  const MDNode *TBAAInfo = LD->getTBAAInfo();
+  AAMDNodes AAInfo = LD->getAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
 
   Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
                    isVolatile, isNonTemporal, isInvariant, Alignment,
-                   TBAAInfo);
+                   AAInfo);
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
@@ -271,7 +271,7 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
   Hi = DAG.getLoad(NVT, dl, Chain, Ptr,
                    LD->getPointerInfo().getWithOffset(IncrementSize),
                    isVolatile, isNonTemporal, isInvariant,
-                   MinAlign(Alignment, IncrementSize), TBAAInfo);
+                   MinAlign(Alignment, IncrementSize), AAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -470,7 +470,7 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
   unsigned Alignment = St->getAlignment();
   bool isVolatile = St->isVolatile();
   bool isNonTemporal = St->isNonTemporal();
-  const MDNode *TBAAInfo = St->getTBAAInfo();
+  AAMDNodes AAInfo = St->getAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
@@ -482,14 +482,14 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
     std::swap(Lo, Hi);
 
   Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(),
-                    isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                    isVolatile, isNonTemporal, Alignment, AAInfo);
 
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                     DAG.getConstant(IncrementSize, Ptr.getValueType()));
   Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                     St->getPointerInfo().getWithOffset(IncrementSize),
                     isVolatile, isNonTemporal,
-                    MinAlign(Alignment, IncrementSize), TBAAInfo);
+                    MinAlign(Alignment, IncrementSize), AAInfo);
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 507e7ffb1d45..eac404c50365 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -199,12 +199,30 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   if (Op.getOpcode() == ISD::LOAD) {
     LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
     ISD::LoadExtType ExtType = LD->getExtensionType();
-    if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD) {
-      if (TLI.isLoadExtLegal(LD->getExtensionType(), LD->getMemoryVT()))
+    if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD)
+      switch (TLI.getLoadExtAction(LD->getExtensionType(), LD->getValueType(0),
+                                   LD->getMemoryVT())) {
+      default: llvm_unreachable("This action is not supported yet!");
+      case TargetLowering::Legal:
         return TranslateLegalizeResults(Op, Result);
-      Changed = true;
-      return LegalizeOp(ExpandLoad(Op));
-    }
+      case TargetLowering::Custom:
+        if (SDValue Lowered = TLI.LowerOperation(Result, DAG)) {
+          Changed = true;
+          if (Lowered->getNumValues() != Op->getNumValues()) {
+            // This expanded to something other than the load. Assume the
+            // lowering code took care of any chain values, and just handle the
+            // returned value.
+            assert(Result.getValue(1).use_empty() &&
+                   "There are still live users of the old chain!");
+            return LegalizeOp(Lowered);
+          } else {
+            return TranslateLegalizeResults(Op, Lowered);
+          }
+        }
+      case TargetLowering::Expand:
+        Changed = true;
+        return LegalizeOp(ExpandLoad(Op));
+      }
   } else if (Op.getOpcode() == ISD::STORE) {
     StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
     EVT StVT = ST->getMemoryVT();
@@ -273,6 +291,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FP_TO_UINT:
   case ISD::FNEG:
   case ISD::FABS:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
   case ISD::FCOPYSIGN:
   case ISD::FSQRT:
   case ISD::FSIN:
@@ -353,9 +373,11 @@ SDValue VectorLegalizer::Promote(SDValue Op) {
     return PromoteFP_TO_INT(Op, Op->getOpcode() == ISD::FP_TO_SINT);
   }
 
-  // The rest of the time, vector "promotion" is basically just bitcasting and
-  // doing the operation in a different type.  For example, x86 promotes
-  // ISD::AND on v2i32 to v1i64.
+  // There are currently two cases of vector promotion:
+  // 1) Bitcasting a vector of integers to a different type to a vector of the
+  //    same overall length. For example, x86 promotes ISD::AND on v2i32 to v1i64.
+  // 2) Extending a vector of floats to a vector of the same number oflarger
+  //    floats. For example, AArch64 promotes ISD::FADD on v4f16 to v4f32.
   MVT VT = Op.getSimpleValueType();
   assert(Op.getNode()->getNumValues() == 1 &&
          "Can't promote a vector with multiple results!");
@@ -365,14 +387,23 @@ SDValue VectorLegalizer::Promote(SDValue Op) {
 
   for (unsigned j = 0; j != Op.getNumOperands(); ++j) {
     if (Op.getOperand(j).getValueType().isVector())
-      Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Op.getOperand(j));
+      if (Op.getOperand(j)
+              .getValueType()
+              .getVectorElementType()
+              .isFloatingPoint())
+        Operands[j] = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op.getOperand(j));
+      else
+        Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Op.getOperand(j));
     else
       Operands[j] = Op.getOperand(j);
   }
 
   Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands);
-
-  return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+  if (VT.isFloatingPoint() ||
+      (VT.isVector() && VT.getVectorElementType().isFloatingPoint()))
+    return DAG.getNode(ISD::FP_ROUND, dl, VT, Op, DAG.getIntPtrConstant(0));
+  else
+    return DAG.getNode(ISD::BITCAST, dl, VT, Op);
 }
 
 SDValue VectorLegalizer::PromoteINT_TO_FP(SDValue Op) {
@@ -480,7 +511,7 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
                                  LD->getPointerInfo().getWithOffset(Offset),
                                  LD->isVolatile(), LD->isNonTemporal(),
                                  LD->isInvariant(), LD->getAlignment(),
-                                 LD->getTBAAInfo());
+                                 LD->getAAInfo());
       } else {
         EVT LoadVT = WideVT;
         while (RemainingBytes < LoadBytes) {
@@ -490,8 +521,8 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
         ScalarLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, WideVT, Chain, BasePTR,
                                     LD->getPointerInfo().getWithOffset(Offset),
                                     LoadVT, LD->isVolatile(),
-                                    LD->isNonTemporal(), LD->getAlignment(),
-                                    LD->getTBAAInfo());
+                                    LD->isNonTemporal(), LD->isInvariant(),
+                                    LD->getAlignment(), LD->getAAInfo());
       }
 
       RemainingBytes -= LoadBytes;
@@ -561,8 +592,8 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
                 Op.getNode()->getValueType(0).getScalarType(),
                 Chain, BasePTR, LD->getPointerInfo().getWithOffset(Idx * Stride),
                 SrcVT.getScalarType(),
-                LD->isVolatile(), LD->isNonTemporal(),
-                LD->getAlignment(), LD->getTBAAInfo());
+                LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(),
+                LD->getAlignment(), LD->getAAInfo());
 
       BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
                          DAG.getConstant(Stride, BasePTR.getValueType()));
@@ -593,7 +624,7 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
-  const MDNode *TBAAInfo = ST->getTBAAInfo();
+  AAMDNodes AAInfo = ST->getAAInfo();
 
   unsigned NumElem = StVT.getVectorNumElements();
   // The type of the data we want to save
@@ -621,7 +652,7 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
     // This scalar TruncStore may be illegal, but we legalize it later.
     SDValue Store = DAG.getTruncStore(Chain, dl, Ex, BasePTR,
                ST->getPointerInfo().getWithOffset(Idx*Stride), MemSclVT,
-               isVolatile, isNonTemporal, Alignment, TBAAInfo);
+               isVolatile, isNonTemporal, Alignment, AAInfo);
 
     BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
                                DAG.getConstant(Stride, BasePTR.getValueType()));
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index f77c592fddb7..96b69eec3354 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -69,8 +69,10 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::ANY_EXTEND:
   case ISD::BSWAP:
   case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTPOP:
   case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
   case ISD::FABS:
   case ISD::FCEIL:
   case ISD::FCOS:
@@ -104,6 +106,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FCOPYSIGN:
   case ISD::FDIV:
   case ISD::FMUL:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+
   case ISD::FPOW:
   case ISD::FREM:
   case ISD::FSUB:
@@ -221,7 +226,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
                                N->getMemoryVT().getVectorElementType(),
                                N->isVolatile(), N->isNonTemporal(),
                                N->isInvariant(), N->getOriginalAlignment(),
-                               N->getTBAAInfo());
+                               N->getAAInfo());
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
@@ -232,7 +237,23 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
 SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
   // Get the dest type - it doesn't always match the input type, e.g. int_to_fp.
   EVT DestVT = N->getValueType(0).getVectorElementType();
-  SDValue Op = GetScalarizedVector(N->getOperand(0));
+  SDValue Op = N->getOperand(0);
+  EVT OpVT = Op.getValueType();
+  SDLoc DL(N);
+  // The result needs scalarizing, but it's not a given that the source does.
+  // This is a workaround for targets where it's impossible to scalarize the
+  // result of a conversion, because the source type is legal.
+  // For instance, this happens on AArch64: v1i1 is illegal but v1i{8,16,32}
+  // are widened to v8i8, v4i16, and v2i32, which is legal, because v1i64 is
+  // legal and was not scalarized.
+  // See the similar logic in ScalarizeVecRes_VSETCC
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Op = GetScalarizedVector(Op);
+  } else {
+    EVT VT = OpVT.getVectorElementType();
+    Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+                      DAG.getConstant(0, TLI.getVectorIdxTy()));
+  }
   return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op);
 }
 
@@ -406,6 +427,10 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::ZERO_EXTEND:
     case ISD::SIGN_EXTEND:
     case ISD::TRUNCATE:
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT:
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:
       Res = ScalarizeVecOp_UnaryOp(N);
       break;
     case ISD::CONCAT_VECTORS:
@@ -449,11 +474,11 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
                      N->getValueType(0), Elt);
 }
 
-/// ScalarizeVecOp_EXTEND - If the value to extend is a vector that needs
-/// to be scalarized, it must be <1 x ty>.  Extend the element instead.
+/// ScalarizeVecOp_UnaryOp - If the input is a vector that needs to be
+/// scalarized, it must be <1 x ty>.  Do the operation on the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
   assert(N->getValueType(0).getVectorNumElements() == 1 &&
-         "Unexected vector type!");
+         "Unexpected vector type!");
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
   SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N),
                            N->getValueType(0).getScalarType(), Elt);
@@ -507,12 +532,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
                              N->getBasePtr(), N->getPointerInfo(),
                              N->getMemoryVT().getVectorElementType(),
                              N->isVolatile(), N->isNonTemporal(),
-                             N->getAlignment(), N->getTBAAInfo());
+                             N->getAlignment(), N->getAAInfo());
 
   return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
                       N->getBasePtr(), N->getPointerInfo(),
                       N->isVolatile(), N->isNonTemporal(),
-                      N->getOriginalAlignment(), N->getTBAAInfo());
+                      N->getOriginalAlignment(), N->getAAInfo());
 }
 
 /// ScalarizeVecOp_FP_ROUND - If the value to round is a vector that needs
@@ -572,6 +597,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::LOAD:
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
     break;
+  case ISD::MLOAD:
+    SplitVecRes_MLOAD(cast<MaskedLoadSDNode>(N), Lo, Hi);
+    break;
   case ISD::SETCC:
     SplitVecRes_SETCC(N, Lo, Hi);
     break;
@@ -625,6 +653,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FCOPYSIGN:
   case ISD::FSUB:
   case ISD::FMUL:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::FDIV:
@@ -866,6 +896,10 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
     return;
   }
 
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(0), true))
+    return;
+
   // Spill the vector to the stack.
   EVT VecVT = Vec.getValueType();
   EVT EltVT = VecVT.getVectorElementType();
@@ -921,14 +955,14 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
   bool isInvariant = LD->isInvariant();
-  const MDNode *TBAAInfo = LD->getTBAAInfo();
+  AAMDNodes AAInfo = LD->getAAInfo();
 
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
                    LD->getPointerInfo(), LoMemVT, isVolatile, isNonTemporal,
-                   isInvariant, Alignment, TBAAInfo);
+                   isInvariant, Alignment, AAInfo);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
@@ -936,7 +970,7 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
                    LD->getPointerInfo().getWithOffset(IncrementSize),
                    HiMemVT, isVolatile, isNonTemporal, isInvariant, Alignment,
-                   TBAAInfo);
+                   AAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -948,6 +982,64 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   ReplaceValueWith(SDValue(LD, 1), Ch);
 }
 
+void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT LoVT, HiVT;
+  SDLoc dl(MLD);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
+
+  SDValue Ch = MLD->getChain();
+  SDValue Ptr = MLD->getBasePtr();
+  SDValue Mask = MLD->getMask();
+  unsigned Alignment = MLD->getOriginalAlignment();
+
+  // if Alignment is equal to the vector size,
+  // take the half of it for the second part
+  unsigned SecondHalfAlignment =
+    (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
+     Alignment/2 : Alignment;
+
+  SDValue MaskLo, MaskHi;
+  std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+
+  EVT MemoryVT = MLD->getMemoryVT();
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue Src0 = MLD->getSrc0();
+  SDValue Src0Lo, Src0Hi;
+  std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(), 
+                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
+                         Alignment, MLD->getAAInfo(), MLD->getRanges());
+
+  Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, MMO);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
+
+  MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(), 
+                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
+                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
+
+  Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, MMO);
+
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(MLD, 1), Ch);
+
+}
+
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
@@ -1203,6 +1295,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::STORE:
       Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
       break;
+    case ISD::MSTORE:
+      Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo);
+      break;
     case ISD::VSELECT:
       Res = SplitVecOp_VSELECT(N, OpNo);
       break;
@@ -1347,6 +1442,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
                                                   Idx.getValueType())), 0);
   }
 
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(0), true))
+    return SDValue();
+
   // Store the vector to the stack.
   EVT EltVT = VecVT.getVectorElementType();
   SDLoc dl(N);
@@ -1357,7 +1456,57 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   // Load back the required element.
   StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
   return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
-                        MachinePointerInfo(), EltVT, false, false, 0);
+                        MachinePointerInfo(), EltVT, false, false, false, 0);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
+                                            unsigned OpNo) {
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  SDValue Mask = N->getMask();
+  SDValue Data = N->getData();
+  EVT MemoryVT = N->getMemoryVT();
+  unsigned Alignment = N->getOriginalAlignment();
+  SDLoc DL(N);
+  
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue DataLo, DataHi;
+  GetSplitVector(Data, DataLo, DataHi);
+  SDValue MaskLo, MaskHi;
+  GetSplitVector(Mask, MaskLo, MaskHi);
+
+  // if Alignment is equal to the vector size,
+  // take the half of it for the second part
+  unsigned SecondHalfAlignment =
+    (Alignment == Data->getValueType(0).getSizeInBits()/8) ?
+       Alignment/2 : Alignment;
+
+  SDValue Lo, Hi;
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(N->getPointerInfo(), 
+                         MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
+                         Alignment, N->getAAInfo(), N->getRanges());
+
+  Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, MMO);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+  Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
+
+  MMO = DAG.getMachineFunction().
+    getMachineMemOperand(N->getPointerInfo(), 
+                         MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
+                         SecondHalfAlignment, N->getAAInfo(), N->getRanges());
+
+  Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, MMO);
+
+
+  // Build a factor node to remember that this store is independent of the
+  // other one.
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -1372,7 +1521,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   unsigned Alignment = N->getOriginalAlignment();
   bool isVol = N->isVolatile();
   bool isNT = N->isNonTemporal();
-  const MDNode *TBAAInfo = N->getTBAAInfo();
+  AAMDNodes AAInfo = N->getAAInfo();
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(1), Lo, Hi);
 
@@ -1383,10 +1532,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
 
   if (isTruncating)
     Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
-                           LoMemVT, isVol, isNT, Alignment, TBAAInfo);
+                           LoMemVT, isVol, isNT, Alignment, AAInfo);
   else
     Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
-                      isVol, isNT, Alignment, TBAAInfo);
+                      isVol, isNT, Alignment, AAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@@ -1395,11 +1544,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   if (isTruncating)
     Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
-                           HiMemVT, isVol, isNT, Alignment, TBAAInfo);
+                           HiMemVT, isVol, isNT, Alignment, AAInfo);
   else
     Hi = DAG.getStore(Ch, DL, Hi, Ptr,
                       N->getPointerInfo().getWithOffset(IncrementSize),
-                      isVol, isNT, Alignment, TBAAInfo);
+                      isVol, isNT, Alignment, AAInfo);
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
 }
@@ -1564,6 +1713,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VECTOR_SHUFFLE:
     Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
     break;
+  case ISD::MLOAD:
+    Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
+    break;
 
   case ISD::ADD:
   case ISD::AND:
@@ -1573,6 +1725,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::OR:
   case ISD::SUB:
   case ISD::XOR:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
     Res = WidenVecRes_Binary(N);
     break;
 
@@ -2252,6 +2406,48 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   return Result;
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
+  
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
+  SDValue Mask = N->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDValue Src0 = GetWidenedVector(N->getSrc0());
+  SDLoc dl(N);
+
+  if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+    Mask = GetWidenedVector(Mask);
+  else {
+    EVT BoolVT = getSetCCResultType(WidenVT);
+
+    // We can't use ModifyToType() because we should fill the mask with
+    // zeroes
+    unsigned WidenNumElts = BoolVT.getVectorNumElements();
+    unsigned MaskNumElts = MaskVT.getVectorNumElements();
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, MaskVT);
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+  }
+
+  // Rebuild memory operand because MemoryVT was changed
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(N->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  WidenVT.getStoreSize(),
+                         N->getAlignment(), N->getAAInfo(), N->getRanges());
+
+  SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
+                                  Mask, Src0, MMO);
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N),
@@ -2735,7 +2931,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   bool      isVolatile = LD->isVolatile();
   bool      isNonTemporal = LD->isNonTemporal();
   bool      isInvariant = LD->isInvariant();
-  const MDNode *TBAAInfo = LD->getTBAAInfo();
+  AAMDNodes AAInfo = LD->getAAInfo();
 
   int LdWidth = LdVT.getSizeInBits();
   int WidthDiff = WidenWidth - LdWidth;          // Difference
@@ -2746,7 +2942,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   int NewVTWidth = NewVT.getSizeInBits();
   SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
                              isVolatile, isNonTemporal, isInvariant, Align,
-                             TBAAInfo);
+                             AAInfo);
   LdChain.push_back(LdOp.getValue(1));
 
   // Check if we can load the element with one instruction
@@ -2791,7 +2987,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset), isVolatile,
                       isNonTemporal, isInvariant, MinAlign(Align, Increment),
-                      TBAAInfo);
+                      AAInfo);
       LdChain.push_back(L.getValue(1));
       if (L->getValueType(0).isVector()) {
         SmallVector<SDValue, 16> Loads;
@@ -2807,7 +3003,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset), isVolatile,
                       isNonTemporal, isInvariant, MinAlign(Align, Increment),
-                      TBAAInfo);
+                      AAInfo);
       LdChain.push_back(L.getValue(1));
     }
 
@@ -2887,7 +3083,8 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   unsigned  Align    = LD->getAlignment();
   bool      isVolatile = LD->isVolatile();
   bool      isNonTemporal = LD->isNonTemporal();
-  const MDNode *TBAAInfo = LD->getTBAAInfo();
+  bool      isInvariant = LD->isInvariant();
+  AAMDNodes AAInfo = LD->getAAInfo();
 
   EVT EltVT = WidenVT.getVectorElementType();
   EVT LdEltVT = LdVT.getVectorElementType();
@@ -2899,7 +3096,8 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   unsigned Increment = LdEltVT.getSizeInBits() / 8;
   Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr,
                           LD->getPointerInfo(),
-                          LdEltVT, isVolatile, isNonTemporal, Align, TBAAInfo);
+                          LdEltVT, isVolatile, isNonTemporal, isInvariant,
+                          Align, AAInfo);
   LdChain.push_back(Ops[0].getValue(1));
   unsigned i = 0, Offset = Increment;
   for (i=1; i < NumElts; ++i, Offset += Increment) {
@@ -2909,7 +3107,8 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
                                                      BasePtr.getValueType()));
     Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
                             LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
-                            isVolatile, isNonTemporal, Align, TBAAInfo);
+                            isVolatile, isNonTemporal, isInvariant, Align,
+                            AAInfo);
     LdChain.push_back(Ops[i].getValue(1));
   }
 
@@ -2932,7 +3131,7 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
   unsigned Align = ST->getAlignment();
   bool     isVolatile = ST->isVolatile();
   bool     isNonTemporal = ST->isNonTemporal();
-  const MDNode *TBAAInfo = ST->getTBAAInfo();
+  AAMDNodes AAInfo = ST->getAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
   SDLoc dl(ST);
 
@@ -2959,7 +3158,7 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
         StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
                                     ST->getPointerInfo().getWithOffset(Offset),
                                        isVolatile, isNonTemporal,
-                                       MinAlign(Align, Offset), TBAAInfo));
+                                       MinAlign(Align, Offset), AAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         Idx += NumVTElts;
@@ -2979,7 +3178,7 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
         StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
                                     ST->getPointerInfo().getWithOffset(Offset),
                                        isVolatile, isNonTemporal,
-                                       MinAlign(Align, Offset), TBAAInfo));
+                                       MinAlign(Align, Offset), AAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
@@ -3001,7 +3200,7 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
   unsigned Align = ST->getAlignment();
   bool     isVolatile = ST->isVolatile();
   bool     isNonTemporal = ST->isNonTemporal();
-  const MDNode *TBAAInfo = ST->getTBAAInfo();
+  AAMDNodes AAInfo = ST->getAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
   SDLoc dl(ST);
 
@@ -3025,7 +3224,7 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
   StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr,
                                       ST->getPointerInfo(), StEltVT,
                                       isVolatile, isNonTemporal, Align,
-                                      TBAAInfo));
+                                      AAInfo));
   unsigned Offset = Increment;
   for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
@@ -3036,7 +3235,7 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
     StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr,
                                       ST->getPointerInfo().getWithOffset(Offset),
                                         StEltVT, isVolatile, isNonTemporal,
-                                        MinAlign(Align, Offset), TBAAInfo));
+                                        MinAlign(Align, Offset), AAInfo));
   }
 }
 
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 624003f5070e..db38b76cf93a 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -40,32 +41,29 @@ static cl::opt<signed> RegPressureThreshold(
   "dfa-sched-reg-pressure-threshold", cl::Hidden, cl::ZeroOrMore, cl::init(5),
   cl::desc("Track reg pressure and switch priority to in-depth"));
 
+ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS)
+    : Picker(this), InstrItins(IS->MF->getSubtarget().getInstrItineraryData()) {
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  TRI = STI.getRegisterInfo();
+  TLI = IS->TLI;
+  TII = STI.getInstrInfo();
+  ResourcesModel = TII->CreateTargetScheduleState(STI);
+  // This hard requirement could be relaxed, but for now
+  // do not let it procede.
+  assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+  unsigned NumRC = TRI->getNumRegClasses();
+  RegLimit.resize(NumRC);
+  RegPressure.resize(NumRC);
+  std::fill(RegLimit.begin(), RegLimit.end(), 0);
+  std::fill(RegPressure.begin(), RegPressure.end(), 0);
+  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+                                             E = TRI->regclass_end();
+       I != E; ++I)
+    RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, *IS->MF);
 
-ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) :
-  Picker(this),
- InstrItins(IS->getTargetLowering()->getTargetMachine().getInstrItineraryData())
-{
-   TII = IS->getTargetLowering()->getTargetMachine().getInstrInfo();
-   TRI = IS->getTargetLowering()->getTargetMachine().getRegisterInfo();
-   TLI = IS->getTargetLowering();
-
-   const TargetMachine &tm = (*IS->MF).getTarget();
-   ResourcesModel = tm.getInstrInfo()->CreateTargetScheduleState(&tm,nullptr);
-   // This hard requirement could be relaxed, but for now
-   // do not let it procede.
-   assert (ResourcesModel && "Unimplemented CreateTargetScheduleState.");
-
-   unsigned NumRC = TRI->getNumRegClasses();
-   RegLimit.resize(NumRC);
-   RegPressure.resize(NumRC);
-   std::fill(RegLimit.begin(), RegLimit.end(), 0);
-   std::fill(RegPressure.begin(), RegPressure.end(), 0);
-   for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
-        E = TRI->regclass_end(); I != E; ++I)
-     RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, *IS->MF);
-
-   ParallelLiveRanges = 0;
-   HorizontalVerticalBalance = 0;
+  ParallelLiveRanges = 0;
+  HorizontalVerticalBalance = 0;
 }
 
 unsigned
@@ -319,7 +317,7 @@ void ResourcePriorityQueue::reserveResources(SUnit *SU) {
 
   // If packet is now full, reset the state so in the next cycle
   // we start fresh.
-  if (Packet.size() >= InstrItins->SchedModel->IssueWidth) {
+  if (Packet.size() >= InstrItins->SchedModel.IssueWidth) {
     ResourcesModel->clearResources();
     Packet.clear();
   }
diff --git a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
index ee5429283112..bce69d79ab12 100644
--- a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
+++ b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_SDNODEDBGVALUE_H
-#define LLVM_CODEGEN_SDNODEDBGVALUE_H
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SDNODEDBGVALUE_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_SDNODEDBGVALUE_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/DebugLoc.h"
@@ -44,7 +44,8 @@ private:
     const Value *Const;     // valid for constants
     unsigned FrameIx;       // valid for stack objects
   } u;
-  MDNode *mdPtr;
+  MDNode *Var;
+  MDNode *Expr;
   bool IsIndirect;
   uint64_t Offset;
   DebugLoc DL;
@@ -52,69 +53,72 @@ private:
   bool Invalid;
 public:
   // Constructor for non-constants.
-  SDDbgValue(MDNode *mdP, SDNode *N, unsigned R,
-	     bool indir, uint64_t off, DebugLoc dl,
-             unsigned O) : mdPtr(mdP), IsIndirect(indir),
-			   Offset(off), DL(dl), Order(O),
-                           Invalid(false) {
+  SDDbgValue(MDNode *Var, MDNode *Expr, SDNode *N, unsigned R, bool indir,
+             uint64_t off, DebugLoc dl, unsigned O)
+      : Var(Var), Expr(Expr), IsIndirect(indir), Offset(off), DL(dl), Order(O),
+        Invalid(false) {
     kind = SDNODE;
     u.s.Node = N;
     u.s.ResNo = R;
   }
 
   // Constructor for constants.
-  SDDbgValue(MDNode *mdP, const Value *C, uint64_t off, DebugLoc dl,
-             unsigned O) : 
-    mdPtr(mdP), IsIndirect(false), Offset(off), DL(dl), Order(O),
-    Invalid(false) {
+  SDDbgValue(MDNode *Var, MDNode *Expr, const Value *C, uint64_t off,
+             DebugLoc dl, unsigned O)
+      : Var(Var), Expr(Expr), IsIndirect(false), Offset(off), DL(dl), Order(O),
+        Invalid(false) {
     kind = CONST;
     u.Const = C;
   }
 
   // Constructor for frame indices.
-  SDDbgValue(MDNode *mdP, unsigned FI, uint64_t off, DebugLoc dl, unsigned O) : 
-    mdPtr(mdP), IsIndirect(false), Offset(off), DL(dl), Order(O),
-    Invalid(false) {
+  SDDbgValue(MDNode *Var, MDNode *Expr, unsigned FI, uint64_t off, DebugLoc dl,
+             unsigned O)
+      : Var(Var), Expr(Expr), IsIndirect(false), Offset(off), DL(dl), Order(O),
+        Invalid(false) {
     kind = FRAMEIX;
     u.FrameIx = FI;
   }
 
   // Returns the kind.
-  DbgValueKind getKind() { return kind; }
+  DbgValueKind getKind() const { return kind; }
 
-  // Returns the MDNode pointer.
-  MDNode *getMDPtr() { return mdPtr; }
+  // Returns the MDNode pointer for the variable.
+  MDNode *getVariable() const { return Var; }
+
+  // Returns the MDNode pointer for the expression.
+  MDNode *getExpression() const { return Expr; }
 
   // Returns the SDNode* for a register ref
-  SDNode *getSDNode() { assert (kind==SDNODE); return u.s.Node; }
+  SDNode *getSDNode() const { assert (kind==SDNODE); return u.s.Node; }
 
   // Returns the ResNo for a register ref
-  unsigned getResNo() { assert (kind==SDNODE); return u.s.ResNo; }
+  unsigned getResNo() const { assert (kind==SDNODE); return u.s.ResNo; }
 
   // Returns the Value* for a constant
-  const Value *getConst() { assert (kind==CONST); return u.Const; }
+  const Value *getConst() const { assert (kind==CONST); return u.Const; }
 
   // Returns the FrameIx for a stack object
-  unsigned getFrameIx() { assert (kind==FRAMEIX); return u.FrameIx; }
+  unsigned getFrameIx() const { assert (kind==FRAMEIX); return u.FrameIx; }
 
   // Returns whether this is an indirect value.
-  bool isIndirect() { return IsIndirect; }
+  bool isIndirect() const { return IsIndirect; }
 
   // Returns the offset.
-  uint64_t getOffset() { return Offset; }
+  uint64_t getOffset() const { return Offset; }
 
   // Returns the DebugLoc.
-  DebugLoc getDebugLoc() { return DL; }
+  DebugLoc getDebugLoc() const { return DL; }
 
   // Returns the SDNodeOrder.  This is the order of the preceding node in the
   // input.
-  unsigned getOrder() { return Order; }
+  unsigned getOrder() const { return Order; }
 
   // setIsInvalidated / isInvalidated - Setter / getter of the "Invalidated"
   // property. A SDDbgValue is invalid if the SDNode that produces the value is
   // deleted.
   void setIsInvalidated() { Invalid = true; }
-  bool isInvalidated() { return Invalid; }
+  bool isInvalidated() const { return Invalid; }
 };
 
 } // end llvm namespace
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 4d8c2c78bce6..61a3fd728711 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -221,7 +221,7 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
   SUnit *NewSU;
   bool TryUnfold = false;
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
-    EVT VT = N->getValueType(i);
+    MVT VT = N->getSimpleValueType(i);
     if (VT == MVT::Glue)
       return nullptr;
     else if (VT == MVT::Other)
@@ -229,7 +229,7 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
   }
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     const SDValue &Op = N->getOperand(i);
-    EVT VT = Op.getNode()->getValueType(Op.getResNo());
+    MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
     if (VT == MVT::Glue)
       return nullptr;
   }
@@ -431,17 +431,23 @@ void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
 /// getPhysicalRegisterVT - Returns the ValueType of the physical register
 /// definition of the specified node.
 /// FIXME: Move to SelectionDAG?
-static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
                                  const TargetInstrInfo *TII) {
-  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
-  assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
-  unsigned NumRes = MCID.getNumDefs();
-  for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
-    if (Reg == *ImpDef)
-      break;
-    ++NumRes;
+  unsigned NumRes;
+  if (N->getOpcode() == ISD::CopyFromReg) {
+    // CopyFromReg has: "chain, Val, glue" so operand 1 gives the type.
+    NumRes = 1;
+  } else {
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
+    NumRes = MCID.getNumDefs();
+    for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+      if (Reg == *ImpDef)
+        break;
+      ++NumRes;
+    }
   }
-  return N->getValueType(NumRes);
+  return N->getSimpleValueType(NumRes);
 }
 
 /// CheckForLiveRegDef - Return true and update live register vector if the
@@ -454,7 +460,7 @@ static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg,
   bool Added = false;
   for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
     if (LiveRegDefs[*AI] && LiveRegDefs[*AI] != SU) {
-      if (RegAdded.insert(*AI)) {
+      if (RegAdded.insert(*AI).second) {
         LRegs.push_back(*AI);
         Added = true;
       }
@@ -572,7 +578,7 @@ void ScheduleDAGFast::ListScheduleBottomUp() {
         assert(LRegs.size() == 1 && "Can't handle this yet!");
         unsigned Reg = LRegs[0];
         SUnit *LRDef = LiveRegDefs[Reg];
-        EVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+        MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
         const TargetRegisterClass *RC =
           TRI->getMinimalPhysRegClass(Reg, VT);
         const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index dedca41c3aab..8b54e6568b9e 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -30,8 +30,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <climits>
 using namespace llvm;
 
@@ -166,12 +166,11 @@ public:
       NeedLatency(needlatency), AvailableQueue(availqueue), CurCycle(0),
       Topo(SUnits, nullptr) {
 
-    const TargetMachine &tm = mf.getTarget();
+    const TargetSubtargetInfo &STI = mf.getSubtarget();
     if (DisableSchedCycles || !NeedLatency)
       HazardRec = new ScheduleHazardRecognizer();
     else
-      HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(
-          tm.getSubtargetImpl(), this);
+      HazardRec = STI.getInstrInfo()->CreateTargetHazardRecognizer(&STI, this);
   }
 
   ~ScheduleDAGRRList() {
@@ -946,7 +945,7 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
   SUnit *NewSU;
   bool TryUnfold = false;
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
-    EVT VT = N->getValueType(i);
+    MVT VT = N->getSimpleValueType(i);
     if (VT == MVT::Glue)
       return nullptr;
     else if (VT == MVT::Other)
@@ -954,7 +953,7 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
   }
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     const SDValue &Op = N->getOperand(i);
-    EVT VT = Op.getNode()->getValueType(Op.getResNo());
+    MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
     if (VT == MVT::Glue)
       return nullptr;
   }
@@ -1189,17 +1188,23 @@ void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
 /// getPhysicalRegisterVT - Returns the ValueType of the physical register
 /// definition of the specified node.
 /// FIXME: Move to SelectionDAG?
-static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
                                  const TargetInstrInfo *TII) {
-  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
-  assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
-  unsigned NumRes = MCID.getNumDefs();
-  for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
-    if (Reg == *ImpDef)
-      break;
-    ++NumRes;
+  unsigned NumRes;
+  if (N->getOpcode() == ISD::CopyFromReg) {
+    // CopyFromReg has: "chain, Val, glue" so operand 1 gives the type.
+    NumRes = 1;
+  } else {
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
+    NumRes = MCID.getNumDefs();
+    for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+      if (Reg == *ImpDef)
+        break;
+      ++NumRes;
+    }
   }
-  return N->getValueType(NumRes);
+  return N->getSimpleValueType(NumRes);
 }
 
 /// CheckForLiveRegDef - Return true and update live register vector if the
@@ -1218,7 +1223,7 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
     if (LiveRegDefs[*AliasI] == SU) continue;
 
     // Add Reg to the set of interfering live regs.
-    if (RegAdded.insert(*AliasI)) {
+    if (RegAdded.insert(*AliasI).second) {
       LRegs.push_back(*AliasI);
     }
   }
@@ -1235,7 +1240,7 @@ static void CheckForLiveRegDefMasked(SUnit *SU, const uint32_t *RegMask,
     if (!LiveRegDefs[i]) continue;
     if (LiveRegDefs[i] == SU) continue;
     if (!MachineOperand::clobbersPhysReg(RegMask, i)) continue;
-    if (RegAdded.insert(i))
+    if (RegAdded.insert(i).second)
       LRegs.push_back(i);
   }
 }
@@ -1310,7 +1315,8 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
         SDNode *Gen = LiveRegGens[CallResource]->getNode();
         while (SDNode *Glued = Gen->getGluedNode())
           Gen = Glued;
-        if (!IsChainDependent(Gen, Node, 0, TII) && RegAdded.insert(CallResource))
+        if (!IsChainDependent(Gen, Node, 0, TII) &&
+            RegAdded.insert(CallResource).second)
           LRegs.push_back(CallResource);
       }
     }
@@ -1439,7 +1445,7 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
     assert(LRegs.size() == 1 && "Can't handle this yet!");
     unsigned Reg = LRegs[0];
     SUnit *LRDef = LiveRegDefs[Reg];
-    EVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+    MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
     const TargetRegisterClass *RC =
       TRI->getMinimalPhysRegClass(Reg, VT);
     const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
@@ -1930,8 +1936,8 @@ void RegReductionPQBase::dumpRegPressure() const {
     unsigned Id = RC->getID();
     unsigned RP = RegPressure[Id];
     if (!RP) continue;
-    DEBUG(dbgs() << RC->getName() << ": " << RP << " / " << RegLimit[Id]
-          << '\n');
+    DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / "
+          << RegLimit[Id] << '\n');
   }
 #endif
 }
@@ -2754,7 +2760,7 @@ static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
     if (!SUImpDefs && !SURegMask)
       continue;
     for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
-      EVT VT = N->getValueType(i);
+      MVT VT = N->getSimpleValueType(i);
       if (VT == MVT::Glue || VT == MVT::Other)
         continue;
       if (!N->hasAnyUseOfValue(i))
@@ -2977,9 +2983,9 @@ void RegReductionPQBase::AddPseudoTwoAddrDeps() {
 llvm::ScheduleDAGSDNodes *
 llvm::createBURRListDAGScheduler(SelectionDAGISel *IS,
                                  CodeGenOpt::Level OptLevel) {
-  const TargetMachine &TM = IS->TM;
-  const TargetInstrInfo *TII = TM.getInstrInfo();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
 
   BURegReductionPriorityQueue *PQ =
     new BURegReductionPriorityQueue(*IS->MF, false, false, TII, TRI, nullptr);
@@ -2991,9 +2997,9 @@ llvm::createBURRListDAGScheduler(SelectionDAGISel *IS,
 llvm::ScheduleDAGSDNodes *
 llvm::createSourceListDAGScheduler(SelectionDAGISel *IS,
                                    CodeGenOpt::Level OptLevel) {
-  const TargetMachine &TM = IS->TM;
-  const TargetInstrInfo *TII = TM.getInstrInfo();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
 
   SrcRegReductionPriorityQueue *PQ =
     new SrcRegReductionPriorityQueue(*IS->MF, false, true, TII, TRI, nullptr);
@@ -3005,10 +3011,10 @@ llvm::createSourceListDAGScheduler(SelectionDAGISel *IS,
 llvm::ScheduleDAGSDNodes *
 llvm::createHybridListDAGScheduler(SelectionDAGISel *IS,
                                    CodeGenOpt::Level OptLevel) {
-  const TargetMachine &TM = IS->TM;
-  const TargetInstrInfo *TII = TM.getInstrInfo();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-  const TargetLowering *TLI = IS->getTargetLowering();
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const TargetLowering *TLI = IS->TLI;
 
   HybridBURRPriorityQueue *PQ =
     new HybridBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
@@ -3021,10 +3027,10 @@ llvm::createHybridListDAGScheduler(SelectionDAGISel *IS,
 llvm::ScheduleDAGSDNodes *
 llvm::createILPListDAGScheduler(SelectionDAGISel *IS,
                                 CodeGenOpt::Level OptLevel) {
-  const TargetMachine &TM = IS->TM;
-  const TargetInstrInfo *TII = TM.getInstrInfo();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-  const TargetLowering *TLI = IS->getTargetLowering();
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const TargetLowering *TLI = IS->TLI;
 
   ILPBURRPriorityQueue *PQ =
     new ILPBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index de910b7c861b..f2b18fc84bf5 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -29,7 +29,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
@@ -38,17 +37,17 @@ using namespace llvm;
 
 STATISTIC(LoadsClustered, "Number of loads clustered together");
 
-// This allows latency based scheduler to notice high latency instructions
-// without a target itinerary. The choise if number here has more to do with
-// balancing scheduler heursitics than with the actual machine latency.
+// This allows the latency-based scheduler to notice high latency instructions
+// without a target itinerary. The choice of number here has more to do with
+// balancing scheduler heuristics than with the actual machine latency.
 static cl::opt<int> HighLatencyCycles(
   "sched-high-latency-cycles", cl::Hidden, cl::init(10),
   cl::desc("Roughly estimate the number of cycles that 'long latency'"
            "instructions take for targets with no itinerary"));
 
 ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
-  : ScheduleDAG(mf), BB(nullptr), DAG(nullptr),
-    InstrItins(mf.getTarget().getInstrItineraryData()) {}
+    : ScheduleDAG(mf), BB(nullptr), DAG(nullptr),
+      InstrItins(mf.getSubtarget().getInstrItineraryData()) {}
 
 /// Run - perform scheduling.
 ///
@@ -120,15 +119,20 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
     return;
 
   unsigned ResNo = User->getOperand(2).getResNo();
-  if (Def->isMachineOpcode()) {
+  if (Def->getOpcode() == ISD::CopyFromReg &&
+      cast<RegisterSDNode>(Def->getOperand(1))->getReg() == Reg) {
+    PhysReg = Reg;
+  } else if (Def->isMachineOpcode()) {
     const MCInstrDesc &II = TII->get(Def->getMachineOpcode());
     if (ResNo >= II.getNumDefs() &&
-        II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg) {
+        II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg)
       PhysReg = Reg;
-      const TargetRegisterClass *RC =
-        TRI->getMinimalPhysRegClass(Reg, Def->getValueType(ResNo));
-      Cost = RC->getCopyCost();
-    }
+  }
+
+  if (PhysReg != 0) {
+    const TargetRegisterClass *RC =
+        TRI->getMinimalPhysRegClass(Reg, Def->getSimpleValueType(ResNo));
+    Cost = RC->getCopyCost();
   }
 }
 
@@ -136,7 +140,7 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
 static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG,
                                 SmallVectorImpl<EVT> &VTs,
                                 SDValue ExtraOper = SDValue()) {
-  SmallVector<SDValue, 4> Ops;
+  SmallVector<SDValue, 8> Ops;
   for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
     Ops.push_back(N->getOperand(I));
 
@@ -226,7 +230,7 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
   for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
        I != E && UseCount < 100; ++I, ++UseCount) {
     SDNode *User = *I;
-    if (User == Node || !Visited.insert(User))
+    if (User == Node || !Visited.insert(User).second)
       continue;
     int64_t Offset1, Offset2;
     if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) ||
@@ -339,7 +343,7 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
 
     // Add all operands to the worklist unless they've already been added.
     for (unsigned i = 0, e = NI->getNumOperands(); i != e; ++i)
-      if (Visited.insert(NI->getOperand(i).getNode()))
+      if (Visited.insert(NI->getOperand(i).getNode()).second)
         Worklist.push_back(NI->getOperand(i).getNode());
 
     if (isPassiveNode(NI))  // Leaf node, e.g. a TargetImmediate.
@@ -425,7 +429,7 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
 }
 
 void ScheduleDAGSDNodes::AddSchedEdges() {
-  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
 
   // Check to see if the scheduler cares about latencies.
   bool UnitLatencies = forceUnitLatencies();
@@ -547,6 +551,14 @@ void ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs() {
     NodeNumDefs = 0;
     return;
   }
+  if (POpc == TargetOpcode::PATCHPOINT &&
+      Node->getValueType(0) == MVT::Other) {
+    // PATCHPOINT is defined to have one result, but it might really have none
+    // if we're not using CallingConv::AnyReg. Don't mistake the chain for a
+    // real definition.
+    NodeNumDefs = 0;
+    return;
+  }
   unsigned NRegDefs = SchedDAG->TII->get(Node->getMachineOpcode()).getNumDefs();
   // Some instructions define regs that are not represented in the selection DAG
   // (e.g. unused flags). See tMOVi8. Make sure we don't access past NumValues.
@@ -733,7 +745,7 @@ ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
                   SmallVectorImpl<std::pair<unsigned, MachineInstr*> > &Orders,
                   SmallSet<unsigned, 8> &Seen) {
   unsigned Order = N->getIROrder();
-  if (!Order || !Seen.insert(Order)) {
+  if (!Order || !Seen.insert(Order).second) {
     // Process any valid SDDbgValues even if node does not have any order
     // assigned.
     ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, 0);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 39ebadf3011c..2cd1f4b9bd47 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SCHEDULEDAGSDNODES_H
-#define SCHEDULEDAGSDNODES_H
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index 4589b0c35dc3..418b58eda394 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <climits>
 using namespace llvm;
 
@@ -71,10 +72,8 @@ public:
                   AliasAnalysis *aa,
                   SchedulingPriorityQueue *availqueue)
     : ScheduleDAGSDNodes(mf), AvailableQueue(availqueue), AA(aa) {
-
-    const TargetMachine &tm = mf.getTarget();
-    HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(
-        tm.getSubtargetImpl(), this);
+    const TargetSubtargetInfo &STI = mf.getSubtarget();
+    HazardRec = STI.getInstrInfo()->CreateTargetHazardRecognizer(&STI, this);
   }
 
   ~ScheduleDAGVLIW() {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index a5555364c82a..c819516eca04 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -46,6 +46,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 #include <cmath>
 
@@ -95,7 +96,7 @@ bool ConstantFPSDNode::isValueValidForType(EVT VT,
 /// BUILD_VECTOR where all of the elements are ~0 or undef.
 bool ISD::isBuildVectorAllOnes(const SDNode *N) {
   // Look through a bit convert.
-  if (N->getOpcode() == ISD::BITCAST)
+  while (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0).getNode();
 
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
@@ -143,7 +144,7 @@ bool ISD::isBuildVectorAllOnes(const SDNode *N) {
 /// BUILD_VECTOR where all of the elements are 0 or undef.
 bool ISD::isBuildVectorAllZeros(const SDNode *N) {
   // Look through a bit convert.
-  if (N->getOpcode() == ISD::BITCAST)
+  while (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0).getNode();
 
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
@@ -233,10 +234,10 @@ bool ISD::allOperandsUndef(const SDNode *N) {
   return true;
 }
 
-ISD::NodeType ISD::getExtForLoadExtType(ISD::LoadExtType ExtType) {
+ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
   switch (ExtType) {
   case ISD::EXTLOAD:
-    return ISD::ANY_EXTEND;
+    return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND;
   case ISD::SEXTLOAD:
     return ISD::SIGN_EXTEND;
   case ISD::ZEXTLOAD:
@@ -686,6 +687,15 @@ void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
   DeallocateNode(N);
 }
 
+void SDDbgInfo::erase(const SDNode *Node) {
+  DbgValMapType::iterator I = DbgValMap.find(Node);
+  if (I == DbgValMap.end())
+    return;
+  for (auto &Val: I->second)
+    Val->setIsInvalidated();
+  DbgValMap.erase(I);
+}
+
 void SelectionDAG::DeallocateNode(SDNode *N) {
   if (N->OperandsNeedDelete)
     delete[] N->OperandList;
@@ -696,10 +706,60 @@ void SelectionDAG::DeallocateNode(SDNode *N) {
 
   NodeAllocator.Deallocate(AllNodes.remove(N));
 
-  // If any of the SDDbgValue nodes refer to this SDNode, invalidate them.
-  ArrayRef<SDDbgValue*> DbgVals = DbgInfo->getSDDbgValues(N);
-  for (unsigned i = 0, e = DbgVals.size(); i != e; ++i)
-    DbgVals[i]->setIsInvalidated();
+  // If any of the SDDbgValue nodes refer to this SDNode, invalidate
+  // them and forget about that node.
+  DbgInfo->erase(N);
+}
+
+#ifndef NDEBUG
+/// VerifySDNode - Sanity check the given SDNode.  Aborts if it is invalid.
+static void VerifySDNode(SDNode *N) {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::BUILD_PAIR: {
+    EVT VT = N->getValueType(0);
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) &&
+           "Wrong return type!");
+    assert(N->getNumOperands() == 2 && "Wrong number of operands!");
+    assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
+           "Mismatched operand types!");
+    assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
+           "Wrong operand type!");
+    assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
+           "Wrong return type size");
+    break;
+  }
+  case ISD::BUILD_VECTOR: {
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(N->getValueType(0).isVector() && "Wrong return type!");
+    assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
+           "Wrong number of operands!");
+    EVT EltVT = N->getValueType(0).getVectorElementType();
+    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
+      assert((I->getValueType() == EltVT ||
+             (EltVT.isInteger() && I->getValueType().isInteger() &&
+              EltVT.bitsLE(I->getValueType()))) &&
+            "Wrong operand type!");
+      assert(I->getValueType() == N->getOperand(0).getValueType() &&
+             "Operands must all have the same type");
+    }
+    break;
+  }
+  }
+}
+#endif // NDEBUG
+
+/// \brief Insert a newly allocated node into the DAG.
+///
+/// Handles insertion into the all nodes list and CSE map, as well as
+/// verification and other common operations when a new node is allocated.
+void SelectionDAG::InsertNode(SDNode *N) {
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifySDNode(N);
+#endif
 }
 
 /// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
@@ -839,83 +899,6 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
   return Node;
 }
 
-#ifndef NDEBUG
-/// VerifyNodeCommon - Sanity check the given node.  Aborts if it is invalid.
-static void VerifyNodeCommon(SDNode *N) {
-  switch (N->getOpcode()) {
-  default:
-    break;
-  case ISD::BUILD_PAIR: {
-    EVT VT = N->getValueType(0);
-    assert(N->getNumValues() == 1 && "Too many results!");
-    assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) &&
-           "Wrong return type!");
-    assert(N->getNumOperands() == 2 && "Wrong number of operands!");
-    assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
-           "Mismatched operand types!");
-    assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
-           "Wrong operand type!");
-    assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
-           "Wrong return type size");
-    break;
-  }
-  case ISD::BUILD_VECTOR: {
-    assert(N->getNumValues() == 1 && "Too many results!");
-    assert(N->getValueType(0).isVector() && "Wrong return type!");
-    assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
-           "Wrong number of operands!");
-    EVT EltVT = N->getValueType(0).getVectorElementType();
-    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
-      assert((I->getValueType() == EltVT ||
-             (EltVT.isInteger() && I->getValueType().isInteger() &&
-              EltVT.bitsLE(I->getValueType()))) &&
-            "Wrong operand type!");
-      assert(I->getValueType() == N->getOperand(0).getValueType() &&
-             "Operands must all have the same type");
-    }
-    break;
-  }
-  }
-}
-
-/// VerifySDNode - Sanity check the given SDNode.  Aborts if it is invalid.
-static void VerifySDNode(SDNode *N) {
-  // The SDNode allocators cannot be used to allocate nodes with fields that are
-  // not present in an SDNode!
-  assert(!isa<MemSDNode>(N) && "Bad MemSDNode!");
-  assert(!isa<ShuffleVectorSDNode>(N) && "Bad ShuffleVectorSDNode!");
-  assert(!isa<ConstantSDNode>(N) && "Bad ConstantSDNode!");
-  assert(!isa<ConstantFPSDNode>(N) && "Bad ConstantFPSDNode!");
-  assert(!isa<GlobalAddressSDNode>(N) && "Bad GlobalAddressSDNode!");
-  assert(!isa<FrameIndexSDNode>(N) && "Bad FrameIndexSDNode!");
-  assert(!isa<JumpTableSDNode>(N) && "Bad JumpTableSDNode!");
-  assert(!isa<ConstantPoolSDNode>(N) && "Bad ConstantPoolSDNode!");
-  assert(!isa<BasicBlockSDNode>(N) && "Bad BasicBlockSDNode!");
-  assert(!isa<SrcValueSDNode>(N) && "Bad SrcValueSDNode!");
-  assert(!isa<MDNodeSDNode>(N) && "Bad MDNodeSDNode!");
-  assert(!isa<RegisterSDNode>(N) && "Bad RegisterSDNode!");
-  assert(!isa<BlockAddressSDNode>(N) && "Bad BlockAddressSDNode!");
-  assert(!isa<EHLabelSDNode>(N) && "Bad EHLabelSDNode!");
-  assert(!isa<ExternalSymbolSDNode>(N) && "Bad ExternalSymbolSDNode!");
-  assert(!isa<CondCodeSDNode>(N) && "Bad CondCodeSDNode!");
-  assert(!isa<CvtRndSatSDNode>(N) && "Bad CvtRndSatSDNode!");
-  assert(!isa<VTSDNode>(N) && "Bad VTSDNode!");
-  assert(!isa<MachineSDNode>(N) && "Bad MachineSDNode!");
-
-  VerifyNodeCommon(N);
-}
-
-/// VerifyMachineNode - Sanity check the given MachineNode.  Aborts if it is
-/// invalid.
-static void VerifyMachineNode(SDNode *N) {
-  // The MachineNode allocators cannot be used to allocate nodes with fields
-  // that are not present in a MachineNode!
-  // Currently there are no such nodes.
-
-  VerifyNodeCommon(N);
-}
-#endif // NDEBUG
-
 /// getEVTAlignment - Compute the default alignment value for the
 /// given type.
 ///
@@ -924,22 +907,23 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
                    PointerType::get(Type::getInt8Ty(*getContext()), 0) :
                    VT.getTypeForEVT(*getContext());
 
-  return TM.getTargetLowering()->getDataLayout()->getABITypeAlignment(Ty);
+  return TLI->getDataLayout()->getABITypeAlignment(Ty);
 }
 
 // EntryNode could meaningfully have debug info if we can find it...
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
-  : TM(tm), TSI(*tm.getSelectionDAGInfo()), TLI(nullptr), OptLevel(OL),
-    EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
-    Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
-    UpdateListeners(nullptr) {
+    : TM(tm), TSI(nullptr), TLI(nullptr), OptLevel(OL),
+      EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
+      Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
+      UpdateListeners(nullptr) {
   AllNodes.push_back(&EntryNode);
   DbgInfo = new SDDbgInfo();
 }
 
-void SelectionDAG::init(MachineFunction &mf, const TargetLowering *tli) {
+void SelectionDAG::init(MachineFunction &mf) {
   MF = &mf;
-  TLI = tli;
+  TLI = getSubtarget().getTargetLowering();
+  TSI = getSubtarget().getSelectionDAGInfo();
   Context = &mf.getFunction()->getContext();
 }
 
@@ -1108,8 +1092,6 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT,
   EVT EltVT = VT.getScalarType();
   const ConstantInt *Elt = &Val;
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-
   // In some cases the vector type is legal but the element type is illegal and
   // needs to be promoted, for example v8i8 on ARM.  In this case, promote the
   // inserted value (the type does not need to match the vector element type).
@@ -1185,7 +1167,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT,
   if (!N) {
     N = new (NodeAllocator) ConstantSDNode(isT, isO, Elt, EltVT);
     CSEMap.InsertNode(N, IP);
-    AllNodes.push_back(N);
+    InsertNode(N);
   }
 
   SDValue Result(N, 0);
@@ -1198,7 +1180,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT,
 }
 
 SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, bool isTarget) {
-  return getConstant(Val, TM.getTargetLowering()->getPointerTy(), isTarget);
+  return getConstant(Val, TLI->getPointerTy(), isTarget);
 }
 
 
@@ -1227,7 +1209,7 @@ SDValue SelectionDAG::getConstantFP(const ConstantFP& V, EVT VT, bool isTarget){
   if (!N) {
     N = new (NodeAllocator) ConstantFPSDNode(isTarget, &V, EltVT);
     CSEMap.InsertNode(N, IP);
-    AllNodes.push_back(N);
+    InsertNode(N);
   }
 
   SDValue Result(N, 0);
@@ -1263,7 +1245,6 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
                                        unsigned char TargetFlags) {
   assert((TargetFlags == 0 || isTargetGA) &&
          "Cannot set target flags on target-independent globals");
-  const TargetLowering *TLI = TM.getTargetLowering();
 
   // Truncate (with sign-extension) the offset value to the pointer size.
   unsigned BitWidth = TLI->getPointerTypeSizeInBits(GV->getType());
@@ -1290,7 +1271,7 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
                                                       DL.getDebugLoc(), GV, VT,
                                                       Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+    InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1305,7 +1286,7 @@ SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
 
   SDNode *N = new (NodeAllocator) FrameIndexSDNode(FI, VT, isTarget);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1325,7 +1306,7 @@ SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
   SDNode *N = new (NodeAllocator) JumpTableSDNode(JTI, VT, isTarget,
                                                   TargetFlags);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1336,8 +1317,7 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
-    Alignment =
-    TM.getTargetLowering()->getDataLayout()->getPrefTypeAlignment(C->getType());
+    Alignment = TLI->getDataLayout()->getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
@@ -1352,7 +1332,7 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
   SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset,
                                                      Alignment, TargetFlags);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1364,8 +1344,7 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
-    Alignment =
-    TM.getTargetLowering()->getDataLayout()->getPrefTypeAlignment(C->getType());
+    Alignment = TLI->getDataLayout()->getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
@@ -1380,7 +1359,7 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset,
                                                      Alignment, TargetFlags);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1398,7 +1377,7 @@ SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
   SDNode *N = new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset,
                                                     TargetFlags);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1412,7 +1391,7 @@ SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
 
   SDNode *N = new (NodeAllocator) BasicBlockSDNode(MBB);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1426,7 +1405,7 @@ SDValue SelectionDAG::getValueType(EVT VT) {
 
   if (N) return SDValue(N, 0);
   N = new (NodeAllocator) VTSDNode(VT);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1434,7 +1413,7 @@ SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
   SDNode *&N = ExternalSymbols[Sym];
   if (N) return SDValue(N, 0);
   N = new (NodeAllocator) ExternalSymbolSDNode(false, Sym, 0, VT);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1445,7 +1424,7 @@ SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
                                                                TargetFlags)];
   if (N) return SDValue(N, 0);
   N = new (NodeAllocator) ExternalSymbolSDNode(true, Sym, TargetFlags, VT);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1456,7 +1435,7 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
   if (!CondCodeNodes[Cond]) {
     CondCodeSDNode *N = new (NodeAllocator) CondCodeSDNode(Cond);
     CondCodeNodes[Cond] = N;
-    AllNodes.push_back(N);
+    InsertNode(N);
   }
 
   return SDValue(CondCodeNodes[Cond], 0);
@@ -1594,7 +1573,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
                                             dl.getDebugLoc(), N1, N2,
                                             MaskAlloc);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1640,7 +1619,7 @@ SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl,
                                                            dl.getDebugLoc(),
                                                            Ops, Code);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1654,7 +1633,7 @@ SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
 
   SDNode *N = new (NodeAllocator) RegisterSDNode(RegNo, VT);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1668,7 +1647,7 @@ SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
 
   SDNode *N = new (NodeAllocator) RegisterMaskSDNode(RegMask);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1684,7 +1663,7 @@ SDValue SelectionDAG::getEHLabel(SDLoc dl, SDValue Root, MCSymbol *Label) {
   SDNode *N = new (NodeAllocator) EHLabelSDNode(dl.getIROrder(),
                                                 dl.getDebugLoc(), Root, Label);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1707,7 +1686,7 @@ SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
   SDNode *N = new (NodeAllocator) BlockAddressSDNode(Opc, VT, BA, Offset,
                                                      TargetFlags);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1725,7 +1704,7 @@ SDValue SelectionDAG::getSrcValue(const Value *V) {
 
   SDNode *N = new (NodeAllocator) SrcValueSDNode(V);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1741,7 +1720,7 @@ SDValue SelectionDAG::getMDNode(const MDNode *MD) {
 
   SDNode *N = new (NodeAllocator) MDNodeSDNode(MD);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1762,7 +1741,7 @@ SDValue SelectionDAG::getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr,
                                                       dl.getDebugLoc(),
                                                       VT, Ptr, SrcAS, DestAS);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1770,7 +1749,7 @@ SDValue SelectionDAG::getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr,
 /// the target's desired shift amount type.
 SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
   EVT OpTy = Op.getValueType();
-  EVT ShTy = TM.getTargetLowering()->getShiftAmountTy(LHSTy);
+  EVT ShTy = TLI->getShiftAmountTy(LHSTy);
   if (OpTy == ShTy || OpTy.isVector()) return Op;
 
   ISD::NodeType Opcode = OpTy.bitsGT(ShTy) ?  ISD::TRUNCATE : ISD::ZERO_EXTEND;
@@ -1783,7 +1762,6 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
   MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
   unsigned ByteSize = VT.getStoreSize();
   Type *Ty = VT.getTypeForEVT(*getContext());
-  const TargetLowering *TLI = TM.getTargetLowering();
   unsigned StackAlign =
   std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty), minAlign);
 
@@ -1798,7 +1776,6 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
                             VT2.getStoreSizeInBits())/8;
   Type *Ty1 = VT1.getTypeForEVT(*getContext());
   Type *Ty2 = VT2.getTypeForEVT(*getContext());
-  const TargetLowering *TLI = TM.getTargetLowering();
   const DataLayout *TD = TLI->getDataLayout();
   unsigned Align = std::max(TD->getPrefTypeAlignment(Ty1),
                             TD->getPrefTypeAlignment(Ty2));
@@ -1817,7 +1794,6 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
   case ISD::SETFALSE2: return getConstant(0, VT);
   case ISD::SETTRUE:
   case ISD::SETTRUE2: {
-    const TargetLowering *TLI = TM.getTargetLowering();
     TargetLowering::BooleanContent Cnt =
         TLI->getBooleanContents(N1->getValueType(0));
     return getConstant(
@@ -1906,7 +1882,7 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
       // Ensure that the constant occurs on the RHS.
       ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
       MVT CompVT = N1.getValueType().getSimpleVT();
-      if (!TM.getTargetLowering()->isCondCodeLegal(SwappedCond, CompVT))
+      if (!TLI->isCondCodeLegal(SwappedCond, CompVT))
         return SDValue();
 
       return getSetCC(dl, VT, N2, N1, SwappedCond);
@@ -1942,7 +1918,6 @@ bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
 /// them in the KnownZero/KnownOne bitsets.
 void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
                                     APInt &KnownOne, unsigned Depth) const {
-  const TargetLowering *TLI = TM.getTargetLowering();
   unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
 
   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
@@ -2378,7 +2353,6 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
 /// information.  For example, immediately after an "SRA X, 2", we know that
 /// the top 3 bits are all equal to each other, so we return 3.
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
-  const TargetLowering *TLI = TM.getTargetLowering();
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && "Invalid VT!");
   unsigned VTBits = VT.getScalarType().getSizeInBits();
@@ -2676,10 +2650,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) {
                                          DL.getDebugLoc(), getVTList(VT));
   CSEMap.InsertNode(N, IP);
 
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifySDNode(N);
-#endif
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -2712,6 +2683,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
       return getConstantFP(apf, VT);
     }
     case ISD::BITCAST:
+      if (VT == MVT::f16 && C->getValueType(0) == MVT::i16)
+        return getConstantFP(APFloat(APFloat::IEEEhalf, Val), VT);
       if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
         return getConstantFP(APFloat(APFloat::IEEEsingle, Val), VT);
       else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
@@ -2774,7 +2747,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
     case ISD::FP_TO_UINT: {
       integerPart x[2];
       bool ignored;
-      assert(integerPartWidth >= 64);
+      static_assert(integerPartWidth >= 64, "APFloat parts too small!");
       // FIXME need to be more flexible about rounding mode.
       APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(),
                             Opcode==ISD::FP_TO_SINT,
@@ -2785,7 +2758,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
       return getConstant(api, VT);
     }
     case ISD::BITCAST:
-      if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
+      if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
+        return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), VT);
+      else if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
         return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), VT);
       else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
         return getConstant(V.bitcastToAPInt().getZExtValue(), VT);
@@ -2977,10 +2952,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
                                         DL.getDebugLoc(), VTs, Operand);
   }
 
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifySDNode(N);
-#endif
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -3411,8 +3383,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
   }
 
   // Perform trivial constant folding.
-  SDValue SV = FoldConstantArithmetic(Opcode, VT, N1.getNode(), N2.getNode());
-  if (SV.getNode()) return SV;
+  if (SDValue SV =
+          FoldConstantArithmetic(Opcode, VT, N1.getNode(), N2.getNode()))
+    return SV;
 
   // Canonicalize constant to RHS if commutative.
   if (N1C && !N2C && isCommutativeBinOp(Opcode)) {
@@ -3421,6 +3394,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
   }
 
   // Constant fold FP operations.
+  bool HasFPExceptions = TLI->hasFloatingPointExceptions();
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1.getNode());
   ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2.getNode());
   if (N1CFP) {
@@ -3434,28 +3408,32 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
       switch (Opcode) {
       case ISD::FADD:
         s = V1.add(V2, APFloat::rmNearestTiesToEven);
-        if (s != APFloat::opInvalidOp)
+        if (!HasFPExceptions || s != APFloat::opInvalidOp)
           return getConstantFP(V1, VT);
         break;
       case ISD::FSUB:
         s = V1.subtract(V2, APFloat::rmNearestTiesToEven);
-        if (s!=APFloat::opInvalidOp)
+        if (!HasFPExceptions || s!=APFloat::opInvalidOp)
           return getConstantFP(V1, VT);
         break;
       case ISD::FMUL:
         s = V1.multiply(V2, APFloat::rmNearestTiesToEven);
-        if (s!=APFloat::opInvalidOp)
+        if (!HasFPExceptions || s!=APFloat::opInvalidOp)
           return getConstantFP(V1, VT);
         break;
       case ISD::FDIV:
         s = V1.divide(V2, APFloat::rmNearestTiesToEven);
-        if (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)
+        if (!HasFPExceptions || (s!=APFloat::opInvalidOp &&
+                                 s!=APFloat::opDivByZero)) {
           return getConstantFP(V1, VT);
+        }
         break;
       case ISD::FREM :
         s = V1.mod(V2, APFloat::rmNearestTiesToEven);
-        if (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)
+        if (!HasFPExceptions || (s!=APFloat::opInvalidOp &&
+                                 s!=APFloat::opDivByZero)) {
           return getConstantFP(V1, VT);
+        }
         break;
       case ISD::FCOPYSIGN:
         V1.copySign(V2);
@@ -3572,10 +3550,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
     N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, nuw, nsw, exact);
   }
 
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifySDNode(N);
-#endif
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -3679,10 +3654,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
                                           DL.getDebugLoc(), VTs, N1, N2, N3);
   }
 
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifySDNode(N);
-#endif
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -3848,7 +3820,7 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
   if (VT == MVT::Other) {
     unsigned AS = 0;
     if (DstAlign >= TLI.getDataLayout()->getPointerPrefAlignment(AS) ||
-        TLI.allowsUnalignedMemoryAccesses(VT, AS)) {
+        TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign)) {
       VT = TLI.getPointerTy();
     } else {
       switch (DstAlign & 7) {
@@ -3908,7 +3880,7 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
       unsigned AS = 0;
       if (NumMemOps && AllowOverlap &&
           VTSize >= 8 && NewVTSize < Size &&
-          TLI.allowsUnalignedMemoryAccesses(VT, AS, &Fast) && Fast)
+          TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign, &Fast) && Fast)
         VTSize = Size;
       else {
         VT = NewVT;
@@ -3972,7 +3944,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
-    const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->needsStackRealignment(MF))
        while (NewAlign > Align &&
              TLI.getDataLayout()->exceedsNaturalStackAlignment(NewAlign))
@@ -4028,7 +4000,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
       Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
                              getMemBasePlusOffset(Src, SrcOff, dl, DAG),
                              SrcPtrInfo.getWithOffset(SrcOff), VT, isVol, false,
-                             MinAlign(SrcAlign, SrcOff));
+                             false, MinAlign(SrcAlign, SrcOff));
       Store = DAG.getTruncStore(Chain, dl, Value,
                                 getMemBasePlusOffset(Dst, DstOff, dl, DAG),
                                 DstPtrInfo.getWithOffset(DstOff), VT, isVol,
@@ -4248,9 +4220,8 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
   // Then check to see if we should lower the memcpy with target-specific
   // code. If the target chooses to do this, this is the next best.
   SDValue Result =
-    TSI.EmitTargetCodeForMemcpy(*this, dl, Chain, Dst, Src, Size, Align,
-                                isVol, AlwaysInline,
-                                DstPtrInfo, SrcPtrInfo);
+      TSI->EmitTargetCodeForMemcpy(*this, dl, Chain, Dst, Src, Size, Align,
+                                   isVol, AlwaysInline, DstPtrInfo, SrcPtrInfo);
   if (Result.getNode())
     return Result;
 
@@ -4269,8 +4240,6 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
   // beyond the given memory regions. But fixing this isn't easy, and most
   // people don't care.
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -4316,17 +4285,14 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst,
 
   // Then check to see if we should lower the memmove with target-specific
   // code. If the target chooses to do this, this is the next best.
-  SDValue Result =
-    TSI.EmitTargetCodeForMemmove(*this, dl, Chain, Dst, Src, Size, Align, isVol,
-                                 DstPtrInfo, SrcPtrInfo);
+  SDValue Result = TSI->EmitTargetCodeForMemmove(
+      *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
   if (Result.getNode())
     return Result;
 
   // FIXME: If the memmove is volatile, lowering it to plain libc memmove may
   // not be safe.  See memcpy above for more details.
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -4371,31 +4337,22 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
 
   // Then check to see if we should lower the memset with target-specific
   // code. If the target chooses to do this, this is the next best.
-  SDValue Result =
-    TSI.EmitTargetCodeForMemset(*this, dl, Chain, Dst, Src, Size, Align, isVol,
-                                DstPtrInfo);
+  SDValue Result = TSI->EmitTargetCodeForMemset(*this, dl, Chain, Dst, Src,
+                                                Size, Align, isVol, DstPtrInfo);
   if (Result.getNode())
     return Result;
 
   // Emit a library call.
-  const TargetLowering *TLI = TM.getTargetLowering();
   Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(*getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Node = Dst; Entry.Ty = IntPtrTy;
   Args.push_back(Entry);
-  // Extend or truncate the argument to be an i32 value for the call.
-  if (Src.getValueType().bitsGT(MVT::i32))
-    Src = getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
-  else
-    Src = getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
   Entry.Node = Src;
-  Entry.Ty = Type::getInt32Ty(*getContext());
-  Entry.isSExt = true;
+  Entry.Ty = Src.getValueType().getTypeForEVT(*getContext());
   Args.push_back(Entry);
   Entry.Node = Size;
   Entry.Ty = IntPtrTy;
-  Entry.isSExt = false;
   Args.push_back(Entry);
 
   // FIXME: pass in SDLoc
@@ -4442,7 +4399,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
                                                SuccessOrdering, FailureOrdering,
                                                SynchScope);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -4587,7 +4544,7 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
                                   ArrayRef<SDValue> Ops,
                                   EVT MemVT, MachinePointerInfo PtrInfo,
                                   unsigned Align, bool Vol,
-                                  bool ReadMem, bool WriteMem) {
+                                  bool ReadMem, bool WriteMem, unsigned Size) {
   if (Align == 0)  // Ensure that codegen never sees alignment 0
     Align = getEVTAlignment(MemVT);
 
@@ -4599,8 +4556,10 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
     Flags |= MachineMemOperand::MOLoad;
   if (Vol)
     Flags |= MachineMemOperand::MOVolatile;
+  if (!Size)
+    Size = MemVT.getStoreSize();
   MachineMemOperand *MMO =
-    MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Align);
+    MF.getMachineMemOperand(PtrInfo, Flags, Size, Align);
 
   return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
 }
@@ -4639,7 +4598,7 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
                                                dl.getDebugLoc(), VTList, Ops,
                                                MemVT, MMO);
   }
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -4683,7 +4642,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
                       SDValue Ptr, SDValue Offset,
                       MachinePointerInfo PtrInfo, EVT MemVT,
                       bool isVolatile, bool isNonTemporal, bool isInvariant,
-                      unsigned Alignment, const MDNode *TBAAInfo,
+                      unsigned Alignment, const AAMDNodes &AAInfo,
                       const MDNode *Ranges) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
@@ -4706,7 +4665,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment,
-                            TBAAInfo, Ranges);
+                            AAInfo, Ranges);
   return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
 }
 
@@ -4755,7 +4714,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
                                              dl.getDebugLoc(), VTs, AM, ExtType,
                                              MemVT, MMO);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -4764,12 +4723,12 @@ SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl,
                               MachinePointerInfo PtrInfo,
                               bool isVolatile, bool isNonTemporal,
                               bool isInvariant, unsigned Alignment,
-                              const MDNode *TBAAInfo,
+                              const AAMDNodes &AAInfo,
                               const MDNode *Ranges) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
   return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
                  PtrInfo, VT, isVolatile, isNonTemporal, isInvariant, Alignment,
-                 TBAAInfo, Ranges);
+                 AAInfo, Ranges);
 }
 
 SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl,
@@ -4784,11 +4743,12 @@ SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
                                  SDValue Chain, SDValue Ptr,
                                  MachinePointerInfo PtrInfo, EVT MemVT,
                                  bool isVolatile, bool isNonTemporal,
-                                 unsigned Alignment, const MDNode *TBAAInfo) {
+                                 bool isInvariant, unsigned Alignment,
+                                 const AAMDNodes &AAInfo) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
   return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
-                 PtrInfo, MemVT, isVolatile, isNonTemporal, false, Alignment,
-                 TBAAInfo);
+                 PtrInfo, MemVT, isVolatile, isNonTemporal, isInvariant,
+                 Alignment, AAInfo);
 }
 
 
@@ -4815,7 +4775,7 @@ SelectionDAG::getIndexedLoad(SDValue OrigLoad, SDLoc dl, SDValue Base,
 SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
                                SDValue Ptr, MachinePointerInfo PtrInfo,
                                bool isVolatile, bool isNonTemporal,
-                               unsigned Alignment, const MDNode *TBAAInfo) {
+                               unsigned Alignment, const AAMDNodes &AAInfo) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
@@ -4834,7 +4794,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags,
                             Val.getValueType().getStoreSize(), Alignment,
-                            TBAAInfo);
+                            AAInfo);
 
   return getStore(Chain, dl, Val, Ptr, MMO);
 }
@@ -4862,7 +4822,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
                                               dl.getDebugLoc(), VTs,
                                               ISD::UNINDEXED, false, VT, MMO);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -4870,7 +4830,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
                                     SDValue Ptr, MachinePointerInfo PtrInfo,
                                     EVT SVT,bool isVolatile, bool isNonTemporal,
                                     unsigned Alignment,
-                                    const MDNode *TBAAInfo) {
+                                    const AAMDNodes &AAInfo) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
@@ -4888,7 +4848,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, SVT.getStoreSize(), Alignment,
-                            TBAAInfo);
+                            AAInfo);
 
   return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
 }
@@ -4931,7 +4891,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
                                               dl.getDebugLoc(), VTs,
                                               ISD::UNINDEXED, true, SVT, MMO);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -4958,7 +4918,61 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base,
                                               ST->getMemoryVT(),
                                               ST->getMemOperand());
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue
+SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain,
+                            SDValue Ptr, SDValue Mask, SDValue Src0,
+                            MachineMemOperand *MMO) {
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(ISD::NON_EXTLOAD, ISD::UNINDEXED,
+                                     MMO->isVolatile(),
+                                     MMO->isNonTemporal(),
+                                     MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) MaskedLoadSDNode(dl.getIROrder(),
+                                             dl.getDebugLoc(), Ops, 4, VTs,
+                                             VT, MMO);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
+                               SDValue Ptr, SDValue Mask, MachineMemOperand *MMO) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  EVT VT = Val.getValueType();
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Mask, Val };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
+                                     MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) MaskedStoreSDNode(dl.getIROrder(),
+                                                    dl.getDebugLoc(), Ops, 4,
+                                                    VTs, VT, MMO);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -5037,10 +5051,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
                                    VTs, Ops);
   }
 
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifySDNode(N);
-#endif
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -5120,15 +5131,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                                      VTList, Ops);
     }
   }
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifySDNode(N);
-#endif
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList) {
-  return getNode(Opcode, DL, VTList, ArrayRef<SDValue>());
+  return getNode(Opcode, DL, VTList, None);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
@@ -5510,6 +5518,10 @@ SDNode *SelectionDAG::UpdadeSDLocOnMergedSDNode(SDNode *N, SDLoc OLoc) {
 /// node, and because it doesn't require CSE recalculation for any of
 /// the node's users.
 ///
+/// However, note that MorphNodeTo recursively deletes dead nodes from the DAG.
+/// As a consequence it isn't appropriate to use from within the DAG combiner or
+/// the legalizer which maintain worklists that would need to be updated when
+/// deleting things.
 SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
                                   SDVTList VTs, ArrayRef<SDValue> Ops) {
   unsigned NumOps = Ops.size();
@@ -5576,10 +5588,9 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
   // new operands.
   if (!DeadNodeSet.empty()) {
     SmallVector<SDNode *, 16> DeadNodes;
-    for (SmallPtrSet<SDNode *, 16>::iterator I = DeadNodeSet.begin(),
-         E = DeadNodeSet.end(); I != E; ++I)
-      if ((*I)->use_empty())
-        DeadNodes.push_back(*I);
+    for (SDNode *N : DeadNodeSet)
+      if (N->use_empty())
+        DeadNodes.push_back(N);
     RemoveDeadNodes(DeadNodes);
   }
 
@@ -5748,10 +5759,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, SDLoc DL, SDVTList VTs,
   if (DoCSE)
     CSEMap.InsertNode(N, IP);
 
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifyMachineNode(N);
-#endif
+  InsertNode(N);
   return N;
 }
 
@@ -5797,26 +5805,24 @@ SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
 /// getDbgValue - Creates a SDDbgValue node.
 ///
 /// SDNode
-SDDbgValue *
-SelectionDAG::getDbgValue(MDNode *MDPtr, SDNode *N, unsigned R,
-			  bool IsIndirect, uint64_t Off,
-                          DebugLoc DL, unsigned O) {
-  return new (Allocator) SDDbgValue(MDPtr, N, R, IsIndirect, Off, DL, O);
+SDDbgValue *SelectionDAG::getDbgValue(MDNode *Var, MDNode *Expr, SDNode *N,
+                                      unsigned R, bool IsIndirect, uint64_t Off,
+                                      DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(Var, Expr, N, R, IsIndirect, Off, DL, O);
 }
 
 /// Constant
-SDDbgValue *
-SelectionDAG::getConstantDbgValue(MDNode *MDPtr, const Value *C,
-				  uint64_t Off,
-				  DebugLoc DL, unsigned O) {
-  return new (Allocator) SDDbgValue(MDPtr, C, Off, DL, O);
+SDDbgValue *SelectionDAG::getConstantDbgValue(MDNode *Var, MDNode *Expr,
+                                              const Value *C, uint64_t Off,
+                                              DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(Var, Expr, C, Off, DL, O);
 }
 
 /// FrameIndex
-SDDbgValue *
-SelectionDAG::getFrameIndexDbgValue(MDNode *MDPtr, unsigned FI, uint64_t Off,
-				    DebugLoc DL, unsigned O) {
-  return new (Allocator) SDDbgValue(MDPtr, FI, Off, DL, O);
+SDDbgValue *SelectionDAG::getFrameIndexDbgValue(MDNode *Var, MDNode *Expr,
+                                                unsigned FI, uint64_t Off,
+                                                DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(Var, Expr, FI, Off, DL, O);
 }
 
 namespace {
@@ -6205,9 +6211,11 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
 /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
 /// value is produced by SD.
 void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) {
-  DbgInfo->add(DB, SD, isParameter);
-  if (SD)
+  if (SD) {
+    assert(DbgInfo->getSDDbgValues(SD).empty() || SD->getHasDebugValue());
     SD->setHasDebugValue(true);
+  }
+  DbgInfo->add(DB, SD, isParameter);
 }
 
 /// TransferDbgValues - Transfer SDDbgValues.
@@ -6222,10 +6230,10 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
        I != E; ++I) {
     SDDbgValue *Dbg = *I;
     if (Dbg->getKind() == SDDbgValue::SDNODE) {
-      SDDbgValue *Clone = getDbgValue(Dbg->getMDPtr(), ToNode, To.getResNo(),
-				      Dbg->isIndirect(),
-                                      Dbg->getOffset(), Dbg->getDebugLoc(),
-                                      Dbg->getOrder());
+      SDDbgValue *Clone =
+          getDbgValue(Dbg->getVariable(), Dbg->getExpression(), ToNode,
+                      To.getResNo(), Dbg->isIndirect(), Dbg->getOffset(),
+                      Dbg->getDebugLoc(), Dbg->getOrder());
       ClonedDVs.push_back(Clone);
     }
   }
@@ -6263,7 +6271,10 @@ MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
   assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!");
   assert(isNonTemporal() == MMO->isNonTemporal() &&
          "Non-temporal encoding error!");
-  assert(memvt.getStoreSize() == MMO->getSize() && "Size mismatch!");
+  // We check here that the size of the memory operand fits within the size of
+  // the MMO. This is because the MMO might indicate only a possible address
+  // range instead of specifying the affected memory addresses precisely.
+  assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
 }
 
 MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
@@ -6273,7 +6284,7 @@ MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
   SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(),
                                       MMO->isNonTemporal(), MMO->isInvariant());
   assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!");
-  assert(memvt.getStoreSize() == MMO->getSize() && "Size mismatch!");
+  assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
 }
 
 /// Profile - Gather unique data for the node.
@@ -6417,7 +6428,7 @@ bool SDNode::hasPredecessor(const SDNode *N) const {
 
 bool
 SDNode::hasPredecessorHelper(const SDNode *N,
-                             SmallPtrSet<const SDNode *, 32> &Visited,
+                             SmallPtrSetImpl<const SDNode *> &Visited,
                              SmallVectorImpl<const SDNode *> &Worklist) const {
   if (Visited.empty()) {
     Worklist.push_back(this);
@@ -6433,7 +6444,7 @@ SDNode::hasPredecessorHelper(const SDNode *N,
     const SDNode *M = Worklist.pop_back_val();
     for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) {
       SDNode *Op = M->getOperand(i).getNode();
-      if (Visited.insert(Op))
+      if (Visited.insert(Op).second)
         Worklist.push_back(Op);
       if (Op == N)
         return true;
@@ -6473,7 +6484,6 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
       EVT OperandVT = Operand.getValueType();
       if (OperandVT.isVector()) {
         // A vector operand; extract a single element.
-        const TargetLowering *TLI = TM.getTargetLowering();
         EVT OperandEltVT = OperandVT.getVectorElementType();
         Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                               OperandEltVT,
@@ -6544,16 +6554,29 @@ bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base,
     return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
   }
 
-  // Handle X+C
-  if (isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc &&
-      cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
-    return true;
-
+  // Handle X + C.
+  if (isBaseWithConstantOffset(Loc)) {
+    int64_t LocOffset = cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
+    if (Loc.getOperand(0) == BaseLoc) {
+      // If the base location is a simple address with no offset itself, then
+      // the second load's first add operand should be the base address.
+      if (LocOffset == Dist * (int)Bytes)
+        return true;
+    } else if (isBaseWithConstantOffset(BaseLoc)) {
+      // The base location itself has an offset, so subtract that value from the
+      // second load's offset before comparing to distance * size.
+      int64_t BOffset =
+        cast<ConstantSDNode>(BaseLoc.getOperand(1))->getSExtValue();
+      if (Loc.getOperand(0) == BaseLoc.getOperand(0)) {
+        if ((LocOffset - BOffset) == Dist * (int)Bytes)
+          return true;
+      }
+    }
+  }
   const GlobalValue *GV1 = nullptr;
   const GlobalValue *GV2 = nullptr;
   int64_t Offset1 = 0;
   int64_t Offset2 = 0;
-  const TargetLowering *TLI = TM.getTargetLowering();
   bool isGA1 = TLI->isGAPlusOffset(Loc.getNode(), GV1, Offset1);
   bool isGA2 = TLI->isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
   if (isGA1 && isGA2 && GV1 == GV2)
@@ -6568,7 +6591,6 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   // If this is a GlobalAddress + cst, return the alignment.
   const GlobalValue *GV;
   int64_t GVOffset = 0;
-  const TargetLowering *TLI = TM.getTargetLowering();
   if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
     unsigned PtrWidth = TLI->getPointerTypeSizeInBits(GV->getType());
     APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0);
@@ -6795,8 +6817,8 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
 
 #ifndef NDEBUG
 static void checkForCyclesHelper(const SDNode *N,
-                                 SmallPtrSet<const SDNode*, 32> &Visited,
-                                 SmallPtrSet<const SDNode*, 32> &Checked,
+                                 SmallPtrSetImpl<const SDNode*> &Visited,
+                                 SmallPtrSetImpl<const SDNode*> &Checked,
                                  const llvm::SelectionDAG *DAG) {
   // If this node has already been checked, don't check it again.
   if (Checked.count(N))
@@ -6804,7 +6826,7 @@ static void checkForCyclesHelper(const SDNode *N,
 
   // If a node has already been visited on this depth-first walk, reject it as
   // a cycle.
-  if (!Visited.insert(N)) {
+  if (!Visited.insert(N).second) {
     errs() << "Detected cycle in SelectionDAG\n";
     dbgs() << "Offending node:\n";
     N->dumprFull(DAG); dbgs() << "\n";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c07b5e6a7362..86a63eea7c2a 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -46,6 +47,8 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -58,6 +61,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -563,6 +567,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL,
   } else if (NumParts > 0) {
     // If the intermediate type was expanded, split each the value into
     // legal parts.
+    assert(NumIntermediates != 0 && "division by zero");
     assert(NumParts % NumIntermediates == 0 &&
            "Must expand into a divisible number of parts!");
     unsigned Factor = NumParts / NumIntermediates;
@@ -645,8 +650,10 @@ namespace {
     /// specified value into the registers specified by this object.  This uses
     /// Chain/Flag as the input and updates them for the output Chain/Flag.
     /// If the Flag pointer is NULL, no flag is used.
-    void getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl,
-                       SDValue &Chain, SDValue *Flag, const Value *V) const;
+    void
+    getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl, SDValue &Chain,
+                  SDValue *Flag, const Value *V,
+                  ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const;
 
     /// AddInlineAsmOperands - Add this value to the specified inlineasm node
     /// operand list.  This adds the code marker, matching input operand index
@@ -761,9 +768,10 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
 /// Chain/Flag as the input and updates them for the output Chain/Flag.
 /// If the Flag pointer is NULL, no flag is used.
 void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl,
-                                 SDValue &Chain, SDValue *Flag,
-                                 const Value *V) const {
+                                 SDValue &Chain, SDValue *Flag, const Value *V,
+                                 ISD::NodeType PreferredExtendType) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  ISD::NodeType ExtendKind = PreferredExtendType;
 
   // Get the list of the values's legal parts.
   unsigned NumRegs = Regs.size();
@@ -772,8 +780,9 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl,
     EVT ValueVT = ValueVTs[Value];
     unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
     MVT RegisterVT = RegVTs[Value];
-    ISD::NodeType ExtendKind =
-      TLI.isZExtFree(Val, RegisterVT)? ISD::ZERO_EXTEND: ISD::ANY_EXTEND;
+
+    if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
+      ExtendKind = ISD::ZERO_EXTEND;
 
     getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value),
                    &Parts[Part], NumParts, RegisterVT, V, ExtendKind);
@@ -860,7 +869,7 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
   AA = &aa;
   GFI = gfi;
   LibInfo = li;
-  DL = DAG.getTarget().getDataLayout();
+  DL = DAG.getSubtarget().getDataLayout();
   Context = DAG.getContext();
   LPadToCallSiteMap.clear();
 }
@@ -879,6 +888,7 @@ void SelectionDAGBuilder::clear() {
   CurInst = nullptr;
   HasTailCall = false;
   SDNodeOrder = LowestSDNodeOrder;
+  StatepointLowering.clear();
 }
 
 /// clearDanglingDebugInfo - Clear the dangling debug information
@@ -988,15 +998,16 @@ void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
     DebugLoc dl = DDI.getdl();
     unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
     MDNode *Variable = DI->getVariable();
+    MDNode *Expr = DI->getExpression();
     uint64_t Offset = DI->getOffset();
     // A dbg.value for an alloca is always indirect.
     bool IsIndirect = isa<AllocaInst>(V) || Offset != 0;
     SDDbgValue *SDV;
     if (Val.getNode()) {
-      if (!EmitFuncArgumentDbgValue(V, Variable, Offset, IsIndirect, Val)) {
-        SDV = DAG.getDbgValue(Variable, Val.getNode(),
-                              Val.getResNo(), IsIndirect,
-			      Offset, dl, DbgSDNodeOrder);
+      if (!EmitFuncArgumentDbgValue(V, Variable, Expr, Offset, IsIndirect,
+                                    Val)) {
+        SDV = DAG.getDbgValue(Variable, Expr, Val.getNode(), Val.getResNo(),
+                              IsIndirect, Offset, dl, DbgSDNodeOrder);
         DAG.AddDbgValue(SDV, Val.getNode(), false);
       }
     } else
@@ -1018,8 +1029,8 @@ SDValue SelectionDAGBuilder::getValue(const Value *V) {
   DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
   if (It != FuncInfo.ValueMap.end()) {
     unsigned InReg = It->second;
-    RegsForValue RFV(*DAG.getContext(), *TM.getTargetLowering(),
-                     InReg, V->getType());
+    RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(), InReg,
+                     V->getType());
     SDValue Chain = DAG.getEntryNode();
     N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
     resolveDanglingDebugInfo(V, N);
@@ -1050,10 +1061,10 @@ SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
 /// getValueImpl - Helper function for getValue and getNonRegisterValue.
 /// Create an SDValue for the given value.
 SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (const Constant *C = dyn_cast<Constant>(V)) {
-    EVT VT = TLI->getValueType(V->getType(), true);
+    EVT VT = TLI.getValueType(V->getType(), true);
 
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
       return DAG.getConstant(*CI, VT);
@@ -1063,7 +1074,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
 
     if (isa<ConstantPointerNull>(C)) {
       unsigned AS = V->getType()->getPointerAddressSpace();
-      return DAG.getConstant(0, TLI->getPointerTy(AS));
+      return DAG.getConstant(0, TLI.getPointerTy(AS));
     }
 
     if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
@@ -1117,7 +1128,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
              "Unknown struct or array constant!");
 
       SmallVector<EVT, 4> ValueVTs;
-      ComputeValueVTs(*TLI, C->getType(), ValueVTs);
+      ComputeValueVTs(TLI, C->getType(), ValueVTs);
       unsigned NumElts = ValueVTs.size();
       if (NumElts == 0)
         return SDValue(); // empty struct
@@ -1149,7 +1160,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
         Ops.push_back(getValue(CV->getOperand(i)));
     } else {
       assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
-      EVT EltVT = TLI->getValueType(VecTy->getElementType());
+      EVT EltVT = TLI.getValueType(VecTy->getElementType());
 
       SDValue Op;
       if (EltVT.isFloatingPoint())
@@ -1169,13 +1180,13 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     DenseMap<const AllocaInst*, int>::iterator SI =
       FuncInfo.StaticAllocaMap.find(AI);
     if (SI != FuncInfo.StaticAllocaMap.end())
-      return DAG.getFrameIndex(SI->second, TLI->getPointerTy());
+      return DAG.getFrameIndex(SI->second, TLI.getPointerTy());
   }
 
   // If this is an instruction which fast-isel has deferred, select it now.
   if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
     unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
-    RegsForValue RFV(*DAG.getContext(), *TLI, InReg, Inst->getType());
+    RegsForValue RFV(*DAG.getContext(), TLI, InReg, Inst->getType());
     SDValue Chain = DAG.getEntryNode();
     return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
@@ -1184,7 +1195,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
 }
 
 void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Chain = getControlRoot();
   SmallVector<ISD::OutputArg, 8> Outs;
   SmallVector<SDValue, 8> OutVals;
@@ -1197,7 +1208,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     // Leave Outs empty so that LowerReturn won't try to load return
     // registers the usual way.
     SmallVector<EVT, 1> PtrValueVTs;
-    ComputeValueVTs(*TLI, PointerType::getUnqual(F->getReturnType()),
+    ComputeValueVTs(TLI, PointerType::getUnqual(F->getReturnType()),
                     PtrValueVTs);
 
     SDValue RetPtr = DAG.getRegister(DemoteReg, PtrValueVTs[0]);
@@ -1205,7 +1216,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
 
     SmallVector<EVT, 4> ValueVTs;
     SmallVector<uint64_t, 4> Offsets;
-    ComputeValueVTs(*TLI, I.getOperand(0)->getType(), ValueVTs, &Offsets);
+    ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs, &Offsets);
     unsigned NumValues = ValueVTs.size();
 
     SmallVector<SDValue, 4> Chains(NumValues);
@@ -1224,28 +1235,33 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
                         MVT::Other, Chains);
   } else if (I.getNumOperands() != 0) {
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*TLI, I.getOperand(0)->getType(), ValueVTs);
+    ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
     if (NumValues) {
       SDValue RetOp = getValue(I.getOperand(0));
-      for (unsigned j = 0, f = NumValues; j != f; ++j) {
-        EVT VT = ValueVTs[j];
 
-        ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+      const Function *F = I.getParent()->getParent();
+
+      ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                          Attribute::SExt))
+        ExtendKind = ISD::SIGN_EXTEND;
+      else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                               Attribute::ZExt))
+        ExtendKind = ISD::ZERO_EXTEND;
+
+      LLVMContext &Context = F->getContext();
+      bool RetInReg = F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                                      Attribute::InReg);
 
-        const Function *F = I.getParent()->getParent();
-        if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
-                                            Attribute::SExt))
-          ExtendKind = ISD::SIGN_EXTEND;
-        else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
-                                                 Attribute::ZExt))
-          ExtendKind = ISD::ZERO_EXTEND;
+      for (unsigned j = 0; j != NumValues; ++j) {
+        EVT VT = ValueVTs[j];
 
         if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
-          VT = TLI->getTypeForExtArgOrReturn(VT.getSimpleVT(), ExtendKind);
+          VT = TLI.getTypeForExtArgOrReturn(Context, VT, ExtendKind);
 
-        unsigned NumParts = TLI->getNumRegisters(*DAG.getContext(), VT);
-        MVT PartVT = TLI->getRegisterType(*DAG.getContext(), VT);
+        unsigned NumParts = TLI.getNumRegisters(Context, VT);
+        MVT PartVT = TLI.getRegisterType(Context, VT);
         SmallVector<SDValue, 4> Parts(NumParts);
         getCopyToParts(DAG, getCurSDLoc(),
                        SDValue(RetOp.getNode(), RetOp.getResNo() + j),
@@ -1253,8 +1269,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
 
         // 'inreg' on function refers to return value
         ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-        if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
-                                            Attribute::InReg))
+        if (RetInReg)
           Flags.setInReg();
 
         // Propagate extension type if any
@@ -1275,9 +1290,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
   bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
   CallingConv::ID CallConv =
     DAG.getMachineFunction().getFunction()->getCallingConv();
-  Chain = TM.getTargetLowering()->LowerReturn(Chain, CallConv, isVarArg,
-                                              Outs, OutVals, getCurSDLoc(),
-                                              DAG);
+  Chain = DAG.getTargetLoweringInfo().LowerReturn(
+      Chain, CallConv, isVarArg, Outs, OutVals, getCurSDLoc(), DAG);
 
   // Verify that the target's LowerReturn behaved as expected.
   assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
@@ -1400,7 +1414,7 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
         if (TM.Options.NoNaNsFPMath)
           Condition = getFCmpCodeWithoutNaN(Condition);
       } else {
-        Condition = ISD::SETEQ; // silence warning.
+        (void)Condition; // silence warning.
         llvm_unreachable("Unknown compare instruction");
       }
 
@@ -1601,10 +1615,9 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
   //     jle foo
   //
   if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
-    if (!TM.getTargetLowering()->isJumpExpensive() &&
-        BOp->hasOneUse() &&
-        (BOp->getOpcode() == Instruction::And ||
-         BOp->getOpcode() == Instruction::Or)) {
+    if (!DAG.getTargetLoweringInfo().isJumpExpensive() &&
+        BOp->hasOneUse() && (BOp->getOpcode() == Instruction::And ||
+                             BOp->getOpcode() == Instruction::Or)) {
       FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
                            BOp->getOpcode(), getEdgeWeight(BrMBB, Succ0MBB),
                            getEdgeWeight(BrMBB, Succ1MBB));
@@ -1724,7 +1737,7 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
 void SelectionDAGBuilder::visitJumpTable(JumpTable &JT) {
   // Emit the code for the jump table
   assert(JT.Reg != -1U && "Should lower JT Header first!");
-  EVT PTy = TM.getTargetLowering()->getPointerTy();
+  EVT PTy = DAG.getTargetLoweringInfo().getPointerTy();
   SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
                                      JT.Reg, PTy);
   SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
@@ -1752,10 +1765,10 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
   // can be used as an index into the jump table in a subsequent basic block.
   // This value may be smaller or larger than the target's pointer type, and
   // therefore require extension or truncating.
-  const TargetLowering *TLI = TM.getTargetLowering();
-  SwitchOp = DAG.getZExtOrTrunc(Sub, getCurSDLoc(), TLI->getPointerTy());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SwitchOp = DAG.getZExtOrTrunc(Sub, getCurSDLoc(), TLI.getPointerTy());
 
-  unsigned JumpTableReg = FuncInfo.CreateReg(TLI->getPointerTy());
+  unsigned JumpTableReg = FuncInfo.CreateReg(TLI.getPointerTy());
   SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurSDLoc(),
                                     JumpTableReg, SwitchOp);
   JT.Reg = JumpTableReg;
@@ -1763,12 +1776,10 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
   // Emit the range check for the jump table, and branch to the default block
   // for the switch statement if the value being switched on exceeds the largest
   // case in the switch.
-  SDValue CMP = DAG.getSetCC(getCurSDLoc(),
-                             TLI->getSetCCResultType(*DAG.getContext(),
-                                                     Sub.getValueType()),
-                             Sub,
-                             DAG.getConstant(JTH.Last - JTH.First,VT),
-                             ISD::SETUGT);
+  SDValue CMP =
+      DAG.getSetCC(getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(),
+                                                         Sub.getValueType()),
+                   Sub, DAG.getConstant(JTH.Last - JTH.First, VT), ISD::SETUGT);
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
   // This is used to avoid emitting unnecessary branches to the next block.
@@ -1799,8 +1810,8 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
                                                   MachineBasicBlock *ParentBB) {
 
   // First create the loads to the guard/stack slot for the comparison.
-  const TargetLowering *TLI = TM.getTargetLowering();
-  EVT PtrTy = TLI->getPointerTy();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT PtrTy = TLI.getPointerTy();
 
   MachineFrameInfo *MFI = ParentBB->getParent()->getFrameInfo();
   int FI = MFI->getStackProtectorIndex();
@@ -1810,10 +1821,22 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
 
   unsigned Align =
-    TLI->getDataLayout()->getPrefTypeAlignment(IRGuard->getType());
-  SDValue Guard = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(),
-                              GuardPtr, MachinePointerInfo(IRGuard, 0),
-                              true, false, false, Align);
+    TLI.getDataLayout()->getPrefTypeAlignment(IRGuard->getType());
+
+  SDValue Guard;
+
+  // If GuardReg is set and useLoadStackGuardNode returns true, retrieve the
+  // guard value from the virtual register holding the value. Otherwise, emit a
+  // volatile load to retrieve the stack guard value.
+  unsigned GuardReg = SPD.getGuardReg();
+
+  if (GuardReg && TLI.useLoadStackGuardNode())
+    Guard = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), GuardReg,
+                               PtrTy);
+  else
+    Guard = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(),
+                        GuardPtr, MachinePointerInfo(IRGuard, 0),
+                        true, false, false, Align);
 
   SDValue StackSlot = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(),
                                   StackSlotPtr,
@@ -1824,11 +1847,10 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   EVT VT = Guard.getValueType();
   SDValue Sub = DAG.getNode(ISD::SUB, getCurSDLoc(), VT, Guard, StackSlot);
 
-  SDValue Cmp = DAG.getSetCC(getCurSDLoc(),
-                             TLI->getSetCCResultType(*DAG.getContext(),
-                                                     Sub.getValueType()),
-                             Sub, DAG.getConstant(0, VT),
-                             ISD::SETNE);
+  SDValue Cmp =
+      DAG.getSetCC(getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(),
+                                                         Sub.getValueType()),
+                   Sub, DAG.getConstant(0, VT), ISD::SETNE);
 
   // If the sub is not 0, then we know the guard/stackslot do not equal, so
   // branch to failure MBB.
@@ -1853,10 +1875,10 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
 /// StackProtectorDescriptor.
 void
 SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
-  const TargetLowering *TLI = TM.getTargetLowering();
-  SDValue Chain = TLI->makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL,
-                                   MVT::isVoid, nullptr, 0, false,
-                                   getCurSDLoc(), false, false).second;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Chain =
+      TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
+                      nullptr, 0, false, getCurSDLoc(), false, false).second;
   DAG.setRoot(Chain);
 }
 
@@ -1871,16 +1893,15 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
                             DAG.getConstant(B.First, VT));
 
   // Check range
-  const TargetLowering *TLI = TM.getTargetLowering();
-  SDValue RangeCmp = DAG.getSetCC(getCurSDLoc(),
-                                  TLI->getSetCCResultType(*DAG.getContext(),
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue RangeCmp =
+      DAG.getSetCC(getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(),
                                                          Sub.getValueType()),
-                                  Sub, DAG.getConstant(B.Range, VT),
-                                  ISD::SETUGT);
+                   Sub, DAG.getConstant(B.Range, VT), ISD::SETUGT);
 
   // Determine the type of the test operands.
   bool UsePtrType = false;
-  if (!TLI->isTypeLegal(VT))
+  if (!TLI.isTypeLegal(VT))
     UsePtrType = true;
   else {
     for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
@@ -1892,7 +1913,7 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
       }
   }
   if (UsePtrType) {
-    VT = TLI->getPointerTy();
+    VT = TLI.getPointerTy();
     Sub = DAG.getZExtOrTrunc(Sub, getCurSDLoc(), VT);
   }
 
@@ -1936,22 +1957,18 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
                                        Reg, VT);
   SDValue Cmp;
   unsigned PopCount = CountPopulation_64(B.Mask);
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (PopCount == 1) {
     // Testing for a single bit; just compare the shift count with what it
     // would need to be to shift a 1 bit in that position.
-    Cmp = DAG.getSetCC(getCurSDLoc(),
-                       TLI->getSetCCResultType(*DAG.getContext(), VT),
-                       ShiftOp,
-                       DAG.getConstant(countTrailingZeros(B.Mask), VT),
-                       ISD::SETEQ);
+    Cmp = DAG.getSetCC(
+        getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(), VT), ShiftOp,
+        DAG.getConstant(countTrailingZeros(B.Mask), VT), ISD::SETEQ);
   } else if (PopCount == BB.Range) {
     // There is only one zero bit in the range, test for it directly.
-    Cmp = DAG.getSetCC(getCurSDLoc(),
-                       TLI->getSetCCResultType(*DAG.getContext(), VT),
-                       ShiftOp,
-                       DAG.getConstant(CountTrailingOnes_64(B.Mask), VT),
-                       ISD::SETNE);
+    Cmp = DAG.getSetCC(
+        getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(), VT), ShiftOp,
+        DAG.getConstant(CountTrailingOnes_64(B.Mask), VT), ISD::SETNE);
   } else {
     // Make desired shift
     SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurSDLoc(), VT,
@@ -1961,9 +1978,8 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
     SDValue AndOp = DAG.getNode(ISD::AND, getCurSDLoc(),
                                 VT, SwitchVal, DAG.getConstant(B.Mask, VT));
     Cmp = DAG.getSetCC(getCurSDLoc(),
-                       TLI->getSetCCResultType(*DAG.getContext(), VT),
-                       AndOp, DAG.getConstant(0, VT),
-                       ISD::SETNE);
+                       TLI.getSetCCResultType(*DAG.getContext(), VT), AndOp,
+                       DAG.getConstant(0, VT), ISD::SETNE);
   }
 
   // The branch weight from SwitchBB to B.TargetBB is B.ExtraWeight.
@@ -2001,8 +2017,17 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
   if (isa<InlineAsm>(Callee))
     visitInlineAsm(&I);
   else if (Fn && Fn->isIntrinsic()) {
-    assert(Fn->getIntrinsicID() == Intrinsic::donothing);
-    // Ignore invokes to @llvm.donothing: jump directly to the next BB.
+    switch (Fn->getIntrinsicID()) {
+    default:
+      llvm_unreachable("Cannot invoke this intrinsic");
+    case Intrinsic::donothing:
+      // Ignore invokes to @llvm.donothing: jump directly to the next BB.
+      break;
+    case Intrinsic::experimental_patchpoint_void:
+    case Intrinsic::experimental_patchpoint_i64:
+      visitPatchpoint(&I, LandingPad);
+      break;
+    }
   } else
     LowerCallTo(&I, getValue(Callee), false, LandingPad);
 
@@ -2034,26 +2059,30 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
 
   // If there aren't registers to copy the values into (e.g., during SjLj
   // exceptions), then don't bother to create these DAG nodes.
-  const TargetLowering *TLI = TM.getTargetLowering();
-  if (TLI->getExceptionPointerRegister() == 0 &&
-      TLI->getExceptionSelectorRegister() == 0)
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.getExceptionPointerRegister() == 0 &&
+      TLI.getExceptionSelectorRegister() == 0)
     return;
 
   SmallVector<EVT, 2> ValueVTs;
-  ComputeValueVTs(*TLI, LP.getType(), ValueVTs);
+  ComputeValueVTs(TLI, LP.getType(), ValueVTs);
   assert(ValueVTs.size() == 2 && "Only two-valued landingpads are supported");
 
   // Get the two live-in registers as SDValues. The physregs have already been
   // copied into virtual registers.
   SDValue Ops[2];
-  Ops[0] = DAG.getZExtOrTrunc(
-    DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
-                       FuncInfo.ExceptionPointerVirtReg, TLI->getPointerTy()),
-    getCurSDLoc(), ValueVTs[0]);
+  if (FuncInfo.ExceptionPointerVirtReg) {
+    Ops[0] = DAG.getZExtOrTrunc(
+        DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
+                           FuncInfo.ExceptionPointerVirtReg, TLI.getPointerTy()),
+        getCurSDLoc(), ValueVTs[0]);
+  } else {
+    Ops[0] = DAG.getConstant(0, TLI.getPointerTy());
+  }
   Ops[1] = DAG.getZExtOrTrunc(
-    DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
-                       FuncInfo.ExceptionSelectorVirtReg, TLI->getPointerTy()),
-    getCurSDLoc(), ValueVTs[1]);
+      DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
+                         FuncInfo.ExceptionSelectorVirtReg, TLI.getPointerTy()),
+      getCurSDLoc(), ValueVTs[1]);
 
   // Merge into one.
   SDValue Res = DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
@@ -2061,6 +2090,27 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
   setValue(&LP, Res);
 }
 
+unsigned
+SelectionDAGBuilder::visitLandingPadClauseBB(GlobalValue *ClauseGV,
+                                             MachineBasicBlock *LPadBB) {
+  SDValue Chain = getControlRoot();
+
+  // Get the typeid that we will dispatch on later.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy());
+  unsigned VReg = FuncInfo.MF->getRegInfo().createVirtualRegister(RC);
+  unsigned TypeID = DAG.getMachineFunction().getMMI().getTypeIDFor(ClauseGV);
+  SDValue Sel = DAG.getConstant(TypeID, TLI.getPointerTy());
+  Chain = DAG.getCopyToReg(Chain, getCurSDLoc(), VReg, Sel);
+
+  // Branch to the main landing pad block.
+  MachineBasicBlock *ClauseMBB = FuncInfo.MBB;
+  ClauseMBB->addSuccessor(LPadBB);
+  DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, Chain,
+                          DAG.getBasicBlock(LPadBB)));
+  return VReg;
+}
+
 /// handleSmallSwitchCaseRange - Emit a series of specific tests (suitable for
 /// small case ranges).
 bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
@@ -2218,9 +2268,8 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
 }
 
 static inline bool areJTsAllowed(const TargetLowering &TLI) {
-  return TLI.supportJumpTables() &&
-          (TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
-           TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
+  return TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
+         TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
 }
 
 static APInt ComputeRange(const APInt &First, const APInt &Last) {
@@ -2245,8 +2294,8 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
   for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I)
     TSize += I->size();
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  if (!areJTsAllowed(*TLI) || TSize.ult(TLI->getMinimumJumpTableEntries()))
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!areJTsAllowed(TLI) || TSize.ult(TLI.getMinimumJumpTableEntries()))
     return false;
 
   APInt Range = ComputeRange(First, Last);
@@ -2327,7 +2376,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
   }
 
   // Create a jump table index for this jump table.
-  unsigned JTEncoding = TLI->getJumpTableEncoding();
+  unsigned JTEncoding = TLI.getJumpTableEncoding();
   unsigned JTI = CurMF->getOrCreateJumpTableInfo(JTEncoding)
                        ->createJumpTableIndex(DestBBs);
 
@@ -2347,7 +2396,6 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
 bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
                                                   CaseRecVector& WorkList,
                                                   const Value* SV,
-                                                  MachineBasicBlock* Default,
                                                   MachineBasicBlock* SwitchBB) {
   // Get the MachineFunction which holds the current MBB.  This is used when
   // inserting any additional MBBs necessary to represent the switch.
@@ -2413,8 +2461,8 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
     RSize -= J->size();
   }
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  if (areJTsAllowed(*TLI)) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (areJTsAllowed(TLI)) {
     // If our case is dense we *really* should handle it earlier!
     assert((FMetric > 0) && "Should handle dense range earlier!");
   } else {
@@ -2484,8 +2532,8 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
                                                    const Value* SV,
                                                    MachineBasicBlock* Default,
                                                    MachineBasicBlock* SwitchBB) {
-  const TargetLowering *TLI = TM.getTargetLowering();
-  EVT PTy = TLI->getPointerTy();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT PTy = TLI.getPointerTy();
   unsigned IntPtrBits = PTy.getSizeInBits();
 
   Case& FrontCase = *CR.Range.first;
@@ -2496,19 +2544,18 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
   MachineFunction *CurMF = FuncInfo.MF;
 
   // If target does not have legal shift left, do not emit bit tests at all.
-  if (!TLI->isOperationLegal(ISD::SHL, PTy))
+  if (!TLI.isOperationLegal(ISD::SHL, PTy))
     return false;
 
   size_t numCmps = 0;
-  for (CaseItr I = CR.Range.first, E = CR.Range.second;
-       I!=E; ++I) {
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
     // Single case counts one, case range - two.
     numCmps += (I->Low == I->High ? 1 : 2);
   }
 
   // Count unique destinations
   SmallSet<MachineBasicBlock*, 4> Dests;
-  for (CaseItr I = CR.Range.first, E = CR.Range.second; I!=E; ++I) {
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
     Dests.insert(I->BB);
     if (Dests.size() > 3)
       // Don't bother the code below, if there are too much unique destinations
@@ -2601,25 +2648,22 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
 
   BitTestBlock BTB(lowBound, cmpRange, SV,
                    -1U, MVT::Other, (CR.CaseBB == SwitchBB),
-                   CR.CaseBB, Default, BTC);
+                   CR.CaseBB, Default, std::move(BTC));
 
   if (CR.CaseBB == SwitchBB)
     visitBitTestHeader(BTB, SwitchBB);
 
-  BitTestCases.push_back(BTB);
+  BitTestCases.push_back(std::move(BTB));
 
   return true;
 }
 
 /// Clusterify - Transform simple list of Cases into list of CaseRange's
-size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
-                                       const SwitchInst& SI) {
-  size_t numCmps = 0;
-
+void SelectionDAGBuilder::Clusterify(CaseVector& Cases,
+                                     const SwitchInst& SI) {
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
-  // Start with "simple" cases
-  for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
-       i != e; ++i) {
+  // Start with "simple" cases.
+  for (SwitchInst::ConstCaseIt i : SI.cases()) {
     const BasicBlock *SuccBB = i.getCaseSuccessor();
     MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
 
@@ -2653,13 +2697,15 @@ size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
       }
     }
 
-  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
-    if (I->Low != I->High)
-      // A range counts double, since it requires two compares.
-      ++numCmps;
-  }
+  DEBUG({
+      size_t numCmps = 0;
+      for (auto &I : Cases)
+        // A range counts double, since it requires two compares.
+        numCmps += I.Low != I.High ? 2 : 1;
 
-  return numCmps;
+      dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
+             << ". Total compares: " << numCmps << '\n';
+    });
 }
 
 void SelectionDAGBuilder::UpdateSplitBlock(MachineBasicBlock *First,
@@ -2680,35 +2726,58 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
 
   // Figure out which block is immediately after the current one.
   MachineBasicBlock *NextBlock = nullptr;
+  if (SwitchMBB + 1 != FuncInfo.MF->end())
+    NextBlock = SwitchMBB + 1;
+
+
+  // Create a vector of Cases, sorted so that we can efficiently create a binary
+  // search tree from them.
+  CaseVector Cases;
+  Clusterify(Cases, SI);
+
+  // Get the default destination MBB.
   MachineBasicBlock *Default = FuncInfo.MBBMap[SI.getDefaultDest()];
 
-  // If there is only the default destination, branch to it if it is not the
-  // next basic block.  Otherwise, just fall through.
-  if (!SI.getNumCases()) {
-    // Update machine-CFG edges.
+  if (isa<UnreachableInst>(SI.getDefaultDest()->getFirstNonPHIOrDbg()) &&
+      !Cases.empty()) {
+    // Replace an unreachable default destination with the most popular case
+    // destination.
+    DenseMap<const BasicBlock *, unsigned> Popularity;
+    unsigned MaxPop = 0;
+    const BasicBlock *MaxBB = nullptr;
+    for (auto I : SI.cases()) {
+      const BasicBlock *BB = I.getCaseSuccessor();
+      if (++Popularity[BB] > MaxPop) {
+        MaxPop = Popularity[BB];
+        MaxBB = BB;
+      }
+    }
 
-    // If this is not a fall-through branch, emit the branch.
+    // Set new default.
+    assert(MaxPop > 0);
+    assert(MaxBB);
+    Default = FuncInfo.MBBMap[MaxBB];
+
+    // Remove cases that were pointing to the destination that is now the default.
+    Cases.erase(std::remove_if(Cases.begin(), Cases.end(),
+                               [&](const Case &C) { return C.BB == Default; }),
+                Cases.end());
+  }
+
+  // If there is only the default destination, go there directly.
+  if (Cases.empty()) {
+    // Update machine-CFG edges.
     SwitchMBB->addSuccessor(Default);
-    if (Default != NextBlock)
-      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
-                              MVT::Other, getControlRoot(),
-                              DAG.getBasicBlock(Default)));
 
+    // If this is not a fall-through branch, emit the branch.
+    if (Default != NextBlock) {
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
+                              getControlRoot(), DAG.getBasicBlock(Default)));
+    }
     return;
   }
 
-  // If there are any non-default case statements, create a vector of Cases
-  // representing each one, and sort the vector so that we can efficiently
-  // create a binary search tree from them.
-  CaseVector Cases;
-  size_t numCmps = Clusterify(Cases, SI);
-  DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
-               << ". Total compares: " << numCmps << '\n');
-  (void)numCmps;
-
-  // Get the Value to be switched on and default basic blocks, which will be
-  // inserted into CaseBlock records, representing basic blocks in the binary
-  // search tree.
+  // Get the Value to be switched on.
   const Value *SV = SI.getCondition();
 
   // Push the initial CaseRec onto the worklist
@@ -2738,7 +2807,7 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
 
     // Emit binary tree. We need to pick a pivot, and push left and right ranges
     // onto the worklist. Leafs are handled via handleSmallSwitchRange() call.
-    handleBTSplitSwitchCase(CR, WorkList, SV, Default, SwitchMBB);
+    handleBTSplitSwitchCase(CR, WorkList, SV, SwitchMBB);
   }
 }
 
@@ -2749,7 +2818,7 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
   SmallSet<BasicBlock*, 32> Done;
   for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
     BasicBlock *BB = I.getSuccessor(i);
-    bool Inserted = Done.insert(BB);
+    bool Inserted = Done.insert(BB).second;
     if (!Inserted)
         continue;
 
@@ -2806,7 +2875,8 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
 
-  EVT ShiftTy = TM.getTargetLowering()->getShiftAmountTy(Op2.getValueType());
+  EVT ShiftTy =
+      DAG.getTargetLoweringInfo().getShiftAmountTy(Op2.getValueType());
 
   // Coerce the shift amount to the right type if we can.
   if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
@@ -2861,8 +2931,8 @@ void SelectionDAGBuilder::visitSDiv(const User &I) {
   if (isa<BinaryOperator>(&I) && cast<BinaryOperator>(&I)->isExact() &&
       !isa<ConstantSDNode>(Op1) &&
       isa<ConstantSDNode>(Op2) && !cast<ConstantSDNode>(Op2)->isNullValue())
-    setValue(&I, TM.getTargetLowering()->BuildExactSDIV(Op1, Op2,
-                                                        getCurSDLoc(), DAG));
+    setValue(&I, DAG.getTargetLoweringInfo()
+                     .BuildExactSDIV(Op1, Op2, getCurSDLoc(), DAG));
   else
     setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(),
                              Op1, Op2));
@@ -2878,7 +2948,7 @@ void SelectionDAGBuilder::visitICmp(const User &I) {
   SDValue Op2 = getValue(I.getOperand(1));
   ISD::CondCode Opcode = getICmpCondCode(predicate);
 
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Opcode));
 }
 
@@ -2893,13 +2963,13 @@ void SelectionDAGBuilder::visitFCmp(const User &I) {
   ISD::CondCode Condition = getFCmpCondCode(predicate);
   if (TM.Options.NoNaNsFPMath)
     Condition = getFCmpCodeWithoutNaN(Condition);
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
 }
 
 void SelectionDAGBuilder::visitSelect(const User &I) {
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(*TM.getTargetLowering(), I.getType(), ValueVTs);
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), I.getType(), ValueVTs);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0) return;
 
@@ -2926,7 +2996,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
 void SelectionDAGBuilder::visitTrunc(const User &I) {
   // TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), DestVT, N));
 }
 
@@ -2934,7 +3004,7 @@ void SelectionDAGBuilder::visitZExt(const User &I) {
   // ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
   // ZExt also can't be a cast to bool for same reason. So, nothing much to do
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
@@ -2942,52 +3012,51 @@ void SelectionDAGBuilder::visitSExt(const User &I) {
   // SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
   // SExt also can't be a cast to bool for same reason. So, nothing much to do
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPTrunc(const User &I) {
   // FPTrunc is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  const TargetLowering *TLI = TM.getTargetLowering();
-  EVT DestVT = TLI->getValueType(I.getType());
-  setValue(&I, DAG.getNode(ISD::FP_ROUND, getCurSDLoc(),
-                           DestVT, N,
-                           DAG.getTargetConstant(0, TLI->getPointerTy())));
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_ROUND, getCurSDLoc(), DestVT, N,
+                           DAG.getTargetConstant(0, TLI.getPointerTy())));
 }
 
 void SelectionDAGBuilder::visitFPExt(const User &I) {
   // FPExt is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPToUI(const User &I) {
   // FPToUI is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPToSI(const User &I) {
   // FPToSI is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitUIToFP(const User &I) {
   // UIToFP is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitSIToFP(const User &I) {
   // SIToFP is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
 }
 
@@ -2995,7 +3064,7 @@ void SelectionDAGBuilder::visitPtrToInt(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
 }
 
@@ -3003,13 +3072,13 @@ void SelectionDAGBuilder::visitIntToPtr(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
 }
 
 void SelectionDAGBuilder::visitBitCast(const User &I) {
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
 
   // BitCast assures us that source and destination are the same size so this is
   // either a BITCAST or a no-op.
@@ -3031,7 +3100,7 @@ void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const Value *SV = I.getOperand(0);
   SDValue N = getValue(SV);
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = TLI.getValueType(I.getType());
 
   unsigned SrcAS = SV->getType()->getPointerAddressSpace();
   unsigned DestAS = I.getType()->getPointerAddressSpace();
@@ -3049,8 +3118,7 @@ void SelectionDAGBuilder::visitInsertElement(const User &I) {
   SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(2)),
                                      getCurSDLoc(), TLI.getVectorIdxTy());
   setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurSDLoc(),
-                           TM.getTargetLowering()->getValueType(I.getType()),
-                           InVec, InVal, InIdx));
+                           TLI.getValueType(I.getType()), InVec, InVal, InIdx));
 }
 
 void SelectionDAGBuilder::visitExtractElement(const User &I) {
@@ -3059,8 +3127,7 @@ void SelectionDAGBuilder::visitExtractElement(const User &I) {
   SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(1)),
                                      getCurSDLoc(), TLI.getVectorIdxTy());
   setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(),
-                           TM.getTargetLowering()->getValueType(I.getType()),
-                           InVec, InIdx));
+                           TLI.getValueType(I.getType()), InVec, InIdx));
 }
 
 // Utility for visitShuffleVector - Return true if every element in Mask,
@@ -3082,8 +3149,8 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   ShuffleVectorInst::getShuffleMask(cast<Constant>(I.getOperand(2)), Mask);
   unsigned MaskNumElts = Mask.size();
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  EVT VT = TLI->getValueType(I.getType());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(I.getType());
   EVT SrcVT = Src1.getValueType();
   unsigned SrcNumElts = SrcVT.getVectorNumElements();
 
@@ -3202,9 +3269,9 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
         if (RangeUse[Input] == 0)
           Src = DAG.getUNDEF(VT);
         else
-          Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, getCurSDLoc(), VT,
-                            Src, DAG.getConstant(StartIdx[Input],
-                                                 TLI->getVectorIdxTy()));
+          Src = DAG.getNode(
+              ISD::EXTRACT_SUBVECTOR, getCurSDLoc(), VT, Src,
+              DAG.getConstant(StartIdx[Input], TLI.getVectorIdxTy()));
       }
 
       // Calculate new mask.
@@ -3230,7 +3297,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   // replacing the shuffle with extract and build vector.
   // to insert and build vector.
   EVT EltVT = VT.getVectorElementType();
-  EVT IdxVT = TLI->getVectorIdxTy();
+  EVT IdxVT = TLI.getVectorIdxTy();
   SmallVector<SDValue,8> Ops;
   for (unsigned i = 0; i != MaskNumElts; ++i) {
     int Idx = Mask[i];
@@ -3262,16 +3329,22 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
 
   unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
 
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> AggValueVTs;
-  ComputeValueVTs(*TLI, AggTy, AggValueVTs);
+  ComputeValueVTs(TLI, AggTy, AggValueVTs);
   SmallVector<EVT, 4> ValValueVTs;
-  ComputeValueVTs(*TLI, ValTy, ValValueVTs);
+  ComputeValueVTs(TLI, ValTy, ValValueVTs);
 
   unsigned NumAggValues = AggValueVTs.size();
   unsigned NumValValues = ValValueVTs.size();
   SmallVector<SDValue, 4> Values(NumAggValues);
 
+  // Ignore an insertvalue that produces an empty object
+  if (!NumAggValues) {
+    setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
+    return;
+  }
+
   SDValue Agg = getValue(Op0);
   unsigned i = 0;
   // Copy the beginning value(s) from the original aggregate.
@@ -3302,9 +3375,9 @@ void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
 
   unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
 
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> ValValueVTs;
-  ComputeValueVTs(*TLI, ValTy, ValValueVTs);
+  ComputeValueVTs(TLI, ValTy, ValValueVTs);
 
   unsigned NumValValues = ValValueVTs.size();
 
@@ -3353,13 +3426,13 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       Ty = cast<SequentialType>(Ty)->getElementType();
 
       // If this is a constant subscript, handle it quickly.
-      const TargetLowering *TLI = TM.getTargetLowering();
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
         if (CI->isZero()) continue;
         uint64_t Offs =
             DL->getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
         SDValue OffsVal;
-        EVT PTy = TLI->getPointerTy(AS);
+        EVT PTy = TLI.getPointerTy(AS);
         unsigned PtrBits = PTy.getSizeInBits();
         if (PtrBits < 64)
           OffsVal = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), PTy,
@@ -3373,8 +3446,8 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       }
 
       // N = N + Idx * ElementSize;
-      APInt ElementSize = APInt(TLI->getPointerSizeInBits(AS),
-                                DL->getTypeAllocSize(Ty));
+      APInt ElementSize =
+          APInt(TLI.getPointerSizeInBits(AS), DL->getTypeAllocSize(Ty));
       SDValue IdxN = getValue(Idx);
 
       // If the index is smaller or larger than intptr_t, truncate or extend
@@ -3411,15 +3484,15 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
     return;   // getValue will auto-populate this.
 
   Type *Ty = I.getAllocatedType();
-  const TargetLowering *TLI = TM.getTargetLowering();
-  uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(Ty);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
   unsigned Align =
-    std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty),
-             I.getAlignment());
+      std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty),
+               I.getAlignment());
 
   SDValue AllocSize = getValue(I.getArraySize());
 
-  EVT IntPtr = TLI->getPointerTy();
+  EVT IntPtr = TLI.getPointerTy();
   if (AllocSize.getValueType() != IntPtr)
     AllocSize = DAG.getZExtOrTrunc(AllocSize, getCurSDLoc(), IntPtr);
 
@@ -3430,7 +3503,8 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
   // Handle alignment.  If the requested alignment is less than or equal to
   // the stack alignment, ignore it.  If the size is greater than or equal to
   // the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
-  unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+  unsigned StackAlign =
+      DAG.getSubtarget().getFrameLowering()->getStackAlignment();
   if (Align <= StackAlign)
     Align = 0;
 
@@ -3464,15 +3538,18 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   Type *Ty = I.getType();
 
   bool isVolatile = I.isVolatile();
-  bool isNonTemporal = I.getMetadata("nontemporal") != nullptr;
-  bool isInvariant = I.getMetadata("invariant.load") != nullptr;
+  bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr;
+  bool isInvariant = I.getMetadata(LLVMContext::MD_invariant_load) != nullptr;
   unsigned Alignment = I.getAlignment();
-  const MDNode *TBAAInfo = I.getMetadata(LLVMContext::MD_tbaa);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(*TM.getTargetLowering(), Ty, ValueVTs, &Offsets);
+  ComputeValueVTs(TLI, Ty, ValueVTs, &Offsets);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0)
     return;
@@ -3483,7 +3560,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
     // Serialize volatile loads with other side effects.
     Root = getRoot();
   else if (AA->pointsToConstantMemory(
-             AliasAnalysis::Location(SV, AA->getTypeStoreSize(Ty), TBAAInfo))) {
+             AliasAnalysis::Location(SV, AA->getTypeStoreSize(Ty), AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
     Root = DAG.getEntryNode();
     ConstantMemory = true;
@@ -3492,9 +3569,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
     Root = DAG.getRoot();
   }
 
-  const TargetLowering *TLI = TM.getTargetLowering();
   if (isVolatile)
-    Root = TLI->prepareVolatileOrAtomicLoad(Root, getCurSDLoc(), DAG);
+    Root = TLI.prepareVolatileOrAtomicLoad(Root, getCurSDLoc(), DAG);
 
   SmallVector<SDValue, 4> Values(NumValues);
   SmallVector<SDValue, 4> Chains(std::min(unsigned(MaxParallelChains),
@@ -3520,7 +3596,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
                             DAG.getConstant(Offsets[i], PtrVT));
     SDValue L = DAG.getLoad(ValueVTs[i], getCurSDLoc(), Root,
                             A, MachinePointerInfo(SV, Offsets[i]), isVolatile,
-                            isNonTemporal, isInvariant, Alignment, TBAAInfo,
+                            isNonTemporal, isInvariant, Alignment, AAInfo,
                             Ranges);
 
     Values[i] = L;
@@ -3549,7 +3625,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
 
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(*TM.getTargetLowering(), SrcV->getType(), ValueVTs, &Offsets);
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), SrcV->getType(),
+                  ValueVTs, &Offsets);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0)
     return;
@@ -3565,9 +3642,11 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
                                           NumValues));
   EVT PtrVT = Ptr.getValueType();
   bool isVolatile = I.isVolatile();
-  bool isNonTemporal = I.getMetadata("nontemporal") != nullptr;
+  bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr;
   unsigned Alignment = I.getAlignment();
-  const MDNode *TBAAInfo = I.getMetadata(LLVMContext::MD_tbaa);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
 
   unsigned ChainI = 0;
   for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
@@ -3583,7 +3662,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
     SDValue St = DAG.getStore(Root, getCurSDLoc(),
                               SDValue(Src.getNode(), Src.getResNo() + i),
                               Add, MachinePointerInfo(PtrV, Offsets[i]),
-                              isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                              isVolatile, isNonTemporal, Alignment, AAInfo);
     Chains[ChainI] = St;
   }
 
@@ -3592,28 +3671,70 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   DAG.setRoot(StoreNode);
 }
 
-static SDValue InsertFenceForAtomic(SDValue Chain, AtomicOrdering Order,
-                                    SynchronizationScope Scope,
-                                    bool Before, SDLoc dl,
-                                    SelectionDAG &DAG,
-                                    const TargetLowering &TLI) {
-  // Fence, if necessary
-  if (Before) {
-    if (Order == AcquireRelease || Order == SequentiallyConsistent)
-      Order = Release;
-    else if (Order == Acquire || Order == Monotonic || Order == Unordered)
-      return Chain;
-  } else {
-    if (Order == AcquireRelease)
-      Order = Acquire;
-    else if (Order == Release || Order == Monotonic || Order == Unordered)
-      return Chain;
+void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // llvm.masked.store.*(Src0, Ptr, alignemt, Mask)
+  Value  *PtrOperand = I.getArgOperand(1);
+  SDValue Ptr = getValue(PtrOperand);
+  SDValue Src0 = getValue(I.getArgOperand(0));
+  SDValue Mask = getValue(I.getArgOperand(3));
+  EVT VT = Src0.getValueType();
+  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlignment(VT);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+
+  MachineMemOperand *MMO =
+    DAG.getMachineFunction().
+    getMachineMemOperand(MachinePointerInfo(PtrOperand),
+                          MachineMemOperand::MOStore,  VT.getStoreSize(),
+                          Alignment, AAInfo);
+  SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, MMO);
+  DAG.setRoot(StoreNode);
+  setValue(&I, StoreNode);
+}
+
+void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
+  Value  *PtrOperand = I.getArgOperand(0);
+  SDValue Ptr = getValue(PtrOperand);
+  SDValue Src0 = getValue(I.getArgOperand(3));
+  SDValue Mask = getValue(I.getArgOperand(2));
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(I.getType());
+  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlignment(VT);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+  const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
+
+  SDValue InChain = DAG.getRoot();
+  if (AA->pointsToConstantMemory(
+      AliasAnalysis::Location(PtrOperand,
+                              AA->getTypeStoreSize(I.getType()),
+                              AAInfo))) {
+    // Do not serialize (non-volatile) loads of constant memory with anything.
+    InChain = DAG.getEntryNode();
   }
-  SDValue Ops[3];
-  Ops[0] = Chain;
-  Ops[1] = DAG.getConstant(Order, TLI.getPointerTy());
-  Ops[2] = DAG.getConstant(Scope, TLI.getPointerTy());
-  return DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops);
+
+  MachineMemOperand *MMO =
+    DAG.getMachineFunction().
+    getMachineMemOperand(MachinePointerInfo(PtrOperand),
+                          MachineMemOperand::MOLoad,  VT.getStoreSize(),
+                          Alignment, AAInfo, Ranges);
+
+  SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, MMO);
+  SDValue OutChain = Load.getValue(1);
+  DAG.setRoot(OutChain);
+  setValue(&I, Load);
 }
 
 void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
@@ -3624,27 +3745,16 @@ void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
 
   SDValue InChain = getRoot();
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  if (TLI->getInsertFencesForAtomic())
-    InChain = InsertFenceForAtomic(InChain, SuccessOrder, Scope, true, dl,
-                                   DAG, *TLI);
-
   MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType();
   SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other);
   SDValue L = DAG.getAtomicCmpSwap(
       ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, MemVT, VTs, InChain,
       getValue(I.getPointerOperand()), getValue(I.getCompareOperand()),
       getValue(I.getNewValOperand()), MachinePointerInfo(I.getPointerOperand()),
-      0 /* Alignment */,
-      TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder,
-      TLI->getInsertFencesForAtomic() ? Monotonic : FailureOrder, Scope);
+      /*Alignment=*/ 0, SuccessOrder, FailureOrder, Scope);
 
   SDValue OutChain = L.getValue(2);
 
-  if (TLI->getInsertFencesForAtomic())
-    OutChain = InsertFenceForAtomic(OutChain, SuccessOrder, Scope, false, dl,
-                                    DAG, *TLI);
-
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
@@ -3671,38 +3781,28 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
 
   SDValue InChain = getRoot();
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  if (TLI->getInsertFencesForAtomic())
-    InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
-                                   DAG, *TLI);
-
   SDValue L =
     DAG.getAtomic(NT, dl,
                   getValue(I.getValOperand()).getSimpleValueType(),
                   InChain,
                   getValue(I.getPointerOperand()),
                   getValue(I.getValOperand()),
-                  I.getPointerOperand(), 0 /* Alignment */,
-                  TLI->getInsertFencesForAtomic() ? Monotonic : Order,
-                  Scope);
+                  I.getPointerOperand(),
+                  /* Alignment=*/ 0, Order, Scope);
 
   SDValue OutChain = L.getValue(1);
 
-  if (TLI->getInsertFencesForAtomic())
-    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
-                                    DAG, *TLI);
-
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
 
 void SelectionDAGBuilder::visitFence(const FenceInst &I) {
   SDLoc dl = getCurSDLoc();
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Ops[3];
   Ops[0] = getRoot();
-  Ops[1] = DAG.getConstant(I.getOrdering(), TLI->getPointerTy());
-  Ops[2] = DAG.getConstant(I.getSynchScope(), TLI->getPointerTy());
+  Ops[1] = DAG.getConstant(I.getOrdering(), TLI.getPointerTy());
+  Ops[2] = DAG.getConstant(I.getSynchScope(), TLI.getPointerTy());
   DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops));
 }
 
@@ -3713,8 +3813,8 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
 
   SDValue InChain = getRoot();
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  EVT VT = TLI->getValueType(I.getType());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(I.getType());
 
   if (I.getAlignment() < VT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic load");
@@ -3728,19 +3828,14 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
                            I.getAlignment() ? I.getAlignment() :
                                               DAG.getEVTAlignment(VT));
 
-  InChain = TLI->prepareVolatileOrAtomicLoad(InChain, dl, DAG);
+  InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
   SDValue L =
       DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain,
                     getValue(I.getPointerOperand()), MMO,
-                    TLI->getInsertFencesForAtomic() ? Monotonic : Order,
-                    Scope);
+                    Order, Scope);
 
   SDValue OutChain = L.getValue(1);
 
-  if (TLI->getInsertFencesForAtomic())
-    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
-                                    DAG, *TLI);
-
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
@@ -3753,28 +3848,19 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
 
   SDValue InChain = getRoot();
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  EVT VT = TLI->getValueType(I.getValueOperand()->getType());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(I.getValueOperand()->getType());
 
   if (I.getAlignment() < VT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic store");
 
-  if (TLI->getInsertFencesForAtomic())
-    InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
-                                   DAG, *TLI);
-
   SDValue OutChain =
     DAG.getAtomic(ISD::ATOMIC_STORE, dl, VT,
                   InChain,
                   getValue(I.getPointerOperand()),
                   getValue(I.getValueOperand()),
                   I.getPointerOperand(), I.getAlignment(),
-                  TLI->getInsertFencesForAtomic() ? Monotonic : Order,
-                  Scope);
-
-  if (TLI->getInsertFencesForAtomic())
-    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
-                                    DAG, *TLI);
+                  Order, Scope);
 
   DAG.setRoot(OutChain);
 }
@@ -3799,13 +3885,13 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
 
   // Info is set by getTgtMemInstrinsic
   TargetLowering::IntrinsicInfo Info;
-  const TargetLowering *TLI = TM.getTargetLowering();
-  bool IsTgtIntrinsic = TLI->getTgtMemIntrinsic(Info, I, Intrinsic);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I, Intrinsic);
 
   // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
   if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
       Info.opc == ISD::INTRINSIC_W_CHAIN)
-    Ops.push_back(DAG.getTargetConstant(Intrinsic, TLI->getPointerTy()));
+    Ops.push_back(DAG.getTargetConstant(Intrinsic, TLI.getPointerTy()));
 
   // Add all operands of the call to the operand list.
   for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
@@ -3814,7 +3900,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   }
 
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(*TLI, I.getType(), ValueVTs);
+  ComputeValueVTs(TLI, I.getType(), ValueVTs);
 
   if (HasChain)
     ValueVTs.push_back(MVT::Other);
@@ -3829,7 +3915,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
                                      VTs, Ops, Info.memVT,
                                    MachinePointerInfo(Info.ptrVal, Info.offset),
                                      Info.align, Info.vol,
-                                     Info.readMem, Info.writeMem);
+                                     Info.readMem, Info.writeMem, Info.size);
   } else if (!HasChain) {
     Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
   } else if (!I.getType()->isVoidTy()) {
@@ -3848,7 +3934,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
 
   if (!I.getType()->isVoidTy()) {
     if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
-      EVT VT = TLI->getValueType(PTy);
+      EVT VT = TLI.getValueType(PTy);
       Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);
     }
 
@@ -4555,16 +4641,17 @@ static unsigned getTruncatedArgReg(const SDValue &N) {
 /// EmitFuncArgumentDbgValue - If the DbgValueInst is a dbg_value of a function
 /// argument, create the corresponding DBG_VALUE machine instruction for it now.
 /// At the end of instruction selection, they will be inserted to the entry BB.
-bool
-SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
-                                              int64_t Offset, bool IsIndirect,
-                                              const SDValue &N) {
+bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V,
+                                                   MDNode *Variable,
+                                                   MDNode *Expr, int64_t Offset,
+                                                   bool IsIndirect,
+                                                   const SDValue &N) {
   const Argument *Arg = dyn_cast<Argument>(V);
   if (!Arg)
     return false;
 
   MachineFunction &MF = DAG.getMachineFunction();
-  const TargetInstrInfo *TII = DAG.getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
 
   // Ignore inlined function arguments here.
   DIVariable DV(Variable);
@@ -4610,14 +4697,16 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
     return false;
 
   if (Op->isReg())
-    FuncInfo.ArgDbgValues.push_back(BuildMI(MF, getCurDebugLoc(),
-                                            TII->get(TargetOpcode::DBG_VALUE),
-                                            IsIndirect,
-                                            Op->getReg(), Offset, Variable));
+    FuncInfo.ArgDbgValues.push_back(
+        BuildMI(MF, getCurDebugLoc(), TII->get(TargetOpcode::DBG_VALUE),
+                IsIndirect, Op->getReg(), Offset, Variable, Expr));
   else
     FuncInfo.ArgDbgValues.push_back(
-      BuildMI(MF, getCurDebugLoc(), TII->get(TargetOpcode::DBG_VALUE))
-          .addOperand(*Op).addImm(Offset).addMetadata(Variable));
+        BuildMI(MF, getCurDebugLoc(), TII->get(TargetOpcode::DBG_VALUE))
+            .addOperand(*Op)
+            .addImm(Offset)
+            .addMetadata(Variable)
+            .addMetadata(Expr));
 
   return true;
 }
@@ -4635,7 +4724,7 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
 /// otherwise lower it and return null.
 const char *
 SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc sdl = getCurSDLoc();
   DebugLoc dl = getCurDebugLoc();
   SDValue Res;
@@ -4649,17 +4738,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::vaend:    visitVAEnd(I); return nullptr;
   case Intrinsic::vacopy:   visitVACopy(I); return nullptr;
   case Intrinsic::returnaddress:
-    setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl, TLI->getPointerTy(),
+    setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl, TLI.getPointerTy(),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::frameaddress:
-    setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl, TLI->getPointerTy(),
+    setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl, TLI.getPointerTy(),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::read_register: {
     Value *Reg = I.getArgOperand(0);
-    SDValue RegName = DAG.getMDNode(cast<MDNode>(Reg));
-    EVT VT = TM.getTargetLowering()->getValueType(I.getType());
+    SDValue RegName =
+        DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
+    EVT VT = TLI.getValueType(I.getType());
     setValue(&I, DAG.getNode(ISD::READ_REGISTER, sdl, VT, RegName));
     return nullptr;
   }
@@ -4667,15 +4757,16 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     Value *Reg = I.getArgOperand(0);
     Value *RegValue = I.getArgOperand(1);
     SDValue Chain = getValue(RegValue).getOperand(0);
-    SDValue RegName = DAG.getMDNode(cast<MDNode>(Reg));
+    SDValue RegName =
+        DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
     DAG.setRoot(DAG.getNode(ISD::WRITE_REGISTER, sdl, MVT::Other, Chain,
                             RegName, getValue(RegValue)));
     return nullptr;
   }
   case Intrinsic::setjmp:
-    return &"_setjmp"[!TLI->usesUnderscoreSetJmp()];
+    return &"_setjmp"[!TLI.usesUnderscoreSetJmp()];
   case Intrinsic::longjmp:
-    return &"_longjmp"[!TLI->usesUnderscoreLongJmp()];
+    return &"_longjmp"[!TLI.usesUnderscoreLongJmp()];
   case Intrinsic::memcpy: {
     // Assert for address < 256 since we support only user defined address
     // spaces.
@@ -4736,6 +4827,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst &DI = cast<DbgDeclareInst>(I);
     MDNode *Variable = DI.getVariable();
+    MDNode *Expression = DI.getExpression();
     const Value *Address = DI.getAddress();
     DIVariable DIVar(Variable);
     assert((!DIVar || DIVar.isVariable()) &&
@@ -4771,16 +4863,16 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
         FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
         if (FINode)
           // Byval parameter.  We have a frame index at this point.
-          SDV = DAG.getFrameIndexDbgValue(Variable, FINode->getIndex(),
-					  0, dl, SDNodeOrder);
+          SDV = DAG.getFrameIndexDbgValue(
+              Variable, Expression, FINode->getIndex(), 0, dl, SDNodeOrder);
         else {
           // Address is an argument, so try to emit its dbg value using
           // virtual register info from the FuncInfo.ValueMap.
-          EmitFuncArgumentDbgValue(Address, Variable, 0, false, N);
+          EmitFuncArgumentDbgValue(Address, Variable, Expression, 0, false, N);
           return nullptr;
         }
       } else if (AI)
-        SDV = DAG.getDbgValue(Variable, N.getNode(), N.getResNo(),
+        SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
                               true, 0, dl, SDNodeOrder);
       else {
         // Can't do anything with other non-AI cases yet.
@@ -4793,7 +4885,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     } else {
       // If Address is an argument then try to emit its dbg value using
       // virtual register info from the FuncInfo.ValueMap.
-      if (!EmitFuncArgumentDbgValue(Address, Variable, 0, false, N)) {
+      if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, 0, false,
+                                    N)) {
         // If variable is pinned by a alloca in dominating bb then
         // use StaticAllocaMap.
         if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
@@ -4801,7 +4894,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
             DenseMap<const AllocaInst*, int>::iterator SI =
               FuncInfo.StaticAllocaMap.find(AI);
             if (SI != FuncInfo.StaticAllocaMap.end()) {
-              SDV = DAG.getFrameIndexDbgValue(Variable, SI->second,
+              SDV = DAG.getFrameIndexDbgValue(Variable, Expression, SI->second,
                                               0, dl, SDNodeOrder);
               DAG.AddDbgValue(SDV, nullptr, false);
               return nullptr;
@@ -4822,6 +4915,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       return nullptr;
 
     MDNode *Variable = DI.getVariable();
+    MDNode *Expression = DI.getExpression();
     uint64_t Offset = DI.getOffset();
     const Value *V = DI.getValue();
     if (!V)
@@ -4829,7 +4923,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 
     SDDbgValue *SDV;
     if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V)) {
-      SDV = DAG.getConstantDbgValue(Variable, V, Offset, dl, SDNodeOrder);
+      SDV = DAG.getConstantDbgValue(Variable, Expression, V, Offset, dl,
+                                    SDNodeOrder);
       DAG.AddDbgValue(SDV, nullptr, false);
     } else {
       // Do not use getValue() in here; we don't want to generate code at
@@ -4841,10 +4936,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       if (N.getNode()) {
         // A dbg.value for an alloca is always indirect.
         bool IsIndirect = isa<AllocaInst>(V) || Offset != 0;
-        if (!EmitFuncArgumentDbgValue(V, Variable, Offset, IsIndirect, N)) {
-          SDV = DAG.getDbgValue(Variable, N.getNode(),
-                                N.getResNo(), IsIndirect,
-				Offset, dl, SDNodeOrder);
+        if (!EmitFuncArgumentDbgValue(V, Variable, Expression, Offset,
+                                      IsIndirect, N)) {
+          SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
+                                IsIndirect, Offset, dl, SDNodeOrder);
           DAG.AddDbgValue(SDV, N.getNode(), false);
         }
       } else if (!V->use_empty() ) {
@@ -4878,7 +4973,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 
   case Intrinsic::eh_typeid_for: {
     // Find the type id for the given typeinfo.
-    GlobalVariable *GV = ExtractTypeInfo(I.getArgOperand(0));
+    GlobalValue *GV = ExtractTypeInfo(I.getArgOperand(0));
     unsigned TypeID = DAG.getMachineFunction().getMMI().getTypeIDFor(GV);
     Res = DAG.getConstant(TypeID, MVT::i32);
     setValue(&I, Res);
@@ -4899,15 +4994,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   case Intrinsic::eh_dwarf_cfa: {
     SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), sdl,
-                                        TLI->getPointerTy());
+                                        TLI.getPointerTy());
     SDValue Offset = DAG.getNode(ISD::ADD, sdl,
                                  CfaArg.getValueType(),
                                  DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, sdl,
                                              CfaArg.getValueType()),
                                  CfaArg);
-    SDValue FA = DAG.getNode(ISD::FRAMEADDR, sdl,
-                             TLI->getPointerTy(),
-                             DAG.getConstant(0, TLI->getPointerTy()));
+    SDValue FA = DAG.getNode(ISD::FRAMEADDR, sdl, TLI.getPointerTy(),
+                             DAG.getConstant(0, TLI.getPointerTy()));
     setValue(&I, DAG.getNode(ISD::ADD, sdl, FA.getValueType(),
                              FA, Offset));
     return nullptr;
@@ -4946,6 +5040,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
 
+  case Intrinsic::masked_load:
+    visitMaskedLoad(I);
+    return nullptr;
+  case Intrinsic::masked_store:
+    visitMaskedStore(I);
+    return nullptr;
   case Intrinsic::x86_mmx_pslli_w:
   case Intrinsic::x86_mmx_pslli_d:
   case Intrinsic::x86_mmx_pslli_q:
@@ -4997,7 +5097,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     ShOps[0] = ShAmt;
     ShOps[1] = DAG.getConstant(0, MVT::i32);
     ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, sdl, ShAmtVT, ShOps);
-    EVT DestVT = TLI->getValueType(I.getType());
+    EVT DestVT = TLI.getValueType(I.getType());
     ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
     Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
                        DAG.getConstant(NewIntrinsic, MVT::i32),
@@ -5009,14 +5109,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::x86_avx_vinsertf128_ps_256:
   case Intrinsic::x86_avx_vinsertf128_si_256:
   case Intrinsic::x86_avx2_vinserti128: {
-    EVT DestVT = TLI->getValueType(I.getType());
-    EVT ElVT = TLI->getValueType(I.getArgOperand(1)->getType());
+    EVT DestVT = TLI.getValueType(I.getType());
+    EVT ElVT = TLI.getValueType(I.getArgOperand(1)->getType());
     uint64_t Idx = (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue() & 1) *
                    ElVT.getVectorNumElements();
-    Res = DAG.getNode(ISD::INSERT_SUBVECTOR, sdl, DestVT,
-                      getValue(I.getArgOperand(0)),
-                      getValue(I.getArgOperand(1)),
-                      DAG.getConstant(Idx, TLI->getVectorIdxTy()));
+    Res =
+        DAG.getNode(ISD::INSERT_SUBVECTOR, sdl, DestVT,
+                    getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)),
+                    DAG.getConstant(Idx, TLI.getVectorIdxTy()));
     setValue(&I, Res);
     return nullptr;
   }
@@ -5024,12 +5124,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::x86_avx_vextractf128_ps_256:
   case Intrinsic::x86_avx_vextractf128_si_256:
   case Intrinsic::x86_avx2_vextracti128: {
-    EVT DestVT = TLI->getValueType(I.getType());
+    EVT DestVT = TLI.getValueType(I.getType());
     uint64_t Idx = (cast<ConstantInt>(I.getArgOperand(1))->getZExtValue() & 1) *
                    DestVT.getVectorNumElements();
     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, DestVT,
                       getValue(I.getArgOperand(0)),
-                      DAG.getConstant(Idx, TLI->getVectorIdxTy()));
+                      DAG.getConstant(Idx, TLI.getVectorIdxTy()));
     setValue(&I, Res);
     return nullptr;
   }
@@ -5055,7 +5155,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     case Intrinsic::convertus:  Code = ISD::CVT_US; break;
     case Intrinsic::convertuu:  Code = ISD::CVT_UU; break;
     }
-    EVT DestVT = TLI->getValueType(I.getType());
+    EVT DestVT = TLI.getValueType(I.getType());
     const Value *Op1 = I.getArgOperand(0);
     Res = DAG.getConvertRndSat(DestVT, sdl, getValue(Op1),
                                DAG.getValueType(DestVT),
@@ -5071,23 +5171,23 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                             getValue(I.getArgOperand(1)), DAG));
     return nullptr;
   case Intrinsic::log:
-    setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
+    setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::log2:
-    setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
+    setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::log10:
-    setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
+    setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::exp:
-    setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
+    setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::exp2:
-    setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
+    setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::pow:
     setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)),
-                           getValue(I.getArgOperand(1)), DAG, *TLI));
+                           getValue(I.getArgOperand(1)), DAG, TLI));
     return nullptr;
   case Intrinsic::sqrt:
   case Intrinsic::fabs:
@@ -5119,6 +5219,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(0))));
     return nullptr;
   }
+  case Intrinsic::minnum:
+    setValue(&I, DAG.getNode(ISD::FMINNUM, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
+  case Intrinsic::maxnum:
+    setValue(&I, DAG.getNode(ISD::FMAXNUM, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
   case Intrinsic::copysign:
     setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
@@ -5133,9 +5245,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(2))));
     return nullptr;
   case Intrinsic::fmuladd: {
-    EVT VT = TLI->getValueType(I.getType());
+    EVT VT = TLI.getValueType(I.getType());
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
-        TLI->isFMAFasterThanFMulAndFAdd(VT)) {
+        TLI.isFMAFasterThanFMulAndFAdd(VT)) {
       setValue(&I, DAG.getNode(ISD::FMA, sdl,
                                getValue(I.getArgOperand(0)).getValueType(),
                                getValue(I.getArgOperand(0)),
@@ -5162,7 +5274,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   case Intrinsic::convert_from_fp16:
     setValue(&I,
-             DAG.getNode(ISD::FP_EXTEND, sdl, TLI->getValueType(I.getType()),
+             DAG.getNode(ISD::FP_EXTEND, sdl, TLI.getValueType(I.getType()),
                          DAG.getNode(ISD::BITCAST, sdl, MVT::f16,
                                      getValue(I.getArgOperand(0)))));
     return nullptr;
@@ -5209,7 +5321,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(ISD::STACKSAVE, sdl,
-                      DAG.getVTList(TLI->getPointerTy(), MVT::Other), Op);
+                      DAG.getVTList(TLI.getPointerTy(), MVT::Other), Op);
     setValue(&I, Res);
     DAG.setRoot(Res.getValue(1));
     return nullptr;
@@ -5223,9 +5335,44 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     // Emit code into the DAG to store the stack guard onto the stack.
     MachineFunction &MF = DAG.getMachineFunction();
     MachineFrameInfo *MFI = MF.getFrameInfo();
-    EVT PtrTy = TLI->getPointerTy();
+    EVT PtrTy = TLI.getPointerTy();
+    SDValue Src, Chain = getRoot();
+    const Value *Ptr = cast<LoadInst>(I.getArgOperand(0))->getPointerOperand();
+    const GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr);
+
+    // See if Ptr is a bitcast. If it is, look through it and see if we can get
+    // global variable __stack_chk_guard.
+    if (!GV)
+      if (const Operator *BC = dyn_cast<Operator>(Ptr))
+        if (BC->getOpcode() == Instruction::BitCast)
+          GV = dyn_cast<GlobalVariable>(BC->getOperand(0));
+
+    if (GV && TLI.useLoadStackGuardNode()) {
+      // Emit a LOAD_STACK_GUARD node.
+      MachineSDNode *Node = DAG.getMachineNode(TargetOpcode::LOAD_STACK_GUARD,
+                                               sdl, PtrTy, Chain);
+      MachinePointerInfo MPInfo(GV);
+      MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
+      unsigned Flags = MachineMemOperand::MOLoad |
+                       MachineMemOperand::MOInvariant;
+      *MemRefs = MF.getMachineMemOperand(MPInfo, Flags,
+                                         PtrTy.getSizeInBits() / 8,
+                                         DAG.getEVTAlignment(PtrTy));
+      Node->setMemRefs(MemRefs, MemRefs + 1);
+
+      // Copy the guard value to a virtual register so that it can be
+      // retrieved in the epilogue.
+      Src = SDValue(Node, 0);
+      const TargetRegisterClass *RC =
+          TLI.getRegClassFor(Src.getSimpleValueType());
+      unsigned Reg = MF.getRegInfo().createVirtualRegister(RC);
+
+      SPDescriptor.setGuardReg(Reg);
+      Chain = DAG.getCopyToReg(Chain, sdl, Reg, Src);
+    } else {
+      Src = getValue(I.getArgOperand(0));   // The guard's value.
+    }
 
-    SDValue Src = getValue(I.getArgOperand(0));   // The guard's value.
     AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
 
     int FI = FuncInfo.StaticAllocaMap[Slot];
@@ -5234,7 +5381,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
 
     // Store the stack protector onto the stack.
-    Res = DAG.getStore(getRoot(), sdl, Src, FIN,
+    Res = DAG.getStore(Chain, sdl, Src, FIN,
                        MachinePointerInfo::getFixedStack(FI),
                        true, false, 0);
     setValue(&I, Res);
@@ -5263,8 +5410,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     // Drop the intrinsic, but forward the value
     setValue(&I, getValue(I.getOperand(0)));
     return nullptr;
+  case Intrinsic::assume:
   case Intrinsic::var_annotation:
-    // Discard annotate attributes
+    // Discard annotate attributes and assumptions
     return nullptr;
 
   case Intrinsic::init_trampoline: {
@@ -5285,7 +5433,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::adjust_trampoline: {
     setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, sdl,
-                             TLI->getPointerTy(),
+                             TLI.getPointerTy(),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   }
@@ -5325,10 +5473,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(sdl).setChain(getRoot())
       .setCallee(CallingConv::C, I.getType(),
-                 DAG.getExternalSymbol(TrapFuncName.data(), TLI->getPointerTy()),
+                 DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy()),
                  std::move(Args), 0);
 
-    std::pair<SDValue, SDValue> Result = TLI->LowerCallTo(CLI);
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
     DAG.setRoot(Result.second);
     return nullptr;
   }
@@ -5392,11 +5540,17 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       if (!LifetimeObject)
         continue;
 
-      int FI = FuncInfo.StaticAllocaMap[LifetimeObject];
+      // First check that the Alloca is static, otherwise it won't have a
+      // valid frame index.
+      auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
+      if (SI == FuncInfo.StaticAllocaMap.end())
+        return nullptr;
+
+      int FI = SI->second;
 
       SDValue Ops[2];
       Ops[0] = getRoot();
-      Ops[1] = DAG.getFrameIndex(FI, TLI->getPointerTy(), true);
+      Ops[1] = DAG.getFrameIndex(FI, TLI.getPointerTy(), true);
       unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END);
 
       Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops);
@@ -5406,7 +5560,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::invariant_start:
     // Discard region information.
-    setValue(&I, DAG.getUNDEF(TLI->getPointerTy()));
+    setValue(&I, DAG.getUNDEF(TLI.getPointerTy()));
     return nullptr;
   case Intrinsic::invariant_end:
     // Discard region information.
@@ -5424,7 +5578,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
   case Intrinsic::clear_cache:
-    return TLI->getClearCacheBuiltinName();
+    return TLI.getClearCacheBuiltinName();
   case Intrinsic::donothing:
     // ignore
     return nullptr;
@@ -5434,41 +5588,85 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::experimental_patchpoint_void:
   case Intrinsic::experimental_patchpoint_i64: {
-    visitPatchpoint(I);
+    visitPatchpoint(&I);
     return nullptr;
   }
+  case Intrinsic::experimental_gc_statepoint: {
+    visitStatepoint(I);
+    return nullptr;
   }
-}
-
-void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
-                                      bool isTailCall,
-                                      MachineBasicBlock *LandingPad) {
-  const TargetLowering *TLI = TM.getTargetLowering();
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  Type *RetTy = FTy->getReturnType();
-  MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
-  MCSymbol *BeginLabel = nullptr;
-
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Args.reserve(CS.arg_size());
+  case Intrinsic::experimental_gc_result_int:
+  case Intrinsic::experimental_gc_result_float:
+  case Intrinsic::experimental_gc_result_ptr: {
+    visitGCResult(I);
+    return nullptr;
+  }
+  case Intrinsic::experimental_gc_relocate: {
+    visitGCRelocate(I);
+    return nullptr;
+  }
+  case Intrinsic::instrprof_increment:
+    llvm_unreachable("instrprof failed to lower an increment");
 
-  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
-       i != e; ++i) {
-    const Value *V = *i;
+  case Intrinsic::frameallocate: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
+
+    // Do the allocation and map it as a normal value.
+    // FIXME: Maybe we should add this to the alloca map so that we don't have
+    // to register allocate it?
+    uint64_t Size = cast<ConstantInt>(I.getArgOperand(0))->getZExtValue();
+    int Alloc = MF.getFrameInfo()->CreateFrameAllocation(Size);
+    MVT PtrVT = TLI.getPointerTy(0);
+    SDValue FIVal = DAG.getFrameIndex(Alloc, PtrVT);
+    setValue(&I, FIVal);
+
+    // Directly emit a FRAME_ALLOC machine instr. Label assignment emission is
+    // the same on all targets.
+    MCSymbol *FrameAllocSym =
+        MF.getMMI().getContext().getOrCreateFrameAllocSymbol(MF.getName());
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
+            TII->get(TargetOpcode::FRAME_ALLOC))
+        .addSym(FrameAllocSym)
+        .addFrameIndex(Alloc);
 
-    // Skip empty types
-    if (V->getType()->isEmptyTy())
-      continue;
+    return nullptr;
+  }
 
-    SDValue ArgNode = getValue(V);
-    Entry.Node = ArgNode; Entry.Ty = V->getType();
+  case Intrinsic::framerecover: {
+    // i8* @llvm.framerecover(i8* %fn, i8* %fp)
+    MachineFunction &MF = DAG.getMachineFunction();
+    MVT PtrVT = TLI.getPointerTy(0);
+
+    // Get the symbol that defines the frame offset.
+    Function *Fn = cast<Function>(I.getArgOperand(0)->stripPointerCasts());
+    MCSymbol *FrameAllocSym =
+        MF.getMMI().getContext().getOrCreateFrameAllocSymbol(Fn->getName());
+
+    // Create a TargetExternalSymbol for the label to avoid any target lowering
+    // that would make this PC relative.
+    StringRef Name = FrameAllocSym->getName();
+    assert(Name.size() == strlen(Name.data()) && "not null terminated");
+    SDValue OffsetSym = DAG.getTargetExternalSymbol(Name.data(), PtrVT);
+    SDValue OffsetVal =
+        DAG.getNode(ISD::FRAME_ALLOC_RECOVER, sdl, PtrVT, OffsetSym);
+
+    // Add the offset to the FP.
+    Value *FP = I.getArgOperand(1);
+    SDValue FPVal = getValue(FP);
+    SDValue Add = DAG.getNode(ISD::ADD, sdl, PtrVT, FPVal, OffsetVal);
+    setValue(&I, Add);
 
-    // Skip the first return-type Attribute to get to params.
-    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
-    Args.push_back(Entry);
+    return nullptr;
   }
+  }
+}
+
+std::pair<SDValue, SDValue>
+SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
+                                    MachineBasicBlock *LandingPad) {
+  MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+  MCSymbol *BeginLabel = nullptr;
 
   if (LandingPad) {
     // Insert a label before the invoke call to mark the try range.  This can be
@@ -5490,24 +5688,17 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     // this call might not return.
     (void)getRoot();
     DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getControlRoot(), BeginLabel));
-  }
 
-  // Check if target-independent constraints permit a tail call here.
-  // Target-dependent constraints are checked within TLI->LowerCallTo.
-  if (isTailCall && !isInTailCallPosition(CS, DAG.getTarget()))
-    isTailCall = false;
+    CLI.setChain(getRoot());
+  }
 
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
-    .setCallee(RetTy, FTy, Callee, std::move(Args), CS).setTailCall(isTailCall);
+  const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
+  std::pair<SDValue, SDValue> Result = TLI->LowerCallTo(CLI);
 
-  std::pair<SDValue,SDValue> Result = TLI->LowerCallTo(CLI);
-  assert((isTailCall || Result.second.getNode()) &&
+  assert((CLI.IsTailCall || Result.second.getNode()) &&
          "Non-null chain expected with non-tail call!");
   assert((Result.second.getNode() || !Result.first.getNode()) &&
          "Null value expected with tail call!");
-  if (Result.first.getNode())
-    setValue(CS.getInstruction(), Result.first);
 
   if (!Result.second.getNode()) {
     // As a special case, a null chain means that a tail call has been emitted
@@ -5530,6 +5721,50 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     // Inform MachineModuleInfo of range.
     MMI.addInvoke(LandingPad, BeginLabel, EndLabel);
   }
+
+  return Result;
+}
+
+void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
+                                      bool isTailCall,
+                                      MachineBasicBlock *LandingPad) {
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  Type *RetTy = FTy->getReturnType();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Args.reserve(CS.arg_size());
+
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    const Value *V = *i;
+
+    // Skip empty types
+    if (V->getType()->isEmptyTy())
+      continue;
+
+    SDValue ArgNode = getValue(V);
+    Entry.Node = ArgNode; Entry.Ty = V->getType();
+
+    // Skip the first return-type Attribute to get to params.
+    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
+    Args.push_back(Entry);
+  }
+
+  // Check if target-independent constraints permit a tail call here.
+  // Target-dependent constraints are checked within TLI->LowerCallTo.
+  if (isTailCall && !isInTailCallPosition(CS, DAG.getTarget()))
+    isTailCall = false;
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
+    .setCallee(RetTy, FTy, Callee, std::move(Args), CS)
+    .setTailCall(isTailCall);
+  std::pair<SDValue,SDValue> Result = lowerInvokable(CLI, LandingPad);
+
+  if (Result.first.getNode())
+    setValue(CS.getInstruction(), Result.first);
 }
 
 /// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
@@ -5595,7 +5830,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
 void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
                                                   SDValue Value,
                                                   bool IsSigned) {
-  EVT VT = TM.getTargetLowering()->getValueType(I.getType(), true);
+  EVT VT = DAG.getTargetLoweringInfo().getValueType(I.getType(), true);
   if (IsSigned)
     Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT);
   else
@@ -5620,7 +5855,7 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   const Value *Size = I.getArgOperand(2);
   const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
   if (CSize && CSize->getZExtValue() == 0) {
-    EVT CallVT = TM.getTargetLowering()->getValueType(I.getType(), true);
+    EVT CallVT = DAG.getTargetLoweringInfo().getValueType(I.getType(), true);
     setValue(&I, DAG.getConstant(0, CallVT));
     return true;
   }
@@ -5677,15 +5912,16 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
     // Require that we can find a legal MVT, and only do this if the target
     // supports unaligned loads of that type.  Expanding into byte loads would
     // bloat the code.
-    const TargetLowering *TLI = TM.getTargetLowering();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     if (ActuallyDoIt && CSize->getZExtValue() > 4) {
       unsigned DstAS = LHS->getType()->getPointerAddressSpace();
       unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
       // TODO: Handle 5 byte compare as 4-byte + 1 byte.
       // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
-      if (!TLI->isTypeLegal(LoadVT) ||
-          !TLI->allowsUnalignedMemoryAccesses(LoadVT, SrcAS) ||
-          !TLI->allowsUnalignedMemoryAccesses(LoadVT, DstAS))
+      // TODO: Check alignment of src and dest ptrs.
+      if (!TLI.isTypeLegal(LoadVT) ||
+          !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) ||
+          !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS))
         ActuallyDoIt = false;
     }
 
@@ -5863,6 +6099,26 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
   return true;
 }
 
+/// visitBinaryFloatCall - If a call instruction is a binary floating-point
+/// operation (as expected), translate it to an SDNode with the specified opcode
+/// and return true.
+bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
+                                               unsigned Opcode) {
+  // Sanity check that it really is a binary floating-point call.
+  if (I.getNumArgOperands() != 2 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      I.getType() != I.getArgOperand(1)->getType() ||
+      !I.onlyReadsMemory())
+    return false;
+
+  SDValue Tmp0 = getValue(I.getArgOperand(0));
+  SDValue Tmp1 = getValue(I.getArgOperand(1));
+  EVT VT = Tmp0.getValueType();
+  setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1));
+  return true;
+}
+
 void SelectionDAGBuilder::visitCall(const CallInst &I) {
   // Handle inline assembly differently.
   if (isa<InlineAsm>(I.getCalledValue())) {
@@ -5919,6 +6175,18 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
         if (visitUnaryFloatCall(I, ISD::FABS))
           return;
         break;
+      case LibFunc::fmin:
+      case LibFunc::fminf:
+      case LibFunc::fminl:
+        if (visitBinaryFloatCall(I, ISD::FMINNUM))
+          return;
+        break;
+      case LibFunc::fmax:
+      case LibFunc::fmaxf:
+      case LibFunc::fmaxl:
+        if (visitBinaryFloatCall(I, ISD::FMAXNUM))
+          return;
+        break;
       case LibFunc::sin:
       case LibFunc::sinf:
       case LibFunc::sinl:
@@ -6025,7 +6293,7 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
     Callee = getValue(I.getCalledValue());
   else
     Callee = DAG.getExternalSymbol(RenameFn,
-                                   TM.getTargetLowering()->getPointerTy());
+                                   DAG.getTargetLoweringInfo().getPointerTy());
 
   // Check if we can potentially perform a tail call. More detailed checking is
   // be done within LowerCallTo, after more information about the call is known.
@@ -6220,9 +6488,9 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   /// ConstraintOperands - Information about all of the constraints.
   SDISelAsmOperandInfoVector ConstraintOperands;
 
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   TargetLowering::AsmOperandInfoVector
-    TargetConstraints = TLI->ParseConstraints(CS);
+    TargetConstraints = TLI.ParseConstraints(CS);
 
   bool hasMemory = false;
 
@@ -6247,10 +6515,10 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       // corresponding argument.
       assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
       if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
-        OpVT = TLI->getSimpleValueType(STy->getElementType(ResNo));
+        OpVT = TLI.getSimpleValueType(STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
-        OpVT = TLI->getSimpleValueType(CS.getType());
+        OpVT = TLI.getSimpleValueType(CS.getType());
       }
       ++ResNo;
       break;
@@ -6271,8 +6539,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
       }
 
-      OpVT = OpInfo.getCallOperandValEVT(*DAG.getContext(), *TLI, DL).
-        getSimpleVT();
+      OpVT =
+          OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI, DL).getSimpleVT();
     }
 
     OpInfo.ConstraintVT = OpVT;
@@ -6283,7 +6551,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     else {
       for (unsigned j = 0, ee = OpInfo.Codes.size(); j != ee; ++j) {
         TargetLowering::ConstraintType
-          CType = TLI->getConstraintType(OpInfo.Codes[j]);
+          CType = TLI.getConstraintType(OpInfo.Codes[j]);
         if (CType == TargetLowering::C_Memory) {
           hasMemory = true;
           break;
@@ -6315,10 +6583,10 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
         std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-          TLI->getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+          TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
                                             OpInfo.ConstraintVT);
         std::pair<unsigned, const TargetRegisterClass*> InputRC =
-          TLI->getRegForInlineAsmConstraint(Input.ConstraintCode,
+          TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
                                             Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
@@ -6332,7 +6600,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     }
 
     // Compute the constraint code and ConstraintType to use.
-    TLI->ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
+    TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
 
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         OpInfo.Type == InlineAsm::isClobber)
@@ -6360,16 +6628,16 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       if (isa<ConstantFP>(OpVal) || isa<ConstantInt>(OpVal) ||
           isa<ConstantVector>(OpVal) || isa<ConstantDataVector>(OpVal)) {
         OpInfo.CallOperand = DAG.getConstantPool(cast<Constant>(OpVal),
-                                                 TLI->getPointerTy());
+                                                 TLI.getPointerTy());
       } else {
         // Otherwise, create a stack slot and emit a store to it before the
         // asm.
         Type *Ty = OpVal->getType();
-        uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(Ty);
-        unsigned Align  = TLI->getDataLayout()->getPrefTypeAlignment(Ty);
+        uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
+        unsigned Align  = TLI.getDataLayout()->getPrefTypeAlignment(Ty);
         MachineFunction &MF = DAG.getMachineFunction();
         int SSFI = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
-        SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI->getPointerTy());
+        SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy());
         Chain = DAG.getStore(Chain, getCurSDLoc(),
                              OpInfo.CallOperand, StackSlot,
                              MachinePointerInfo::getFixedStack(SSFI),
@@ -6387,7 +6655,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     // If this constraint is for a specific register, allocate it before
     // anything else.
     if (OpInfo.ConstraintType == TargetLowering::C_Register)
-      GetRegistersForValue(DAG, *TLI, getCurSDLoc(), OpInfo);
+      GetRegistersForValue(DAG, TLI, getCurSDLoc(), OpInfo);
   }
 
   // Second pass - Loop over all of the operands, assigning virtual or physregs
@@ -6398,7 +6666,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     // C_Register operands have already been allocated, Other/Memory don't need
     // to be.
     if (OpInfo.ConstraintType == TargetLowering::C_RegisterClass)
-      GetRegistersForValue(DAG, *TLI, getCurSDLoc(), OpInfo);
+      GetRegistersForValue(DAG, TLI, getCurSDLoc(), OpInfo);
   }
 
   // AsmNodeOperands - The operands for the ISD::INLINEASM node.
@@ -6406,7 +6674,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   AsmNodeOperands.push_back(SDValue());  // reserve space for input chain
   AsmNodeOperands.push_back(
           DAG.getTargetExternalSymbol(IA->getAsmString().c_str(),
-                                      TLI->getPointerTy()));
+                                      TLI.getPointerTy()));
 
   // If we have a !srcloc metadata node associated with it, we want to attach
   // this to the ultimately generated inline asm machineinstr.  To do this, we
@@ -6429,7 +6697,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
 
     // Compute the constraint code and ConstraintType to use.
-    TLI->ComputeConstraintToUse(OpInfo, SDValue());
+    TLI.ComputeConstraintToUse(OpInfo, SDValue());
 
     // Ideally, we would only check against memory constraints.  However, the
     // meaning of an other constraint can be target-specific and we can't easily
@@ -6447,7 +6715,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   }
 
   AsmNodeOperands.push_back(DAG.getTargetConstant(ExtraInfo,
-                                                  TLI->getPointerTy()));
+                                                  TLI.getPointerTy()));
 
   // Loop over all of the inputs, copying the operand values into the
   // appropriate registers and processing the output regs.
@@ -6469,7 +6737,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         // Add information to the INLINEASM node to know about this output.
         unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
         AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlags,
-                                                        TLI->getPointerTy()));
+                                                        TLI.getPointerTy()));
         AsmNodeOperands.push_back(OpInfo.CallOperand);
         break;
       }
@@ -6549,7 +6817,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
           MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
           for (unsigned i = 0, e = InlineAsm::getNumOperandRegisters(OpFlag);
                i != e; ++i) {
-            if (const TargetRegisterClass *RC = TLI->getRegClassFor(RegVT))
+            if (const TargetRegisterClass *RC = TLI.getRegClassFor(RegVT))
               MatchedRegs.Regs.push_back(RegInfo.createVirtualRegister(RC));
             else {
               LLVMContext &Ctx = *DAG.getContext();
@@ -6576,7 +6844,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         OpFlag = InlineAsm::getFlagWordForMatchingOp(OpFlag,
                                                     OpInfo.getMatchedOperand());
         AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlag,
-                                                        TLI->getPointerTy()));
+                                                        TLI.getPointerTy()));
         AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]);
         break;
       }
@@ -6588,7 +6856,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
       if (OpInfo.ConstraintType == TargetLowering::C_Other) {
         std::vector<SDValue> Ops;
-        TLI->LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
+        TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
                                           Ops, DAG);
         if (Ops.empty()) {
           LLVMContext &Ctx = *DAG.getContext();
@@ -6602,20 +6870,20 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         unsigned ResOpType =
           InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
         AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
-                                                        TLI->getPointerTy()));
+                                                        TLI.getPointerTy()));
         AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end());
         break;
       }
 
       if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
         assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
-        assert(InOperandVal.getValueType() == TLI->getPointerTy() &&
+        assert(InOperandVal.getValueType() == TLI.getPointerTy() &&
                "Memory operands expect pointer values");
 
         // Add information to the INLINEASM node to know about this input.
         unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
         AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
-                                                        TLI->getPointerTy()));
+                                                        TLI.getPointerTy()));
         AsmNodeOperands.push_back(InOperandVal);
         break;
       }
@@ -6678,7 +6946,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
     // FIXME: Why don't we do this for inline asms with MRVs?
     if (CS.getType()->isSingleValueType() && CS.getType()->isSized()) {
-      EVT ResultType = TLI->getValueType(CS.getType());
+      EVT ResultType = TLI.getValueType(CS.getType());
 
       // If any of the results of the inline asm is a vector, it may have the
       // wrong width/num elts.  This can happen for register classes that can
@@ -6743,9 +7011,9 @@ void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
 }
 
 void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
-  const TargetLowering *TLI = TM.getTargetLowering();
-  const DataLayout &DL = *TLI->getDataLayout();
-  SDValue V = DAG.getVAArg(TLI->getValueType(I.getType()), getCurSDLoc(),
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = *TLI.getDataLayout();
+  SDValue V = DAG.getVAArg(TLI.getValueType(I.getType()), getCurSDLoc(),
                            getRoot(), getValue(I.getOperand(0)),
                            DAG.getSrcValue(I.getOperand(0)),
                            DL.getABITypeAlignment(I.getType()));
@@ -6777,18 +7045,19 @@ void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
 /// convention or require stack pointer adjustment. Only a subset of the
 /// intrinsic's operands need to participate in the calling convention.
 std::pair<SDValue, SDValue>
-SelectionDAGBuilder::LowerCallOperands(const CallInst &CI, unsigned ArgIdx,
+SelectionDAGBuilder::lowerCallOperands(ImmutableCallSite CS, unsigned ArgIdx,
                                        unsigned NumArgs, SDValue Callee,
-                                       bool useVoidTy) {
+                                       bool UseVoidTy,
+                                       MachineBasicBlock *LandingPad,
+                                       bool IsPatchPoint) {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumArgs);
 
   // Populate the argument list.
   // Attributes for args start at offset 1, after the return attribute.
-  ImmutableCallSite CS(&CI);
   for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1;
        ArgI != ArgE; ++ArgI) {
-    const Value *V = CI.getOperand(ArgI);
+    const Value *V = CS->getOperand(ArgI);
 
     assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
 
@@ -6799,14 +7068,13 @@ SelectionDAGBuilder::LowerCallOperands(const CallInst &CI, unsigned ArgIdx,
     Args.push_back(Entry);
   }
 
-  Type *retTy = useVoidTy ? Type::getVoidTy(*DAG.getContext()) : CI.getType();
+  Type *retTy = UseVoidTy ? Type::getVoidTy(*DAG.getContext()) : CS->getType();
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
-    .setCallee(CI.getCallingConv(), retTy, Callee, std::move(Args), NumArgs)
-    .setDiscardResult(!CI.use_empty());
+    .setCallee(CS.getCallingConv(), retTy, Callee, std::move(Args), NumArgs)
+    .setDiscardResult(CS->use_empty()).setIsPatchPoint(IsPatchPoint);
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  return TLI->LowerCallTo(CLI);
+  return lowerInvokable(CLI, LandingPad);
 }
 
 /// \brief Add a stack map intrinsic call's live variable operands to a stackmap
@@ -6826,11 +7094,11 @@ SelectionDAGBuilder::LowerCallOperands(const CallInst &CI, unsigned ArgIdx,
 /// assumption made by the llvm.gcroot intrinsic). If the alloca's location were
 /// only available in a register, then the runtime would need to trap when
 /// execution reaches the StackMap in order to read the alloca's location.
-static void addStackMapLiveVars(const CallInst &CI, unsigned StartIdx,
+static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx,
                                 SmallVectorImpl<SDValue> &Ops,
                                 SelectionDAGBuilder &Builder) {
-  for (unsigned i = StartIdx, e = CI.getNumArgOperands(); i != e; ++i) {
-    SDValue OpVal = Builder.getValue(CI.getArgOperand(i));
+  for (unsigned i = StartIdx, e = CS.arg_size(); i != e; ++i) {
+    SDValue OpVal = Builder.getValue(CS.getArgument(i));
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpVal)) {
       Ops.push_back(
         Builder.DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
@@ -6881,7 +7149,7 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
                   cast<ConstantSDNode>(NBytesVal)->getZExtValue(), MVT::i32));
 
   // Push live variables for the stack map.
-  addStackMapLiveVars(CI, 2, Ops, *this);
+  addStackMapLiveVars(&CI, 2, Ops, *this);
 
   // We are not pushing any register mask info here on the operands list,
   // because the stackmap doesn't clobber anything.
@@ -6908,7 +7176,8 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
 }
 
 /// \brief Lower llvm.experimental.patchpoint directly to its target opcode.
-void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
+void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
+                                          MachineBasicBlock *LandingPad) {
   // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
   //                                                 i32 <numBytes>,
   //                                                 i8* <target>,
@@ -6916,32 +7185,29 @@ void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
   //                                                 [Args...],
   //                                                 [live variables...])
 
-  CallingConv::ID CC = CI.getCallingConv();
-  bool isAnyRegCC = CC == CallingConv::AnyReg;
-  bool hasDef = !CI.getType()->isVoidTy();
-  SDValue Callee = getValue(CI.getOperand(2)); // <target>
+  CallingConv::ID CC = CS.getCallingConv();
+  bool IsAnyRegCC = CC == CallingConv::AnyReg;
+  bool HasDef = !CS->getType()->isVoidTy();
+  SDValue Callee = getValue(CS->getOperand(2)); // <target>
 
   // Get the real number of arguments participating in the call <numArgs>
-  SDValue NArgVal = getValue(CI.getArgOperand(PatchPointOpers::NArgPos));
+  SDValue NArgVal = getValue(CS.getArgument(PatchPointOpers::NArgPos));
   unsigned NumArgs = cast<ConstantSDNode>(NArgVal)->getZExtValue();
 
   // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
   // Intrinsics include all meta-operands up to but not including CC.
   unsigned NumMetaOpers = PatchPointOpers::CCPos;
-  assert(CI.getNumArgOperands() >= NumMetaOpers + NumArgs &&
+  assert(CS.arg_size() >= NumMetaOpers + NumArgs &&
          "Not enough arguments provided to the patchpoint intrinsic");
 
   // For AnyRegCC the arguments are lowered later on manually.
-  unsigned NumCallArgs = isAnyRegCC ? 0 : NumArgs;
+  unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
   std::pair<SDValue, SDValue> Result =
-    LowerCallOperands(CI, NumMetaOpers, NumCallArgs, Callee, isAnyRegCC);
-
-  // Set the root to the target-lowered call chain.
-  SDValue Chain = Result.second;
-  DAG.setRoot(Chain);
+    lowerCallOperands(CS, NumMetaOpers, NumCallArgs, Callee, IsAnyRegCC,
+                      LandingPad, true);
 
-  SDNode *CallEnd = Chain.getNode();
-  if (hasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
+  SDNode *CallEnd = Result.second.getNode();
+  if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
     CallEnd = CallEnd->getOperand(0).getNode();
 
   /// Get a call instruction from the call sequence chain.
@@ -6949,16 +7215,16 @@ void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
   assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
          "Expected a callseq node.");
   SDNode *Call = CallEnd->getOperand(0).getNode();
-  bool hasGlue = Call->getGluedNode();
+  bool HasGlue = Call->getGluedNode();
 
   // Replace the target specific call node with the patchable intrinsic.
   SmallVector<SDValue, 8> Ops;
 
   // Add the <id> and <numBytes> constants.
-  SDValue IDVal = getValue(CI.getOperand(PatchPointOpers::IDPos));
+  SDValue IDVal = getValue(CS->getOperand(PatchPointOpers::IDPos));
   Ops.push_back(DAG.getTargetConstant(
                   cast<ConstantSDNode>(IDVal)->getZExtValue(), MVT::i64));
-  SDValue NBytesVal = getValue(CI.getOperand(PatchPointOpers::NBytesPos));
+  SDValue NBytesVal = getValue(CS->getOperand(PatchPointOpers::NBytesPos));
   Ops.push_back(DAG.getTargetConstant(
                   cast<ConstantSDNode>(NBytesVal)->getZExtValue(), MVT::i32));
 
@@ -6971,8 +7237,8 @@ void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
   // Adjust <numArgs> to account for any arguments that have been passed on the
   // stack instead.
   // Call Node: Chain, Target, {Args}, RegMask, [Glue]
-  unsigned NumCallRegArgs = Call->getNumOperands() - (hasGlue ? 4 : 3);
-  NumCallRegArgs = isAnyRegCC ? NumArgs : NumCallRegArgs;
+  unsigned NumCallRegArgs = Call->getNumOperands() - (HasGlue ? 4 : 3);
+  NumCallRegArgs = IsAnyRegCC ? NumArgs : NumCallRegArgs;
   Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, MVT::i32));
 
   // Add the calling convention
@@ -6980,20 +7246,20 @@ void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
 
   // Add the arguments we omitted previously. The register allocator should
   // place these in any free register.
-  if (isAnyRegCC)
+  if (IsAnyRegCC)
     for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i)
-      Ops.push_back(getValue(CI.getArgOperand(i)));
+      Ops.push_back(getValue(CS.getArgument(i)));
 
   // Push the arguments from the call instruction up to the register mask.
-  SDNode::op_iterator e = hasGlue ? Call->op_end()-2 : Call->op_end()-1;
+  SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1;
   for (SDNode::op_iterator i = Call->op_begin()+2; i != e; ++i)
     Ops.push_back(*i);
 
   // Push live variables for the stack map.
-  addStackMapLiveVars(CI, NumMetaOpers + NumArgs, Ops, *this);
+  addStackMapLiveVars(CS, NumMetaOpers + NumArgs, Ops, *this);
 
   // Push the register mask info.
-  if (hasGlue)
+  if (HasGlue)
     Ops.push_back(*(Call->op_end()-2));
   else
     Ops.push_back(*(Call->op_end()-1));
@@ -7003,15 +7269,15 @@ void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
   Ops.push_back(*(Call->op_begin()));
 
   // Push the glue flag (last operand).
-  if (hasGlue)
+  if (HasGlue)
     Ops.push_back(*(Call->op_end()-1));
 
   SDVTList NodeTys;
-  if (isAnyRegCC && hasDef) {
+  if (IsAnyRegCC && HasDef) {
     // Create the return types based on the intrinsic definition
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     SmallVector<EVT, 3> ValueVTs;
-    ComputeValueVTs(TLI, CI.getType(), ValueVTs);
+    ComputeValueVTs(TLI, CS->getType(), ValueVTs);
     assert(ValueVTs.size() == 1 && "Expected only one return value type.");
 
     // There is always a chain and a glue type at the end
@@ -7026,18 +7292,18 @@ void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
                                          getCurSDLoc(), NodeTys, Ops);
 
   // Update the NodeMap.
-  if (hasDef) {
-    if (isAnyRegCC)
-      setValue(&CI, SDValue(MN, 0));
+  if (HasDef) {
+    if (IsAnyRegCC)
+      setValue(CS.getInstruction(), SDValue(MN, 0));
     else
-      setValue(&CI, Result.first);
+      setValue(CS.getInstruction(), Result.first);
   }
 
   // Fixup the consumers of the intrinsic. The chain and glue may be used in the
   // call sequence. Furthermore the location of the chain and glue can change
   // when the AnyReg calling convention is used and the intrinsic returns a
   // value.
-  if (isAnyRegCC && hasDef) {
+  if (IsAnyRegCC && HasDef) {
     SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)};
     SDValue To[] = {SDValue(MN, 1), SDValue(MN, 2)};
     DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
@@ -7186,8 +7452,11 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       }
       if (Args[i].isNest)
         Flags.setNest();
-      if (NeedsRegBlock)
+      if (NeedsRegBlock) {
         Flags.setInConsecutiveRegs();
+        if (Value == NumValues - 1)
+          Flags.setInConsecutiveRegsLast();
+      }
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
@@ -7233,10 +7502,6 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         else if (j != 0)
           MyFlags.Flags.setOrigAlign(1);
 
-        // Only mark the end at the last register of the last value.
-        if (NeedsRegBlock && Value == NumValues - 1 && j == NumParts - 1)
-          MyFlags.Flags.setInConsecutiveRegsLast();
-
         CLI.Outs.push_back(MyFlags);
         CLI.OutVals.push_back(Parts[j]);
       }
@@ -7349,10 +7614,15 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
          "Copy from a reg to the same reg!");
   assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  RegsForValue RFV(V->getContext(), *TLI, Reg, V->getType());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  RegsForValue RFV(V->getContext(), TLI, Reg, V->getType());
   SDValue Chain = DAG.getEntryNode();
-  RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V);
+
+  ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
+                              FuncInfo.PreferredExtendType.end())
+                                 ? ISD::ANY_EXTEND
+                                 : FuncInfo.PreferredExtendType[V];
+  RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V, ExtendType);
   PendingExports.push_back(Chain);
 }
 
@@ -7378,15 +7648,13 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
 void SelectionDAGISel::LowerArguments(const Function &F) {
   SelectionDAG &DAG = SDB->DAG;
   SDLoc dl = SDB->getCurSDLoc();
-  const TargetLowering *TLI = getTargetLowering();
   const DataLayout *DL = TLI->getDataLayout();
   SmallVector<ISD::InputArg, 16> Ins;
 
   if (!FuncInfo->CanLowerReturn) {
     // Put in an sret pointer parameter before all the other parameters.
     SmallVector<EVT, 1> ValueVTs;
-    ComputeValueVTs(*getTargetLowering(),
-                    PointerType::getUnqual(F.getReturnType()), ValueVTs);
+    ComputeValueVTs(*TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
 
     // NOTE: Assuming that a pointer will never break down to more than one VT
     // or one register.
@@ -7451,8 +7719,11 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       }
       if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
         Flags.setNest();
-      if (NeedsRegBlock)
+      if (NeedsRegBlock) {
         Flags.setInConsecutiveRegs();
+        if (Value == NumValues - 1)
+          Flags.setInConsecutiveRegsLast();
+      }
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
@@ -7465,11 +7736,6 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         // if it isn't first piece, alignment must be 1
         else if (i > 0)
           MyFlags.Flags.setOrigAlign(1);
-
-        // Only mark the end at the last register of the last value.
-        if (NeedsRegBlock && Value == NumValues - 1 && i == NumRegs - 1)
-          MyFlags.Flags.setInConsecutiveRegsLast();
-
         Ins.push_back(MyFlags);
       }
       PartBase += VT.getStoreSize();
@@ -7478,9 +7744,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
 
   // Call the target to set up the argument values.
   SmallVector<SDValue, 8> InVals;
-  SDValue NewRoot = TLI->LowerFormalArguments(DAG.getRoot(), F.getCallingConv(),
-                                              F.isVarArg(), Ins,
-                                              dl, DAG, InVals);
+  SDValue NewRoot = TLI->LowerFormalArguments(
+      DAG.getRoot(), F.getCallingConv(), F.isVarArg(), Ins, dl, DAG, InVals);
 
   // Verify that the target's LowerFormalArguments behaved as expected.
   assert(NewRoot.getNode() && NewRoot.getValueType() == MVT::Other &&
@@ -7517,8 +7782,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     MachineRegisterInfo& RegInfo = MF.getRegInfo();
     unsigned SRetReg = RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
     FuncInfo->DemoteRegister = SRetReg;
-    NewRoot = SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(),
-                                    SRetReg, ArgValue);
+    NewRoot =
+        SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(), SRetReg, ArgValue);
     DAG.setRoot(NewRoot);
 
     // i indexes lowered arguments.  Bump it past the hidden sret argument.
@@ -7633,7 +7898,8 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
 
     // If this terminator has multiple identical successors (common for
     // switches), only handle each succ once.
-    if (!SuccsHandled.insert(SuccMBB)) continue;
+    if (!SuccsHandled.insert(SuccMBB).second)
+      continue;
 
     MachineBasicBlock::iterator MBBI = SuccMBB->begin();
 
@@ -7676,11 +7942,11 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       // Remember that this register needs to added to the machine PHI node as
       // the input for this MBB.
       SmallVector<EVT, 4> ValueVTs;
-      const TargetLowering *TLI = TM.getTargetLowering();
-      ComputeValueVTs(*TLI, PN->getType(), ValueVTs);
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      ComputeValueVTs(TLI, PN->getType(), ValueVTs);
       for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
         EVT VT = ValueVTs[vti];
-        unsigned NumRegisters = TLI->getNumRegisters(*DAG.getContext(), VT);
+        unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
         for (unsigned i = 0, e = NumRegisters; i != e; ++i)
           FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg+i));
         Reg += NumRegisters;
@@ -7697,6 +7963,7 @@ MachineBasicBlock *
 SelectionDAGBuilder::StackProtectorDescriptor::
 AddSuccessorMBB(const BasicBlock *BB,
                 MachineBasicBlock *ParentMBB,
+                bool IsLikely,
                 MachineBasicBlock *SuccMBB) {
   // If SuccBB has not been created yet, create it.
   if (!SuccMBB) {
@@ -7706,6 +7973,7 @@ AddSuccessorMBB(const BasicBlock *BB,
     MF->insert(++BBI, SuccMBB);
   }
   // Add it as a successor of ParentMBB.
-  ParentMBB->addSuccessor(SuccMBB);
+  ParentMBB->addSuccessor(
+      SuccMBB, BranchProbabilityInfo::getBranchWeightStackProtector(IsLikely));
   return SuccMBB;
 }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 84679f98d84e..eba98b8086b7 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -11,9 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SELECTIONDAGBUILDER_H
-#define SELECTIONDAGBUILDER_H
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
 
+#include "StatepointLowering.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -21,6 +22,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLowering.h"
 #include <vector>
 
 namespace llvm {
@@ -114,6 +116,10 @@ public:
   /// get simple disambiguation between loads without worrying about alias
   /// analysis.
   SmallVector<SDValue, 8> PendingLoads;
+
+  /// State used while lowering a statepoint sequence (gc_statepoint,
+  /// gc_relocate, and gc_result).  See StatepointLowering.hpp/cpp for details.
+  StatepointLoweringState StatepointLowering;
 private:
 
   /// PendingExports - CopyToReg nodes that copy values to virtual registers
@@ -200,7 +206,7 @@ private:
     }
   };
 
-  size_t Clusterify(CaseVector &Cases, const SwitchInst &SI);
+  void Clusterify(CaseVector &Cases, const SwitchInst &SI);
 
   /// CaseBlock - This structure is used to communicate between
   /// SelectionDAGBuilder and SDISel for the code generation of additional basic
@@ -276,9 +282,9 @@ private:
     BitTestBlock(APInt F, APInt R, const Value* SV,
                  unsigned Rg, MVT RgVT, bool E,
                  MachineBasicBlock* P, MachineBasicBlock* D,
-                 const BitTestInfo& C):
+                 BitTestInfo C):
       First(F), Range(R), SValue(SV), Reg(Rg), RegVT(RgVT), Emitted(E),
-      Parent(P), Default(D), Cases(C) { }
+      Parent(P), Default(D), Cases(std::move(C)) { }
     APInt First;
     APInt Range;
     const Value *SValue;
@@ -397,7 +403,8 @@ private:
   class StackProtectorDescriptor {
   public:
     StackProtectorDescriptor() : ParentMBB(nullptr), SuccessMBB(nullptr),
-                                 FailureMBB(nullptr), Guard(nullptr) { }
+                                 FailureMBB(nullptr), Guard(nullptr),
+                                 GuardReg(0) { }
     ~StackProtectorDescriptor() { }
 
     /// Returns true if all fields of the stack protector descriptor are
@@ -415,8 +422,8 @@ private:
       assert(!shouldEmitStackProtector() && "Stack Protector Descriptor is "
              "already initialized!");
       ParentMBB = MBB;
-      SuccessMBB = AddSuccessorMBB(BB, MBB);
-      FailureMBB = AddSuccessorMBB(BB, MBB, FailureMBB);
+      SuccessMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ true);
+      FailureMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ false, FailureMBB);
       if (!Guard)
         Guard = StackProtCheckCall.getArgOperand(0);
     }
@@ -455,6 +462,9 @@ private:
     MachineBasicBlock *getFailureMBB() { return FailureMBB; }
     const Value *getGuard() { return Guard; }
 
+    unsigned getGuardReg() const { return GuardReg; }
+    void setGuardReg(unsigned R) { GuardReg = R; }
+
   private:
     /// The basic block for which we are generating the stack protector.
     ///
@@ -477,11 +487,15 @@ private:
     /// stack protector stack slot.
     const Value *Guard;
 
+    /// The virtual register holding the stack guard value.
+    unsigned GuardReg;
+
     /// Add a successor machine basic block to ParentMBB. If the successor mbb
     /// has not been created yet (i.e. if SuccMBB = 0), then the machine basic
-    /// block will be created.
+    /// block will be created. Assign a large weight if IsLikely is true.
     MachineBasicBlock *AddSuccessorMBB(const BasicBlock *BB,
                                        MachineBasicBlock *ParentMBB,
+                                       bool IsLikely,
                                        MachineBasicBlock *SuccMBB = nullptr);
   };
 
@@ -604,6 +618,13 @@ public:
     N = NewN;
   }
 
+  void removeValue(const Value *V) {
+    // This is to support hack in lowerCallFromStatepoint
+    // Should be removed when hack is resolved
+    if (NodeMap.count(V))
+      NodeMap.erase(V);
+  }
+
   void setUnusedArgValue(const Value *V, SDValue NewN) {
     SDValue &N = UnusedArgNodeMap[V];
     assert(!N.getNode() && "Already set a value for this node!");
@@ -626,17 +647,24 @@ public:
   void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall,
                    MachineBasicBlock *LandingPad = nullptr);
 
-  std::pair<SDValue, SDValue> LowerCallOperands(const CallInst &CI,
-                                                unsigned ArgIdx,
-                                                unsigned NumArgs,
-                                                SDValue Callee,
-                                                bool useVoidTy = false);
+  std::pair<SDValue, SDValue> lowerCallOperands(
+          ImmutableCallSite CS,
+          unsigned ArgIdx,
+          unsigned NumArgs,
+          SDValue Callee,
+          bool UseVoidTy = false,
+          MachineBasicBlock *LandingPad = nullptr,
+          bool IsPatchPoint = false);
 
   /// UpdateSplitBlock - When an MBB was split during scheduling, update the
   /// references that need to refer to the last resulting block.
   void UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last);
 
 private:
+  std::pair<SDValue, SDValue> lowerInvokable(
+          TargetLowering::CallLoweringInfo &CLI,
+          MachineBasicBlock *LandingPad);
+
   // Terminator instructions.
   void visitRet(const ReturnInst &I);
   void visitBr(const BranchInst &I);
@@ -658,7 +686,6 @@ private:
   bool handleBTSplitSwitchCase(CaseRec& CR,
                                CaseRecVector& WorkList,
                                const Value* SV,
-                               MachineBasicBlock* Default,
                                MachineBasicBlock *SwitchBB);
   bool handleBitTestsSwitchCase(CaseRec& CR,
                                 CaseRecVector& WorkList,
@@ -686,6 +713,8 @@ public:
   void visitJumpTable(JumpTable &JT);
   void visitJumpTableHeader(JumpTable &JT, JumpTableHeader &JTH,
                             MachineBasicBlock *SwitchBB);
+  unsigned visitLandingPadClauseBB(GlobalValue *ClauseGV,
+                                   MachineBasicBlock *LPadMBB);
 
 private:
   // These all get lowered before this pass.
@@ -743,6 +772,8 @@ private:
   void visitAlloca(const AllocaInst &I);
   void visitLoad(const LoadInst &I);
   void visitStore(const StoreInst &I);
+  void visitMaskedLoad(const CallInst &I);
+  void visitMaskedStore(const CallInst &I);
   void visitAtomicCmpXchg(const AtomicCmpXchgInst &I);
   void visitAtomicRMW(const AtomicRMWInst &I);
   void visitFence(const FenceInst &I);
@@ -755,6 +786,7 @@ private:
   bool visitStrLenCall(const CallInst &I);
   bool visitStrNLenCall(const CallInst &I);
   bool visitUnaryFloatCall(const CallInst &I, unsigned Opcode);
+  bool visitBinaryFloatCall(const CallInst &I, unsigned Opcode);
   void visitAtomicLoad(const LoadInst &I);
   void visitAtomicStore(const StoreInst &I);
 
@@ -767,7 +799,13 @@ private:
   void visitVAEnd(const CallInst &I);
   void visitVACopy(const CallInst &I);
   void visitStackmap(const CallInst &I);
-  void visitPatchpoint(const CallInst &I);
+  void visitPatchpoint(ImmutableCallSite CS,
+                       MachineBasicBlock *LandingPad = nullptr);
+
+  // These three are implemented in StatepointLowering.cpp
+  void visitStatepoint(const CallInst &I);
+  void visitGCRelocate(const CallInst &I);
+  void visitGCResult(const CallInst &I);
 
   void visitUserOp1(const Instruction &I) {
     llvm_unreachable("UserOp1 should not exist at instruction selection time!");
@@ -784,7 +822,7 @@ private:
   /// EmitFuncArgumentDbgValue - If V is an function argument then create
   /// corresponding DBG_VALUE machine instruction for it now. At the end of
   /// instruction selection, they will be inserted to the entry BB.
-  bool EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
+  bool EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable, MDNode *Expr,
                                 int64_t Offset, bool IsIndirect,
                                 const SDValue &N);
 };
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index a71cc6859ea0..e8577d898c2d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 std::string SDNode::getOperationName(const SelectionDAG *G) const {
@@ -36,7 +37,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
       return "<<Unknown DAG Node>>";
     if (isMachineOpcode()) {
       if (G)
-        if (const TargetInstrInfo *TII = G->getTarget().getInstrInfo())
+        if (const TargetInstrInfo *TII = G->getSubtarget().getInstrInfo())
           if (getMachineOpcode() < TII->getNumOpcodes())
             return TII->getName(getMachineOpcode());
       return "<<Unknown Machine Node #" + utostr(getOpcode()) + ">>";
@@ -140,6 +141,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
 
   // Unary operators
   case ISD::FABS:                       return "fabs";
+  case ISD::FMINNUM:                    return "fminnum";
+  case ISD::FMAXNUM:                    return "fmaxnum";
   case ISD::FNEG:                       return "fneg";
   case ISD::FSQRT:                      return "fsqrt";
   case ISD::FSIN:                       return "fsin";
@@ -266,6 +269,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     // Other operators
   case ISD::LOAD:                       return "load";
   case ISD::STORE:                      return "store";
+  case ISD::MLOAD:                      return "masked_load";
+  case ISD::MSTORE:                     return "masked_store";
   case ISD::VAARG:                      return "vaarg";
   case ISD::VACOPY:                     return "vacopy";
   case ISD::VAEND:                      return "vaend";
@@ -433,7 +438,8 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
       OS << LBB->getName() << " ";
     OS << (const void*)BBDN->getBasicBlock() << ">";
   } else if (const RegisterSDNode *R = dyn_cast<RegisterSDNode>(this)) {
-    OS << ' ' << PrintReg(R->getReg(), G ? G->getTarget().getRegisterInfo() :nullptr);
+    OS << ' ' << PrintReg(R->getReg(),
+                          G ? G->getSubtarget().getRegisterInfo() : nullptr);
   } else if (const ExternalSymbolSDNode *ES =
              dyn_cast<ExternalSymbolSDNode>(this)) {
     OS << "'" << ES->getSymbol() << "'";
@@ -565,7 +571,7 @@ void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
 typedef SmallPtrSet<const SDNode *, 128> VisitedSDNodeSet;
 static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
                        const SelectionDAG *G, VisitedSDNodeSet &once) {
-  if (!once.insert(N))          // If we've been here before, return now.
+  if (!once.insert(N).second) // If we've been here before, return now.
     return;
 
   // Dump the current SDNode, but don't end the line yet.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 57e22e21c371..4f031d3ff7e7 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
@@ -40,6 +41,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -181,6 +183,10 @@ UseMBPI("use-mbpi",
         cl::init(true), cl::Hidden);
 
 #ifndef NDEBUG
+static cl::opt<std::string>
+FilterDAGBasicBlockName("filter-view-dags", cl::Hidden,
+                        cl::desc("Only display the basic block whose name "
+                                 "matches this for all view-*-dags options"));
 static cl::opt<bool>
 ViewDAGCombine1("view-dag-combine1-dags", cl::Hidden,
           cl::desc("Pop up a window to show dags before the first "
@@ -284,8 +290,8 @@ namespace llvm {
   /// for the target.
   ScheduleDAGSDNodes* createDefaultScheduler(SelectionDAGISel *IS,
                                              CodeGenOpt::Level OptLevel) {
-    const TargetLowering *TLI = IS->getTargetLowering();
-    const TargetSubtargetInfo &ST = IS->TM.getSubtarget<TargetSubtargetInfo>();
+    const TargetLowering *TLI = IS->TLI;
+    const TargetSubtargetInfo &ST = IS->MF->getSubtarget();
 
     if (OptLevel == CodeGenOpt::None || ST.useMachineScheduler() ||
         TLI->getSchedulingPreference() == Sched::Source)
@@ -336,7 +342,7 @@ void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
 SelectionDAGISel::SelectionDAGISel(TargetMachine &tm,
                                    CodeGenOpt::Level OL) :
   MachineFunctionPass(ID), TM(tm),
-  FuncInfo(new FunctionLoweringInfo(TM)),
+  FuncInfo(new FunctionLoweringInfo()),
   CurDAG(new SelectionDAG(tm, OL)),
   SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
   GFI(),
@@ -411,32 +417,32 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
          "-fast-isel-abort requires -fast-isel");
 
   const Function &Fn = *mf.getFunction();
-  const TargetInstrInfo &TII = *TM.getInstrInfo();
-  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
-  const TargetLowering *TLI = TM.getTargetLowering();
-
   MF = &mf;
-  RegInfo = &MF->getRegInfo();
-  AA = &getAnalysis<AliasAnalysis>();
-  LibInfo = &getAnalysis<TargetLibraryInfo>();
-  GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
-
-  TargetSubtargetInfo &ST =
-    const_cast<TargetSubtargetInfo&>(TM.getSubtarget<TargetSubtargetInfo>());
-  ST.resetSubtargetFeatures(MF);
-  TM.resetTargetOptions(MF);
 
+  // Reset the target options before resetting the optimization
+  // level below.
+  // FIXME: This is a horrible hack and should be processed via
+  // codegen looking at the optimization level explicitly when
+  // it wants to look at it.
+  TM.resetTargetOptions(Fn);
   // Reset OptLevel to None for optnone functions.
   CodeGenOpt::Level NewOptLevel = OptLevel;
   if (Fn.hasFnAttribute(Attribute::OptimizeNone))
     NewOptLevel = CodeGenOpt::None;
   OptLevelChanger OLC(*this, NewOptLevel);
 
+  TII = MF->getSubtarget().getInstrInfo();
+  TLI = MF->getSubtarget().getTargetLowering();
+  RegInfo = &MF->getRegInfo();
+  AA = &getAnalysis<AliasAnalysis>();
+  LibInfo = &getAnalysis<TargetLibraryInfo>();
+  GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
+
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), this);
 
-  CurDAG->init(*MF, TLI);
+  CurDAG->init(*MF);
   FuncInfo->set(Fn, *MF, CurDAG);
 
   if (UseMBPI && OptLevel != CodeGenOpt::None)
@@ -454,7 +460,8 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   // copied into vregs, emit the copies into the top of the block before
   // emitting the code for the block.
   MachineBasicBlock *EntryMBB = MF->begin();
-  RegInfo->EmitLiveInCopies(EntryMBB, TRI, TII);
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  RegInfo->EmitLiveInCopies(EntryMBB, TRI, *TII);
 
   DenseMap<unsigned, unsigned> LiveInMap;
   if (!FuncInfo->ArgDbgValues.empty())
@@ -489,15 +496,14 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
                        "- add if needed");
       MachineInstr *Def = RegInfo->getVRegDef(LDI->second);
       MachineBasicBlock::iterator InsertPos = Def;
-      const MDNode *Variable =
-        MI->getOperand(MI->getNumOperands()-1).getMetadata();
+      const MDNode *Variable = MI->getDebugVariable();
+      const MDNode *Expr = MI->getDebugExpression();
       bool IsIndirect = MI->isIndirectDebugValue();
       unsigned Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
       // Def is never a terminator here, so it is ok to increment InsertPos.
       BuildMI(*EntryMBB, ++InsertPos, MI->getDebugLoc(),
-              TII.get(TargetOpcode::DBG_VALUE),
-              IsIndirect,
-              LDI->second, Offset, Variable);
+              TII->get(TargetOpcode::DBG_VALUE), IsIndirect, LDI->second, Offset,
+              Variable, Expr);
 
       // If this vreg is directly copied into an exported register then
       // that COPY instructions also need DBG_VALUE, if it is the only
@@ -516,11 +522,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       }
       if (CopyUseMI) {
         MachineInstr *NewMI =
-          BuildMI(*MF, CopyUseMI->getDebugLoc(),
-                  TII.get(TargetOpcode::DBG_VALUE),
-                  IsIndirect,
-                  CopyUseMI->getOperand(0).getReg(),
-                  Offset, Variable);
+            BuildMI(*MF, CopyUseMI->getDebugLoc(),
+                    TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
+                    CopyUseMI->getOperand(0).getReg(), Offset, Variable, Expr);
         MachineBasicBlock::iterator Pos = CopyUseMI;
         EntryMBB->insertAfter(Pos, NewMI);
       }
@@ -534,7 +538,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       break;
 
     for (const auto &MI : MBB) {
-      const MCInstrDesc &MCID = TM.getInstrInfo()->get(MI.getOpcode());
+      const MCInstrDesc &MCID = TII->get(MI.getOpcode());
       if ((MCID.isCall() && !MCID.isReturn()) ||
           MI.isStackAligningInlineAsm()) {
         MFI->setHasCalls(true);
@@ -617,7 +621,7 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() {
     SDNode *N = Worklist.pop_back_val();
 
     // If we've already seen this node, ignore it.
-    if (!VisitedNodes.insert(N))
+    if (!VisitedNodes.insert(N).second)
       continue;
 
     // Otherwise, add all chain operands to the worklist.
@@ -652,6 +656,12 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   std::string BlockName;
   int BlockNumber = -1;
   (void)BlockNumber;
+  bool MatchFilterBB = false; (void)MatchFilterBB;
+#ifndef NDEBUG
+  MatchFilterBB = (!FilterDAGBasicBlockName.empty() &&
+                   FilterDAGBasicBlockName ==
+                       FuncInfo->MBB->getBasicBlock()->getName().str());
+#endif
 #ifdef NDEBUG
   if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewLegalizeDAGs ||
       ViewDAGCombine2 || ViewDAGCombineLT || ViewISelDAGs || ViewSchedDAGs ||
@@ -665,7 +675,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   DEBUG(dbgs() << "Initial selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
-  if (ViewDAGCombine1) CurDAG->viewGraph("dag-combine1 input for " + BlockName);
+  if (ViewDAGCombine1 && MatchFilterBB)
+    CurDAG->viewGraph("dag-combine1 input for " + BlockName);
 
   // Run the DAG combiner in pre-legalize mode.
   {
@@ -678,8 +689,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
   // Second step, hack on the DAG until it only uses operations and types that
   // the target supports.
-  if (ViewLegalizeTypesDAGs) CurDAG->viewGraph("legalize-types input for " +
-                                               BlockName);
+  if (ViewLegalizeTypesDAGs && MatchFilterBB)
+    CurDAG->viewGraph("legalize-types input for " + BlockName);
 
   bool Changed;
   {
@@ -693,7 +704,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   CurDAG->NewNodesMustHaveLegalTypes = true;
 
   if (Changed) {
-    if (ViewDAGCombineLT)
+    if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
 
     // Run the DAG combiner in post-type-legalize mode.
@@ -719,7 +730,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
       CurDAG->LegalizeTypes();
     }
 
-    if (ViewDAGCombineLT)
+    if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
 
     // Run the DAG combiner in post-type-legalize mode.
@@ -733,7 +744,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
           << BlockNumber << " '" << BlockName << "'\n"; CurDAG->dump());
   }
 
-  if (ViewLegalizeDAGs) CurDAG->viewGraph("legalize input for " + BlockName);
+  if (ViewLegalizeDAGs && MatchFilterBB)
+    CurDAG->viewGraph("legalize input for " + BlockName);
 
   {
     NamedRegionTimer T("DAG Legalization", GroupName, TimePassesIsEnabled);
@@ -743,7 +755,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   DEBUG(dbgs() << "Legalized selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
-  if (ViewDAGCombine2) CurDAG->viewGraph("dag-combine2 input for " + BlockName);
+  if (ViewDAGCombine2 && MatchFilterBB)
+    CurDAG->viewGraph("dag-combine2 input for " + BlockName);
 
   // Run the DAG combiner in post-legalize mode.
   {
@@ -757,7 +770,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   if (OptLevel != CodeGenOpt::None)
     ComputeLiveOutVRegInfo();
 
-  if (ViewISelDAGs) CurDAG->viewGraph("isel input for " + BlockName);
+  if (ViewISelDAGs && MatchFilterBB)
+    CurDAG->viewGraph("isel input for " + BlockName);
 
   // Third, instruction select all of the operations to machine code, adding the
   // code to the MachineBasicBlock.
@@ -769,7 +783,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   DEBUG(dbgs() << "Selected selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
-  if (ViewSchedDAGs) CurDAG->viewGraph("scheduler input for " + BlockName);
+  if (ViewSchedDAGs && MatchFilterBB)
+    CurDAG->viewGraph("scheduler input for " + BlockName);
 
   // Schedule machine code.
   ScheduleDAGSDNodes *Scheduler = CreateScheduler();
@@ -779,7 +794,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     Scheduler->Run(CurDAG, FuncInfo->MBB);
   }
 
-  if (ViewSUnitDAGs) Scheduler->viewGraph();
+  if (ViewSUnitDAGs && MatchFilterBB) Scheduler->viewGraph();
 
   // Emit machine code to BB.  This can change 'BB' to the last block being
   // inserted into.
@@ -894,6 +909,8 @@ void SelectionDAGISel::DoInstructionSelection() {
 void SelectionDAGISel::PrepareEHLandingPad() {
   MachineBasicBlock *MBB = FuncInfo->MBB;
 
+  const TargetRegisterClass *PtrRC = TLI->getRegClassFor(TLI->getPointerTy());
+
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
   MCSymbol *Label = MF->getMMI().addLandingPad(MBB);
@@ -901,13 +918,70 @@ void SelectionDAGISel::PrepareEHLandingPad() {
   // Assign the call site to the landing pad's begin label.
   MF->getMMI().setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
 
-  const MCInstrDesc &II = TM.getInstrInfo()->get(TargetOpcode::EH_LABEL);
+  const MCInstrDesc &II = TII->get(TargetOpcode::EH_LABEL);
   BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
 
+  if (TM.getMCAsmInfo()->getExceptionHandlingType() ==
+      ExceptionHandling::MSVC) {
+    // Make virtual registers and a series of labels that fill in values for the
+    // clauses.
+    auto &RI = MF->getRegInfo();
+    FuncInfo->ExceptionSelectorVirtReg = RI.createVirtualRegister(PtrRC);
+
+    // Get all invoke BBs that will unwind into the clause BBs.
+    SmallVector<MachineBasicBlock *, 4> InvokeBBs(MBB->pred_begin(),
+                                                  MBB->pred_end());
+
+    // Emit separate machine basic blocks with separate labels for each clause
+    // before the main landing pad block.
+    const BasicBlock *LLVMBB = MBB->getBasicBlock();
+    const LandingPadInst *LPadInst = LLVMBB->getLandingPadInst();
+    MachineInstrBuilder SelectorPHI = BuildMI(
+        *MBB, MBB->begin(), SDB->getCurDebugLoc(), TII->get(TargetOpcode::PHI),
+        FuncInfo->ExceptionSelectorVirtReg);
+    for (unsigned I = 0, E = LPadInst->getNumClauses(); I != E; ++I) {
+      MachineBasicBlock *ClauseBB = MF->CreateMachineBasicBlock(LLVMBB);
+      MF->insert(MBB, ClauseBB);
+
+      // Add the edge from the invoke to the clause.
+      for (MachineBasicBlock *InvokeBB : InvokeBBs)
+        InvokeBB->addSuccessor(ClauseBB);
+
+      // Mark the clause as a landing pad or MI passes will delete it.
+      ClauseBB->setIsLandingPad();
+
+      GlobalValue *ClauseGV = ExtractTypeInfo(LPadInst->getClause(I));
+
+      // Start the BB with a label.
+      MCSymbol *ClauseLabel = MF->getMMI().addClauseForLandingPad(MBB);
+      BuildMI(*ClauseBB, ClauseBB->begin(), SDB->getCurDebugLoc(), II)
+          .addSym(ClauseLabel);
+
+      // Construct a simple BB that defines a register with the typeid constant.
+      FuncInfo->MBB = ClauseBB;
+      FuncInfo->InsertPt = ClauseBB->end();
+      unsigned VReg = SDB->visitLandingPadClauseBB(ClauseGV, MBB);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+
+      // Add the typeid virtual register to the phi in the main landing pad.
+      SelectorPHI.addReg(VReg).addMBB(ClauseBB);
+    }
+
+    // Remove the edge from the invoke to the lpad.
+    for (MachineBasicBlock *InvokeBB : InvokeBBs)
+      InvokeBB->removeSuccessor(MBB);
+
+    // Restore FuncInfo back to its previous state and select the main landing
+    // pad block.
+    FuncInfo->MBB = MBB;
+    FuncInfo->InsertPt = MBB->end();
+    return;
+  }
+
   // Mark exception register as live in.
-  const TargetLowering *TLI = getTargetLowering();
-  const TargetRegisterClass *PtrRC = TLI->getRegClassFor(TLI->getPointerTy());
   if (unsigned Reg = TLI->getExceptionPointerRegister())
     FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
 
@@ -1042,7 +1116,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = nullptr;
   if (TM.Options.EnableFastISel)
-    FastIS = getTargetLowering()->createFastISel(*FuncInfo, LibInfo);
+    FastIS = TLI->createFastISel(*FuncInfo, LibInfo);
 
   // Iterate over all basic blocks in the function.
   ReversePostOrderTraversal<const Function*> RPOT(&Fn);
@@ -1096,7 +1170,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         ++NumEntryBlocks;
 
         // Lower any arguments needed in this block if this is the entry block.
-        if (!FastIS->LowerArguments()) {
+        if (!FastIS->lowerArguments()) {
           // Fast isel failed to lower these arguments
           ++NumFastIselFailLowerArguments;
           if (EnableFastISelAbortArgs)
@@ -1134,7 +1208,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         FastIS->recomputeInsertPt();
 
         // Try to select the instruction with FastISel.
-        if (FastIS->SelectInstruction(Inst)) {
+        if (FastIS->selectInstruction(Inst)) {
           --NumFastIselRemaining;
           ++NumFastIselSuccess;
           // If fast isel succeeded, skip over all the folded instructions, and
@@ -1729,7 +1803,7 @@ static SDNode *findGlueUse(SDNode *N) {
 /// This function recursively traverses up the operand chain, ignoring
 /// certain nodes.
 static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
-                          SDNode *Root, SmallPtrSet<SDNode*, 16> &Visited,
+                          SDNode *Root, SmallPtrSetImpl<SDNode*> &Visited,
                           bool IgnoreChains) {
   // The NodeID's are given uniques ID's where a node ID is guaranteed to be
   // greater than all of its (recursive) operands.  If we scan to a point where
@@ -1744,7 +1818,7 @@ static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
 
   // Don't revisit nodes if we already scanned it and didn't fail, we know we
   // won't fail if we scan it again.
-  if (!Visited.insert(Use))
+  if (!Visited.insert(Use).second)
     return false;
 
   for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
@@ -1861,8 +1935,8 @@ SDNode
   SDLoc dl(Op);
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(0));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
-  unsigned Reg = getTargetLowering()->getRegisterByName(
-                 RegStr->getString().data(), Op->getValueType(0));
+  unsigned Reg =
+      TLI->getRegisterByName(RegStr->getString().data(), Op->getValueType(0));
   SDValue New = CurDAG->getCopyFromReg(
                         CurDAG->getEntryNode(), dl, Reg, Op->getValueType(0));
   New->setNodeId(-1);
@@ -1874,8 +1948,8 @@ SDNode
   SDLoc dl(Op);
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
-  unsigned Reg = getTargetLowering()->getRegisterByName(
-                 RegStr->getString().data(), Op->getOperand(2).getValueType());
+  unsigned Reg = TLI->getRegisterByName(RegStr->getString().data(),
+                                        Op->getOperand(2).getValueType());
   SDValue New = CurDAG->getCopyToReg(
                         CurDAG->getEntryNode(), dl, Reg, Op->getOperand(2));
   New->setNodeId(-1);
@@ -2375,7 +2449,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
     Result = !::CheckOpcode(Table, Index, N.getNode());
     return Index;
   case SelectionDAGISel::OPC_CheckType:
-    Result = !::CheckType(Table, Index, N, SDISel.getTargetLowering());
+    Result = !::CheckType(Table, Index, N, SDISel.TLI);
     return Index;
   case SelectionDAGISel::OPC_CheckChild0Type:
   case SelectionDAGISel::OPC_CheckChild1Type:
@@ -2385,14 +2459,15 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
   case SelectionDAGISel::OPC_CheckChild5Type:
   case SelectionDAGISel::OPC_CheckChild6Type:
   case SelectionDAGISel::OPC_CheckChild7Type:
-    Result = !::CheckChildType(Table, Index, N, SDISel.getTargetLowering(),
-                        Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Type);
+    Result = !::CheckChildType(Table, Index, N, SDISel.TLI,
+                               Table[Index - 1] -
+                                   SelectionDAGISel::OPC_CheckChild0Type);
     return Index;
   case SelectionDAGISel::OPC_CheckCondCode:
     Result = !::CheckCondCode(Table, Index, N);
     return Index;
   case SelectionDAGISel::OPC_CheckValueType:
-    Result = !::CheckValueType(Table, Index, N, SDISel.getTargetLowering());
+    Result = !::CheckValueType(Table, Index, N, SDISel.TLI);
     return Index;
   case SelectionDAGISel::OPC_CheckInteger:
     Result = !::CheckInteger(Table, Index, N);
@@ -2436,6 +2511,42 @@ struct MatchScope {
   bool HasChainNodesMatched, HasGlueResultNodesMatched;
 };
 
+/// \\brief A DAG update listener to keep the matching state
+/// (i.e. RecordedNodes and MatchScope) uptodate if the target is allowed to
+/// change the DAG while matching.  X86 addressing mode matcher is an example
+/// for this.
+class MatchStateUpdater : public SelectionDAG::DAGUpdateListener
+{
+      SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes;
+      SmallVectorImpl<MatchScope> &MatchScopes;
+public:
+  MatchStateUpdater(SelectionDAG &DAG,
+                    SmallVectorImpl<std::pair<SDValue, SDNode*> > &RN,
+                    SmallVectorImpl<MatchScope> &MS) :
+    SelectionDAG::DAGUpdateListener(DAG),
+    RecordedNodes(RN), MatchScopes(MS) { }
+
+  void NodeDeleted(SDNode *N, SDNode *E) {
+    // Some early-returns here to avoid the search if we deleted the node or
+    // if the update comes from MorphNodeTo (MorphNodeTo is the last thing we
+    // do, so it's unnecessary to update matching state at that point).
+    // Neither of these can occur currently because we only install this
+    // update listener during matching a complex patterns.
+    if (!E || E->isMachineOpcode())
+      return;
+    // Performing linear search here does not matter because we almost never
+    // run this code.  You'd have to have a CSE during complex pattern
+    // matching.
+    for (auto &I : RecordedNodes)
+      if (I.first.getNode() == N)
+        I.first.setNode(E);
+
+    for (auto &I : MatchScopes)
+      for (auto &J : I.NodeStack)
+        if (J.getNode() == N)
+          J.setNode(E);
+  }
+};
 }
 
 SDNode *SelectionDAGISel::
@@ -2449,8 +2560,6 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
   case ISD::BasicBlock:
   case ISD::Register:
   case ISD::RegisterMask:
-  //case ISD::VALUETYPE:
-  //case ISD::CONDCODE:
   case ISD::HANDLENODE:
   case ISD::MDNODE_SDNODE:
   case ISD::TargetConstant:
@@ -2692,6 +2801,14 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       unsigned CPNum = MatcherTable[MatcherIndex++];
       unsigned RecNo = MatcherTable[MatcherIndex++];
       assert(RecNo < RecordedNodes.size() && "Invalid CheckComplexPat");
+
+      // If target can modify DAG during matching, keep the matching state
+      // consistent.
+      std::unique_ptr<MatchStateUpdater> MSU;
+      if (ComplexPatternFuncMutatesDAG())
+        MSU.reset(new MatchStateUpdater(*CurDAG, RecordedNodes,
+                                        MatchScopes));
+
       if (!CheckComplexPattern(NodeToMatch, RecordedNodes[RecNo].second,
                                RecordedNodes[RecNo].first, CPNum,
                                RecordedNodes))
@@ -2703,7 +2820,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       continue;
 
     case OPC_CheckType:
-      if (!::CheckType(MatcherTable, MatcherIndex, N, getTargetLowering()))
+      if (!::CheckType(MatcherTable, MatcherIndex, N, TLI))
         break;
       continue;
 
@@ -2751,7 +2868,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
 
         MVT CaseVT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
         if (CaseVT == MVT::iPTR)
-          CaseVT = getTargetLowering()->getPointerTy();
+          CaseVT = TLI->getPointerTy();
 
         // If the VT matches, then we will execute this case.
         if (CurNodeVT == CaseVT)
@@ -2773,7 +2890,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_CheckChild2Type: case OPC_CheckChild3Type:
     case OPC_CheckChild4Type: case OPC_CheckChild5Type:
     case OPC_CheckChild6Type: case OPC_CheckChild7Type:
-      if (!::CheckChildType(MatcherTable, MatcherIndex, N, getTargetLowering(),
+      if (!::CheckChildType(MatcherTable, MatcherIndex, N, TLI,
                             Opcode-OPC_CheckChild0Type))
         break;
       continue;
@@ -2781,7 +2898,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       if (!::CheckCondCode(MatcherTable, MatcherIndex, N)) break;
       continue;
     case OPC_CheckValueType:
-      if (!::CheckValueType(MatcherTable, MatcherIndex, N, getTargetLowering()))
+      if (!::CheckValueType(MatcherTable, MatcherIndex, N, TLI))
         break;
       continue;
     case OPC_CheckInteger:
@@ -2980,7 +3097,8 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       for (unsigned i = 0; i != NumVTs; ++i) {
         MVT::SimpleValueType VT =
           (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
-        if (VT == MVT::iPTR) VT = getTargetLowering()->getPointerTy().SimpleTy;
+        if (VT == MVT::iPTR)
+          VT = TLI->getPointerTy().SimpleTy;
         VTs.push_back(VT);
       }
 
@@ -3076,7 +3194,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       if (EmitNodeInfo & OPFL_MemRefs) {
         // Only attach load or store memory operands if the generated
         // instruction may load or store.
-        const MCInstrDesc &MCID = TM.getInstrInfo()->get(TargetOpc);
+        const MCInstrDesc &MCID = TII->get(TargetOpc);
         bool mayLoad = MCID.mayLoad();
         bool mayStore = MCID.mayStore();
 
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
new file mode 100644
index 000000000000..33c20d3f2195
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -0,0 +1,684 @@
+//===-- StatepointLowering.cpp - SDAGBuilder's statepoint code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes support code use by SelectionDAGBuilder when lowering a
+// statepoint sequence in SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "StatepointLowering.h"
+#include "SelectionDAGBuilder.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "statepoint-lowering"
+
+STATISTIC(NumSlotsAllocatedForStatepoints,
+          "Number of stack slots allocated for statepoints");
+STATISTIC(NumOfStatepoints, "Number of statepoint nodes encountered");
+STATISTIC(StatepointMaxSlotsRequired,
+          "Maximum number of stack slots required for a singe statepoint");
+
+void
+StatepointLoweringState::startNewStatepoint(SelectionDAGBuilder &Builder) {
+  // Consistency check
+  assert(PendingGCRelocateCalls.empty() &&
+         "Trying to visit statepoint before finished processing previous one");
+  Locations.clear();
+  RelocLocations.clear();
+  NextSlotToAllocate = 0;
+  // Need to resize this on each safepoint - we need the two to stay in
+  // sync and the clear patterns of a SelectionDAGBuilder have no relation
+  // to FunctionLoweringInfo.
+  AllocatedStackSlots.resize(Builder.FuncInfo.StatepointStackSlots.size());
+  for (size_t i = 0; i < AllocatedStackSlots.size(); i++) {
+    AllocatedStackSlots[i] = false;
+  }
+}
+void StatepointLoweringState::clear() {
+  Locations.clear();
+  RelocLocations.clear();
+  AllocatedStackSlots.clear();
+  assert(PendingGCRelocateCalls.empty() &&
+         "cleared before statepoint sequence completed");
+}
+
+SDValue
+StatepointLoweringState::allocateStackSlot(EVT ValueType,
+                                           SelectionDAGBuilder &Builder) {
+
+  NumSlotsAllocatedForStatepoints++;
+
+  // The basic scheme here is to first look for a previously created stack slot
+  // which is not in use (accounting for the fact arbitrary slots may already
+  // be reserved), or to create a new stack slot and use it.
+
+  // If this doesn't succeed in 40000 iterations, something is seriously wrong
+  for (int i = 0; i < 40000; i++) {
+    assert(Builder.FuncInfo.StatepointStackSlots.size() ==
+               AllocatedStackSlots.size() &&
+           "broken invariant");
+    const size_t NumSlots = AllocatedStackSlots.size();
+    assert(NextSlotToAllocate <= NumSlots && "broken invariant");
+
+    if (NextSlotToAllocate >= NumSlots) {
+      assert(NextSlotToAllocate == NumSlots);
+      // record stats
+      if (NumSlots + 1 > StatepointMaxSlotsRequired) {
+        StatepointMaxSlotsRequired = NumSlots + 1;
+      }
+
+      SDValue SpillSlot = Builder.DAG.CreateStackTemporary(ValueType);
+      const unsigned FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      Builder.FuncInfo.StatepointStackSlots.push_back(FI);
+      AllocatedStackSlots.push_back(true);
+      return SpillSlot;
+    }
+    if (!AllocatedStackSlots[NextSlotToAllocate]) {
+      const int FI = Builder.FuncInfo.StatepointStackSlots[NextSlotToAllocate];
+      AllocatedStackSlots[NextSlotToAllocate] = true;
+      return Builder.DAG.getFrameIndex(FI, ValueType);
+    }
+    // Note: We deliberately choose to advance this only on the failing path.
+    // Doing so on the suceeding path involes a bit of complexity that caused a
+    // minor bug previously.  Unless performance shows this matters, please
+    // keep this code as simple as possible.
+    NextSlotToAllocate++;
+  }
+  llvm_unreachable("infinite loop?");
+}
+
+/// Try to find existing copies of the incoming values in stack slots used for
+/// statepoint spilling.  If we can find a spill slot for the incoming value,
+/// mark that slot as allocated, and reuse the same slot for this safepoint.
+/// This helps to avoid series of loads and stores that only serve to resuffle
+/// values on the stack between calls.
+static void reservePreviousStackSlotForValue(SDValue Incoming,
+                                             SelectionDAGBuilder &Builder) {
+
+  if (isa<ConstantSDNode>(Incoming) || isa<FrameIndexSDNode>(Incoming)) {
+    // We won't need to spill this, so no need to check for previously
+    // allocated stack slots
+    return;
+  }
+
+  SDValue Loc = Builder.StatepointLowering.getLocation(Incoming);
+  if (Loc.getNode()) {
+    // duplicates in input
+    return;
+  }
+
+  // Search back for the load from a stack slot pattern to find the original
+  // slot we allocated for this value.  We could extend this to deal with
+  // simple modification patterns, but simple dealing with trivial load/store
+  // sequences helps a lot already.
+  if (LoadSDNode *Load = dyn_cast<LoadSDNode>(Incoming)) {
+    if (auto *FI = dyn_cast<FrameIndexSDNode>(Load->getBasePtr())) {
+      const int Index = FI->getIndex();
+      auto Itr = std::find(Builder.FuncInfo.StatepointStackSlots.begin(),
+                           Builder.FuncInfo.StatepointStackSlots.end(), Index);
+      if (Itr == Builder.FuncInfo.StatepointStackSlots.end()) {
+        // not one of the lowering stack slots, can't reuse!
+        // TODO: Actually, we probably could reuse the stack slot if the value
+        // hasn't changed at all, but we'd need to look for intervening writes
+        return;
+      } else {
+        // This is one of our dedicated lowering slots
+        const int Offset =
+            std::distance(Builder.FuncInfo.StatepointStackSlots.begin(), Itr);
+        if (Builder.StatepointLowering.isStackSlotAllocated(Offset)) {
+          // stack slot already assigned to someone else, can't use it!
+          // TODO: currently we reserve space for gc arguments after doing
+          // normal allocation for deopt arguments.  We should reserve for
+          // _all_ deopt and gc arguments, then start allocating.  This
+          // will prevent some moves being inserted when vm state changes,
+          // but gc state doesn't between two calls.
+          return;
+        }
+        // Reserve this stack slot
+        Builder.StatepointLowering.reserveStackSlot(Offset);
+      }
+
+      // Cache this slot so we find it when going through the normal
+      // assignment loop.
+      SDValue Loc =
+          Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType());
+
+      Builder.StatepointLowering.setLocation(Incoming, Loc);
+    }
+  }
+
+  // TODO: handle case where a reloaded value flows through a phi to
+  // another safepoint.  e.g.
+  // bb1:
+  //  a' = relocated...
+  // bb2: % pred: bb1, bb3, bb4, etc.
+  //  a_phi = phi(a', ...)
+  // statepoint ... a_phi
+  // NOTE: This will require reasoning about cross basic block values.  This is
+  // decidedly non trivial and this might not be the right place to do it.  We
+  // don't really have the information we need here...
+
+  // TODO: handle simple updates.  If a value is modified and the original
+  // value is no longer live, it would be nice to put the modified value in the
+  // same slot.  This allows folding of the memory accesses for some
+  // instructions types (like an increment).
+  // statepoint (i)
+  // i1 = i+1
+  // statepoint (i1)
+}
+
+/// Remove any duplicate (as SDValues) from the derived pointer pairs.  This
+/// is not required for correctness.  It's purpose is to reduce the size of
+/// StackMap section.  It has no effect on the number of spill slots required
+/// or the actual lowering.
+static void removeDuplicatesGCPtrs(SmallVectorImpl<const Value *> &Bases,
+                                   SmallVectorImpl<const Value *> &Ptrs,
+                                   SmallVectorImpl<const Value *> &Relocs,
+                                   SelectionDAGBuilder &Builder) {
+
+  // This is horribly ineffecient, but I don't care right now
+  SmallSet<SDValue, 64> Seen;
+
+  SmallVector<const Value *, 64> NewBases, NewPtrs, NewRelocs;
+  for (size_t i = 0; i < Ptrs.size(); i++) {
+    SDValue SD = Builder.getValue(Ptrs[i]);
+    // Only add non-duplicates
+    if (Seen.count(SD) == 0) {
+      NewBases.push_back(Bases[i]);
+      NewPtrs.push_back(Ptrs[i]);
+      NewRelocs.push_back(Relocs[i]);
+    }
+    Seen.insert(SD);
+  }
+  assert(Bases.size() >= NewBases.size());
+  assert(Ptrs.size() >= NewPtrs.size());
+  assert(Relocs.size() >= NewRelocs.size());
+  Bases = NewBases;
+  Ptrs = NewPtrs;
+  Relocs = NewRelocs;
+  assert(Ptrs.size() == Bases.size());
+  assert(Ptrs.size() == Relocs.size());
+}
+
+/// Extract call from statepoint, lower it and return pointer to the
+/// call node. Also update NodeMap so that getValue(statepoint) will
+/// reference lowered call result
+static SDNode *lowerCallFromStatepoint(const CallInst &CI,
+                                       SelectionDAGBuilder &Builder) {
+
+  assert(Intrinsic::experimental_gc_statepoint ==
+             dyn_cast<IntrinsicInst>(&CI)->getIntrinsicID() &&
+         "function called must be the statepoint function");
+
+  ImmutableStatepoint StatepointOperands(&CI);
+
+  // Lower the actual call itself - This is a bit of a hack, but we want to
+  // avoid modifying the actual lowering code.  This is similiar in intent to
+  // the LowerCallOperands mechanism used by PATCHPOINT, but is structured
+  // differently.  Hopefully, this is slightly more robust w.r.t. calling
+  // convention, return values, and other function attributes.
+  Value *ActualCallee = const_cast<Value *>(StatepointOperands.actualCallee());
+
+  std::vector<Value *> Args;
+  CallInst::const_op_iterator arg_begin = StatepointOperands.call_args_begin();
+  CallInst::const_op_iterator arg_end = StatepointOperands.call_args_end();
+  Args.insert(Args.end(), arg_begin, arg_end);
+  // TODO: remove the creation of a new instruction!  We should not be
+  // modifying the IR (even temporarily) at this point.
+  CallInst *Tmp = CallInst::Create(ActualCallee, Args);
+  Tmp->setTailCall(CI.isTailCall());
+  Tmp->setCallingConv(CI.getCallingConv());
+  Tmp->setAttributes(CI.getAttributes());
+  Builder.LowerCallTo(Tmp, Builder.getValue(ActualCallee), false);
+
+  // Handle the return value of the call iff any.
+  const bool HasDef = !Tmp->getType()->isVoidTy();
+  if (HasDef) {
+    // The value of the statepoint itself will be the value of call itself.
+    // We'll replace the actually call node shortly.  gc_result will grab
+    // this value.
+    Builder.setValue(&CI, Builder.getValue(Tmp));
+  } else {
+    // The token value is never used from here on, just generate a poison value
+    Builder.setValue(&CI, Builder.DAG.getIntPtrConstant(-1));
+  }
+  // Remove the fake entry we created so we don't have a hanging reference
+  // after we delete this node.
+  Builder.removeValue(Tmp);
+  delete Tmp;
+  Tmp = nullptr;
+
+  // Search for the call node
+  // The following code is essentially reverse engineering X86's
+  // LowerCallTo.
+  SDNode *CallNode = nullptr;
+
+  // We just emitted a call, so it should be last thing generated
+  SDValue Chain = Builder.DAG.getRoot();
+
+  // Find closest CALLSEQ_END walking back through lowered nodes if needed
+  SDNode *CallEnd = Chain.getNode();
+  int Sanity = 0;
+  while (CallEnd->getOpcode() != ISD::CALLSEQ_END) {
+    CallEnd = CallEnd->getGluedNode();
+    assert(CallEnd && "Can not find call node");
+    assert(Sanity < 20 && "should have found call end already");
+    Sanity++;
+  }
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
+         "Expected a callseq node.");
+  assert(CallEnd->getGluedNode());
+
+  // Step back inside the CALLSEQ
+  CallNode = CallEnd->getGluedNode();
+  return CallNode;
+}
+
+/// Callect all gc pointers coming into statepoint intrinsic, clean them up,
+/// and return two arrays:
+///   Bases - base pointers incoming to this statepoint
+///   Ptrs - derived pointers incoming to this statepoint
+///   Relocs - the gc_relocate corresponding to each base/ptr pair
+/// Elements of this arrays should be in one-to-one correspondence with each
+/// other i.e Bases[i], Ptrs[i] are from the same gcrelocate call
+static void
+getIncomingStatepointGCValues(SmallVectorImpl<const Value *> &Bases,
+                              SmallVectorImpl<const Value *> &Ptrs,
+                              SmallVectorImpl<const Value *> &Relocs,
+                              ImmutableCallSite Statepoint,
+                              SelectionDAGBuilder &Builder) {
+  // Search for relocated pointers.  Note that working backwards from the
+  // gc_relocates ensures that we only get pairs which are actually relocated
+  // and used after the statepoint.
+  // TODO: This logic should probably become a utility function in Statepoint.h
+  for (const User *U : cast<CallInst>(Statepoint.getInstruction())->users()) {
+    if (!isGCRelocate(U)) {
+      continue;
+    }
+    GCRelocateOperands relocateOpers(U);
+    Relocs.push_back(cast<Value>(U));
+    Bases.push_back(relocateOpers.basePtr());
+    Ptrs.push_back(relocateOpers.derivedPtr());
+  }
+
+  // Remove any redundant llvm::Values which map to the same SDValue as another
+  // input.  Also has the effect of removing duplicates in the original
+  // llvm::Value input list as well.  This is a useful optimization for
+  // reducing the size of the StackMap section.  It has no other impact.
+  removeDuplicatesGCPtrs(Bases, Ptrs, Relocs, Builder);
+
+  assert(Bases.size() == Ptrs.size() && Ptrs.size() == Relocs.size());
+}
+
+/// Spill a value incoming to the statepoint. It might be either part of
+/// vmstate
+/// or gcstate. In both cases unconditionally spill it on the stack unless it
+/// is a null constant. Return pair with first element being frame index
+/// containing saved value and second element with outgoing chain from the
+/// emitted store
+static std::pair<SDValue, SDValue>
+spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
+                             SelectionDAGBuilder &Builder) {
+  SDValue Loc = Builder.StatepointLowering.getLocation(Incoming);
+
+  // Emit new store if we didn't do it for this ptr before
+  if (!Loc.getNode()) {
+    Loc = Builder.StatepointLowering.allocateStackSlot(Incoming.getValueType(),
+                                                       Builder);
+    assert(isa<FrameIndexSDNode>(Loc));
+    int Index = cast<FrameIndexSDNode>(Loc)->getIndex();
+    // We use TargetFrameIndex so that isel will not select it into LEA
+    Loc = Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType());
+
+    // TODO: We can create TokenFactor node instead of
+    //       chaining stores one after another, this may allow
+    //       a bit more optimal scheduling for them
+    Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc,
+                                 MachinePointerInfo::getFixedStack(Index),
+                                 false, false, 0);
+
+    Builder.StatepointLowering.setLocation(Incoming, Loc);
+  }
+
+  assert(Loc.getNode());
+  return std::make_pair(Loc, Chain);
+}
+
+/// Lower a single value incoming to a statepoint node.  This value can be
+/// either a deopt value or a gc value, the handling is the same.  We special
+/// case constants and allocas, then fall back to spilling if required.
+static void lowerIncomingStatepointValue(SDValue Incoming,
+                                         SmallVectorImpl<SDValue> &Ops,
+                                         SelectionDAGBuilder &Builder) {
+  SDValue Chain = Builder.getRoot();
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Incoming)) {
+    // If the original value was a constant, make sure it gets recorded as
+    // such in the stackmap.  This is required so that the consumer can
+    // parse any internal format to the deopt state.  It also handles null
+    // pointers and other constant pointers in GC states
+    Ops.push_back(
+        Builder.DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
+    Ops.push_back(Builder.DAG.getTargetConstant(C->getSExtValue(), MVT::i64));
+  } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
+    // This handles allocas as arguments to the statepoint
+    const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo();
+    Ops.push_back(
+        Builder.DAG.getTargetFrameIndex(FI->getIndex(), TLI.getPointerTy()));
+  } else {
+    // Otherwise, locate a spill slot and explicitly spill it so it
+    // can be found by the runtime later.  We currently do not support
+    // tracking values through callee saved registers to their eventual
+    // spill location.  This would be a useful optimization, but would
+    // need to be optional since it requires a lot of complexity on the
+    // runtime side which not all would support.
+    std::pair<SDValue, SDValue> Res =
+        spillIncomingStatepointValue(Incoming, Chain, Builder);
+    Ops.push_back(Res.first);
+    Chain = Res.second;
+  }
+
+  Builder.DAG.setRoot(Chain);
+}
+
+/// Lower deopt state and gc pointer arguments of the statepoint.  The actual
+/// lowering is described in lowerIncomingStatepointValue.  This function is
+/// responsible for lowering everything in the right position and playing some
+/// tricks to avoid redundant stack manipulation where possible.  On
+/// completion, 'Ops' will contain ready to use operands for machine code
+/// statepoint. The chain nodes will have already been created and the DAG root
+/// will be set to the last value spilled (if any were).
+static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
+                                    ImmutableStatepoint Statepoint,
+                                    SelectionDAGBuilder &Builder) {
+
+  // Lower the deopt and gc arguments for this statepoint.  Layout will
+  // be: deopt argument length, deopt arguments.., gc arguments...
+
+  SmallVector<const Value *, 64> Bases, Ptrs, Relocations;
+  getIncomingStatepointGCValues(Bases, Ptrs, Relocations,
+                                Statepoint.getCallSite(), Builder);
+
+#ifndef NDEBUG
+  // Check that each of the gc pointer and bases we've gotten out of the
+  // safepoint is something the strategy thinks might be a pointer into the GC
+  // heap.  This is basically just here to help catch errors during statepoint
+  // insertion. TODO: This should actually be in the Verifier, but we can't get
+  // to the GCStrategy from there (yet).
+  if (Builder.GFI) {
+    GCStrategy &S = Builder.GFI->getStrategy();
+    for (const Value *V : Bases) {
+      auto Opt = S.isGCManagedPointer(V);
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() &&
+               "non gc managed base pointer found in statepoint");
+      }
+    }
+    for (const Value *V : Ptrs) {
+      auto Opt = S.isGCManagedPointer(V);
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() &&
+               "non gc managed derived pointer found in statepoint");
+      }
+    }
+    for (const Value *V : Relocations) {
+      auto Opt = S.isGCManagedPointer(V);
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() && "non gc managed pointer relocated");
+      }
+    }
+  }
+#endif
+
+
+
+  // Before we actually start lowering (and allocating spill slots for values),
+  // reserve any stack slots which we judge to be profitable to reuse for a
+  // particular value.  This is purely an optimization over the code below and
+  // doesn't change semantics at all.  It is important for performance that we
+  // reserve slots for both deopt and gc values before lowering either.
+  for (auto I = Statepoint.vm_state_begin() + 1, E = Statepoint.vm_state_end();
+       I != E; ++I) {
+    Value *V = *I;
+    SDValue Incoming = Builder.getValue(V);
+    reservePreviousStackSlotForValue(Incoming, Builder);
+  }
+  for (unsigned i = 0; i < Bases.size() * 2; ++i) {
+    // Even elements will contain base, odd elements - derived ptr
+    const Value *V = i % 2 ? Bases[i / 2] : Ptrs[i / 2];
+    SDValue Incoming = Builder.getValue(V);
+    reservePreviousStackSlotForValue(Incoming, Builder);
+  }
+
+  // First, prefix the list with the number of unique values to be
+  // lowered.  Note that this is the number of *Values* not the
+  // number of SDValues required to lower them.
+  const int NumVMSArgs = Statepoint.numTotalVMSArgs();
+  Ops.push_back(
+      Builder.DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
+  Ops.push_back(Builder.DAG.getTargetConstant(NumVMSArgs, MVT::i64));
+
+  assert(NumVMSArgs + 1 == std::distance(Statepoint.vm_state_begin(),
+                                         Statepoint.vm_state_end()));
+
+  // The vm state arguments are lowered in an opaque manner.  We do
+  // not know what type of values are contained within.  We skip the
+  // first one since that happens to be the total number we lowered
+  // explicitly just above.  We could have left it in the loop and
+  // not done it explicitly, but it's far easier to understand this
+  // way.
+  for (auto I = Statepoint.vm_state_begin() + 1, E = Statepoint.vm_state_end();
+       I != E; ++I) {
+    const Value *V = *I;
+    SDValue Incoming = Builder.getValue(V);
+    lowerIncomingStatepointValue(Incoming, Ops, Builder);
+  }
+
+  // Finally, go ahead and lower all the gc arguments.  There's no prefixed
+  // length for this one.  After lowering, we'll have the base and pointer
+  // arrays interwoven with each (lowered) base pointer immediately followed by
+  // it's (lowered) derived pointer.  i.e
+  // (base[0], ptr[0], base[1], ptr[1], ...)
+  for (unsigned i = 0; i < Bases.size() * 2; ++i) {
+    // Even elements will contain base, odd elements - derived ptr
+    const Value *V = i % 2 ? Bases[i / 2] : Ptrs[i / 2];
+    SDValue Incoming = Builder.getValue(V);
+    lowerIncomingStatepointValue(Incoming, Ops, Builder);
+  }
+}
+void SelectionDAGBuilder::visitStatepoint(const CallInst &CI) {
+  // The basic scheme here is that information about both the original call and
+  // the safepoint is encoded in the CallInst.  We create a temporary call and
+  // lower it, then reverse engineer the calling sequence.
+
+  // Check some preconditions for sanity
+  assert(isStatepoint(&CI) &&
+         "function called must be the statepoint function");
+  NumOfStatepoints++;
+  // Clear state
+  StatepointLowering.startNewStatepoint(*this);
+
+#ifndef NDEBUG
+  // Consistency check
+  for (const User *U : CI.users()) {
+    const CallInst *Call = cast<CallInst>(U);
+    if (isGCRelocate(Call))
+      StatepointLowering.scheduleRelocCall(*Call);
+  }
+#endif
+
+  ImmutableStatepoint ISP(&CI);
+#ifndef NDEBUG
+  // If this is a malformed statepoint, report it early to simplify debugging.
+  // This should catch any IR level mistake that's made when constructing or
+  // transforming statepoints.
+  ISP.verify();
+
+  // Check that the associated GCStrategy expects to encounter statepoints.
+  // TODO: This if should become an assert.  For now, we allow the GCStrategy
+  // to be optional for backwards compatibility.  This will only last a short
+  // period (i.e. a couple of weeks).
+  if (GFI) {
+    assert(GFI->getStrategy().useStatepoints() &&
+           "GCStrategy does not expect to encounter statepoints");
+  }
+#endif
+
+
+  // Lower statepoint vmstate and gcstate arguments
+  SmallVector<SDValue, 10> LoweredArgs;
+  lowerStatepointMetaArgs(LoweredArgs, ISP, *this);
+
+  // Get call node, we will replace it later with statepoint
+  SDNode *CallNode = lowerCallFromStatepoint(CI, *this);
+
+  // Construct the actual STATEPOINT node with all the appropriate arguments
+  // and return values.
+
+  // TODO: Currently, all of these operands are being marked as read/write in
+  // PrologEpilougeInserter.cpp, we should special case the VMState arguments
+  // and flags to be read-only.
+  SmallVector<SDValue, 40> Ops;
+
+  // Calculate and push starting position of vmstate arguments
+  // Call Node: Chain, Target, {Args}, RegMask, [Glue]
+  SDValue Glue;
+  if (CallNode->getGluedNode()) {
+    // Glue is always last operand
+    Glue = CallNode->getOperand(CallNode->getNumOperands() - 1);
+  }
+  // Get number of arguments incoming directly into call node
+  unsigned NumCallRegArgs =
+      CallNode->getNumOperands() - (Glue.getNode() ? 4 : 3);
+  Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, MVT::i32));
+
+  // Add call target
+  SDValue CallTarget = SDValue(CallNode->getOperand(1).getNode(), 0);
+  Ops.push_back(CallTarget);
+
+  // Add call arguments
+  // Get position of register mask in the call
+  SDNode::op_iterator RegMaskIt;
+  if (Glue.getNode())
+    RegMaskIt = CallNode->op_end() - 2;
+  else
+    RegMaskIt = CallNode->op_end() - 1;
+  Ops.insert(Ops.end(), CallNode->op_begin() + 2, RegMaskIt);
+
+  // Add a leading constant argument with the Flags and the calling convention
+  // masked together
+  CallingConv::ID CallConv = CI.getCallingConv();
+  int Flags = dyn_cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+  assert(Flags == 0 && "not expected to be used");
+  Ops.push_back(DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
+  Ops.push_back(
+      DAG.getTargetConstant(Flags | ((unsigned)CallConv << 1), MVT::i64));
+
+  // Insert all vmstate and gcstate arguments
+  Ops.insert(Ops.end(), LoweredArgs.begin(), LoweredArgs.end());
+
+  // Add register mask from call node
+  Ops.push_back(*RegMaskIt);
+
+  // Add chain
+  Ops.push_back(CallNode->getOperand(0));
+
+  // Same for the glue, but we add it only if original call had it
+  if (Glue.getNode())
+    Ops.push_back(Glue);
+
+  // Compute return values
+  SmallVector<EVT, 21> ValueVTs;
+  ValueVTs.push_back(MVT::Other);
+  ValueVTs.push_back(MVT::Glue); // provide a glue output since we consume one
+  // as input.  This allows someone else to chain
+  // off us as needed.
+  SDVTList NodeTys = DAG.getVTList(ValueVTs);
+
+  SDNode *StatepointMCNode = DAG.getMachineNode(TargetOpcode::STATEPOINT,
+                                                getCurSDLoc(), NodeTys, Ops);
+
+  // Replace original call
+  DAG.ReplaceAllUsesWith(CallNode, StatepointMCNode); // This may update Root
+  // Remove originall call node
+  DAG.DeleteNode(CallNode);
+
+  // DON'T set the root - under the assumption that it's already set past the
+  // inserted node we created.
+
+  // TODO: A better future implementation would be to emit a single variable
+  // argument, variable return value STATEPOINT node here and then hookup the
+  // return value of each gc.relocate to the respective output of the
+  // previously emitted STATEPOINT value.  Unfortunately, this doesn't appear
+  // to actually be possible today.
+}
+
+void SelectionDAGBuilder::visitGCResult(const CallInst &CI) {
+  // The result value of the gc_result is simply the result of the actual
+  // call.  We've already emitted this, so just grab the value.
+  Instruction *I = cast<Instruction>(CI.getArgOperand(0));
+  assert(isStatepoint(I) &&
+         "first argument must be a statepoint token");
+
+  setValue(&CI, getValue(I));
+}
+
+void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) {
+#ifndef NDEBUG
+  // Consistency check
+  StatepointLowering.relocCallVisited(CI);
+#endif
+
+  GCRelocateOperands relocateOpers(&CI);
+  SDValue SD = getValue(relocateOpers.derivedPtr());
+
+  if (isa<ConstantSDNode>(SD) || isa<FrameIndexSDNode>(SD)) {
+    // We didn't need to spill these special cases (constants and allocas).
+    // See the handling in spillIncomingValueForStatepoint for detail.
+    setValue(&CI, SD);
+    return;
+  }
+
+  SDValue Loc = StatepointLowering.getRelocLocation(SD);
+  // Emit new load if we did not emit it before
+  if (!Loc.getNode()) {
+    SDValue SpillSlot = StatepointLowering.getLocation(SD);
+    int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+
+    // Be conservative: flush all pending loads
+    // TODO: Probably we can be less restrictive on this,
+    // it may allow more scheduling opprtunities
+    SDValue Chain = getRoot();
+
+    Loc = DAG.getLoad(SpillSlot.getValueType(), getCurSDLoc(), Chain,
+                      SpillSlot, MachinePointerInfo::getFixedStack(FI), false,
+                      false, false, 0);
+
+    StatepointLowering.setRelocLocation(SD, Loc);
+
+    // Again, be conservative, don't emit pending loads
+    DAG.setRoot(Loc.getValue(1));
+  }
+
+  assert(Loc.getNode());
+  setValue(&CI, Loc);
+}
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.h b/lib/CodeGen/SelectionDAG/StatepointLowering.h
new file mode 100644
index 000000000000..673112cf8bb5
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.h
@@ -0,0 +1,138 @@
+//===-- StatepointLowering.h - SDAGBuilder's statepoint code -*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes support code use by SelectionDAGBuilder when lowering a
+// statepoint sequence in SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include <vector>
+
+namespace llvm {
+class SelectionDAGBuilder;
+
+/// This class tracks both per-statepoint and per-selectiondag information.
+/// For each statepoint it tracks locations of it's gc valuess (incoming and
+/// relocated) and list of gcreloc calls scheduled for visiting (this is
+/// used for a debug mode consistency check only).  The spill slot tracking
+/// works in concert with information in FunctionLoweringInfo.
+class StatepointLoweringState {
+public:
+  StatepointLoweringState() : NextSlotToAllocate(0) {
+  }
+
+  /// Reset all state tracking for a newly encountered safepoint.  Also
+  /// performs some consistency checking.
+  void startNewStatepoint(SelectionDAGBuilder &Builder);
+
+  /// Clear the memory usage of this object.  This is called from
+  /// SelectionDAGBuilder::clear.  We require this is never called in the
+  /// midst of processing a statepoint sequence.
+  void clear();
+
+  /// Returns the spill location of a value incoming to the current
+  /// statepoint.  Will return SDValue() if this value hasn't been
+  /// spilled.  Otherwise, the value has already been spilled and no
+  /// further action is required by the caller.
+  SDValue getLocation(SDValue val) {
+    if (!Locations.count(val))
+      return SDValue();
+    return Locations[val];
+  }
+  void setLocation(SDValue val, SDValue Location) {
+    assert(!Locations.count(val) &&
+           "Trying to allocate already allocated location");
+    Locations[val] = Location;
+  }
+
+  /// Returns the relocated value for a given input pointer. Will
+  /// return SDValue() if this value hasn't yet been reloaded from
+  /// it's stack slot after the statepoint.  Otherwise, the value
+  /// has already been reloaded and the SDValue of that reload will
+  /// be returned. Note that VMState values are spilled but not
+  /// reloaded (since they don't change at the safepoint unless
+  /// also listed in the GC pointer section) and will thus never
+  /// be in this map
+  SDValue getRelocLocation(SDValue val) {
+    if (!RelocLocations.count(val))
+      return SDValue();
+    return RelocLocations[val];
+  }
+  void setRelocLocation(SDValue val, SDValue Location) {
+    assert(!RelocLocations.count(val) &&
+           "Trying to allocate already allocated location");
+    RelocLocations[val] = Location;
+  }
+
+  /// Record the fact that we expect to encounter a given gc_relocate
+  /// before the next statepoint.  If we don't see it, we'll report
+  /// an assertion.
+  void scheduleRelocCall(const CallInst &RelocCall) {
+    PendingGCRelocateCalls.push_back(&RelocCall);
+  }
+  /// Remove this gc_relocate from the list we're expecting to see
+  /// before the next statepoint.  If we weren't expecting to see
+  /// it, we'll report an assertion.
+  void relocCallVisited(const CallInst &RelocCall) {
+    SmallVectorImpl<const CallInst *>::iterator itr =
+        std::find(PendingGCRelocateCalls.begin(), PendingGCRelocateCalls.end(),
+                  &RelocCall);
+    assert(itr != PendingGCRelocateCalls.end() &&
+           "Visited unexpected gcrelocate call");
+    PendingGCRelocateCalls.erase(itr);
+  }
+
+  // TODO: Should add consistency tracking to ensure we encounter
+  // expected gc_result calls too.
+
+  /// Get a stack slot we can use to store an value of type ValueType.  This
+  /// will hopefully be a recylced slot from another statepoint.
+  SDValue allocateStackSlot(EVT ValueType, SelectionDAGBuilder &Builder);
+
+  void reserveStackSlot(int Offset) {
+    assert(Offset >= 0 && Offset < (int)AllocatedStackSlots.size() &&
+           "out of bounds");
+    assert(!AllocatedStackSlots[Offset] && "already reserved!");
+    assert(NextSlotToAllocate <= (unsigned)Offset && "consistency!");
+    AllocatedStackSlots[Offset] = true;
+  }
+  bool isStackSlotAllocated(int Offset) {
+    assert(Offset >= 0 && Offset < (int)AllocatedStackSlots.size() &&
+           "out of bounds");
+    return AllocatedStackSlots[Offset];
+  }
+
+private:
+  /// Maps pre-relocation value (gc pointer directly incoming into statepoint)
+  /// into it's location (currently only stack slots)
+  DenseMap<SDValue, SDValue> Locations;
+  /// Map pre-relocated value into it's new relocated location
+  DenseMap<SDValue, SDValue> RelocLocations;
+
+  /// A boolean indicator for each slot listed in the FunctionInfo as to
+  /// whether it has been used in the current statepoint.  Since we try to
+  /// preserve stack slots across safepoints, there can be gaps in which
+  /// slots have been allocated.
+  SmallVector<bool, 50> AllocatedStackSlots;
+
+  /// Points just beyond the last slot known to have been allocated
+  unsigned NextSlotToAllocate;
+
+  /// Keep track of pending gcrelocate calls for consistency check
+  SmallVector<const CallInst *, 10> PendingGCRelocateCalls;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 05ace413bfdf..72e0aca84080 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -31,13 +31,13 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cctype>
 using namespace llvm;
 
-/// NOTE: The constructor takes ownership of TLOF.
-TargetLowering::TargetLowering(const TargetMachine &tm,
-                               const TargetLoweringObjectFile *tlof)
-  : TargetLoweringBase(tm, tlof) {}
+/// NOTE: The TargetMachine owns TLOF.
+TargetLowering::TargetLowering(const TargetMachine &tm)
+  : TargetLoweringBase(tm) {}
 
 const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
@@ -1283,36 +1283,53 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     }
 
     // (zext x) == C --> x == (trunc C)
-    if (DCI.isBeforeLegalize() && N0->hasOneUse() &&
-        (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+    // (sext x) == C --> x == (trunc C)
+    if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+        DCI.isBeforeLegalize() && N0->hasOneUse()) {
       unsigned MinBits = N0.getValueSizeInBits();
-      SDValue PreZExt;
+      SDValue PreExt;
+      bool Signed = false;
       if (N0->getOpcode() == ISD::ZERO_EXTEND) {
         // ZExt
         MinBits = N0->getOperand(0).getValueSizeInBits();
-        PreZExt = N0->getOperand(0);
+        PreExt = N0->getOperand(0);
       } else if (N0->getOpcode() == ISD::AND) {
         // DAGCombine turns costly ZExts into ANDs
         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0->getOperand(1)))
           if ((C->getAPIntValue()+1).isPowerOf2()) {
             MinBits = C->getAPIntValue().countTrailingOnes();
-            PreZExt = N0->getOperand(0);
+            PreExt = N0->getOperand(0);
           }
+      } else if (N0->getOpcode() == ISD::SIGN_EXTEND) {
+        // SExt
+        MinBits = N0->getOperand(0).getValueSizeInBits();
+        PreExt = N0->getOperand(0);
+        Signed = true;
       } else if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(N0)) {
-        // ZEXTLOAD
+        // ZEXTLOAD / SEXTLOAD
         if (LN0->getExtensionType() == ISD::ZEXTLOAD) {
           MinBits = LN0->getMemoryVT().getSizeInBits();
-          PreZExt = N0;
+          PreExt = N0;
+        } else if (LN0->getExtensionType() == ISD::SEXTLOAD) {
+          Signed = true;
+          MinBits = LN0->getMemoryVT().getSizeInBits();
+          PreExt = N0;
         }
       }
 
+      // Figure out how many bits we need to preserve this constant.
+      unsigned ReqdBits = Signed ?
+        C1.getBitWidth() - C1.getNumSignBits() + 1 :
+        C1.getActiveBits();
+
       // Make sure we're not losing bits from the constant.
       if (MinBits > 0 &&
-          MinBits < C1.getBitWidth() && MinBits >= C1.getActiveBits()) {
+          MinBits < C1.getBitWidth() &&
+          MinBits >= ReqdBits) {
         EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits);
         if (isTypeDesirableForOp(ISD::SETCC, MinVT)) {
           // Will get folded away.
-          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreZExt);
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreExt);
           SDValue C = DAG.getConstant(C1.trunc(MinBits), MinVT);
           return DAG.getSetCC(dl, VT, Trunc, C, Cond);
         }
@@ -2177,7 +2194,8 @@ getRegForInlineAsmConstraint(const std::string &Constraint,
     std::make_pair(0u, static_cast<const TargetRegisterClass*>(nullptr));
 
   // Figure out which register class contains this reg.
-  const TargetRegisterInfo *RI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *RI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(),
        E = RI->regclass_end(); RCI != E; ++RCI) {
     const TargetRegisterClass *RC = *RCI;
@@ -2239,14 +2257,11 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
 
   // Do a prepass over the constraints, canonicalizing them, and building up the
   // ConstraintOperands list.
-  InlineAsm::ConstraintInfoVector
-    ConstraintInfos = IA->ParseConstraints();
-
   unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
   unsigned ResNo = 0;   // ResNo - The result number of the next output.
 
-  for (unsigned i = 0, e = ConstraintInfos.size(); i != e; ++i) {
-    ConstraintOperands.push_back(AsmOperandInfo(ConstraintInfos[i]));
+  for (InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
+    ConstraintOperands.emplace_back(std::move(CI));
     AsmOperandInfo &OpInfo = ConstraintOperands.back();
 
     // Update multiple alternative constraint count.
@@ -2325,7 +2340,7 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
   }
 
   // If we have multiple alternative constraints, select the best alternative.
-  if (ConstraintInfos.size()) {
+  if (ConstraintOperands.size()) {
     if (maCount) {
       unsigned bestMAIndex = 0;
       int bestWeight = -1;
@@ -2641,11 +2656,13 @@ SDValue TargetLowering::BuildExactSDIV(SDValue Op1, SDValue Op2, SDLoc dl,
 
 /// \brief Given an ISD::SDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
-/// multiplying by a magic number.  See:
-/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+/// multiplying by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
 SDValue TargetLowering::BuildSDIV(SDNode *N, const APInt &Divisor,
                                   SelectionDAG &DAG, bool IsAfterLegalization,
                                   std::vector<SDNode *> *Created) const {
+  assert(Created && "No vector to hold sdiv ops.");
+
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
@@ -2673,38 +2690,36 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, const APInt &Divisor,
   // If d > 0 and m < 0, add the numerator
   if (Divisor.isStrictlyPositive() && magics.m.isNegative()) {
     Q = DAG.getNode(ISD::ADD, dl, VT, Q, N->getOperand(0));
-    if (Created)
-      Created->push_back(Q.getNode());
+    Created->push_back(Q.getNode());
   }
   // If d < 0 and m > 0, subtract the numerator.
   if (Divisor.isNegative() && magics.m.isStrictlyPositive()) {
     Q = DAG.getNode(ISD::SUB, dl, VT, Q, N->getOperand(0));
-    if (Created)
-      Created->push_back(Q.getNode());
+    Created->push_back(Q.getNode());
   }
   // Shift right algebraic if shift value is nonzero
   if (magics.s > 0) {
     Q = DAG.getNode(ISD::SRA, dl, VT, Q,
                  DAG.getConstant(magics.s, getShiftAmountTy(Q.getValueType())));
-    if (Created)
-      Created->push_back(Q.getNode());
+    Created->push_back(Q.getNode());
   }
   // Extract the sign bit and add it to the quotient
   SDValue T = DAG.getNode(ISD::SRL, dl, VT, Q,
                           DAG.getConstant(VT.getScalarSizeInBits() - 1,
                                           getShiftAmountTy(Q.getValueType())));
-  if (Created)
-    Created->push_back(T.getNode());
+  Created->push_back(T.getNode());
   return DAG.getNode(ISD::ADD, dl, VT, Q, T);
 }
 
 /// \brief Given an ISD::UDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
-/// multiplying by a magic number.  See:
-/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+/// multiplying by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
 SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor,
                                   SelectionDAG &DAG, bool IsAfterLegalization,
                                   std::vector<SDNode *> *Created) const {
+  assert(Created && "No vector to hold udiv ops.");
+
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
@@ -2725,8 +2740,7 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor,
     unsigned Shift = Divisor.countTrailingZeros();
     Q = DAG.getNode(ISD::SRL, dl, VT, Q,
                     DAG.getConstant(Shift, getShiftAmountTy(Q.getValueType())));
-    if (Created)
-      Created->push_back(Q.getNode());
+    Created->push_back(Q.getNode());
 
     // Get magic number for the shifted divisor.
     magics = Divisor.lshr(Shift).magicu(Shift);
@@ -2744,8 +2758,8 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor,
                             DAG.getConstant(magics.m, VT)).getNode(), 1);
   else
     return SDValue();       // No mulhu or equvialent
-  if (Created)
-    Created->push_back(Q.getNode());
+
+  Created->push_back(Q.getNode());
 
   if (magics.a == 0) {
     assert(magics.s < Divisor.getBitWidth() &&
@@ -2754,15 +2768,12 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor,
                  DAG.getConstant(magics.s, getShiftAmountTy(Q.getValueType())));
   } else {
     SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Q);
-    if (Created)
-      Created->push_back(NPQ.getNode());
+    Created->push_back(NPQ.getNode());
     NPQ = DAG.getNode(ISD::SRL, dl, VT, NPQ,
                       DAG.getConstant(1, getShiftAmountTy(NPQ.getValueType())));
-    if (Created)
-      Created->push_back(NPQ.getNode());
+    Created->push_back(NPQ.getNode());
     NPQ = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
-    if (Created)
-      Created->push_back(NPQ.getNode());
+    Created->push_back(NPQ.getNode());
     return DAG.getNode(ISD::SRL, dl, VT, NPQ,
              DAG.getConstant(magics.s-1, getShiftAmountTy(NPQ.getValueType())));
   }
@@ -2785,7 +2796,7 @@ verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
 
 bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
                                SelectionDAG &DAG, SDValue LL, SDValue LH,
-			       SDValue RL, SDValue RH) const {
+                               SDValue RL, SDValue RH) const {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
@@ -2818,8 +2829,8 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
       // The inputs are both zero-extended.
       if (HasUMUL_LOHI) {
         // We can emit a umul_lohi.
-        Lo = DAG.getNode(ISD::UMUL_LOHI, dl,
-	                 DAG.getVTList(HiLoVT, HiLoVT), LL, RL);
+        Lo = DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(HiLoVT, HiLoVT), LL,
+                         RL);
         Hi = SDValue(Lo.getNode(), 1);
         return true;
       }
@@ -2834,8 +2845,8 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
       // The input values are both sign-extended.
       if (HasSMUL_LOHI) {
         // We can emit a smul_lohi.
-        Lo = DAG.getNode(ISD::SMUL_LOHI, dl,
-	                 DAG.getVTList(HiLoVT, HiLoVT), LL, RL);
+        Lo = DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(HiLoVT, HiLoVT), LL,
+                         RL);
         Hi = SDValue(Lo.getNode(), 1);
         return true;
       }
diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp
index f7c64dac6124..0be00f0e7ce5 100644
--- a/lib/CodeGen/ShadowStackGC.cpp
+++ b/lib/CodeGen/ShadowStackGC.cpp
@@ -144,7 +144,7 @@ namespace {
         LLVMContext &C = F.getContext();
         BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
         Type *ExnTy = StructType::get(Type::getInt8PtrTy(C),
-                                      Type::getInt32Ty(C), NULL);
+                                      Type::getInt32Ty(C), nullptr);
         Constant *PersFn =
           F.getParent()->
           getOrInsertFunction("__gcc_personality_v0",
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index b0950ded270a..7fd8107707c8 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -98,7 +99,7 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
                                       VoidPtrTy, // __personality
                                       VoidPtrTy, // __lsda
                                       ArrayType::get(VoidPtrTy, 5), // __jbuf
-                                      NULL);
+                                      nullptr);
   RegisterFn = M.getOrInsertFunction(
       "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
       PointerType::getUnqual(FunctionContextTy), (Type *)nullptr);
@@ -138,8 +139,8 @@ void SjLjEHPrepare::insertCallSiteStore(Instruction *I, int Number) {
 /// MarkBlocksLiveIn - Insert BB and all of its predescessors into LiveBBs until
 /// we reach blocks we've already seen.
 static void MarkBlocksLiveIn(BasicBlock *BB,
-                             SmallPtrSet<BasicBlock *, 64> &LiveBBs) {
-  if (!LiveBBs.insert(BB))
+                             SmallPtrSetImpl<BasicBlock *> &LiveBBs) {
+  if (!LiveBBs.insert(BB).second)
     return; // already been here.
 
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
@@ -190,7 +191,7 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,
   // Create an alloca for the incoming jump buffer ptr and the new jump buffer
   // that needs to be restored on all exits from the function. This is an alloca
   // because the value needs to be added to the global context list.
-  const TargetLowering *TLI = TM->getTargetLowering();
+  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
   unsigned Align =
       TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
   FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context",
@@ -350,10 +351,8 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
       continue;
 
     // Demote the PHIs to the stack.
-    for (SmallPtrSet<PHINode *, 8>::iterator I = PHIsToDemote.begin(),
-                                             E = PHIsToDemote.end();
-         I != E; ++I)
-      DemotePHIToStack(*I);
+    for (PHINode *PN : PHIsToDemote)
+      DemotePHIToStack(PN);
 
     // Move the landingpad instruction back to the top of the landing pad block.
     LPI->moveBefore(UnwindBlock->begin());
diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp
index 24e94d11f888..97a5424aa560 100644
--- a/lib/CodeGen/SpillPlacement.cpp
+++ b/lib/CodeGen/SpillPlacement.cpp
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
 
@@ -60,27 +61,6 @@ void SpillPlacement::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-namespace {
-static BlockFrequency Threshold;
-}
-
-/// Decision threshold. A node gets the output value 0 if the weighted sum of
-/// its inputs falls in the open interval (-Threshold;Threshold).
-static BlockFrequency getThreshold() { return Threshold; }
-
-/// \brief Set the threshold for a given entry frequency.
-///
-/// Set the threshold relative to \c Entry.  Since the threshold is used as a
-/// bound on the open interval (-Threshold;Threshold), 1 is the minimum
-/// threshold.
-static void setThreshold(const BlockFrequency &Entry) {
-  // Apparently 2 is a good threshold when Entry==2^14, but we need to scale
-  // it.  Divide by 2^13, rounding as appropriate.
-  uint64_t Freq = Entry.getFrequency();
-  uint64_t Scaled = (Freq >> 13) + bool(Freq & (1 << 12));
-  Threshold = std::max(UINT64_C(1), Scaled);
-}
-
 /// Node - Each edge bundle corresponds to a Hopfield node.
 ///
 /// The node contains precomputed frequency data that only depends on the CFG,
@@ -126,9 +106,9 @@ struct SpillPlacement::Node {
 
   /// clear - Reset per-query data, but preserve frequencies that only depend on
   // the CFG.
-  void clear() {
+  void clear(const BlockFrequency &Threshold) {
     BiasN = BiasP = Value = 0;
-    SumLinkWeights = getThreshold();
+    SumLinkWeights = Threshold;
     Links.clear();
   }
 
@@ -166,7 +146,7 @@ struct SpillPlacement::Node {
 
   /// update - Recompute Value from Bias and Links. Return true when node
   /// preference changes.
-  bool update(const Node nodes[]) {
+  bool update(const Node nodes[], const BlockFrequency &Threshold) {
     // Compute the weighted sum of inputs.
     BlockFrequency SumN = BiasN;
     BlockFrequency SumP = BiasP;
@@ -186,9 +166,9 @@ struct SpillPlacement::Node {
     //  2. It helps tame rounding errors when the links nominally sum to 0.
     //
     bool Before = preferReg();
-    if (SumN >= SumP + getThreshold())
+    if (SumN >= SumP + Threshold)
       Value = -1;
-    else if (SumP >= SumN + getThreshold())
+    else if (SumP >= SumN + Threshold)
       Value = 1;
     else
       Value = 0;
@@ -227,7 +207,7 @@ void SpillPlacement::activate(unsigned n) {
   if (ActiveNodes->test(n))
     return;
   ActiveNodes->set(n);
-  nodes[n].clear();
+  nodes[n].clear(Threshold);
 
   // Very large bundles usually come from big switches, indirect branches,
   // landing pads, or loops with many 'continue' statements. It is difficult to
@@ -244,6 +224,18 @@ void SpillPlacement::activate(unsigned n) {
   }
 }
 
+/// \brief Set the threshold for a given entry frequency.
+///
+/// Set the threshold relative to \c Entry.  Since the threshold is used as a
+/// bound on the open interval (-Threshold;Threshold), 1 is the minimum
+/// threshold.
+void SpillPlacement::setThreshold(const BlockFrequency &Entry) {
+  // Apparently 2 is a good threshold when Entry==2^14, but we need to scale
+  // it.  Divide by 2^13, rounding as appropriate.
+  uint64_t Freq = Entry.getFrequency();
+  uint64_t Scaled = (Freq >> 13) + bool(Freq & (1 << 12));
+  Threshold = std::max(UINT64_C(1), Scaled);
+}
 
 /// addConstraints - Compute node biases and weights from a set of constraints.
 /// Set a bit in NodeMask for each active node.
@@ -310,7 +302,7 @@ bool SpillPlacement::scanActiveBundles() {
   Linked.clear();
   RecentPositive.clear();
   for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) {
-    nodes[n].update(nodes);
+    nodes[n].update(nodes, Threshold);
     // A node that must spill, or a node without any links is not going to
     // change its value ever again, so exclude it from iterations.
     if (nodes[n].mustSpill())
@@ -330,7 +322,7 @@ void SpillPlacement::iterate() {
   // First update the recently positive nodes. They have likely received new
   // negative bias that will turn them off.
   while (!RecentPositive.empty())
-    nodes[RecentPositive.pop_back_val()].update(nodes);
+    nodes[RecentPositive.pop_back_val()].update(nodes, Threshold);
 
   if (Linked.empty())
     return;
@@ -349,7 +341,7 @@ void SpillPlacement::iterate() {
            iteration == 0 ? Linked.rbegin() : std::next(Linked.rbegin()),
            E = Linked.rend(); I != E; ++I) {
       unsigned n = *I;
-      if (nodes[n].update(nodes)) {
+      if (nodes[n].update(nodes, Threshold)) {
         Changed = true;
         if (nodes[n].preferReg())
           RecentPositive.push_back(n);
@@ -363,7 +355,7 @@ void SpillPlacement::iterate() {
     for (SmallVectorImpl<unsigned>::const_iterator I =
            std::next(Linked.begin()), E = Linked.end(); I != E; ++I) {
       unsigned n = *I;
-      if (nodes[n].update(nodes)) {
+      if (nodes[n].update(nodes, Threshold)) {
         Changed = true;
         if (nodes[n].preferReg())
           RecentPositive.push_back(n);
diff --git a/lib/CodeGen/SpillPlacement.h b/lib/CodeGen/SpillPlacement.h
index 43fc7f50cef9..622361e7e80c 100644
--- a/lib/CodeGen/SpillPlacement.h
+++ b/lib/CodeGen/SpillPlacement.h
@@ -24,8 +24,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_SPILLPLACEMENT_H
-#define LLVM_CODEGEN_SPILLPLACEMENT_H
+#ifndef LLVM_LIB_CODEGEN_SPILLPLACEMENT_H
+#define LLVM_LIB_CODEGEN_SPILLPLACEMENT_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
@@ -40,7 +40,7 @@ class MachineBasicBlock;
 class MachineLoopInfo;
 class MachineBlockFrequencyInfo;
 
-class SpillPlacement  : public MachineFunctionPass {
+class SpillPlacement : public MachineFunctionPass {
   struct Node;
   const MachineFunction *MF;
   const EdgeBundles *bundles;
@@ -60,7 +60,11 @@ class SpillPlacement  : public MachineFunctionPass {
   SmallVector<unsigned, 8> RecentPositive;
 
   // Block frequencies are computed once. Indexed by block number.
-  SmallVector<BlockFrequency, 4> BlockFrequencies;
+  SmallVector<BlockFrequency, 8> BlockFrequencies;
+
+  /// Decision threshold. A node gets the output value 0 if the weighted sum of
+  /// its inputs falls in the open interval (-Threshold;Threshold).
+  BlockFrequency Threshold;
 
 public:
   static char ID; // Pass identification, replacement for typeid.
@@ -152,6 +156,7 @@ private:
   void releaseMemory() override;
 
   void activate(unsigned);
+  void setThreshold(const BlockFrequency &Entry);
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
deleted file mode 100644
index 064944894162..000000000000
--- a/lib/CodeGen/Spiller.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-//===-- llvm/CodeGen/Spiller.cpp -  Spiller -------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Spiller.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/LiveRangeEdit.h"
-#include "llvm/CodeGen/LiveStackAnalysis.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "spiller"
-
-namespace {
-  enum SpillerName { trivial, inline_ };
-}
-
-static cl::opt<SpillerName>
-spillerOpt("spiller",
-           cl::desc("Spiller to use: (default: standard)"),
-           cl::Prefix,
-           cl::values(clEnumVal(trivial,   "trivial spiller"),
-                      clEnumValN(inline_,  "inline", "inline spiller"),
-                      clEnumValEnd),
-           cl::init(trivial));
-
-// Spiller virtual destructor implementation.
-Spiller::~Spiller() {}
-
-namespace {
-
-/// Utility class for spillers.
-class SpillerBase : public Spiller {
-protected:
-  MachineFunctionPass *pass;
-  MachineFunction *mf;
-  VirtRegMap *vrm;
-  LiveIntervals *lis;
-  MachineFrameInfo *mfi;
-  MachineRegisterInfo *mri;
-  const TargetInstrInfo *tii;
-  const TargetRegisterInfo *tri;
-
-  /// Construct a spiller base.
-  SpillerBase(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm)
-    : pass(&pass), mf(&mf), vrm(&vrm)
-  {
-    lis = &pass.getAnalysis<LiveIntervals>();
-    mfi = mf.getFrameInfo();
-    mri = &mf.getRegInfo();
-    tii = mf.getTarget().getInstrInfo();
-    tri = mf.getTarget().getRegisterInfo();
-  }
-
-  /// Add spill ranges for every use/def of the live interval, inserting loads
-  /// immediately before each use, and stores after each def. No folding or
-  /// remat is attempted.
-  void trivialSpillEverywhere(LiveRangeEdit& LRE) {
-    LiveInterval* li = &LRE.getParent();
-
-    DEBUG(dbgs() << "Spilling everywhere " << *li << "\n");
-
-    assert(li->weight != llvm::huge_valf &&
-           "Attempting to spill already spilled value.");
-
-    assert(!TargetRegisterInfo::isStackSlot(li->reg) &&
-           "Trying to spill a stack slot.");
-
-    DEBUG(dbgs() << "Trivial spill everywhere of reg" << li->reg << "\n");
-
-    const TargetRegisterClass *trc = mri->getRegClass(li->reg);
-    unsigned ss = vrm->assignVirt2StackSlot(li->reg);
-
-    // Iterate over reg uses/defs.
-    for (MachineRegisterInfo::reg_instr_iterator
-         regItr = mri->reg_instr_begin(li->reg);
-         regItr != mri->reg_instr_end();) {
-
-      // Grab the use/def instr.
-      MachineInstr *mi = &*regItr;
-
-      DEBUG(dbgs() << "  Processing " << *mi);
-
-      // Step regItr to the next use/def instr.
-      ++regItr;
-
-      // Collect uses & defs for this instr.
-      SmallVector<unsigned, 2> indices;
-      bool hasUse = false;
-      bool hasDef = false;
-      for (unsigned i = 0; i != mi->getNumOperands(); ++i) {
-        MachineOperand &op = mi->getOperand(i);
-        if (!op.isReg() || op.getReg() != li->reg)
-          continue;
-        hasUse |= mi->getOperand(i).isUse();
-        hasDef |= mi->getOperand(i).isDef();
-        indices.push_back(i);
-      }
-
-      // Create a new virtual register for the load and/or store.
-      unsigned NewVReg = LRE.create();
-
-      // Update the reg operands & kill flags.
-      for (unsigned i = 0; i < indices.size(); ++i) {
-        unsigned mopIdx = indices[i];
-        MachineOperand &mop = mi->getOperand(mopIdx);
-        mop.setReg(NewVReg);
-        if (mop.isUse() && !mi->isRegTiedToDefOperand(mopIdx)) {
-          mop.setIsKill(true);
-        }
-      }
-      assert(hasUse || hasDef);
-
-      // Insert reload if necessary.
-      MachineBasicBlock::iterator miItr(mi);
-      if (hasUse) {
-        MachineInstrSpan MIS(miItr);
-
-        tii->loadRegFromStackSlot(*mi->getParent(), miItr, NewVReg, ss, trc,
-                                  tri);
-        lis->InsertMachineInstrRangeInMaps(MIS.begin(), miItr);
-      }
-
-      // Insert store if necessary.
-      if (hasDef) {
-        MachineInstrSpan MIS(miItr);
-
-        tii->storeRegToStackSlot(*mi->getParent(), std::next(miItr), NewVReg,
-                                 true, ss, trc, tri);
-        lis->InsertMachineInstrRangeInMaps(std::next(miItr), MIS.end());
-      }
-    }
-  }
-};
-
-} // end anonymous namespace
-
-namespace {
-
-/// Spills any live range using the spill-everywhere method with no attempt at
-/// folding.
-class TrivialSpiller : public SpillerBase {
-public:
-
-  TrivialSpiller(MachineFunctionPass &pass, MachineFunction &mf,
-                 VirtRegMap &vrm)
-    : SpillerBase(pass, mf, vrm) {}
-
-  void spill(LiveRangeEdit &LRE) override {
-    // Ignore spillIs - we don't use it.
-    trivialSpillEverywhere(LRE);
-  }
-};
-
-} // end anonymous namespace
-
-void Spiller::anchor() { }
-
-llvm::Spiller* llvm::createSpiller(MachineFunctionPass &pass,
-                                   MachineFunction &mf,
-                                   VirtRegMap &vrm) {
-  switch (spillerOpt) {
-  case trivial: return new TrivialSpiller(pass, mf, vrm);
-  case inline_: return createInlineSpiller(pass, mf, vrm);
-  }
-  llvm_unreachable("Invalid spiller optimization");
-}
diff --git a/lib/CodeGen/Spiller.h b/lib/CodeGen/Spiller.h
index b7d5beaab1b2..08f99ec78adc 100644
--- a/lib/CodeGen/Spiller.h
+++ b/lib/CodeGen/Spiller.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_SPILLER_H
-#define LLVM_CODEGEN_SPILLER_H
+#ifndef LLVM_LIB_CODEGEN_SPILLER_H
+#define LLVM_LIB_CODEGEN_SPILLER_H
 
 namespace llvm {
 
@@ -31,11 +31,6 @@ namespace llvm {
 
   };
 
-  /// Create and return a spiller object, as specified on the command line.
-  Spiller* createSpiller(MachineFunctionPass &pass,
-                         MachineFunction &mf,
-                         VirtRegMap &vrm);
-
   /// Create and return a spiller that will insert spill code directly instead
   /// of deferring though VirtRegMap.
   Spiller *createInlineSpiller(MachineFunctionPass &pass,
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 7d4f568df27d..4c8801a3611a 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -40,16 +40,11 @@ STATISTIC(NumRepairs,  "Number of invalid live ranges repaired");
 //                                 Split Analysis
 //===----------------------------------------------------------------------===//
 
-SplitAnalysis::SplitAnalysis(const VirtRegMap &vrm,
-                             const LiveIntervals &lis,
+SplitAnalysis::SplitAnalysis(const VirtRegMap &vrm, const LiveIntervals &lis,
                              const MachineLoopInfo &mli)
-  : MF(vrm.getMachineFunction()),
-    VRM(vrm),
-    LIS(lis),
-    Loops(mli),
-    TII(*MF.getTarget().getInstrInfo()),
-    CurLI(nullptr),
-    LastSplitPoint(MF.getNumBlockIDs()) {}
+    : MF(vrm.getMachineFunction()), VRM(vrm), LIS(lis), Loops(mli),
+      TII(*MF.getSubtarget().getInstrInfo()), CurLI(nullptr),
+      LastSplitPoint(MF.getNumBlockIDs()) {}
 
 void SplitAnalysis::clear() {
   UseSlots.clear();
@@ -125,10 +120,9 @@ void SplitAnalysis::analyzeUses() {
 
   // First get all the defs from the interval values. This provides the correct
   // slots for early clobbers.
-  for (LiveInterval::const_vni_iterator I = CurLI->vni_begin(),
-       E = CurLI->vni_end(); I != E; ++I)
-    if (!(*I)->isPHIDef() && !(*I)->isUnused())
-      UseSlots.push_back((*I)->def);
+  for (const VNInfo *VNI : CurLI->valnos)
+    if (!VNI->isPHIDef() && !VNI->isUnused())
+      UseSlots.push_back(VNI->def);
 
   // Get use slots form the use-def chain.
   const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -321,22 +315,14 @@ void SplitAnalysis::analyze(const LiveInterval *li) {
 //===----------------------------------------------------------------------===//
 
 /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
-SplitEditor::SplitEditor(SplitAnalysis &sa,
-                         LiveIntervals &lis,
-                         VirtRegMap &vrm,
+SplitEditor::SplitEditor(SplitAnalysis &sa, LiveIntervals &lis, VirtRegMap &vrm,
                          MachineDominatorTree &mdt,
                          MachineBlockFrequencyInfo &mbfi)
-  : SA(sa), LIS(lis), VRM(vrm),
-    MRI(vrm.getMachineFunction().getRegInfo()),
-    MDT(mdt),
-    TII(*vrm.getMachineFunction().getTarget().getInstrInfo()),
-    TRI(*vrm.getMachineFunction().getTarget().getRegisterInfo()),
-    MBFI(mbfi),
-    Edit(nullptr),
-    OpenIdx(0),
-    SpillMode(SM_Partition),
-    RegAssign(Allocator)
-{}
+    : SA(sa), LIS(lis), VRM(vrm), MRI(vrm.getMachineFunction().getRegInfo()),
+      MDT(mdt), TII(*vrm.getMachineFunction().getSubtarget().getInstrInfo()),
+      TRI(*vrm.getMachineFunction().getSubtarget().getRegisterInfo()),
+      MBFI(mbfi), Edit(nullptr), OpenIdx(0), SpillMode(SM_Partition),
+      RegAssign(Allocator) {}
 
 void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
   Edit = &LRE;
@@ -740,9 +726,7 @@ void SplitEditor::hoistCopiesForSize() {
 
   // Find the nearest common dominator for parent values with multiple
   // back-copies.  If a single back-copy dominates, put it in DomPair.second.
-  for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
-       VI != VE; ++VI) {
-    VNInfo *VNI = *VI;
+  for (VNInfo *VNI : LI->valnos) {
     if (VNI->isUnused())
       continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
@@ -815,9 +799,7 @@ void SplitEditor::hoistCopiesForSize() {
   // Remove redundant back-copies that are now known to be dominated by another
   // def with the same value.
   SmallVector<VNInfo*, 8> BackCopies;
-  for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
-       VI != VE; ++VI) {
-    VNInfo *VNI = *VI;
+  for (VNInfo *VNI : LI->valnos) {
     if (VNI->isUnused())
       continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
@@ -836,16 +818,15 @@ void SplitEditor::hoistCopiesForSize() {
 bool SplitEditor::transferValues() {
   bool Skipped = false;
   RegAssignMap::const_iterator AssignI = RegAssign.begin();
-  for (LiveInterval::const_iterator ParentI = Edit->getParent().begin(),
-         ParentE = Edit->getParent().end(); ParentI != ParentE; ++ParentI) {
-    DEBUG(dbgs() << "  blit " << *ParentI << ':');
-    VNInfo *ParentVNI = ParentI->valno;
+  for (const LiveRange::Segment &S : Edit->getParent()) {
+    DEBUG(dbgs() << "  blit " << S << ':');
+    VNInfo *ParentVNI = S.valno;
     // RegAssign has holes where RegIdx 0 should be used.
-    SlotIndex Start = ParentI->start;
+    SlotIndex Start = S.start;
     AssignI.advanceTo(Start);
     do {
       unsigned RegIdx;
-      SlotIndex End = ParentI->end;
+      SlotIndex End = S.end;
       if (!AssignI.valid()) {
         RegIdx = 0;
       } else if (AssignI.start() <= Start) {
@@ -930,7 +911,7 @@ bool SplitEditor::transferValues() {
         ++MBB;
       }
       Start = End;
-    } while (Start != ParentI->end);
+    } while (Start != S.end);
     DEBUG(dbgs() << '\n');
   }
 
@@ -943,9 +924,7 @@ bool SplitEditor::transferValues() {
 
 void SplitEditor::extendPHIKillRanges() {
     // Extend live ranges to be live-out for successor PHI values.
-  for (LiveInterval::const_vni_iterator I = Edit->getParent().vni_begin(),
-       E = Edit->getParent().vni_end(); I != E; ++I) {
-    const VNInfo *PHIVNI = *I;
+  for (const VNInfo *PHIVNI : Edit->getParent().valnos) {
     if (PHIVNI->isUnused() || !PHIVNI->isPHIDef())
       continue;
     unsigned RegIdx = RegAssign.lookup(PHIVNI->def);
@@ -1019,12 +998,11 @@ void SplitEditor::deleteRematVictims() {
   SmallVector<MachineInstr*, 8> Dead;
   for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I){
     LiveInterval *LI = &LIS.getInterval(*I);
-    for (LiveInterval::const_iterator LII = LI->begin(), LIE = LI->end();
-           LII != LIE; ++LII) {
+    for (const LiveRange::Segment &S : LI->segments) {
       // Dead defs end at the dead slot.
-      if (LII->end != LII->valno->def.getDeadSlot())
+      if (S.end != S.valno->def.getDeadSlot())
         continue;
-      MachineInstr *MI = LIS.getInstructionFromIndex(LII->valno->def);
+      MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def);
       assert(MI && "Missing instruction for dead def");
       MI->addRegisterDead(LI->reg, &TRI);
 
@@ -1049,9 +1027,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
   // the inserted copies.
 
   // Add the original defs from the parent interval.
-  for (LiveInterval::const_vni_iterator I = Edit->getParent().vni_begin(),
-         E = Edit->getParent().vni_end(); I != E; ++I) {
-    const VNInfo *ParentVNI = *I;
+  for (const VNInfo *ParentVNI : Edit->getParent().valnos) {
     if (ParentVNI->isUnused())
       continue;
     unsigned RegIdx = RegAssign.lookup(ParentVNI->def);
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index 7048ee37631b..2e60c1445b81 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_SPLITKIT_H
-#define LLVM_CODEGEN_SPLITKIT_H
+#ifndef LLVM_LIB_CODEGEN_SPLITKIT_H
+#define LLVM_LIB_CODEGEN_SPLITKIT_H
 
 #include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index 370430c4c82a..faf94b67fe77 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -228,7 +228,7 @@ void StackColoring::dump() const {
 unsigned StackColoring::collectMarkers(unsigned NumSlot) {
   unsigned MarkersFound = 0;
   // Scan the function to find all lifetime markers.
-  // NOTE: We use the a reverse-post-order iteration to ensure that we obtain a
+  // NOTE: We use a reverse-post-order iteration to ensure that we obtain a
   // deterministic numbering, and because we'll need a post-order iteration
   // later for solving the liveness dataflow problem.
   for (MachineBasicBlock *MBB : depth_first(MF)) {
@@ -463,7 +463,8 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
     if (!VI.Var)
       continue;
     if (SlotRemap.count(VI.Slot)) {
-      DEBUG(dbgs()<<"Remapping debug info for ["<<VI.Var->getName()<<"].\n");
+      DEBUG(dbgs() << "Remapping debug info for ["
+                   << DIVariable(VI.Var).getName() << "].\n");
       VI.Slot = SlotRemap[VI.Slot];
       FixedDbg++;
     }
diff --git a/lib/CodeGen/StackMapLivenessAnalysis.cpp b/lib/CodeGen/StackMapLivenessAnalysis.cpp
index 3ba502fff695..767f43a420fd 100644
--- a/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -21,7 +21,7 @@
 #include "llvm/CodeGen/StackMapLivenessAnalysis.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -67,7 +67,7 @@ bool StackMapLiveness::runOnMachineFunction(MachineFunction &_MF) {
   DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: "
                << _MF.getName() << " **********\n");
   MF = &_MF;
-  TRI = MF->getTarget().getRegisterInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
   ++NumStackMapFuncVisited;
 
   // Skip this function if there are no patchpoints to process.
@@ -123,5 +123,7 @@ uint32_t *StackMapLiveness::createRegisterMask() const {
   for (LivePhysRegs::const_iterator RI = LiveRegs.begin(), RE = LiveRegs.end();
        RI != RE; ++RI)
     Mask[*RI / 32] |= 1U << (*RI % 32);
+
+  TRI->adjustStackMapLiveOutMask(Mask);
   return Mask;
 }
diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
index 1473fc184f84..f1d1160c9528 100644
--- a/lib/CodeGen/StackMaps.cpp
+++ b/lib/CodeGen/StackMaps.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <iterator>
 
 using namespace llvm;
@@ -83,7 +84,8 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
     switch (MOI->getImm()) {
     default: llvm_unreachable("Unrecognized operand type.");
     case StackMaps::DirectMemRefOp: {
-      unsigned Size = AP.TM.getDataLayout()->getPointerSizeInBits();
+      unsigned Size =
+          AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSizeInBits();
       assert((Size % 8) == 0 && "Need pointer size in bytes.");
       Size /= 8;
       unsigned Reg = (++MOI)->getReg();
@@ -122,7 +124,8 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
     assert(TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) &&
            "Virtreg operands should have been rewritten before now.");
     const TargetRegisterClass *RC =
-      AP.TM.getRegisterInfo()->getMinimalPhysRegClass(MOI->getReg());
+        AP.TM.getSubtargetImpl()->getRegisterInfo()->getMinimalPhysRegClass(
+            MOI->getReg());
     assert(!MOI->getSubReg() && "Physical subreg still around.");
     Locs.push_back(
       Location(Location::Register, RC->getSize(), MOI->getReg(), 0));
@@ -158,7 +161,7 @@ StackMaps::createLiveOutReg(unsigned Reg, const TargetRegisterInfo *TRI) const {
 StackMaps::LiveOutVec
 StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {
   assert(Mask && "No register mask specified");
-  const TargetRegisterInfo *TRI = AP.TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = AP.TM.getSubtargetImpl()->getRegisterInfo();
   LiveOutVec LiveOuts;
 
   // Create a LiveOutReg for each bit that is set in the register mask.
@@ -217,9 +220,18 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID,
        I != E; ++I) {
     // Constants are encoded as sign-extended integers.
     // -1 is directly encoded as .long 0xFFFFFFFF with no constant pool.
-    if (I->LocType == Location::Constant &&
-        ((I->Offset + (int64_t(1)<<31)) >> 32) != 0) {
+    if (I->LocType == Location::Constant && !isInt<32>(I->Offset)) {
       I->LocType = Location::ConstantIndex;
+      // ConstPool is intentionally a MapVector of 'uint64_t's (as
+      // opposed to 'int64_t's).  We should never be in a situation
+      // where we have to insert either the tombstone or the empty
+      // keys into a map, and for a DenseMap<uint64_t, T> these are
+      // (uint64_t)0 and (uint64_t)-1.  They can be and are
+      // represented using 32 bit integers.
+
+      assert((uint64_t)I->Offset != DenseMapInfo<uint64_t>::getEmptyKey() &&
+             (uint64_t)I->Offset != DenseMapInfo<uint64_t>::getTombstoneKey() &&
+             "empty and tombstone keys should fit in 32 bits!");
       auto Result = ConstPool.insert(std::make_pair(I->Offset, I->Offset));
       I->Offset = Result.first - ConstPool.begin();
     }
@@ -229,15 +241,19 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID,
   // entry.
   const MCExpr *CSOffsetExpr = MCBinaryExpr::CreateSub(
     MCSymbolRefExpr::Create(MILabel, OutContext),
-    MCSymbolRefExpr::Create(AP.CurrentFnSym, OutContext),
+    MCSymbolRefExpr::Create(AP.CurrentFnSymForSize, OutContext),
     OutContext);
 
-  CSInfos.push_back(CallsiteInfo(CSOffsetExpr, ID, Locations, LiveOuts));
+  CSInfos.emplace_back(CSOffsetExpr, ID, std::move(Locations),
+                       std::move(LiveOuts));
 
   // Record the stack size of the current function.
   const MachineFrameInfo *MFI = AP.MF->getFrameInfo();
+  const TargetRegisterInfo *RegInfo = AP.MF->getSubtarget().getRegisterInfo();
+  const bool DynamicFrameSize = MFI->hasVarSizedObjects() ||
+    RegInfo->needsStackRealignment(*(AP.MF));
   FnStackSize[AP.CurrentFnSym] =
-    MFI->hasVarSizedObjects() ? UINT64_MAX : MFI->getStackSize();
+    DynamicFrameSize ? UINT64_MAX : MFI->getStackSize();
 }
 
 void StackMaps::recordStackMap(const MachineInstr &MI) {
@@ -270,6 +286,18 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) {
   }
 #endif
 }
+void StackMaps::recordStatepoint(const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::STATEPOINT &&
+         "expected statepoint");
+
+  StatepointOpers opers(&MI);
+  // Record all the deopt and gc operands (they're contiguous and run from the
+  // initial index to the end of the operand list)
+  const unsigned StartIdx = opers.getVarIdx();
+  recordStackMapOpers(MI, 0xABCDEF00,
+                      MI.operands_begin() + StartIdx, MI.operands_end(),
+                      false);
+}
 
 /// Emit the stackmap header.
 ///
@@ -485,7 +513,7 @@ void StackMaps::serializeToStackMapSection() {
 
   MCContext &OutContext = AP.OutStreamer.getContext();
   MCStreamer &OS = AP.OutStreamer;
-  const TargetRegisterInfo *TRI = AP.TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = AP.TM.getSubtargetImpl()->getRegisterInfo();
 
   // Create the section.
   const MCSection *StackMapSection =
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index accfe7be18d9..a132805ac845 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/Passes.h"
@@ -31,8 +32,10 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cstdlib>
 using namespace llvm;
 
@@ -85,7 +88,7 @@ bool StackProtector::runOnFunction(Function &Fn) {
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  TLI = TM->getTargetLowering();
+  TLI = TM->getSubtargetImpl()->getTargetLowering();
 
   Attribute Attr = Fn.getAttributes().getAttribute(
       AttributeSet::FunctionIndex, "stack-protector-buffer-size");
@@ -168,7 +171,7 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) {
     } else if (const PHINode *PN = dyn_cast<PHINode>(U)) {
       // Keep track of what PHI nodes we have already visited to ensure
       // they are only visited once.
-      if (VisitedPHIs.insert(PN))
+      if (VisitedPHIs.insert(PN).second)
         if (HasAddressTaken(PN))
           return true;
     } else if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
@@ -209,20 +212,16 @@ bool StackProtector::RequiresStackProtector() {
                                             Attribute::StackProtect))
     return false;
 
-  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-    BasicBlock *BB = I;
-
-    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;
-         ++II) {
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+  for (const BasicBlock &BB : *F) {
+    for (const Instruction &I : BB) {
+      if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
         if (AI->isArrayAllocation()) {
           // SSP-Strong: Enable protectors for any call to alloca, regardless
           // of size.
           if (Strong)
             return true;
 
-          if (const ConstantInt *CI =
-                  dyn_cast<ConstantInt>(AI->getArraySize())) {
+          if (const auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) {
             if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {
               // A call to alloca with size >= SSPBufferSize requires
               // stack protectors.
@@ -334,7 +333,7 @@ static CallInst *FindPotentialTailCall(BasicBlock *BB, ReturnInst *RI,
 /// Returns true if the platform/triple supports the stackprotectorcreate pseudo
 /// node.
 static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI,
-                           const TargetLoweringBase *TLI, const Triple &Trip,
+                           const TargetLoweringBase *TLI, const Triple &TT,
                            AllocaInst *&AI, Value *&StackGuardVar) {
   bool SupportsSelectionDAGSP = false;
   PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
@@ -343,9 +342,10 @@ static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI,
     Constant *OffsetVal =
         ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
 
-    StackGuardVar = ConstantExpr::getIntToPtr(
-        OffsetVal, PointerType::get(PtrTy, AddressSpace));
-  } else if (Trip.getOS() == llvm::Triple::OpenBSD) {
+    StackGuardVar =
+        ConstantExpr::getIntToPtr(OffsetVal, PointerType::get(PtrTy,
+                                                              AddressSpace));
+  } else if (TT.isOSOpenBSD()) {
     StackGuardVar = M->getOrInsertGlobal("__guard_local", PtrTy);
     cast<GlobalValue>(StackGuardVar)
         ->setVisibility(GlobalValue::HiddenVisibility);
@@ -398,14 +398,13 @@ bool StackProtector::InsertStackProtectors() {
         InsertionPt = RI;
         // At this point we know that BB has a return statement so it *DOES*
         // have a terminator.
-        assert(InsertionPt != nullptr && "BB must have a terminator instruction at "
-                                   "this point.");
+        assert(InsertionPt != nullptr &&
+               "BB must have a terminator instruction at this point.");
       }
 
       Function *Intrinsic =
           Intrinsic::getDeclaration(M, Intrinsic::stackprotectorcheck);
       CallInst::Create(Intrinsic, StackGuardVar, "", InsertionPt);
-
     } else {
       // If we do not support SelectionDAG based tail calls, generate IR level
       // tail calls.
@@ -458,11 +457,17 @@ bool StackProtector::InsertStackProtectors() {
       LoadInst *LI1 = B.CreateLoad(StackGuardVar);
       LoadInst *LI2 = B.CreateLoad(AI);
       Value *Cmp = B.CreateICmpEQ(LI1, LI2);
-      B.CreateCondBr(Cmp, NewBB, FailBB);
+      unsigned SuccessWeight =
+          BranchProbabilityInfo::getBranchWeightStackProtector(true);
+      unsigned FailureWeight =
+          BranchProbabilityInfo::getBranchWeightStackProtector(false);
+      MDNode *Weights = MDBuilder(F->getContext())
+                            .createBranchWeights(SuccessWeight, FailureWeight);
+      B.CreateCondBr(Cmp, NewBB, FailBB, Weights);
     }
   }
 
-  // Return if we didn't modify any basic blocks. I.e., there are no return
+  // Return if we didn't modify any basic blocks. i.e., there are no return
   // statements in the function.
   if (!HasPrologue)
     return false;
@@ -476,15 +481,17 @@ BasicBlock *StackProtector::CreateFailBB() {
   LLVMContext &Context = F->getContext();
   BasicBlock *FailBB = BasicBlock::Create(Context, "CallStackCheckFailBlk", F);
   IRBuilder<> B(FailBB);
-  if (Trip.getOS() == llvm::Triple::OpenBSD) {
-    Constant *StackChkFail = M->getOrInsertFunction(
-        "__stack_smash_handler", Type::getVoidTy(Context),
-        Type::getInt8PtrTy(Context), NULL);
+  if (Trip.isOSOpenBSD()) {
+    Constant *StackChkFail =
+        M->getOrInsertFunction("__stack_smash_handler",
+                               Type::getVoidTy(Context),
+                               Type::getInt8PtrTy(Context), nullptr);
 
     B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));
   } else {
-    Constant *StackChkFail = M->getOrInsertFunction(
-        "__stack_chk_fail", Type::getVoidTy(Context), NULL);
+    Constant *StackChkFail =
+        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context),
+                               nullptr);
     B.CreateCall(StackChkFail);
   }
   B.CreateUnreachable();
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 791168f5c05f..cc72e5e51f2c 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -28,7 +28,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <vector>
 using namespace llvm;
 
@@ -422,7 +422,7 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
     });
 
   MFI = MF.getFrameInfo();
-  TII = MF.getTarget().getInstrInfo();
+  TII = MF.getSubtarget().getInstrInfo();
   LS = &getAnalysis<LiveStacks>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
 
diff --git a/lib/CodeGen/StatepointExampleGC.cpp b/lib/CodeGen/StatepointExampleGC.cpp
new file mode 100644
index 000000000000..802cf132b34d
--- /dev/null
+++ b/lib/CodeGen/StatepointExampleGC.cpp
@@ -0,0 +1,54 @@
+//===-- StatepointDefaultGC.cpp - The default statepoint GC strategy ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a GCStrategy which serves as an example for the usage
+// of a statepoint based lowering strategy.  This GCStrategy is intended to
+// suitable as a default implementation usable with any collector which can
+// consume the standard stackmap format generated by statepoints, uses the
+// default addrespace to distinguish between gc managed and non-gc managed
+// pointers, and has reasonable relocation semantics.  
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Value.h"
+
+using namespace llvm;
+
+namespace {
+class StatepointGC : public GCStrategy {
+public:
+  StatepointGC() {
+    UseStatepoints = true;
+    // These options are all gc.root specific, we specify them so that the
+    // gc.root lowering code doesn't run.
+    InitRoots = false;
+    NeededSafePoints = 0;
+    UsesMetadata = false;
+    CustomRoots = false;
+    CustomSafePoints = false;
+  }
+  Optional<bool> isGCManagedPointer(const Value *V) const override {
+    // Method is only valid on pointer typed values.
+    PointerType *PT = cast<PointerType>(V->getType());
+    // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+    // GC managed heap.  We know that a pointer into this heap needs to be
+    // updated and that no other pointer does.
+    return (1 == PT->getAddressSpace());
+  }
+};
+}
+
+static GCRegistry::Add<StatepointGC>
+X("statepoint-example", "an example strategy for statepoint");
+
+namespace llvm {
+void linkStatepointExampleGC() {}
+}
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 723a6297f36a..4377236323a7 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "tailduplication"
@@ -135,8 +136,8 @@ bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
   if (skipOptnoneFunction(*MF.getFunction()))
     return false;
 
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   MMI = getAnalysisIfAvailable<MachineModuleInfo>();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
@@ -798,11 +799,9 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
       RS->enterBasicBlock(PredBB);
       if (!PredBB->empty())
         RS->forward(std::prev(PredBB->end()));
-      BitVector RegsLiveAtExit(TRI->getNumRegs());
-      RS->getRegsUsed(RegsLiveAtExit, false);
       for (MachineBasicBlock::livein_iterator I = TailBB->livein_begin(),
              E = TailBB->livein_end(); I != E; ++I) {
-        if (!RegsLiveAtExit[*I])
+        if (!RS->isRegUsed(*I, false))
           // If a register is previously livein to the tail but it's not live
           // at the end of predecessor BB, then it should be added to its
           // livein list.
diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 883e9d1846d9..1557d10238e9 100644
--- a/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -14,8 +14,8 @@
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cstdlib>
 using namespace llvm;
 
@@ -26,7 +26,7 @@ TargetFrameLowering::~TargetFrameLowering() {
 /// the stack frame of the specified index. This is the default implementation
 /// which is overridden for some targets.
 int TargetFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                         int FI) const {
+                                             int FI) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   return MFI->getObjectOffset(FI) + MFI->getStackSize() -
     getOffsetOfLocalArea() + MFI->getOffsetAdjustment();
@@ -34,7 +34,7 @@ int TargetFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
 
 int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                              int FI, unsigned &FrameReg) const {
-  const TargetRegisterInfo *RI = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
 
   // By default, assume all frame indices are referenced via whatever
   // getFrameRegister() says. The target can override this if it's doing
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index 83966bd0c208..608b8068759e 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -290,13 +291,15 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
     Offset = 0;
     return true;
   }
-  unsigned BitSize = TM->getRegisterInfo()->getSubRegIdxSize(SubIdx);
+  unsigned BitSize =
+      TM->getSubtargetImpl()->getRegisterInfo()->getSubRegIdxSize(SubIdx);
   // Convert bit size to byte size to be consistent with
   // MCRegisterClass::getSize().
   if (BitSize % 8)
     return false;
 
-  int BitOffset = TM->getRegisterInfo()->getSubRegIdxOffset(SubIdx);
+  int BitOffset =
+      TM->getSubtargetImpl()->getRegisterInfo()->getSubRegIdxOffset(SubIdx);
   if (BitOffset < 0 || BitOffset % 8)
     return false;
 
@@ -305,7 +308,7 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
 
   assert(RC->getSize() >= (Offset + Size) && "bad subregister range");
 
-  if (!TM->getDataLayout()->isLittleEndian()) {
+  if (!TM->getSubtargetImpl()->getDataLayout()->isLittleEndian()) {
     Offset = RC->getSize() - (Offset + Size);
   }
   return true;
@@ -370,6 +373,10 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI,
   return nullptr;
 }
 
+void TargetInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  llvm_unreachable("Not a MachO target");
+}
+
 bool TargetInstrInfo::
 canFoldMemoryOperand(const MachineInstr *MI,
                      const SmallVectorImpl<unsigned> &Ops) const {
@@ -498,7 +505,7 @@ TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
 
   const MachineOperand &MO = MI->getOperand(1-Ops[0]);
   MachineBasicBlock::iterator Pos = MI;
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
   if (Flags == MachineMemOperand::MOStore)
     storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI);
@@ -562,8 +569,6 @@ isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
                                          AliasAnalysis *AA) const {
   const MachineFunction &MF = *MI->getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetMachine &TM = MF.getTarget();
-  const TargetInstrInfo &TII = *TM.getInstrInfo();
 
   // Remat clients assume operand 0 is the defined register.
   if (!MI->getNumOperands() || !MI->getOperand(0).isReg())
@@ -582,7 +587,7 @@ isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
   // redundant with subsequent checks, but it's target-independent,
   // simple, and a common case.
   int FrameIdx = 0;
-  if (TII.isLoadFromStackSlot(MI, FrameIdx) &&
+  if (isLoadFromStackSlot(MI, FrameIdx) &&
       MF.getFrameInfo()->isImmutableObjectIndex(FrameIdx))
     return true;
 
@@ -640,6 +645,28 @@ isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
   return true;
 }
 
+int TargetInstrInfo::getSPAdjust(const MachineInstr *MI) const {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  bool StackGrowsDown =
+    TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  int FrameSetupOpcode = getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = getCallFrameDestroyOpcode();
+
+  if (MI->getOpcode() != FrameSetupOpcode &&
+      MI->getOpcode() != FrameDestroyOpcode)
+    return 0;
+ 
+  int SPAdj = MI->getOperand(0).getImm();
+
+  if ((!StackGrowsDown && MI->getOpcode() == FrameSetupOpcode) ||
+       (StackGrowsDown && MI->getOpcode() == FrameDestroyOpcode))
+    SPAdj = -SPAdj;
+
+  return SPAdj;
+}
+
 /// isSchedulingBoundary - Test if the given instruction should be
 /// considered a scheduling boundary. This primarily includes labels
 /// and terminators.
@@ -655,8 +682,8 @@ bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
   // saves compile time, because it doesn't require every single
   // stack slot reference to depend on the instruction that does the
   // modification.
-  const TargetLowering &TLI = *MF.getTarget().getTargetLowering();
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   if (MI->modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI))
     return true;
 
@@ -746,14 +773,14 @@ TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
 }
 
 /// Return the default expected latency for a def based on it's opcode.
-unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel *SchedModel,
+unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel &SchedModel,
                                             const MachineInstr *DefMI) const {
   if (DefMI->isTransient())
     return 0;
   if (DefMI->mayLoad())
-    return SchedModel->LoadLatency;
+    return SchedModel.LoadLatency;
   if (isHighLatencyDef(DefMI->getOpcode()))
-    return SchedModel->HighLatency;
+    return SchedModel.HighLatency;
   return 1;
 }
 
@@ -852,3 +879,77 @@ computeOperandLatency(const InstrItineraryData *ItinData,
                           defaultDefLatency(ItinData->SchedModel, DefMI));
   return InstrLatency;
 }
+
+bool TargetInstrInfo::getRegSequenceInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
+  assert((MI.isRegSequence() ||
+          MI.isRegSequenceLike()) && "Instruction do not have the proper type");
+
+  if (!MI.isRegSequence())
+    return getRegSequenceLikeInputs(MI, DefIdx, InputRegs);
+
+  // We are looking at:
+  // Def = REG_SEQUENCE v0, sub0, v1, sub1, ...
+  assert(DefIdx == 0 && "REG_SEQUENCE only has one def");
+  for (unsigned OpIdx = 1, EndOpIdx = MI.getNumOperands(); OpIdx != EndOpIdx;
+       OpIdx += 2) {
+    const MachineOperand &MOReg = MI.getOperand(OpIdx);
+    const MachineOperand &MOSubIdx = MI.getOperand(OpIdx + 1);
+    assert(MOSubIdx.isImm() &&
+           "One of the subindex of the reg_sequence is not an immediate");
+    // Record Reg:SubReg, SubIdx.
+    InputRegs.push_back(RegSubRegPairAndIdx(MOReg.getReg(), MOReg.getSubReg(),
+                                            (unsigned)MOSubIdx.getImm()));
+  }
+  return true;
+}
+
+bool TargetInstrInfo::getExtractSubregInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    RegSubRegPairAndIdx &InputReg) const {
+  assert((MI.isExtractSubreg() ||
+      MI.isExtractSubregLike()) && "Instruction do not have the proper type");
+
+  if (!MI.isExtractSubreg())
+    return getExtractSubregLikeInputs(MI, DefIdx, InputReg);
+
+  // We are looking at:
+  // Def = EXTRACT_SUBREG v0.sub1, sub0.
+  assert(DefIdx == 0 && "EXTRACT_SUBREG only has one def");
+  const MachineOperand &MOReg = MI.getOperand(1);
+  const MachineOperand &MOSubIdx = MI.getOperand(2);
+  assert(MOSubIdx.isImm() &&
+         "The subindex of the extract_subreg is not an immediate");
+
+  InputReg.Reg = MOReg.getReg();
+  InputReg.SubReg = MOReg.getSubReg();
+  InputReg.SubIdx = (unsigned)MOSubIdx.getImm();
+  return true;
+}
+
+bool TargetInstrInfo::getInsertSubregInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    RegSubRegPair &BaseReg, RegSubRegPairAndIdx &InsertedReg) const {
+  assert((MI.isInsertSubreg() ||
+      MI.isInsertSubregLike()) && "Instruction do not have the proper type");
+
+  if (!MI.isInsertSubreg())
+    return getInsertSubregLikeInputs(MI, DefIdx, BaseReg, InsertedReg);
+
+  // We are looking at:
+  // Def = INSERT_SEQUENCE v0, v1, sub0.
+  assert(DefIdx == 0 && "INSERT_SUBREG only has one def");
+  const MachineOperand &MOBaseReg = MI.getOperand(1);
+  const MachineOperand &MOInsertedReg = MI.getOperand(2);
+  const MachineOperand &MOSubIdx = MI.getOperand(3);
+  assert(MOSubIdx.isImm() &&
+         "One of the subindex of the reg_sequence is not an immediate");
+  BaseReg.Reg = MOBaseReg.getReg();
+  BaseReg.SubReg = MOBaseReg.getSubReg();
+
+  InsertedReg.Reg = MOInsertedReg.getReg();
+  InsertedReg.SubReg = MOInsertedReg.getSubReg();
+  InsertedReg.SubIdx = (unsigned)MOSubIdx.getImm();
+  return true;
+}
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index e80ef7176c21..9b2fdffa87c1 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cctype>
 using namespace llvm;
 
@@ -205,6 +206,16 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::FLOOR_F80] = "floorl";
   Names[RTLIB::FLOOR_F128] = "floorl";
   Names[RTLIB::FLOOR_PPCF128] = "floorl";
+  Names[RTLIB::FMIN_F32] = "fminf";
+  Names[RTLIB::FMIN_F64] = "fmin";
+  Names[RTLIB::FMIN_F80] = "fminl";
+  Names[RTLIB::FMIN_F128] = "fminl";
+  Names[RTLIB::FMIN_PPCF128] = "fminl";
+  Names[RTLIB::FMAX_F32] = "fmaxf";
+  Names[RTLIB::FMAX_F64] = "fmax";
+  Names[RTLIB::FMAX_F80] = "fmaxl";
+  Names[RTLIB::FMAX_F128] = "fmaxl";
+  Names[RTLIB::FMAX_PPCF128] = "fmaxl";
   Names[RTLIB::ROUND_F32] = "roundf";
   Names[RTLIB::ROUND_F64] = "round";
   Names[RTLIB::ROUND_F80] = "roundl";
@@ -403,7 +414,7 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
     Names[RTLIB::SINCOS_PPCF128] = nullptr;
   }
 
-  if (TT.getOS() != Triple::OpenBSD) {
+  if (!TT.isOSOpenBSD()) {
     Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = "__stack_chk_fail";
   } else {
     // These are generally not available.
@@ -683,10 +694,9 @@ static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
   CCs[RTLIB::O_F128] = ISD::SETEQ;
 }
 
-/// NOTE: The constructor takes ownership of TLOF.
-TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
-                                       const TargetLoweringObjectFile *tlof)
-  : TM(tm), DL(TM.getDataLayout()), TLOF(*tlof) {
+/// NOTE: The TargetMachine owns TLOF.
+TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
+    : TM(tm), DL(TM.getSubtargetImpl()->getDataLayout()) {
   initActions();
 
   // Perform these initializations only once.
@@ -700,10 +710,13 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
   HasMultipleConditionRegisters = false;
   HasExtractBitsInsn = false;
   IntDivIsCheap = false;
-  Pow2DivIsCheap = false;
+  FsqrtIsCheap = false;
+  Pow2SDivIsCheap = false;
   JumpIsExpensive = false;
   PredictableSelectIsExpensive = false;
   MaskAndBranchFoldingIsLegal = false;
+  EnableExtLdPromotion = false;
+  HasFloatingPointExceptions = true;
   StackPointerRegisterToSaveRestore = 0;
   ExceptionPointerRegister = 0;
   ExceptionSelectorRegister = 0;
@@ -718,7 +731,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
   PrefLoopAlignment = 0;
   MinStackArgumentAlignment = 1;
   InsertFencesForAtomic = false;
-  SupportJumpTables = true;
   MinimumJumpTableEntries = 4;
 
   InitLibcallNames(LibcallRoutineNames, Triple(TM.getTargetTriple()));
@@ -726,10 +738,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
   InitLibcallCallingConvs(LibcallCallingConvs);
 }
 
-TargetLoweringBase::~TargetLoweringBase() {
-  delete &TLOF;
-}
-
 void TargetLoweringBase::initActions() {
   // All operations default to being supported.
   memset(OpActions, 0, sizeof(OpActions));
@@ -741,35 +749,32 @@ void TargetLoweringBase::initActions() {
   memset(TargetDAGCombineArray, 0, array_lengthof(TargetDAGCombineArray));
 
   // Set default actions for various operations.
-  for (unsigned VT = 0; VT != (unsigned)MVT::LAST_VALUETYPE; ++VT) {
+  for (MVT VT : MVT::all_valuetypes()) {
     // Default all indexed load / store to expand.
     for (unsigned IM = (unsigned)ISD::PRE_INC;
          IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
-      setIndexedLoadAction(IM, (MVT::SimpleValueType)VT, Expand);
-      setIndexedStoreAction(IM, (MVT::SimpleValueType)VT, Expand);
+      setIndexedLoadAction(IM, VT, Expand);
+      setIndexedStoreAction(IM, VT, Expand);
     }
 
     // Most backends expect to see the node which just returns the value loaded.
-    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
-                       (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
 
     // These operations default to expand.
-    setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FGETSIGN, VT, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS, VT, Expand);
+    setOperationAction(ISD::FMINNUM, VT, Expand);
+    setOperationAction(ISD::FMAXNUM, VT, Expand);
 
     // These library functions default to expand.
-    setOperationAction(ISD::FROUND, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FROUND, VT, Expand);
 
     // These operations default to expand for vector types.
-    if (VT >= MVT::FIRST_VECTOR_VALUETYPE &&
-        VT <= MVT::LAST_VECTOR_VALUETYPE) {
-      setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG,
-                         (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG,
-                         (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG,
-                         (MVT::SimpleValueType)VT, Expand);
+    if (VT.isVector()) {
+      setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+      setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, VT, Expand);
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand);
+      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand);
     }
   }
 
@@ -792,6 +797,8 @@ void TargetLoweringBase::initActions() {
   setOperationAction(ISD::FEXP ,  MVT::f16, Expand);
   setOperationAction(ISD::FEXP2,  MVT::f16, Expand);
   setOperationAction(ISD::FFLOOR, MVT::f16, Expand);
+  setOperationAction(ISD::FMINNUM, MVT::f16, Expand);
+  setOperationAction(ISD::FMAXNUM, MVT::f16, Expand);
   setOperationAction(ISD::FNEARBYINT, MVT::f16, Expand);
   setOperationAction(ISD::FCEIL,  MVT::f16, Expand);
   setOperationAction(ISD::FRINT,  MVT::f16, Expand);
@@ -803,6 +810,8 @@ void TargetLoweringBase::initActions() {
   setOperationAction(ISD::FEXP ,  MVT::f32, Expand);
   setOperationAction(ISD::FEXP2,  MVT::f32, Expand);
   setOperationAction(ISD::FFLOOR, MVT::f32, Expand);
+  setOperationAction(ISD::FMINNUM, MVT::f32, Expand);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Expand);
   setOperationAction(ISD::FNEARBYINT, MVT::f32, Expand);
   setOperationAction(ISD::FCEIL,  MVT::f32, Expand);
   setOperationAction(ISD::FRINT,  MVT::f32, Expand);
@@ -814,6 +823,8 @@ void TargetLoweringBase::initActions() {
   setOperationAction(ISD::FEXP ,  MVT::f64, Expand);
   setOperationAction(ISD::FEXP2,  MVT::f64, Expand);
   setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
+  setOperationAction(ISD::FMINNUM, MVT::f64, Expand);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Expand);
   setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
   setOperationAction(ISD::FCEIL,  MVT::f64, Expand);
   setOperationAction(ISD::FRINT,  MVT::f64, Expand);
@@ -825,6 +836,8 @@ void TargetLoweringBase::initActions() {
   setOperationAction(ISD::FEXP ,  MVT::f128, Expand);
   setOperationAction(ISD::FEXP2,  MVT::f128, Expand);
   setOperationAction(ISD::FFLOOR, MVT::f128, Expand);
+  setOperationAction(ISD::FMINNUM, MVT::f128, Expand);
+  setOperationAction(ISD::FMAXNUM, MVT::f128, Expand);
   setOperationAction(ISD::FNEARBYINT, MVT::f128, Expand);
   setOperationAction(ISD::FCEIL,  MVT::f128, Expand);
   setOperationAction(ISD::FRINT,  MVT::f128, Expand);
@@ -976,11 +989,16 @@ TargetLoweringBase::emitPatchPoint(MachineInstr *MI,
     // Add a new memory operand for this FI.
     const MachineFrameInfo &MFI = *MF.getFrameInfo();
     assert(MFI.getObjectOffset(FI) != -1);
-    MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
-                              MachineMemOperand::MOLoad,
-                              TM.getDataLayout()->getPointerSize(),
-                              MFI.getObjectAlignment(FI));
+
+    unsigned Flags = MachineMemOperand::MOLoad;
+    if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
+      Flags |= MachineMemOperand::MOStore;
+      Flags |= MachineMemOperand::MOVolatile;
+    }
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(FI), Flags,
+        TM.getSubtargetImpl()->getDataLayout()->getPointerSize(),
+        MFI.getObjectAlignment(FI));
     MIB->addMemOperand(MF, MMO);
 
     // Replace the instruction and update the operand index.
@@ -996,7 +1014,8 @@ TargetLoweringBase::emitPatchPoint(MachineInstr *MI,
 /// of the register class for the specified type and its associated "cost".
 std::pair<const TargetRegisterClass*, uint8_t>
 TargetLoweringBase::findRepresentativeClass(MVT VT) const {
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
   if (!RC)
     return std::make_pair(RC, 0);
@@ -1023,8 +1042,8 @@ TargetLoweringBase::findRepresentativeClass(MVT VT) const {
 /// computeRegisterProperties - Once all of the register classes are added,
 /// this allows us to compute derived properties we expose.
 void TargetLoweringBase::computeRegisterProperties() {
-  assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
-         "Too many value types for ValueTypeActions to hold!");
+  static_assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE,
+                "Too many value types for ValueTypeActions to hold!");
 
   // Everything defaults to needing one register.
   for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
@@ -1179,8 +1198,12 @@ void TargetLoweringBase::computeRegisterProperties() {
         TransformToType[i] = MVT::Other;
         if (PreferredAction == TypeScalarizeVector)
           ValueTypeActions.setTypeAction(VT, TypeScalarizeVector);
-        else
+        else if (PreferredAction == TypeSplitVector)
           ValueTypeActions.setTypeAction(VT, TypeSplitVector);
+        else
+          // Set type action according to the number of elements.
+          ValueTypeActions.setTypeAction(VT, NElts == 1 ? TypeScalarizeVector
+                                                        : TypeSplitVector);
       } else {
         TransformToType[i] = NVT;
         ValueTypeActions.setTypeAction(VT, TypeWidenVector);
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index f59efa35031c..9f1e06b4725d 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 using namespace dwarf;
 
@@ -72,9 +73,10 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer,
                                                     Flags,
                                                     SectionKind::getDataRel(),
                                                     0, Label->getName());
-  unsigned Size = TM.getDataLayout()->getPointerSize();
+  unsigned Size = TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
   Streamer.SwitchSection(Sec);
-  Streamer.EmitValueToAlignment(TM.getDataLayout()->getPointerABIAlignment());
+  Streamer.EmitValueToAlignment(
+      TM.getSubtargetImpl()->getDataLayout()->getPointerABIAlignment());
   Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject);
   const MCExpr *E = MCConstantExpr::Create(Size, getContext());
   Streamer.EmitELFSize(Label, E);
@@ -287,7 +289,8 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
     // FIXME: this is getting the alignment of the character, not the
     // alignment of the global!
     unsigned Align =
-      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV));
+        TM.getSubtargetImpl()->getDataLayout()->getPreferredAlignment(
+            cast<GlobalVariable>(GV));
 
     const char *SizeSpec = ".rodata.str1.";
     if (Kind.isMergeable2ByteCString())
@@ -355,44 +358,59 @@ TargetLoweringObjectFileELF::getSectionForConstant(SectionKind Kind,
   return DataRelROSection;
 }
 
-const MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
-    unsigned Priority, const MCSymbol *KeySym) const {
-  // The default scheme is .ctor / .dtor, so we have to invert the priority
-  // numbering.
-  if (Priority == 65535)
-    return StaticCtorSection;
+static const MCSectionELF *getStaticStructorSection(MCContext &Ctx,
+                                                    bool UseInitArray,
+                                                    bool IsCtor,
+                                                    unsigned Priority,
+                                                    const MCSymbol *KeySym) {
+  std::string Name;
+  unsigned Type;
+  unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE;
+  SectionKind Kind = SectionKind::getDataRel();
+  StringRef COMDAT = KeySym ? KeySym->getName() : "";
+
+  if (KeySym)
+    Flags |= ELF::SHF_GROUP;
 
   if (UseInitArray) {
-    std::string Name = std::string(".init_array.") + utostr(Priority);
-    return getContext().getELFSection(Name, ELF::SHT_INIT_ARRAY,
-                                      ELF::SHF_ALLOC | ELF::SHF_WRITE,
-                                      SectionKind::getDataRel());
+    if (IsCtor) {
+      Type = ELF::SHT_INIT_ARRAY;
+      Name = ".init_array";
+    } else {
+      Type = ELF::SHT_FINI_ARRAY;
+      Name = ".fini_array";
+    }
+    if (Priority != 65535) {
+      Name += '.';
+      Name += utostr(Priority);
+    }
   } else {
-    std::string Name = std::string(".ctors.") + utostr(65535 - Priority);
-    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
-                                      ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                                      SectionKind::getDataRel());
+    // The default scheme is .ctor / .dtor, so we have to invert the priority
+    // numbering.
+    if (IsCtor)
+      Name = ".ctors";
+    else
+      Name = ".dtors";
+    if (Priority != 65535) {
+      Name += '.';
+      Name += utostr(65535 - Priority);
+    }
+    Type = ELF::SHT_PROGBITS;
   }
+
+  return Ctx.getELFSection(Name, Type, Flags, Kind, 0, COMDAT);
 }
 
-const MCSection *TargetLoweringObjectFileELF::getStaticDtorSection(
+const MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
     unsigned Priority, const MCSymbol *KeySym) const {
-  // The default scheme is .ctor / .dtor, so we have to invert the priority
-  // numbering.
-  if (Priority == 65535)
-    return StaticDtorSection;
+  return getStaticStructorSection(getContext(), UseInitArray, true, Priority,
+                                  KeySym);
+}
 
-  if (UseInitArray) {
-    std::string Name = std::string(".fini_array.") + utostr(Priority);
-    return getContext().getELFSection(Name, ELF::SHT_FINI_ARRAY,
-                                      ELF::SHF_ALLOC | ELF::SHF_WRITE,
-                                      SectionKind::getDataRel());
-  } else {
-    std::string Name = std::string(".dtors.") + utostr(65535 - Priority);
-    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
-                                      ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                                      SectionKind::getDataRel());
-  }
+const MCSection *TargetLoweringObjectFileELF::getStaticDtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getStaticStructorSection(getContext(), UseInitArray, false, Priority,
+                                  KeySym);
 }
 
 void
@@ -446,14 +464,15 @@ emitModuleFlags(MCStreamer &Streamer,
       continue;
 
     StringRef Key = MFE.Key->getString();
-    Value *Val = MFE.Val;
+    Metadata *Val = MFE.Val;
 
     if (Key == "Objective-C Image Info Version") {
-      VersionVal = cast<ConstantInt>(Val)->getZExtValue();
+      VersionVal = mdconst::extract<ConstantInt>(Val)->getZExtValue();
     } else if (Key == "Objective-C Garbage Collection" ||
                Key == "Objective-C GC Only" ||
-               Key == "Objective-C Is Simulated") {
-      ImageInfoFlags |= cast<ConstantInt>(Val)->getZExtValue();
+               Key == "Objective-C Is Simulated" ||
+               Key == "Objective-C Image Swift Version") {
+      ImageInfoFlags |= mdconst::extract<ConstantInt>(Val)->getZExtValue();
     } else if (Key == "Objective-C Image Info Section") {
       SectionVal = cast<MDString>(Val)->getString();
     } else if (Key == "Linker Options") {
@@ -554,41 +573,6 @@ const MCSection *TargetLoweringObjectFileMachO::getExplicitSectionGlobal(
   return S;
 }
 
-bool TargetLoweringObjectFileMachO::isSectionAtomizableBySymbols(
-    const MCSection &Section) const {
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
-
-    // Sections holding 1 byte strings are atomized based on the data
-    // they contain.
-    // Sections holding 2 byte strings require symbols in order to be
-    // atomized.
-    // There is no dedicated section for 4 byte strings.
-    if (SMO.getKind().isMergeable1ByteCString())
-      return false;
-
-    if (SMO.getSegmentName() == "__DATA" &&
-        SMO.getSectionName() == "__cfstring")
-      return false;
-
-    switch (SMO.getType()) {
-    default:
-      return true;
-
-      // These sections are atomized at the element boundaries without using
-      // symbols.
-    case MachO::S_4BYTE_LITERALS:
-    case MachO::S_8BYTE_LITERALS:
-    case MachO::S_16BYTE_LITERALS:
-    case MachO::S_LITERAL_POINTERS:
-    case MachO::S_NON_LAZY_SYMBOL_POINTERS:
-    case MachO::S_LAZY_SYMBOL_POINTERS:
-    case MachO::S_MOD_INIT_FUNC_POINTERS:
-    case MachO::S_MOD_TERM_FUNC_POINTERS:
-    case MachO::S_INTERPOSING:
-      return false;
-    }
-}
-
 const MCSection *TargetLoweringObjectFileMachO::
 SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
                        Mangler &Mang, const TargetMachine &TM) const {
@@ -611,17 +595,21 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
 
   // FIXME: Alignment check should be handled by section classifier.
   if (Kind.isMergeable1ByteCString() &&
-      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
+      TM.getSubtargetImpl()->getDataLayout()->getPreferredAlignment(
+          cast<GlobalVariable>(GV)) < 32)
     return CStringSection;
 
   // Do not put 16-bit arrays in the UString section if they have an
   // externally visible label, this runs into issues with certain linker
   // versions.
   if (Kind.isMergeable2ByteCString() && !GV->hasExternalLinkage() &&
-      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
+      TM.getSubtargetImpl()->getDataLayout()->getPreferredAlignment(
+          cast<GlobalVariable>(GV)) < 32)
     return UStringSection;
 
-  if (Kind.isMergeableConst()) {
+  // With MachO only variables whose corresponding symbol starts with 'l' or
+  // 'L' can be merged, so we only try merging GVs with private linkage.
+  if (GV->hasPrivateLinkage() && Kind.isMergeableConst()) {
     if (Kind.isMergeableConst4())
       return FourByteConstantSection;
     if (Kind.isMergeableConst8())
@@ -739,7 +727,7 @@ getCOFFSectionFlags(SectionKind K) {
       COFF::IMAGE_SCN_MEM_EXECUTE |
       COFF::IMAGE_SCN_MEM_READ |
       COFF::IMAGE_SCN_CNT_CODE;
-  else if (K.isBSS ())
+  else if (K.isBSS())
     Flags |=
       COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
       COFF::IMAGE_SCN_MEM_READ |
@@ -749,7 +737,7 @@ getCOFFSectionFlags(SectionKind K) {
       COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
       COFF::IMAGE_SCN_MEM_READ |
       COFF::IMAGE_SCN_MEM_WRITE;
-  else if (K.isReadOnly())
+  else if (K.isReadOnly() || K.isReadOnlyWithRel())
     Flags |=
       COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
       COFF::IMAGE_SCN_MEM_READ;
@@ -774,7 +762,7 @@ static const GlobalValue *getComdatGVForCOFF(const GlobalValue *GV) {
 
   if (ComdatGV->getComdat() != C)
     report_fatal_error("Associative COMDAT symbol '" + ComdatGVName +
-                       "' is not a key for it's COMDAT.");
+                       "' is not a key for its COMDAT.");
 
   return ComdatGV;
 }
@@ -843,9 +831,9 @@ static const char *getCOFFSectionNameForUniqueGlobal(SectionKind Kind) {
     return ".bss";
   if (Kind.isThreadLocal())
     return ".tls$";
-  if (Kind.isWriteable())
-    return ".data";
-  return ".rdata";
+  if (Kind.isReadOnly() || Kind.isReadOnlyWithRel())
+    return ".rdata";
+  return ".data";
 }
 
 
@@ -893,7 +881,7 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
   if (Kind.isThreadLocal())
     return TLSDataSection;
 
-  if (Kind.isReadOnly())
+  if (Kind.isReadOnly() || Kind.isReadOnlyWithRel())
     return ReadOnlySection;
 
   // Note: we claim that common symbols are put in BSSSection, but they are
@@ -924,7 +912,7 @@ emitModuleFlags(MCStreamer &Streamer,
        i = ModuleFlags.begin(), e = ModuleFlags.end(); i != e; ++i) {
     const Module::ModuleFlagEntry &MFE = *i;
     StringRef Key = MFE.Key->getString();
-    Value *Val = MFE.Val;
+    Metadata *Val = MFE.Val;
     if (Key == "Linker Options") {
       LinkerOptions = cast<MDNode>(Val);
       break;
@@ -944,7 +932,7 @@ emitModuleFlags(MCStreamer &Streamer,
       StringRef Op = MDOption->getString();
       // Lead with a space for consistency with our dllexport implementation.
       std::string Escaped(" ");
-      if (Op.find(" ") != StringRef::npos) {
+      if (!Op.startswith("\"") && (Op.find(" ") != StringRef::npos)) {
         // The PE-COFF spec says args with spaces must be quoted.  It doesn't say
         // how to escape quotes, but it probably uses this algorithm:
         // http://msdn.microsoft.com/en-us/library/17w5ykft(v=vs.85).aspx
@@ -960,29 +948,14 @@ emitModuleFlags(MCStreamer &Streamer,
   }
 }
 
-static const MCSection *getAssociativeCOFFSection(MCContext &Ctx,
-                                                  const MCSection *Sec,
-                                                  const MCSymbol *KeySym) {
-  // Return the normal section if we don't have to be associative.
-  if (!KeySym)
-    return Sec;
-
-  // Make an associative section with the same name and kind as the normal
-  // section.
-  const MCSectionCOFF *SecCOFF = cast<MCSectionCOFF>(Sec);
-  unsigned Characteristics =
-      SecCOFF->getCharacteristics() | COFF::IMAGE_SCN_LNK_COMDAT;
-  return Ctx.getCOFFSection(SecCOFF->getSectionName(), Characteristics,
-                            SecCOFF->getKind(), KeySym->getName(),
-                            COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE);
-}
-
 const MCSection *TargetLoweringObjectFileCOFF::getStaticCtorSection(
     unsigned Priority, const MCSymbol *KeySym) const {
-  return getAssociativeCOFFSection(getContext(), StaticCtorSection, KeySym);
+  return getContext().getAssociativeCOFFSection(
+      cast<MCSectionCOFF>(StaticCtorSection), KeySym);
 }
 
 const MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
     unsigned Priority, const MCSymbol *KeySym) const {
-  return getAssociativeCOFFSection(getContext(), StaticDtorSection, KeySym);
+  return getContext().getAssociativeCOFFSection(
+      cast<MCSectionCOFF>(StaticDtorSection), KeySym);
 }
diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp
index 3ca2017550cf..618d903a0904 100644
--- a/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/lib/CodeGen/TargetOptionsImpl.cpp
@@ -51,3 +51,10 @@ bool TargetOptions::HonorSignDependentRoundingFPMath() const {
 StringRef TargetOptions::getTrapFunctionName() const {
   return TrapFuncName;
 }
+
+/// getCFIFuncName - If this returns a non-empty string, then it is the name of
+/// the function that gets called on CFI violations in CFI non-enforcing mode
+/// (!TargetOptions::CFIEnforcing).
+StringRef TargetOptions::getCFIFuncName() const {
+  return CFIFuncName;
+}
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index a3a4fb3f02b9..61a66b623928 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -108,7 +109,7 @@ TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const {
 /// register of the given type, picking the most sub register class of
 /// the right type that contains this physreg.
 const TargetRegisterClass *
-TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, EVT VT) const {
+TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const {
   assert(isPhysicalRegister(reg) && "reg must be a physical register");
 
   // Pick the most sub register class of the right type that contains
@@ -293,3 +294,11 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
   // All clear, tell the register allocator to prefer this register.
   Hints.push_back(Phys);
 }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void
+TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex,
+                            const TargetRegisterInfo *TRI) {
+  dbgs() << PrintReg(Reg, TRI, SubRegIndex) << "\n";
+}
+#endif
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index b0f2ca68884b..ef2dab1287fa 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -16,7 +16,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
@@ -157,7 +156,7 @@ unsigned TargetSchedModel::computeOperandLatency(
   const MachineInstr *UseMI, unsigned UseOperIdx) const {
 
   if (!hasInstrSchedModel() && !hasInstrItineraries())
-    return TII->defaultDefLatency(&SchedModel, DefMI);
+    return TII->defaultDefLatency(SchedModel, DefMI);
 
   if (hasInstrItineraries()) {
     int OperLatency = 0;
@@ -181,7 +180,7 @@ unsigned TargetSchedModel::computeOperandLatency(
     // applicable to the InstrItins model. InstrSchedModel should model all
     // special cases without TII hooks.
     InstrLatency = std::max(InstrLatency,
-                            TII->defaultDefLatency(&SchedModel, DefMI));
+                            TII->defaultDefLatency(SchedModel, DefMI));
     return InstrLatency;
   }
   // hasInstrSchedModel()
@@ -222,7 +221,29 @@ unsigned TargetSchedModel::computeOperandLatency(
   // FIXME: Automatically giving all implicit defs defaultDefLatency is
   // undesirable. We should only do it for defs that are known to the MC
   // desc like flags. Truly implicit defs should get 1 cycle latency.
-  return DefMI->isTransient() ? 0 : TII->defaultDefLatency(&SchedModel, DefMI);
+  return DefMI->isTransient() ? 0 : TII->defaultDefLatency(SchedModel, DefMI);
+}
+
+unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const {
+  assert(hasInstrSchedModel() && "Only call this function with a SchedModel");
+
+  unsigned SCIdx = TII->get(Opcode).getSchedClass();
+  const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SCIdx);
+  unsigned Latency = 0;
+
+  if (SCDesc->isValid() && !SCDesc->isVariant()) {
+    for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
+         DefIdx != DefEnd; ++DefIdx) {
+      // Lookup the definition's write latency in SubtargetInfo.
+      const MCWriteLatencyEntry *WLEntry =
+          STI->getWriteLatencyEntry(SCDesc, DefIdx);
+      Latency = std::max(Latency, capLatency(WLEntry->Cycles));
+    }
+    return Latency;
+  }
+
+  assert(Latency && "No MI sched latency");
+  return 0;
 }
 
 unsigned
@@ -248,7 +269,7 @@ TargetSchedModel::computeInstrLatency(const MachineInstr *MI,
       return Latency;
     }
   }
-  return TII->defaultDefLatency(&SchedModel, MI);
+  return TII->defaultDefLatency(SchedModel, MI);
 }
 
 unsigned TargetSchedModel::
@@ -268,7 +289,7 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
   // for predicated defs.
   unsigned Reg = DefMI->getOperand(DefOperIdx).getReg();
   const MachineFunction &MF = *DefMI->getParent()->getParent();
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   if (!DepMI->readsRegister(Reg, TRI) && TII->isPredicated(DepMI))
     return computeInstrLatency(DefMI);
 
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index f42d47bebda7..e218a8387c53 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "twoaddrinstr"
@@ -544,10 +545,21 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
   if (ToRegA) {
     unsigned FromRegB = getMappedReg(regB, SrcRegMap);
     unsigned FromRegC = getMappedReg(regC, SrcRegMap);
-    bool BComp = !FromRegB || regsAreCompatible(FromRegB, ToRegA, TRI);
-    bool CComp = !FromRegC || regsAreCompatible(FromRegC, ToRegA, TRI);
-    if (BComp != CComp)
-      return !BComp && CComp;
+    bool CompB = FromRegB && regsAreCompatible(FromRegB, ToRegA, TRI);
+    bool CompC = FromRegC && regsAreCompatible(FromRegC, ToRegA, TRI);
+
+    // Compute if any of the following are true:
+    // -RegB is not tied to a register and RegC is compatible with RegA.
+    // -RegB is tied to the wrong physical register, but RegC is.
+    // -RegB is tied to the wrong physical register, and RegC isn't tied.
+    if ((!FromRegB && CompC) || (FromRegB && !CompB && (!FromRegC || CompC)))
+      return true;
+    // Don't compute if any of the following are true:
+    // -RegC is not tied to a register and RegB is compatible with RegA.
+    // -RegC is tied to the wrong physical register, but RegB is.
+    // -RegC is tied to the wrong physical register, and RegB isn't tied.
+    if ((!FromRegC && CompB) || (FromRegC && !CompC && (!FromRegB || CompB)))
+      return false;
   }
 
   // If there is a use of regC between its last def (could be livein) and this
@@ -666,7 +678,7 @@ TwoAddressInstructionPass::scanUses(unsigned DstReg) {
   unsigned Reg = DstReg;
   while (MachineInstr *UseMI = findOnlyInterestingUse(Reg, MBB, MRI, TII,IsCopy,
                                                       NewReg, IsDstPhys)) {
-    if (IsCopy && !Processed.insert(UseMI))
+    if (IsCopy && !Processed.insert(UseMI).second)
       break;
 
     DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UseMI);
@@ -1503,9 +1515,9 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   const TargetMachine &TM = MF->getTarget();
   MRI = &MF->getRegInfo();
-  TII = TM.getInstrInfo();
-  TRI = TM.getRegisterInfo();
-  InstrItins = TM.getInstrItineraryData();
+  TII = TM.getSubtargetImpl()->getInstrInfo();
+  TRI = TM.getSubtargetImpl()->getRegisterInfo();
+  InstrItins = TM.getSubtargetImpl()->getInstrItineraryData();
   LV = getAnalysisIfAvailable<LiveVariables>();
   LIS = getAnalysisIfAvailable<LiveIntervals>();
   AA = &getAnalysis<AliasAnalysis>();
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index 2e220820b921..7824f9218575 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -64,9 +64,8 @@ bool UnreachableBlockElim::runOnFunction(Function &F) {
   SmallPtrSet<BasicBlock*, 8> Reachable;
 
   // Mark all reachable blocks.
-  for (df_ext_iterator<Function*, SmallPtrSet<BasicBlock*, 8> > I =
-       df_ext_begin(&F, Reachable), E = df_ext_end(&F, Reachable); I != E; ++I)
-    /* Mark all reachable blocks */;
+  for (BasicBlock *BB : depth_first_ext(&F, Reachable))
+    (void)BB/* Mark all reachable blocks */;
 
   // Loop over all dead blocks, remembering them and deleting all instructions
   // in them.
@@ -125,10 +124,8 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
   MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
 
   // Mark all reachable blocks.
-  for (df_ext_iterator<MachineFunction*, SmallPtrSet<MachineBasicBlock*, 8> >
-       I = df_ext_begin(&F, Reachable), E = df_ext_end(&F, Reachable);
-       I != E; ++I)
-    /* Mark all reachable blocks */;
+  for (MachineBasicBlock *BB : depth_first_ext(&F, Reachable))
+    (void)BB/* Mark all reachable blocks */;
 
   // Loop over all dead blocks, remembering them and deleting all instructions
   // in them.
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 704736f07945..7d3b0cea1c15 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -54,8 +55,8 @@ INITIALIZE_PASS(VirtRegMap, "virtregmap", "Virtual Register Map", false, false)
 
 bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
   MRI = &mf.getRegInfo();
-  TII = mf.getTarget().getInstrInfo();
-  TRI = mf.getTarget().getRegisterInfo();
+  TII = mf.getSubtarget().getInstrInfo();
+  TRI = mf.getSubtarget().getRegisterInfo();
   MF = &mf;
 
   Virt2PhysMap.clear();
@@ -123,7 +124,7 @@ void VirtRegMap::print(raw_ostream &OS, const Module*) const {
     if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) {
       OS << '[' << PrintReg(Reg, TRI) << " -> "
          << PrintReg(Virt2PhysMap[Reg], TRI) << "] "
-         << MRI->getRegClass(Reg)->getName() << "\n";
+         << TRI->getRegClassName(MRI->getRegClass(Reg)) << "\n";
     }
   }
 
@@ -131,7 +132,7 @@ void VirtRegMap::print(raw_ostream &OS, const Module*) const {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) {
       OS << '[' << PrintReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg]
-         << "] " << MRI->getRegClass(Reg)->getName() << "\n";
+         << "] " << TRI->getRegClassName(MRI->getRegClass(Reg)) << "\n";
     }
   }
   OS << '\n';
@@ -205,8 +206,8 @@ void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const {
 bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
   MF = &fn;
   TM = &MF->getTarget();
-  TRI = TM->getRegisterInfo();
-  TII = TM->getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
   MRI = &MF->getRegInfo();
   Indexes = &getAnalysis<SlotIndexes>();
   LIS = &getAnalysis<LiveIntervals>();
@@ -251,20 +252,41 @@ void VirtRegRewriter::addMBBLiveIns() {
     unsigned PhysReg = VRM->getPhys(VirtReg);
     assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register.");
 
-    // Scan the segments of LI.
-    for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I != E;
-         ++I) {
-      if (!Indexes->findLiveInMBBs(I->start, I->end, LiveIn))
-        continue;
-      for (unsigned i = 0, e = LiveIn.size(); i != e; ++i)
-        if (!LiveIn[i]->isLiveIn(PhysReg))
-          LiveIn[i]->addLiveIn(PhysReg);
-      LiveIn.clear();
+    if (LI.hasSubRanges()) {
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        for (const auto &Seg : S.segments) {
+          if (!Indexes->findLiveInMBBs(Seg.start, Seg.end, LiveIn))
+            continue;
+          for (MCSubRegIndexIterator SR(PhysReg, TRI); SR.isValid(); ++SR) {
+            unsigned SubReg = SR.getSubReg();
+            unsigned SubRegIndex = SR.getSubRegIndex();
+            unsigned SubRegLaneMask = TRI->getSubRegIndexLaneMask(SubRegIndex);
+            if ((SubRegLaneMask & S.LaneMask) == 0)
+              continue;
+            for (unsigned i = 0, e = LiveIn.size(); i != e; ++i) {
+              if (!LiveIn[i]->isLiveIn(SubReg))
+                LiveIn[i]->addLiveIn(SubReg);
+            }
+          }
+          LiveIn.clear();
+        }
+      }
+    } else {
+      // Scan the segments of LI.
+      for (const auto &Seg : LI.segments) {
+        if (!Indexes->findLiveInMBBs(Seg.start, Seg.end, LiveIn))
+          continue;
+        for (unsigned i = 0, e = LiveIn.size(); i != e; ++i)
+          if (!LiveIn[i]->isLiveIn(PhysReg))
+            LiveIn[i]->addLiveIn(PhysReg);
+        LiveIn.clear();
+      }
     }
   }
 }
 
 void VirtRegRewriter::rewrite() {
+  bool NoSubRegLiveness = !MRI->tracksSubRegLiveness();
   SmallVector<unsigned, 8> SuperDeads;
   SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
@@ -346,7 +368,8 @@ void VirtRegRewriter::rewrite() {
           // A virtual register kill refers to the whole register, so we may
           // have to add <imp-use,kill> operands for the super-register.  A
           // partial redef always kills and redefines the super-register.
-          if (MO.readsReg() && (MO.isDef() || MO.isKill()))
+          if (NoSubRegLiveness && MO.readsReg()
+              && (MO.isDef() || MO.isKill()))
             SuperKills.push_back(PhysReg);
 
           if (MO.isDef()) {
@@ -357,10 +380,12 @@ void VirtRegRewriter::rewrite() {
             MO.setIsUndef(false);
 
             // Also add implicit defs for the super-register.
-            if (MO.isDead())
-              SuperDeads.push_back(PhysReg);
-            else
-              SuperDefs.push_back(PhysReg);
+            if (NoSubRegLiveness) {
+              if (MO.isDead())
+                SuperDeads.push_back(PhysReg);
+              else
+                SuperDefs.push_back(PhysReg);
+            }
           }
 
           // PhysReg operands cannot have subregister indexes.