vendor/llvm/llvm-trunk-r242221

author: Dimitry Andric <dim@FreeBSD.org> 2015-08-07 23:01:33 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2015-08-07 23:01:33 +0000
commit: ee8648bdac07986a0f1ec897b02ec82a2f144d46 (patch)
tree: 52d1861acda1205241ee35a94aa63129c604d469 /lib/Target
parent: 1a82d4c088707c791c792f6822f611b47a12bdfe (diff)
251 files changed, 12046 insertions, 3562 deletions
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index bffd9e6e8c76..79a84ad8c6c5 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -510,9 +510,17 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
       if (J.isRegMask())
         AvailableRegs.clearBitsNotInMask(J.getRegMask());
 
-      if (J.isReg() && J.isDef() && AvailableRegs[J.getReg()]) {
-        assert(J.isDead() && "Non-dead def should have been removed by now!");
-        AvailableRegs.reset(J.getReg());
+      if (J.isReg() && J.isDef()) {
+        MCRegAliasIterator AI(J.getReg(), TRI, /*IncludeSelf=*/true);
+        if (J.isDead())
+          for (; AI.isValid(); ++AI)
+            AvailableRegs.reset(*AI);
+#ifndef NDEBUG
+        else
+          for (; AI.isValid(); ++AI)
+            assert(!AvailableRegs[*AI] &&
+                   "Non-dead def should have been removed by now!");
+#endif
       }
     }
   }
@@ -585,7 +593,6 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
       if (Change) {
         Substs[MO.getReg()] = Reg;
         MO.setReg(Reg);
-        MRI->setPhysRegUsed(Reg);
 
         Changed = true;
       }
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 4691e949838d..815ebef177d8 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -40,6 +40,11 @@ def CC_AArch64_AAPCS : CallingConv<[
   // slot is 64-bit.
   CCIfByVal<CCPassByVal<8, 8>>,
 
+  // The 'nest' parameter, if any, is passed in X18.
+  // Darwin uses X18 as the platform register and hence 'nest' isn't currently
+  // supported there.
+  CCIfNest<CCAssignToReg<[X18]>>,
+
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index c19fcdc4bb18..072819836bb3 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -310,7 +310,7 @@ CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
 }
 
 unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) {
-  assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
+  assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i64 &&
          "Alloca should always return a pointer.");
 
   // Don't handle dynamic allocas.
@@ -420,7 +420,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
 
   unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
 
-  EVT DestEVT = TLI.getValueType(GV->getType(), true);
+  EVT DestEVT = TLI.getValueType(DL, GV->getType(), true);
   if (!DestEVT.isSimple())
     return 0;
 
@@ -459,7 +459,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
 }
 
 unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
 
   // Only handle simple types.
   if (!CEVT.isSimple())
@@ -538,13 +538,14 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
   }
   case Instruction::IntToPtr: {
     // Look past no-op inttoptrs.
-    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
       return computeAddress(U->getOperand(0), Addr, Ty);
     break;
   }
   case Instruction::PtrToInt: {
     // Look past no-op ptrtoints.
-    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return computeAddress(U->getOperand(0), Addr, Ty);
     break;
   }
@@ -879,13 +880,13 @@ bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs if its operand is in the same BB.
     if (InMBB &&
-        TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+        TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+            TLI.getPointerTy(DL))
       return computeCallAddress(U->getOperand(0), Addr);
     break;
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints if its operand is in the same BB.
-    if (InMBB &&
-        TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return computeCallAddress(U->getOperand(0), Addr);
     break;
   }
@@ -906,7 +907,7 @@ bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
 
 
 bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
-  EVT evt = TLI.getValueType(Ty, true);
+  EVT evt = TLI.getValueType(DL, Ty, true);
 
   // Only handle simple types.
   if (evt == MVT::Other || !evt.isSimple())
@@ -1390,7 +1391,7 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
 
 bool AArch64FastISel::emitCmp(const Value *LHS, const Value *RHS, bool IsZExt) {
   Type *Ty = LHS->getType();
-  EVT EVT = TLI.getValueType(Ty, true);
+  EVT EVT = TLI.getValueType(DL, Ty, true);
   if (!EVT.isSimple())
     return false;
   MVT VT = EVT.getSimpleVT();
@@ -2761,7 +2762,7 @@ bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
   if (SrcReg == 0)
     return false;
 
-  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
   if (SrcVT == MVT::f128)
     return false;
 
@@ -2797,7 +2798,7 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
     return false;
   bool SrcIsKill = hasTrivialKill(I->getOperand(0));
 
-  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
 
   // Handle sign-extension.
   if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
@@ -2856,7 +2857,7 @@ bool AArch64FastISel::fastLowerArguments() {
     if (ArgTy->isStructTy() || ArgTy->isArrayTy())
       return false;
 
-    EVT ArgVT = TLI.getValueType(ArgTy);
+    EVT ArgVT = TLI.getValueType(DL, ArgTy);
     if (!ArgVT.isSimple())
       return false;
 
@@ -2898,7 +2899,7 @@ bool AArch64FastISel::fastLowerArguments() {
   unsigned GPRIdx = 0;
   unsigned FPRIdx = 0;
   for (auto const &Arg : F->args()) {
-    MVT VT = TLI.getSimpleValueType(Arg.getType());
+    MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
     unsigned SrcReg;
     const TargetRegisterClass *RC;
     if (VT >= MVT::i1 && VT <= MVT::i32) {
@@ -3689,7 +3690,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
   if (Ret->getNumOperands() > 0) {
     CallingConv::ID CC = F.getCallingConv();
     SmallVector<ISD::OutputArg, 4> Outs;
-    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
@@ -3724,7 +3725,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
     if (!MRI.getRegClass(SrcReg)->contains(DestReg))
       return false;
 
-    EVT RVEVT = TLI.getValueType(RV->getType());
+    EVT RVEVT = TLI.getValueType(DL, RV->getType());
     if (!RVEVT.isSimple())
       return false;
 
@@ -3772,8 +3773,8 @@ bool AArch64FastISel::selectTrunc(const Instruction *I) {
   Value *Op = I->getOperand(0);
   Type *SrcTy = Op->getType();
 
-  EVT SrcEVT = TLI.getValueType(SrcTy, true);
-  EVT DestEVT = TLI.getValueType(DestTy, true);
+  EVT SrcEVT = TLI.getValueType(DL, SrcTy, true);
+  EVT DestEVT = TLI.getValueType(DL, DestTy, true);
   if (!SrcEVT.isSimple())
     return false;
   if (!DestEVT.isSimple())
@@ -4459,7 +4460,7 @@ bool AArch64FastISel::selectIntExt(const Instruction *I) {
 }
 
 bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) {
-  EVT DestEVT = TLI.getValueType(I->getType(), true);
+  EVT DestEVT = TLI.getValueType(DL, I->getType(), true);
   if (!DestEVT.isSimple())
     return false;
 
@@ -4825,7 +4826,7 @@ std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
   bool IdxNIsKill = hasTrivialKill(Idx);
 
   // If the index is smaller or larger than intptr_t, truncate or extend it.
-  MVT PtrVT = TLI.getPointerTy();
+  MVT PtrVT = TLI.getPointerTy(DL);
   EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
   if (IdxVT.bitsLT(PtrVT)) {
     IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false);
@@ -4849,7 +4850,7 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
   // into a single N = N + TotalOffset.
   uint64_t TotalOffs = 0;
   Type *Ty = I->getOperand(0)->getType();
-  MVT VT = TLI.getPointerTy();
+  MVT VT = TLI.getPointerTy(DL);
   for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) {
     const Value *Idx = *OI;
     if (auto *StTy = dyn_cast<StructType>(Ty)) {
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3ba7e70a102d..a7817f4f67dd 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -349,12 +349,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // Allocate space for the rest of the frame.
 
   const unsigned Alignment = MFI->getMaxAlignment();
-  const bool NeedsRealignment = (Alignment > 16);
+  const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
   unsigned scratchSPReg = AArch64::SP;
-  if (NeedsRealignment) {
-    // Use the first callee-saved register as a scratch register
-    assert(MF.getRegInfo().isPhysRegUsed(AArch64::X9) &&
-           "No scratch register to align SP!");
+  if (NumBytes && NeedsRealignment) {
+    // Use the first callee-saved register as a scratch register.
     scratchSPReg = AArch64::X9;
   }
 
@@ -366,9 +364,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
                     MachineInstr::FrameSetup);
 
-  assert(!(NeedsRealignment && NumBytes==0) &&
-         "NumBytes should never be 0 when realignment is needed");
-
   if (NumBytes && NeedsRealignment) {
     const unsigned NrBitsToZero = countTrailingZeros(Alignment);
     assert(NrBitsToZero > 1);
@@ -881,28 +876,34 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
   return true;
 }
 
-void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
-    MachineFunction &MF, RegScavenger *RS) const {
+void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                BitVector &SavedRegs,
+                                                RegScavenger *RS) const {
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  MachineRegisterInfo *MRI = &MF.getRegInfo();
   SmallVector<unsigned, 4> UnspilledCSGPRs;
   SmallVector<unsigned, 4> UnspilledCSFPRs;
 
   // The frame record needs to be created by saving the appropriate registers
   if (hasFP(MF)) {
-    MRI->setPhysRegUsed(AArch64::FP);
-    MRI->setPhysRegUsed(AArch64::LR);
+    SavedRegs.set(AArch64::FP);
+    SavedRegs.set(AArch64::LR);
   }
 
   // Spill the BasePtr if it's used. Do this first thing so that the
   // getCalleeSavedRegs() below will get the right answer.
   if (RegInfo->hasBasePointer(MF))
-    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
+    SavedRegs.set(RegInfo->getBaseRegister());
 
   if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF))
-    MRI->setPhysRegUsed(AArch64::X9);
+    SavedRegs.set(AArch64::X9);
 
   // If any callee-saved registers are used, the frame cannot be eliminated.
   unsigned NumGPRSpilled = 0;
@@ -924,8 +925,8 @@ void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
                 AArch64::FPR64RegClass.contains(EvenReg)) &&
            "Register class mismatch!");
 
-    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
-    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
+    const bool OddRegUsed = SavedRegs.test(OddReg);
+    const bool EvenRegUsed = SavedRegs.test(EvenReg);
 
     // Early exit if none of the registers in the register pair is actually
     // used.
@@ -946,7 +947,7 @@ void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
     if (OddRegUsed ^ EvenRegUsed) {
       // Find out which register is the additional spill.
       Reg = OddRegUsed ? EvenReg : OddReg;
-      MRI->setPhysRegUsed(Reg);
+      SavedRegs.set(Reg);
     }
 
     DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
@@ -1001,7 +1002,7 @@ void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
       UnspilledCSGPRs.pop_back();
       DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
                    << " to get a scratch register.\n");
-      MRI->setPhysRegUsed(Reg);
+      SavedRegs.set(Reg);
       ExtraCSSpill = true;
       ++Count;
     }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index b496fccba349..731f031ff855 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -59,8 +59,8 @@ public:
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 1ea4abcf05fa..772e894f4f0a 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -610,10 +610,11 @@ static bool isWorthFoldingADDlow(SDValue N) {
 bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
                                               SDValue &Base, SDValue &OffImm) {
   SDLoc dl(N);
+  const DataLayout &DL = CurDAG->getDataLayout();
   const TargetLowering *TLI = getTargetLowering();
   if (N.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
     OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
     return true;
   }
@@ -628,10 +629,9 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
 
     const GlobalValue *GV = GAN->getGlobal();
     unsigned Alignment = GV->getAlignment();
-    const DataLayout *DL = TLI->getDataLayout();
     Type *Ty = GV->getType()->getElementType();
     if (Alignment == 0 && Ty->isSized())
-      Alignment = DL->getABITypeAlignment(Ty);
+      Alignment = DL.getABITypeAlignment(Ty);
 
     if (Alignment >= Size)
       return true;
@@ -645,7 +645,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
         Base = N.getOperand(0);
         if (Base.getOpcode() == ISD::FrameIndex) {
           int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
         }
         OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
         return true;
@@ -688,7 +688,8 @@ bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
         const TargetLowering *TLI = getTargetLowering();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64);
       return true;
@@ -1494,7 +1495,7 @@ static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
 }
 
 static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                       unsigned &LSB, unsigned &MSB,
+                                       unsigned &Immr, unsigned &Imms,
                                        bool BiggerPattern) {
   assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
          "N must be a SHR/SRA operation to call this function");
@@ -1508,7 +1509,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
          "Type checking must have been done before calling this function");
 
   // Check for AND + SRL doing several bits extract.
-  if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+  if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
     return true;
 
   // we're looking for a shift of a shift
@@ -1548,13 +1549,9 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
 
   assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
          "bad amount in shift node!");
-  // Note: The width operand is encoded as width-1.
-  unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
-  int sLSB = Srl_imm - Shl_imm;
-  if (sLSB < 0)
-    return false;
-  LSB = sLSB;
-  MSB = LSB + Width;
+  int immr = Srl_imm - Shl_imm;
+  Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
+  Imms = VT.getSizeInBits() - Shl_imm - Trunc_bits - 1;
   // SRA requires a signed extraction
   if (VT == MVT::i32)
     Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
@@ -1564,7 +1561,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
 }
 
 static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
-                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
+                                SDValue &Opd0, unsigned &Immr, unsigned &Imms,
                                 unsigned NumberOfIgnoredLowBits = 0,
                                 bool BiggerPattern = false) {
   if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
@@ -1576,11 +1573,11 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
       return false;
     break;
   case ISD::AND:
-    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
+    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms,
                                       NumberOfIgnoredLowBits, BiggerPattern);
   case ISD::SRL:
   case ISD::SRA:
-    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
+    return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
   }
 
   unsigned NOpc = N->getMachineOpcode();
@@ -1593,8 +1590,8 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
   case AArch64::UBFMXri:
     Opc = NOpc;
     Opd0 = N->getOperand(0);
-    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+    Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+    Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
     return true;
   }
   // Unreachable
@@ -1602,9 +1599,9 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
 }
 
 SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
-  unsigned Opc, LSB, MSB;
+  unsigned Opc, Immr, Imms;
   SDValue Opd0;
-  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
+  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
     return nullptr;
 
   EVT VT = N->getValueType(0);
@@ -1613,8 +1610,8 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
   // If the bit extract operation is 64bit but the original type is 32bit, we
   // need to add one EXTRACT_SUBREG.
   if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
-    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, dl, MVT::i64),
-                       CurDAG->getTargetConstant(MSB, dl, MVT::i64)};
+    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64),
+                       CurDAG->getTargetConstant(Imms, dl, MVT::i64)};
 
     SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
@@ -1624,8 +1621,8 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
     return Node;
   }
 
-  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, dl, VT),
-                   CurDAG->getTargetConstant(MSB, dl, VT)};
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
+                   CurDAG->getTargetConstant(Imms, dl, VT)};
   return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
 }
 
@@ -2351,7 +2348,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
     unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
     const TargetLowering *TLI = getTargetLowering();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    SDValue TFI = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     SDLoc DL(Node);
     SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
                       CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index f3242cdd971d..3e8f46cf1ecd 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -705,7 +705,8 @@ void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
   addTypeForNEON(VT, MVT::v4i32);
 }
 
-EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+                                              EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
@@ -774,7 +775,8 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
   }
 }
 
-MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
+MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
+                                                  EVT) const {
   return MVT::i64;
 }
 
@@ -1710,7 +1712,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 
   const char *LibcallName =
       (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+  SDValue Callee =
+      DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
 
   StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -2089,7 +2092,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       CurArgIdx = Ins[i].getOrigArgIndex();
 
       // Get type of the original argument.
-      EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+      EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
+                                  /*AllowUnknown*/ true);
       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
@@ -2111,7 +2115,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     if (Ins[i].Flags.isByVal()) {
       // Byval is used for HFAs in the PCS, but the system should work in a
       // non-compliant manner for larger structs.
-      EVT PtrTy = getPointerTy();
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
       int Size = Ins[i].Flags.getByValSize();
       unsigned NumRegs = (Size + 7) / 8;
 
@@ -2119,7 +2123,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       // case. It should also work for fundamental types too.
       unsigned FrameIdx =
         MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
-      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
+      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
       InVals.push_back(FrameIdxN);
 
       continue;
@@ -2186,7 +2190,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
 
       // Create load nodes to retrieve arguments from the stack.
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
       SDValue ArgValue;
 
       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
@@ -2265,6 +2269,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   SmallVector<SDValue, 8> MemOps;
 
@@ -2279,7 +2284,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   if (GPRSaveSize != 0) {
     GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
 
-    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+    SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
 
     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
       unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
@@ -2288,8 +2293,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
           DAG.getStore(Val.getValue(1), DL, Val, FIN,
                        MachinePointerInfo::getStack(i * 8), false, false, 0);
       MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(8, DL, getPointerTy()));
+      FIN =
+          DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
     }
   }
   FuncInfo->setVarArgsGPRIndex(GPRIdx);
@@ -2307,7 +2312,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
     if (FPRSaveSize != 0) {
       FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
 
-      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+      SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
 
       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
         unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
@@ -2317,8 +2322,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
             DAG.getStore(Val.getValue(1), DL, Val, FIN,
                          MachinePointerInfo::getStack(i * 16), false, false, 0);
         MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                          DAG.getConstant(16, DL, getPointerTy()));
+        FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
+                          DAG.getConstant(16, DL, PtrVT));
       }
     }
     FuncInfo->setVarArgsFPRIndex(FPRIdx);
@@ -2614,7 +2619,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     for (unsigned i = 0; i != NumArgs; ++i) {
       MVT ValVT = Outs[i].VT;
       // Get type of the original argument.
-      EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
+      EVT ActualVT = getValueType(DAG.getDataLayout(),
+                                  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
                                   /*AllowUnknown*/ true);
       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
@@ -2674,10 +2680,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                                               true),
                                  DL);
 
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
+                                        getPointerTy(DAG.getDataLayout()));
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
@@ -2743,13 +2751,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       unsigned LocMemOffset = VA.getLocMemOffset();
       int32_t Offset = LocMemOffset + BEAlign;
       SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
-      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+      PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
 
       if (IsTailCall) {
         Offset = Offset + FPDiff;
         int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
 
-        DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+        DstAddr = DAG.getFrameIndex(FI, PtrVT);
         DstInfo = MachinePointerInfo::getFixedStack(FI);
 
         // Make sure any stack arguments overlapping with where we're storing
@@ -2759,7 +2767,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       } else {
         SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
 
-        DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+        DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
         DstInfo = MachinePointerInfo::getStack(LocMemOffset);
       }
 
@@ -2809,25 +2817,24 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       const GlobalValue *GV = G->getGlobal();
       bool InternalLinkage = GV->hasInternalLinkage();
       if (InternalLinkage)
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+        Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
       else {
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
-                                            AArch64II::MO_GOT);
-        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+        Callee =
+            DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
+        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
       }
     } else if (ExternalSymbolSDNode *S =
                    dyn_cast<ExternalSymbolSDNode>(Callee)) {
       const char *Sym = S->getSymbol();
-      Callee =
-          DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
-      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+      Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
+      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
     }
   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = G->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+    Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const char *Sym = S->getSymbol();
-    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
+    Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
   }
 
   // We don't usually want to end the call-sequence here because we would tidy
@@ -2977,7 +2984,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
                                                   SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GN->getGlobal();
@@ -3069,7 +3076,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
   assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
 
   SDLoc DL(Op);
-  MVT PtrVT = getPointerTy();
+  MVT PtrVT = getPointerTy(DAG.getDataLayout());
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
   SDValue TLVPAddr =
@@ -3124,7 +3131,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
 ///  the sequence is produced as per above.
 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
                                                       SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   SDValue Chain = DAG.getEntryNode();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -3159,7 +3166,7 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
   }
 
   SDValue TPOff;
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
   const GlobalValue *GV = GA->getGlobal();
 
@@ -3786,7 +3793,7 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
   // Jump table entries as PC relative offsets. No additional tweaking
   // is necessary here. Just get the address of the jump table.
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
@@ -3812,7 +3819,7 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
                                                  SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
@@ -3853,7 +3860,7 @@ SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
       !Subtarget->isTargetMachO()) {
@@ -3879,8 +3886,8 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
 
   SDLoc DL(Op);
-  SDValue FR =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
+                                 getPointerTy(DAG.getDataLayout()));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                       MachinePointerInfo(SV), false, false, 0);
@@ -3892,6 +3899,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
   // Standard, section B.3.
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
 
   SDValue Chain = Op.getOperand(0);
@@ -3900,8 +3908,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
   SmallVector<SDValue, 4> MemOps;
 
   // void *__stack at offset 0
-  SDValue Stack =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
                                 MachinePointerInfo(SV), false, false, 8));
 
@@ -3910,12 +3917,12 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
   if (GPRSize > 0) {
     SDValue GRTop, GRTopAddr;
 
-    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(8, DL, getPointerTy()));
+    GRTopAddr =
+        DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
 
-    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
-    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
-                        DAG.getConstant(GPRSize, DL, getPointerTy()));
+    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
+    GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
+                        DAG.getConstant(GPRSize, DL, PtrVT));
 
     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
                                   MachinePointerInfo(SV, 8), false, false, 8));
@@ -3925,28 +3932,28 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
   int FPRSize = FuncInfo->getVarArgsFPRSize();
   if (FPRSize > 0) {
     SDValue VRTop, VRTopAddr;
-    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(16, DL, getPointerTy()));
+    VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                            DAG.getConstant(16, DL, PtrVT));
 
-    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
-    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
-                        DAG.getConstant(FPRSize, DL, getPointerTy()));
+    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
+    VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
+                        DAG.getConstant(FPRSize, DL, PtrVT));
 
     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
                                   MachinePointerInfo(SV, 16), false, false, 8));
   }
 
   // int __gr_offs at offset 24
-  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(24, DL, getPointerTy()));
+  SDValue GROffsAddr =
+      DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
   MemOps.push_back(DAG.getStore(Chain, DL,
                                 DAG.getConstant(-GPRSize, DL, MVT::i32),
                                 GROffsAddr, MachinePointerInfo(SV, 24), false,
                                 false, 4));
 
   // int __vr_offs at offset 28
-  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(28, DL, getPointerTy()));
+  SDValue VROffsAddr =
+      DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
   MemOps.push_back(DAG.getStore(Chain, DL,
                                 DAG.getConstant(-FPRSize, DL, MVT::i32),
                                 VROffsAddr, MachinePointerInfo(SV, 28), false,
@@ -3987,21 +3994,22 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Addr = Op.getOperand(1);
   unsigned Align = Op.getConstantOperandVal(3);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
-                               MachinePointerInfo(V), false, false, false, 0);
+  SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V),
+                               false, false, false, 0);
   Chain = VAList.getValue(1);
 
   if (Align > 8) {
     assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
-    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                         DAG.getConstant(Align - 1, DL, getPointerTy()));
-    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
-                         DAG.getConstant(-(int64_t)Align, DL, getPointerTy()));
+    VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                         DAG.getConstant(Align - 1, DL, PtrVT));
+    VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
+                         DAG.getConstant(-(int64_t)Align, DL, PtrVT));
   }
 
   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
-  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+  uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
 
   // Scalar integer and FP values smaller than 64 bits are implicitly extended
   // up to 64 bits.  At the very least, we have to increase the striding of the
@@ -4016,8 +4024,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Increment the pointer, VAList, to the next vaarg
-  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                               DAG.getConstant(ArgSize, DL, getPointerTy()));
+  SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                               DAG.getConstant(ArgSize, DL, PtrVT));
   // Store the incremented VAList to the legalized pointer
   SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
                                  false, false, 0);
@@ -4057,8 +4065,8 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
-                                                  EVT VT) const {
+unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                                  SelectionDAG &DAG) const {
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("sp", AArch64::SP)
                        .Default(0);
@@ -4079,7 +4087,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset = DAG.getConstant(8, DL, getPointerTy());
+    SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
     return DAG.getLoad(VT, DL, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
                        MachinePointerInfo(), false, false, false, 0);
@@ -4232,7 +4240,7 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 AArch64TargetLowering::ConstraintType
-AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
+AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:
@@ -4283,8 +4291,7 @@ AArch64TargetLowering::getSingleConstraintMatchWeight(
 
 std::pair<unsigned, const TargetRegisterClass *>
 AArch64TargetLowering::getRegForInlineAsmConstraint(
-    const TargetRegisterInfo *TRI, const std::string &Constraint,
-    MVT VT) const {
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
@@ -4320,10 +4327,9 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
     unsigned Size = Constraint.size();
     if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
         tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
-      const std::string Reg =
-          std::string(&Constraint[2], &Constraint[Size - 1]);
-      int RegNo = atoi(Reg.c_str());
-      if (RegNo >= 0 && RegNo <= 31) {
+      int RegNo;
+      bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
+      if (!Failed && RegNo >= 0 && RegNo <= 31) {
         // v0 - v31 are aliases of q0 - q31.
         // By default we'll emit v0-v31 for this unless there's a modifier where
         // we'll emit the correct register as well.
@@ -6429,6 +6435,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                                const CallInst &I,
                                                unsigned Intrinsic) const {
+  auto &DL = I.getModule()->getDataLayout();
   switch (Intrinsic) {
   case Intrinsic::aarch64_neon_ld2:
   case Intrinsic::aarch64_neon_ld3:
@@ -6444,7 +6451,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_ld4r: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
@@ -6470,7 +6477,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
@@ -6488,7 +6495,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
     Info.vol = true;
     Info.readMem = true;
     Info.writeMem = false;
@@ -6501,7 +6508,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
     Info.vol = true;
     Info.readMem = false;
     Info.writeMem = true;
@@ -6572,7 +6579,8 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
     return true;
 
   const TargetOptions &Options = getTargetMachine().Options;
-  EVT VT = getValueType(User->getOperand(0)->getType());
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  EVT VT = getValueType(DL, User->getOperand(0)->getType());
 
   if (isFMAFasterThanFMulAndFAdd(VT) &&
       isOperationLegalOrCustom(ISD::FMA, VT) &&
@@ -6637,6 +6645,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
       break;
     case Instruction::GetElementPtr: {
       gep_type_iterator GTI = gep_type_begin(Instr);
+      auto &DL = Ext->getModule()->getDataLayout();
       std::advance(GTI, U.getOperandNo());
       Type *IdxTy = *GTI;
       // This extension will end up with a shift because of the scaling factor.
@@ -6644,7 +6653,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
       // Get the shift amount based on the scaling factor:
       // log2(sizeof(IdxTy)) - log2(8).
       uint64_t ShiftAmt =
-        countTrailingZeros(getDataLayout()->getTypeStoreSizeInBits(IdxTy)) - 3;
+          countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
       // Is the constant foldable in the shift of the addressing mode?
       // I.e., shift amount is between 1 and 4 inclusive.
       if (ShiftAmt == 0 || ShiftAmt > 4)
@@ -6708,10 +6717,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
-  const DataLayout *DL = getDataLayout();
+  const DataLayout &DL = LI->getModule()->getDataLayout();
 
   VectorType *VecTy = Shuffles[0]->getType();
-  unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy);
+  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
 
   // Skip illegal vector types.
   if (VecSize != 64 && VecSize != 128)
@@ -6721,8 +6730,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   // load integer vectors first and then convert to pointer vectors.
   Type *EltTy = VecTy->getVectorElementType();
   if (EltTy->isPointerTy())
-    VecTy = VectorType::get(DL->getIntPtrType(EltTy),
-                            VecTy->getVectorNumElements());
+    VecTy =
+        VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
   Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
   Type *Tys[2] = {VecTy, PtrTy};
@@ -6796,8 +6805,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   Type *EltTy = VecTy->getVectorElementType();
   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
 
-  const DataLayout *DL = getDataLayout();
-  unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy);
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
 
   // Skip illegal vector types.
   if (SubVecSize != 64 && SubVecSize != 128)
@@ -6810,7 +6819,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
   // vectors to integer vectors.
   if (EltTy->isPointerTy()) {
-    Type *IntTy = DL->getIntPtrType(EltTy);
+    Type *IntTy = DL.getIntPtrType(EltTy);
     unsigned NumOpElts =
         dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
 
@@ -6894,8 +6903,8 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
 
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
-bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                  Type *Ty,
+bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                  const AddrMode &AM, Type *Ty,
                                                   unsigned AS) const {
   // AArch64 has five basic addressing modes:
   //  reg
@@ -6916,7 +6925,7 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
   uint64_t NumBytes = 0;
   if (Ty->isSized()) {
-    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
+    uint64_t NumBits = DL.getTypeSizeInBits(Ty);
     NumBytes = NumBits / 8;
     if (!isPowerOf2_64(NumBits))
       NumBytes = 0;
@@ -6946,8 +6955,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
   return false;
 }
 
-int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                                Type *Ty,
+int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
                                                 unsigned AS) const {
   // Scaling factors are not free at all.
   // Operands                     | Rt Latency
@@ -6956,7 +6965,7 @@ int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
   // -------------------------------------------
   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
   // Rt, [Xn, Wm, <extend> #imm]  |
-  if (isLegalAddressingMode(AM, Ty, AS))
+  if (isLegalAddressingMode(DL, AM, Ty, AS))
     // Scale represents reg2 * scale, thus account for 1 if
     // it is not equal to 0 or 1.
     return AM.Scale != 0 && AM.Scale != 1;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 46298c0e7de1..c73ce1e54b3e 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -233,7 +233,7 @@ public:
                                      APInt &KnownOne, const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
-  MVT getScalarShiftAmountTy(EVT LHSTy) const override;
+  MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
 
   /// allowsMisalignedMemoryAccesses - Returns true if the target allows
   /// unaligned memory accesses of the specified type.
@@ -278,7 +278,8 @@ public:
   bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
 
   /// getSetCCResultType - Return the ISD::SETCC ValueType
-  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
 
   SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
@@ -323,7 +324,7 @@ public:
 
   /// isLegalAddressingMode - Return true if the addressing mode represented
   /// by AM is legal for this target, for a load/store of the specified type.
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
   /// \brief Return the cost of the scaling factor used in the addressing
@@ -331,7 +332,7 @@ public:
   /// of the specified type.
   /// If the AM is supported, the return value must be >= 0.
   /// If the AM is not supported, it returns a negative value.
-  int getScalingFactorCost(const AddrMode &AM, Type *Ty,
+  int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                            unsigned AS) const override;
 
   /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
@@ -471,9 +472,9 @@ private:
                         std::vector<SDNode *> *Created) const override;
   bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
 
-  ConstraintType
-  getConstraintType(const std::string &Constraint) const override;
-  unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+  ConstraintType getConstraintType(StringRef Constraint) const override;
+  unsigned getRegisterByName(const char* RegName, EVT VT,
+                             SelectionDAG &DAG) const override;
 
   /// Examine constraint string and operand type and determine a weight value.
   /// The operand object must already have been set up with the operand type.
@@ -483,14 +484,12 @@ private:
 
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                               const std::string &Constraint,
-                               MVT VT) const override;
+                               StringRef Constraint, MVT VT) const override;
   void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
 
-  unsigned getInlineAsmMemConstraint(
-      const std::string &ConstraintCode) const override {
+  unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
     if (ConstraintCode == "Q")
       return InlineAsm::Constraint_Q;
     // FIXME: clang has code for 'Ump', 'Utf', 'Usa', and 'Ush' but these are
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index b73e0958df90..fa1a46acba84 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -894,6 +894,8 @@ def REVXr   : OneXRegData<0b011, "rev", bswap>;
 def REV32Xr : OneXRegData<0b010, "rev32",
                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
 
+def : InstAlias<"rev64 $Rd, $Rn", (REVXr GPR64:$Rd, GPR64:$Rn), 0>;
+
 // The bswap commutes with the rotr so we want a pattern for both possible
 // orders.
 def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
@@ -5283,18 +5285,23 @@ def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
@@ -5309,12 +5316,16 @@ def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 1836682e386e..841af55f7a65 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -90,7 +90,7 @@ AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
 
 BitVector
 AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
 
   // FIXME: avoid re-calculating this every time.
   BitVector Reserved(getNumRegs());
@@ -119,7 +119,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
 bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
                                       unsigned Reg) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
 
   switch (Reg) {
   default:
@@ -198,11 +198,9 @@ bool AArch64RegisterInfo::canRealignStack(const MachineFunction &MF) const {
 bool
 AArch64RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
   const Function *F = MF.getFunction();
-  unsigned StackAlign = MF.getTarget()
-                            .getSubtargetImpl(*MF.getFunction())
-                            ->getFrameLowering()
-                            ->getStackAlignment();
+  unsigned StackAlign = TFI->getStackAlignment();
   bool requiresRealignment =
       ((MFI->getMaxAlignment() > StackAlign) ||
        F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
@@ -213,8 +211,7 @@ AArch64RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
 
 unsigned
 AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
-
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
   return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
 }
 
@@ -280,7 +277,7 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
   // Note that the incoming offset is based on the SP value at function entry,
   // so it'll be negative.
   MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // Estimate an offset from the frame pointer.
@@ -376,8 +373,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineFunction &MF = *MBB.getParent();
   const AArch64InstrInfo *TII =
       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
-  const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>(
-      MF.getSubtarget().getFrameLowering());
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
 
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   unsigned FrameReg;
@@ -415,7 +411,7 @@ namespace llvm {
 
 unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                                   MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
 
   switch (RC->getID()) {
   default:
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index b9c53998752a..f40293021d74 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -16,11 +16,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-selectiondag-info"
 
-AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const DataLayout *DL)
-    : TargetSelectionDAGInfo(DL) {}
-
-AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {}
-
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, unsigned Align, bool isVolatile,
@@ -37,8 +32,8 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
   if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
     const AArch64TargetLowering &TLI = *STI.getTargetLowering();
 
-    EVT IntPtr = TLI.getPointerTy();
-    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+    EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
+    Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
     TargetLowering::ArgListTy Args;
     TargetLowering::ArgListEntry Entry;
     Entry.Node = Dst;
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 11932d2b1c22..97421b45b122 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -20,8 +20,6 @@ namespace llvm {
 
 class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit AArch64SelectionDAGInfo(const DataLayout *DL);
-  ~AArch64SelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
                                   SDValue Dst, SDValue Src, SDValue Size,
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 554826b1e08a..486efd6ce3a2 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -49,15 +49,15 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
       HasV8_1aOps(false), HasFPARMv8(false), HasNEON(false), HasCrypto(false),
       HasCRC(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
       IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(),
-      InstrInfo(initializeSubtargetDependencies(FS)),
-      TSInfo(TM.getDataLayout()), TLInfo(TM, *this) {}
+      InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
+      TLInfo(TM, *this) {}
 
 /// ClassifyGlobalReference - Find the target operand flags that describe
 /// how a global value should be referenced for the current subtarget.
 unsigned char
 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
                                         const TargetMachine &TM) const {
-  bool isDecl = GV->isDeclarationForLinker();
+  bool isDef = GV->isStrongDefinitionForLinker();
 
   // MachO large model always goes via a GOT, simply to get a single 8-byte
   // absolute relocation on all global addresses.
@@ -66,8 +66,7 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
 
   // The small code mode's direct accesses use ADRP, which cannot necessarily
   // produce the value 0 (if the code is above 4GB).
-  if (TM.getCodeModel() == CodeModel::Small &&
-      GV->isWeakForLinker() && isDecl) {
+  if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) {
     // In PIC mode use the GOT, but in absolute mode use a constant pool load.
     if (TM.getRelocationModel() == Reloc::Static)
         return AArch64II::MO_CONSTPOOL;
@@ -85,8 +84,7 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
   //     defined could end up in unexpected places. Use a GOT.
   if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
     if (isTargetMachO())
-      return (isDecl || GV->isWeakForLinker()) ? AArch64II::MO_GOT
-                                               : AArch64II::MO_NO_FLAG;
+      return isDef ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
     else
       // No need to go through the GOT for local symbols on ELF.
       return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index fc91c94351cc..e085cca35f1c 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -181,8 +181,8 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  EVT SrcTy = TLI->getValueType(Src);
-  EVT DstTy = TLI->getValueType(Dst);
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return BaseT::getCastInstrCost(Opcode, Dst, Src);
@@ -265,7 +265,7 @@ unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 
   if (Index != -1U) {
     // Legalize the type.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
 
     // This type is legalized to a scalar type.
     if (!LT.second.isVector())
@@ -289,7 +289,7 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost(
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
@@ -364,8 +364,8 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
     };
 
-    EVT SelCondTy = TLI->getValueType(CondTy);
-    EVT SelValTy = TLI->getValueType(ValTy);
+    EVT SelCondTy = TLI->getValueType(DL, CondTy);
+    EVT SelValTy = TLI->getValueType(DL, ValTy);
     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
       int Idx =
           ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
@@ -380,7 +380,7 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                          unsigned Alignment,
                                          unsigned AddressSpace) {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 
   if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
       Src->getVectorElementType()->isIntegerTy(64)) {
@@ -416,7 +416,7 @@ unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
   if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
     Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = TLI->getDataLayout()->getTypeAllocSize(SubVecTy);
+    unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy);
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 4dabdadd8eeb..444d3ccc15e1 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -31,7 +31,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   typedef TargetTransformInfo TTI;
   friend BaseT;
 
-  const AArch64TargetMachine *TM;
   const AArch64Subtarget *ST;
   const AArch64TargetLowering *TLI;
 
@@ -50,30 +49,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
 public:
   explicit AArch64TTIImpl(const AArch64TargetMachine *TM, Function &F)
-      : BaseT(TM), TM(TM), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   AArch64TTIImpl(const AArch64TTIImpl &Arg)
-      : BaseT(static_cast<const BaseT &>(Arg)), TM(Arg.TM), ST(Arg.ST),
-        TLI(Arg.TLI) {}
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
   AArch64TTIImpl(AArch64TTIImpl &&Arg)
-      : BaseT(std::move(static_cast<BaseT &>(Arg))), TM(std::move(Arg.TM)),
-        ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {}
-  AArch64TTIImpl &operator=(const AArch64TTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    TM = RHS.TM;
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  AArch64TTIImpl &operator=(AArch64TTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    TM = std::move(RHS.TM);
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
 
   /// \name Scalar TTI Implementations
   /// @{
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 359c2e734e21..db9fb0e775df 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -228,7 +228,7 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
 }
 
 static MCSymbolizer *
-createAArch64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
+createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
                                 LLVMSymbolLookupCallback SymbolLookUp,
                                 void *DisInfo, MCContext *Ctx,
                                 std::unique_ptr<MCRelocationInfo> &&RelInfo) {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index b5b1d1f9e19c..16d53569b231 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -199,7 +199,7 @@ MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
 MCTargetStreamer *
 createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   const Triple &TT = STI.getTargetTriple();
-  if (TT.getObjectFormat() == Triple::ELF)
+  if (TT.isOSBinFormatELF())
     return new AArch64TargetELFStreamer(S);
   return nullptr;
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 099d1b01c339..9f7bed0d3b12 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -42,16 +42,13 @@ static MCInstrInfo *createAArch64MCInstrInfo() {
 
 static MCSubtargetInfo *
 createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-
   if (CPU.empty())
     CPU = "generic";
 
-  InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createAArch64MCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
+static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitAArch64MCRegisterInfo(X, AArch64::LR);
   return X;
@@ -75,11 +72,11 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT,
+                                                 Reloc::Model RM,
                                                  CodeModel::Model CM,
                                                  CodeGenOpt::Level OL) {
-  Triple TheTriple(TT);
-  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
+  assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) &&
          "Only expect Darwin and ELF targets");
 
   if (CM == CodeModel::Default)
@@ -94,7 +91,7 @@ static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
         "Only small and large code models are allowed on AArch64");
 
   // AArch64 Darwin is always PIC.
-  if (TheTriple.isOSDarwin())
+  if (TT.isOSDarwin())
     RM = Reloc::PIC_;
   // On ELF platforms the default static relocation model has a smart enough
   // linker to cope with referencing external symbols defined in a shared
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 569ad3844b25..ef8ef6268548 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -98,6 +98,16 @@ def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
         "true",
         "Enable SI load/store optimizer pass">;
 
+// Performance debugging feature. Allow using DS instruction immediate
+// offsets even if the base pointer can't be proven to be base. On SI,
+// base pointer values that won't give the same result as a 16-bit add
+// are not safe to fold, but this will override the conservative test
+// for the base pointer.
+def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-folding",
+        "EnableUnsafeDSOffsetFolding",
+        "true",
+        "Force using DS instruction immediate offsets on SI">;
+
 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
         "FlatAddressSpace",
         "true",
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 0b426bc63dd5..ad267d350850 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -22,7 +22,6 @@ using namespace llvm;
 namespace {
 
 class AMDGPUAlwaysInline : public ModulePass {
-
   static char ID;
 
 public:
@@ -36,10 +35,9 @@ public:
 char AMDGPUAlwaysInline::ID = 0;
 
 bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+  std::vector<Function *> FuncsToClone;
 
-  std::vector<Function*> FuncsToClone;
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    Function &F = *I;
+  for (Function &F : M) {
     if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
         !F.hasFnAttribute(Attribute::NoInline))
       FuncsToClone.push_back(&F);
@@ -49,12 +47,11 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
     ValueToValueMapTy VMap;
     Function *NewFunc = CloneFunction(F, VMap, false);
     NewFunc->setLinkage(GlobalValue::InternalLinkage);
-    F->getParent()->getFunctionList().push_back(NewFunc);
+    M.getFunctionList().push_back(NewFunc);
     F->replaceAllUsesWith(NewFunc);
   }
 
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    Function &F = *I;
+  for (Function &F : M) {
     if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) {
       F.addFnAttr(Attribute::AlwaysInline);
     }
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index df4461eac4db..37b77d778d9f 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -110,8 +110,11 @@ private:
                          SDValue &Offset, SDValue &GLC) const;
   SDNode *SelectAddrSpaceCast(SDNode *N);
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                        SDValue &Clamp, SDValue &Omod) const;
+  bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                         SDValue &Clamp, SDValue &Omod) const;
 
   bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
                             SDValue &Omod) const;
@@ -859,7 +862,8 @@ bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
       (OffsetBits == 8 && !isUInt<8>(Offset)))
     return false;
 
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS ||
+      Subtarget->unsafeDSOffsetFoldingEnabled())
     return true;
 
   // On Southern Islands instruction with a negative base value and an offset
@@ -1316,6 +1320,12 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src,
+                                         SDValue &SrcMods) const {
+  bool Res = SelectVOP3Mods(In, Src, SrcMods);
+  return Res && cast<ConstantSDNode>(SrcMods)->isNullValue();
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
                                          SDValue &SrcMods, SDValue &Clamp,
                                          SDValue &Omod) const {
@@ -1327,6 +1337,16 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
   return SelectVOP3Mods(In, Src, SrcMods);
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src,
+                                           SDValue &SrcMods, SDValue &Clamp,
+                                           SDValue &Omod) const {
+  bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod);
+
+  return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() &&
+                cast<ConstantSDNode>(Clamp)->isNullValue() &&
+                cast<ConstantSDNode>(Omod)->isNullValue();
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
                                               SDValue &SrcMods,
                                               SDValue &Omod) const {
@@ -1351,18 +1371,14 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   do {
     IsModified = false;
     // Go over all selected nodes and try to fold them a bit more
-    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-         E = CurDAG->allnodes_end(); I != E; ++I) {
-
-      SDNode *Node = I;
-
-      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+    for (SDNode &Node : CurDAG->allnodes()) {
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
       if (!MachineNode)
         continue;
 
       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
-      if (ResNode != Node) {
-        ReplaceUses(Node, ResNode);
+      if (ResNode != &Node) {
+        ReplaceUses(&Node, ResNode);
         IsModified = true;
       }
     }
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d56838ec2019..3a65f3b56146 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -406,6 +406,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
 
+  setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::SELECT_CC);
@@ -444,7 +445,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
 // Target Information
 //===----------------------------------------------------------------------===//
 
-MVT AMDGPUTargetLowering::getVectorIdxTy() const {
+MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
   return MVT::i32;
 }
 
@@ -545,9 +546,8 @@ bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
 }
 
 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
-  const DataLayout *DL = getDataLayout();
-  unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType());
-  unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType());
+  unsigned SrcSize = Src->getScalarSizeInBits();
+  unsigned DestSize = Dest->getScalarSizeInBits();
 
   return SrcSize == 32 && DestSize == 64;
 }
@@ -697,7 +697,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                        const SDValue &InitPtr,
                                                        SDValue Chain,
                                                        SelectionDAG &DAG) const {
-  const DataLayout *TD = getDataLayout();
+  const DataLayout &TD = DAG.getDataLayout();
   SDLoc DL(InitPtr);
   Type *InitTy = Init->getType();
 
@@ -705,20 +705,20 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     EVT VT = EVT::getEVT(InitTy);
     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
     return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                        TD->getPrefTypeAlignment(InitTy));
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
+                        false, TD.getPrefTypeAlignment(InitTy));
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
     EVT VT = EVT::getEVT(CFP->getType());
     PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
     return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr,
-                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                 TD->getPrefTypeAlignment(CFP->getType()));
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
+                        false, TD.getPrefTypeAlignment(CFP->getType()));
   }
 
   if (StructType *ST = dyn_cast<StructType>(InitTy)) {
-    const StructLayout *SL = TD->getStructLayout(ST);
+    const StructLayout *SL = TD.getStructLayout(ST);
 
     EVT PtrVT = InitPtr.getValueType();
     SmallVector<SDValue, 8> Chains;
@@ -745,7 +745,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     else
       llvm_unreachable("Unexpected type");
 
-    unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
+    unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType());
     SmallVector<SDValue, 8> Chains;
     for (unsigned i = 0; i < NumElements; ++i) {
       SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT);
@@ -762,8 +762,8 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     EVT VT = EVT::getEVT(InitTy);
     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
     return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                        TD->getPrefTypeAlignment(InitTy));
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
+                        false, TD.getPrefTypeAlignment(InitTy));
   }
 
   Init->dump();
@@ -785,7 +785,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
                                                  SDValue Op,
                                                  SelectionDAG &DAG) const {
 
-  const DataLayout *TD = getDataLayout();
+  const DataLayout &DL = DAG.getDataLayout();
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = G->getGlobal();
 
@@ -801,7 +801,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
 
     unsigned Offset;
     if (MFI->LocalMemoryObjects.count(GV) == 0) {
-      uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+      uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
       Offset = MFI->LDSSize;
       MFI->LocalMemoryObjects[GV] = Offset;
       // XXX: Account for alignment?
@@ -811,16 +811,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
     }
 
     return DAG.getConstant(Offset, SDLoc(Op),
-                           getPointerTy(AMDGPUAS::LOCAL_ADDRESS));
+                           getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS));
   }
   case AMDGPUAS::CONSTANT_ADDRESS: {
     MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
     Type *EltType = GV->getType()->getElementType();
-    unsigned Size = TD->getTypeAllocSize(EltType);
-    unsigned Alignment = TD->getPrefTypeAlignment(EltType);
+    unsigned Size = DL.getTypeAllocSize(EltType);
+    unsigned Alignment = DL.getPrefTypeAlignment(EltType);
 
-    MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
-    MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+    MVT PrivPtrVT = getPointerTy(DL, AMDGPUAS::PRIVATE_ADDRESS);
+    MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
 
     int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
     SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
@@ -1653,7 +1653,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   // fb = fabs(fb);
   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT);
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 
   // int cv = fr >= fb;
   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
@@ -1960,7 +1960,8 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
 
   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
@@ -2020,7 +2021,8 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
 
   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
 
@@ -2051,7 +2053,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
   APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
 
   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
@@ -2081,7 +2084,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const
 
   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
 
   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
 
@@ -2100,8 +2104,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const
   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
   const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
   const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
-
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
 
   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
 
@@ -2172,7 +2176,8 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
 
   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
@@ -2411,6 +2416,33 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
                       SN->getBasePtr(), SN->getMemOperand());
 }
 
+SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  // i64 (shl x, 32) -> (build_pair 0, x)
+
+  // Doing this with moves theoretically helps MI optimizations that understand
+  // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as
+  // v_lshl_b64. In the SALU case, I think this is slightly worse since it
+  // doubles the code size and I'm unsure about cycle count.
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS || RHS->getZExtValue() != 32)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Extract low 32-bits.
+  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo);
+}
+
 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   EVT VT = N->getValueType(0);
@@ -2448,17 +2480,24 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   SDLoc DL(N);
 
   switch(N->getOpcode()) {
-    default: break;
-    case ISD::MUL:
-      return performMulCombine(N, DCI);
-    case AMDGPUISD::MUL_I24:
-    case AMDGPUISD::MUL_U24: {
-      SDValue N0 = N->getOperand(0);
-      SDValue N1 = N->getOperand(1);
-      simplifyI24(N0, DCI);
-      simplifyI24(N1, DCI);
-      return SDValue();
-    }
+  default:
+    break;
+  case ISD::SHL: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performShlCombine(N, DCI);
+  }
+  case ISD::MUL:
+    return performMulCombine(N, DCI);
+  case AMDGPUISD::MUL_I24:
+  case AMDGPUISD::MUL_U24: {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    simplifyI24(N0, DCI);
+    simplifyI24(N1, DCI);
+    return SDValue();
+  }
   case ISD::SELECT: {
     SDValue Cond = N->getOperand(0);
     if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
@@ -2644,6 +2683,18 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
   return DAG.getRegister(VirtualRegister, VT);
 }
 
+uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
+    const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
+  uint64_t ArgOffset = MFI->ABIArgOffset;
+  switch (Param) {
+  case GRID_DIM:
+    return ArgOffset;
+  case GRID_OFFSET:
+    return ArgOffset + 4;
+  }
+  llvm_unreachable("unexpected implicit parameter type");
+}
+
 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
 
 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index fbb7d3c88437..478b2035fd75 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -65,6 +65,7 @@ private:
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
 protected:
@@ -123,7 +124,7 @@ public:
 
   bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
-  MVT getVectorIdxTy() const override;
+  MVT getVectorIdxTy(const DataLayout &) const override;
   bool isSelectSupported(SelectSupportKind) const override;
 
   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
@@ -207,6 +208,16 @@ public:
   virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
                                        const TargetRegisterClass *RC,
                                        unsigned Reg, EVT VT) const;
+
+  enum ImplicitParameter {
+    GRID_DIM,
+    GRID_OFFSET
+  };
+
+  /// \brief Helper function that returns the byte offset of the given
+  /// type of implicit parameter.
+  unsigned getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
+                                      const ImplicitParameter Param) const;
 };
 
 namespace AMDGPUISD {
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 0779d1d786b2..bd5abc4f546e 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -69,6 +69,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
       FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
       CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
       EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
+      EnableUnsafeDSOffsetFolding(false),
       WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
       EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
       GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 30f50eb1d2f3..90831bfb4458 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -76,6 +76,7 @@ private:
   bool EnablePromoteAlloca;
   bool EnableIfCvt;
   bool EnableLoadStoreOpt;
+  bool EnableUnsafeDSOffsetFolding;
   unsigned WavefrontSize;
   bool CFALUBug;
   int LocalMemorySize;
@@ -222,6 +223,10 @@ public:
     return EnableLoadStoreOpt;
   }
 
+  bool unsafeDSOffsetFoldingEnabled() const {
+    return EnableUnsafeDSOffsetFolding;
+  }
+
   unsigned getWavefrontSize() const {
     return WavefrontSize;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index a9a911a8efed..2297b52b423c 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -156,8 +156,10 @@ public:
 } // End of anonymous namespace
 
 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis(
-      [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); });
+  return TargetIRAnalysis([this](Function &F) {
+    return TargetTransformInfo(
+        AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
+  });
 }
 
 void AMDGPUPassConfig::addIRPasses() {
@@ -269,6 +271,7 @@ void GCNPassConfig::addPreRegAlloc() {
     // also need extra copies to the address operand to be eliminated.
     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
     insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
+    insertPass(&MachineSchedulerID, &RegisterCoalescerID);
   }
   addPass(createSIShrinkInstructionsPass(), false);
   addPass(createSIFixSGPRLiveRangesPass(), false);
@@ -280,10 +283,10 @@ void GCNPassConfig::addPostRegAlloc() {
 }
 
 void GCNPassConfig::addPreSched2() {
-  addPass(createSIInsertWaits(*TM), false);
 }
 
 void GCNPassConfig::addPreEmitPass() {
+  addPass(createSIInsertWaits(*TM), false);
   addPass(createSILowerControlFlowPass(*TM), false);
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 791c84e6f28b..dee0a69d1e68 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -37,8 +37,9 @@ class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {
   const AMDGPUTargetLowering *getTLI() const { return TLI; }
 
 public:
-  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM)
-      : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
+  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const DataLayout &DL)
+      : BaseT(TM, DL), ST(TM->getSubtargetImpl()),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg)
@@ -46,18 +47,6 @@ public:
   AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   bool hasBranchDivergence() { return true; }
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 7172e4bb9335..c709741f3777 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -44,7 +44,7 @@ static MCInstrInfo *createAMDGPUMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitAMDGPUMCRegisterInfo(X, 0);
   return X;
@@ -52,14 +52,13 @@ static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo * X = new MCSubtargetInfo();
-  InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                               CodeModel::Model CM,
-                                               CodeGenOpt::Level OL) {
+static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(const Triple &TT,
+                                                Reloc::Model RM,
+                                                CodeModel::Model CM,
+                                                CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
   X->initMCCodeGenInfo(RM, CM, OL);
   return X;
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 8357b6d9d0ed..4e4d554f0ee7 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -815,8 +815,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case Intrinsic::r600_read_local_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
-    case Intrinsic::AMDGPU_read_workdim:
-      return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
+    case Intrinsic::AMDGPU_read_workdim: {
+      uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
+      return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
+    }
 
     case Intrinsic::r600_read_tgid_x:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
@@ -897,8 +899,9 @@ SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 
   for (unsigned i = 0, e = VecVT.getVectorNumElements();
                                                            i != e; ++i) {
-    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
-                               DAG.getConstant(i, DL, getVectorIdxTy())));
+    Args.push_back(DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
+        DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
   }
 
   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
@@ -1459,22 +1462,17 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   SDValue Ptr = Op.getOperand(1);
   SDValue LoweredLoad;
 
-  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
-  if (Ret.getNode()) {
-    SDValue Ops[2] = {
-      Ret,
-      Chain
-    };
-    return DAG.getMergeValues(Ops, DL);
-  }
+  if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG))
+    return Ret;
 
   // Lower loads constant address space global variable loads
   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
       isa<GlobalVariable>(GetUnderlyingObject(
-          LoadNode->getMemOperand()->getValue(), *getDataLayout()))) {
+          LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) {
 
-    SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
-        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
+    SDValue Ptr = DAG.getZExtOrTrunc(
+        LoadNode->getBasePtr(), DL,
+        getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS));
     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
         DAG.getConstant(2, DL, MVT::i32));
     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
@@ -1702,7 +1700,8 @@ SDValue R600TargetLowering::LowerFormalArguments(
   return Chain;
 }
 
-EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                           EVT VT) const {
    if (!VT.isVector())
      return MVT::i32;
    return VT.changeVectorElementTypeToInteger();
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index c06d3c4fd309..4dbac97af2a1 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -38,7 +38,9 @@ public:
                               const SmallVectorImpl<ISD::InputArg> &Ins,
                               SDLoc DL, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &InVals) const override;
-  EVT getSetCCResultType(LLVMContext &, EVT VT) const override;
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                         EVT VT) const override;
+
 private:
   unsigned Gen;
   /// Each OpenCL kernel has nine implicit parameters that are stored in the
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index d14e37a64612..c2887255cc11 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -126,11 +126,42 @@ static bool updateOperand(FoldCandidate &Fold,
   return false;
 }
 
+static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
+                              const MachineInstr *MI) {
+  for (auto Candidate : FoldList) {
+    if (Candidate.UseMI == MI)
+      return true;
+  }
+  return false;
+}
+
 static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
                              MachineInstr *MI, unsigned OpNo,
                              MachineOperand *OpToFold,
                              const SIInstrInfo *TII) {
   if (!TII->isOperandLegal(MI, OpNo, OpToFold)) {
+
+    // Special case for v_mac_f32_e64 if we are trying to fold into src2
+    unsigned Opc = MI->getOpcode();
+    if (Opc == AMDGPU::V_MAC_F32_e64 &&
+        (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
+      // Check if changing this to a v_mad_f32 instruction will allow us to
+      // fold the operand.
+      MI->setDesc(TII->get(AMDGPU::V_MAD_F32));
+      bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
+      if (FoldAsMAD) {
+        MI->untieRegOperand(OpNo);
+        return true;
+      }
+      MI->setDesc(TII->get(Opc));
+    }
+
+    // If we are already folding into another operand of MI, then
+    // we can't commute the instruction, otherwise we risk making the
+    // other fold illegal.
+    if (isUseMIInFoldList(FoldList, MI))
+      return false;
+
     // Operand is not legal, so try to commute the instruction to
     // see if this makes it possible to fold.
     unsigned CommuteIdx0;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index ead1a3743473..dd818a9ba746 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -254,8 +254,9 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
   return false;
 }
 
-bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                             Type *Ty, unsigned AS) const {
+bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                             const AddrMode &AM, Type *Ty,
+                                             unsigned AS) const {
   // No global is ever allowed as a base.
   if (AM.BaseGV)
     return false;
@@ -416,7 +417,7 @@ static EVT toIntegerVT(EVT VT) {
 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          SDLoc SL, SDValue Chain,
                                          unsigned Offset, bool Signed) const {
-  const DataLayout *DL = getDataLayout();
+  const DataLayout &DL = DAG.getDataLayout();
   MachineFunction &MF = DAG.getMachineFunction();
   const SIRegisterInfo *TRI =
       static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
@@ -425,16 +426,16 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
 
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-  MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
                                        MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
   SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
                             DAG.getConstant(Offset, SL, PtrVT));
-  SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
+  SDValue PtrOffset = DAG.getUNDEF(PtrVT);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
-  unsigned Align = DL->getABITypeAlignment(Ty);
+  unsigned Align = DL.getABITypeAlignment(Ty);
 
   if (VT != MemVT && VT.isFloatingPoint()) {
     // Do an integer load and convert.
@@ -451,7 +452,12 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                true, // isNonTemporal
                                true, // isInvariant
                                Align); // Alignment
-    return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load);
+    SDValue Ops[] = {
+      DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load),
+      Load.getValue(1)
+    };
+
+    return DAG.getMergeValues(Ops, SL);
   }
 
   ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
@@ -569,6 +575,8 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   AnalyzeFormalArguments(CCInfo, Splits);
 
+  SmallVector<SDValue, 16> Chains;
+
   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
 
     const ISD::InputArg &Arg = Ins[i];
@@ -587,8 +595,9 @@ SDValue SITargetLowering::LowerFormalArguments(
                               VA.getLocMemOffset();
       // The first 36 bytes of the input buffer contains information about
       // thread group and global sizes.
-      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
+      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, Chain,
                                    Offset, Ins[i].Flags.isSExt());
+      Chains.push_back(Arg.getValue(1));
 
       const PointerType *ParamTy =
         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
@@ -614,7 +623,8 @@ SDValue SITargetLowering::LowerFormalArguments(
       Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
                                      &AMDGPU::SReg_64RegClass);
       Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
-      InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+      SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      InVals.push_back(Copy);
       continue;
     }
 
@@ -634,7 +644,9 @@ SDValue SITargetLowering::LowerFormalArguments(
       for (unsigned j = 1; j != NumElements; ++j) {
         Reg = ArgLocs[ArgIdx++].getLocReg();
         Reg = MF.addLiveIn(Reg, RC);
-        Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+
+        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+        Regs.push_back(Copy);
       }
 
       // Fill up the missing vector elements
@@ -653,7 +665,11 @@ SDValue SITargetLowering::LowerFormalArguments(
         AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
     Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
   }
-  return Chain;
+
+  if (Chains.empty())
+    return Chain;
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
@@ -695,14 +711,15 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
   return true;
 }
 
-EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const {
+EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
+                                         EVT VT) const {
   if (!VT.isVector()) {
     return MVT::i1;
   }
   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
 }
 
-MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
+MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const {
   return MVT::i32;
 }
 
@@ -888,7 +905,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
 
   SDLoc DL(GSD);
   const GlobalValue *GV = GSD->getGlobal();
-  MVT PtrVT = getPointerTy(GSD->getAddressSpace());
+  MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
 
   SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
   SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
@@ -926,6 +943,7 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  auto MFI = MF.getInfo<SIMachineFunctionInfo>();
   const SIRegisterInfo *TRI =
       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
 
@@ -964,8 +982,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   case Intrinsic::AMDGPU_read_workdim:
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset,
-                          false);
+                          getImplicitParameterOffset(MFI, GRID_DIM), false);
 
   case Intrinsic::r600_read_tgid_x:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
@@ -1213,7 +1230,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
 
   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
 
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
 
   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
 
@@ -1411,7 +1429,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
     unsigned AS = Load->getAddressSpace();
     unsigned Align = Load->getAlignment();
     Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
 
     // Don't try to replace the load if we have to expand it due to alignment
     // problems. Otherwise we will end up scalarizing the load, and trying to
@@ -2212,9 +2230,8 @@ SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
 
 std::pair<unsigned, const TargetRegisterClass *>
 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                               const std::string &Constraint_,
+                                               StringRef Constraint,
                                                MVT VT) const {
-  StringRef Constraint(Constraint_);
   if (Constraint == "r") {
     switch(VT.SimpleTy) {
       default: llvm_unreachable("Unhandled type for 'r' inline asm constraint");
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index a956b013bdb1..635b4edc89de 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -62,8 +62,8 @@ public:
   bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
                           EVT /*VT*/) const override;
 
-  bool isLegalAddressingMode(const AddrMode &AM,
-                             Type *Ty, unsigned AS) const override;
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
 
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
@@ -90,8 +90,9 @@ public:
   MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
                                       MachineBasicBlock * BB) const override;
   bool enableAggressiveFMAFusion(EVT VT) const override;
-  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
-  MVT getScalarShiftAmountTy(EVT VT) const override;
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
@@ -114,9 +115,9 @@ public:
                                   SDLoc DL,
                                   SDValue Ptr) const;
 
-  std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(
-                                   const TargetRegisterInfo *TRI,
-                                   const std::string &Constraint, MVT VT) const override;
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;
 };
 
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index eb96bd0227b2..18910615bebe 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -227,9 +227,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
 
     uint8_t Offset0 = Offset0Imm->getImm();
     uint8_t Offset1 = Offset1Imm->getImm();
-    assert(Offset1 > Offset0);
 
-    if (Offset1 - Offset0 == 1) {
+    if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
       // Each of these offsets is in element sized units, so we need to convert
       // to bytes of the individual reads.
 
@@ -924,7 +923,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
     return false;
 
   unsigned Opc = UseMI->getOpcode();
-  if (Opc == AMDGPU::V_MAD_F32) {
+  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) {
     // Don't fold if we are using source modifiers. The new VOP2 instructions
     // don't have them.
     if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) ||
@@ -963,9 +962,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
       // instead of having to modify in place.
 
       // Remove these first since they are at the end.
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
                                                       AMDGPU::OpName::omod));
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
                                                       AMDGPU::OpName::clamp));
 
       unsigned Src1Reg = Src1->getReg();
@@ -980,6 +979,14 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
       Src1->setSubReg(Src2SubReg);
       Src1->setIsKill(Src2->isKill());
 
+      if (Opc == AMDGPU::V_MAC_F32_e64) {
+        UseMI->untieRegOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
+      }
+
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
+                                                      AMDGPU::OpName::src2));
+      // ChangingToImmediate adds Src2 back to the instruction.
       Src2->ChangeToImmediate(Imm);
 
       removeModOperands(*UseMI);
@@ -1010,11 +1017,17 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
       // instead of having to modify in place.
 
       // Remove these first since they are at the end.
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
                                                       AMDGPU::OpName::omod));
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
+      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
                                                       AMDGPU::OpName::clamp));
 
+      if (Opc == AMDGPU::V_MAC_F32_e64) {
+        UseMI->untieRegOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
+      }
+
+      // ChangingToImmediate adds Src2 back to the instruction.
       Src2->ChangeToImmediate(Imm);
 
       // These come before src2.
@@ -1126,6 +1139,38 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
   return false;
 }
 
+MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
+                                                MachineBasicBlock::iterator &MI,
+                                                LiveVariables *LV) const {
+
+  switch (MI->getOpcode()) {
+    default: return nullptr;
+    case AMDGPU::V_MAC_F32_e64: break;
+    case AMDGPU::V_MAC_F32_e32: {
+      const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0);
+      if (Src0->isImm() && !isInlineConstant(*Src0, 4))
+        return nullptr;
+      break;
+    }
+  }
+
+  const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst);
+  const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0);
+  const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1);
+  const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2);
+
+  return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32))
+                 .addOperand(*Dst)
+                 .addImm(0) // Src0 mods
+                 .addOperand(*Src0)
+                 .addImm(0) // Src1 mods
+                 .addOperand(*Src1)
+                 .addImm(0) // Src mods
+                 .addOperand(*Src2)
+                 .addImm(0)  // clamp
+                 .addImm(0); // omod
+}
+
 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
   int64_t SVal = Imm.getSExtValue();
   if (SVal >= -16 && SVal <= 64)
@@ -1625,7 +1670,10 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
 
   if (MO->isReg()) {
     assert(DefinedRC);
-    const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg());
+    const TargetRegisterClass *RC =
+        TargetRegisterInfo::isVirtualRegister(MO->getReg()) ?
+            MRI.getRegClass(MO->getReg()) :
+            RI.getPhysRegClass(MO->getReg());
 
     // In order to be legal, the common sub-class must be equal to the
     // class of the current operand.  For example:
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 0382272068d2..015ea12d4598 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -144,6 +144,10 @@ public:
 
   unsigned getMachineCSELookAheadLimit() const override { return 500; }
 
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB,
+                                      MachineBasicBlock::iterator &MI,
+                                      LiveVariables *LV) const override;
+
   bool isSALU(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SALU;
   }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index fcb58d5da3b0..b39a78714640 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -529,9 +529,11 @@ def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
 
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
+def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
 def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
 def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
 def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
+def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">;
 
 //===----------------------------------------------------------------------===//
 // SI assembler operands
@@ -1113,6 +1115,13 @@ def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> {
   field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2);
   field string Asm = "$dst, $src0, $vsrc1, $src2";
 }
+def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> {
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
+  let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
+                             HasModifiers>.ret;
+  let Asm32 = getAsm32<2>.ret;
+  let Asm64 = getAsm64<2, HasModifiers>.ret;
+}
 def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
 def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
 def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 8c8d836776db..1ee63c675822 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1488,7 +1488,10 @@ defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>;
 defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>;
 defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>;
 
-defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>;
+let Constraints = "$dst = $src2", DisableEncoding="$src2",
+    isConvertibleToThreeAddress = 1 in {
+defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_MAC>;
+}
 } // End isCommutable = 1
 
 defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">;
@@ -2206,6 +2209,15 @@ def : Pat <
   (V_CNDMASK_B32_e64 $src2, $src1, $src0)
 >;
 
+// Pattern for V_MAC_F32
+def : Pat <
+  (fmad  (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+         (VOP3NoMods f32:$src1, i32:$src1_modifiers),
+         (VOP3NoMods f32:$src2, i32:$src2_modifiers)),
+  (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+                 $src2_modifiers, $src2, $clamp, $omod)
+>;
+
 /********** ======================= **********/
 /********** Image sampling patterns **********/
 /********** ======================= **********/
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 9b1d256dc5a8..1bdb1f0ee9f9 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -214,12 +214,11 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   // cases, like vectors of pointers.
   const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
 
-  unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
-  unsigned DestReg1
-    = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg();
+  const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst);
+  const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst);
 
   unsigned Offset0
-          = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
+    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
   unsigned Offset1
     = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
 
@@ -258,20 +257,43 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
 
   unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
   unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
-  updateRegDefsUses(DestReg0, DestReg, SubRegIdx0);
-  updateRegDefsUses(DestReg1, DestReg, SubRegIdx1);
 
-  LIS->RemoveMachineInstrFromMaps(I);
-  // Replacing Paired in the maps with Read2 allows us to avoid updating the
-  // live range for the m0 register.
-  LIS->ReplaceMachineInstrInMaps(Paired, Read2);
+  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+
+  // Copy to the old destination registers.
+  MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc)
+    .addOperand(*Dest0) // Copy to same destination including flags and sub reg.
+    .addReg(DestReg, 0, SubRegIdx0);
+  MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc)
+    .addOperand(*Dest1)
+    .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+  LIS->InsertMachineInstrInMaps(Read2);
+
+  // repairLiveintervalsInRange() doesn't handle physical register, so we have
+  // to update the M0 range manually.
+  SlotIndex PairedIndex = LIS->getInstructionIndex(Paired);
+  LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
+  LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
+  bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
+
+  // The new write to the original destination register is now the copy. Steal
+  // the old SlotIndex.
+  LIS->ReplaceMachineInstrInMaps(I, Copy0);
+  LIS->ReplaceMachineInstrInMaps(Paired, Copy1);
+
   I->eraseFromParent();
   Paired->eraseFromParent();
 
   LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
   LIS->shrinkToUses(&AddrRegLI);
 
-  LIS->getInterval(DestReg); // Create new LI
+  LIS->createAndComputeVirtRegInterval(DestReg);
+
+  if (UpdateM0Range) {
+    SlotIndex Read2Index = LIS->getInstructionIndex(Read2);
+    M0Segment->end = Read2Index.getRegSlot();
+  }
 
   DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
   return Read2.getInstr();
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 587ea63d6796..d23b92edef33 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -53,7 +53,6 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
   if (!LaneVGPRs.count(LaneVGPRIdx)) {
     unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
     LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
-    MRI.setPhysRegUsed(LaneVGPR);
 
     // Add this register as live-in to all blocks to avoid machine verifer
     // complaining about use of an undefined physical register.
diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
index 0a7f684552f0..b086d2ed6652 100644
--- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
@@ -91,7 +91,6 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
 
   if (ScratchOffsetReg != AMDGPU::NoRegister) {
     // Found an SGPR to use
-    MRI.setPhysRegUsed(ScratchOffsetReg);
     BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
             .addReg(ScratchOffsetPreloadReg);
   } else {
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index db2ff0b1f952..ce4acafac9fa 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -499,7 +499,7 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
 
   for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
        I != E; ++I) {
-    if (!MRI.isPhysRegUsed(*I))
+    if (MRI.reg_nodbg_empty(*I))
       return *I;
   }
   return AMDGPU::NoRegister;
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 51e72cdb5f9e..5d00bdd6a9bb 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -94,8 +94,20 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
   // is vcc.  We should handle this the same way we handle vopc, by addding
   // a register allocation hint pre-regalloc and then do the shrining
   // post-regalloc.
-  if (Src2)
-    return false;
+  if (Src2) {
+    switch (MI.getOpcode()) {
+      default: return false;
+
+      case AMDGPU::V_MAC_F32_e64:
+        if (!isVGPR(Src2, TRI, MRI) ||
+            TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
+          return false;
+        break;
+
+      case AMDGPU::V_CNDMASK_B32_e64:
+        break;
+    }
+  }
 
   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
   const MachineOperand *Src1Mod =
@@ -149,7 +161,7 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
     return;
 
   // Try to fold Src0
-  if (Src0.isReg()) {
+  if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
     unsigned Reg = Src0.getReg();
     MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
     if (Def && Def->isMoveImmediate()) {
@@ -243,6 +255,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           continue;
       }
 
+      if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
+        // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
+        // instructions.
+        const MachineOperand *Src2 =
+            TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+        if (!Src2->isReg())
+          continue;
+        unsigned SReg = Src2->getReg();
+        if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+          MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
+          continue;
+        }
+        if (SReg != AMDGPU::VCC)
+          continue;
+      }
+
       // We can shrink this instruction
       DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';);
 
@@ -259,6 +287,11 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       if (Src1)
         Inst32.addOperand(*Src1);
 
+      const MachineOperand *Src2 =
+          TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+      if (Src2)
+        Inst32.addOperand(*Src2);
+
       ++NumInstructionsShrunk;
       MI.eraseFromParent();
 
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 96b4742da2bb..ef609a66d032 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -150,6 +150,10 @@ def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
 def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
                                        "NaCl trap">;
 
+def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true",
+                                        "Generate calls via indirect call "
+                                        "instructions">;
+
 // ARM ISAs.
 def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
                                    "Support ARM v4T instructions">;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index b1a11d626bda..9f43e732bd73 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1230,8 +1230,7 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   Reloc::Model RM = MF.getTarget().getRelocationModel();
 
   if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
-    assert(getSubtarget().getTargetTriple().getObjectFormat() ==
-           Triple::MachO &&
+    assert(getSubtarget().getTargetTriple().isOSBinFormatMachO() &&
            "LOAD_STACK_GUARD currently supported only for MachO.");
     expandLoadStackGuard(MI, RM);
     MI->getParent()->erase(MI);
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 3f79a9b53d70..e7d5be7753e4 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -127,7 +127,7 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
 BitVector ARMBaseRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
-  const TargetFrameLowering *TFI = STI.getFrameLowering();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
 
   // FIXME: avoid re-calculating this every time.
   BitVector Reserved(getNumRegs());
@@ -194,7 +194,7 @@ unsigned
 ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                          MachineFunction &MF) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
-  const TargetFrameLowering *TFI = STI.getFrameLowering();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
 
   switch (RC->getID()) {
   default:
@@ -302,7 +302,7 @@ ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
 bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
 
   // When outgoing call frames are so large that we adjust the stack pointer
   // around the call, we can no longer use the stack pointer to reach the
@@ -333,6 +333,7 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
 bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   const MachineRegisterInfo *MRI = &MF.getRegInfo();
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
   // We can't realign the stack if:
   // 1. Dynamic stack realignment is explicitly disabled,
   // 2. This is a Thumb1 function (it's not useful, so we don't bother), or
@@ -347,7 +348,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
     return false;
   // We may also need a base pointer if there are dynamic allocas or stack
   // pointer adjustments around calls.
-  if (MF.getSubtarget().getFrameLowering()->hasReservedCallFrame(MF))
+  if (TFI->hasReservedCallFrame(MF))
     return true;
   // A base pointer is required and allowed.  Check that it isn't too late to
   // reserve it.
@@ -357,9 +358,9 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
 bool ARMBaseRegisterInfo::
 needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
   const Function *F = MF.getFunction();
-  unsigned StackAlign =
-      MF.getSubtarget().getFrameLowering()->getStackAlignment();
+  unsigned StackAlign = TFI->getStackAlignment();
   bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
                               F->hasFnAttribute(Attribute::StackAlignment));
 
@@ -378,7 +379,7 @@ cannotEliminateFrame(const MachineFunction &MF) const {
 unsigned
 ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
-  const TargetFrameLowering *TFI = STI.getFrameLowering();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
 
   if (TFI->hasFP(MF))
     return getFramePointerReg(STI);
@@ -517,7 +518,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   // Note that the incoming offset is based on the SP value at function entry,
   // so it'll be negative.
   MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
@@ -694,8 +695,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineFunction &MF = *MBB.getParent();
   const ARMBaseInstrInfo &TII =
       *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const ARMFrameLowering *TFI = static_cast<const ARMFrameLowering *>(
-      MF.getSubtarget().getFrameLowering());
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   assert(!AFI->isThumb1OnlyFunction() &&
          "This eliminateFrameIndex does not support Thumb1!");
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 7dd21ecbe91b..27cf06b995a0 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -142,6 +142,9 @@ def CC_ARM_AAPCS : CallingConv<[
   // Handles byval parameters.
   CCIfByVal<CCPassByVal<4, 4>>,
 
+  // The 'nest' parameter, if any, is passed in R12.
+  CCIfNest<CCAssignToReg<[R12]>>,
+
   // Handle all vector types as either f64 or v2f64.
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 4175b4af86e6..fdd0763ea608 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -49,8 +49,6 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
-extern cl::opt<bool> EnableARMLongCalls;
-
 namespace {
 
   // All possible address modes, plus some.
@@ -685,7 +683,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
 }
 
 unsigned ARMFastISel::fastMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
 
   // Only handle simple types.
   if (!CEVT.isSimple()) return 0;
@@ -732,7 +730,7 @@ unsigned ARMFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
 }
 
 bool ARMFastISel::isTypeLegal(Type *Ty, MVT &VT) {
-  EVT evt = TLI.getValueType(Ty, true);
+  EVT evt = TLI.getValueType(DL, Ty, true);
 
   // Only handle simple types.
   if (evt == MVT::Other || !evt.isSimple()) return false;
@@ -786,12 +784,13 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
       return ARMComputeAddress(U->getOperand(0), Addr);
     case Instruction::IntToPtr:
       // Look past no-op inttoptrs.
-      if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+          TLI.getPointerTy(DL))
         return ARMComputeAddress(U->getOperand(0), Addr);
       break;
     case Instruction::PtrToInt:
       // Look past no-op ptrtoints.
-      if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
         return ARMComputeAddress(U->getOperand(0), Addr);
       break;
     case Instruction::GetElementPtr: {
@@ -1365,7 +1364,7 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
 bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
                              bool isZExt) {
   Type *Ty = Src1Value->getType();
-  EVT SrcEVT = TLI.getValueType(Ty, true);
+  EVT SrcEVT = TLI.getValueType(DL, Ty, true);
   if (!SrcEVT.isSimple()) return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
 
@@ -1557,7 +1556,7 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
     return false;
 
   Value *Src = I->getOperand(0);
-  EVT SrcEVT = TLI.getValueType(Src->getType(), true);
+  EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true);
   if (!SrcEVT.isSimple())
     return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
@@ -1750,7 +1749,7 @@ bool ARMFastISel::SelectRem(const Instruction *I, bool isSigned) {
 }
 
 bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
-  EVT DestVT  = TLI.getValueType(I->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   // We can get here in the case when we have a binary operation on a non-legal
   // type and the target independent selector doesn't know how to handle it.
@@ -1790,7 +1789,7 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
 }
 
 bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
-  EVT FPVT = TLI.getValueType(I->getType(), true);
+  EVT FPVT = TLI.getValueType(DL, I->getType(), true);
   if (!FPVT.isSimple()) return false;
   MVT VT = FPVT.getSimpleVT();
 
@@ -2095,7 +2094,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   CallingConv::ID CC = F.getCallingConv();
   if (Ret->getNumOperands() > 0) {
     SmallVector<ISD::OutputArg, 4> Outs;
-    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
@@ -2122,7 +2121,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
       return false;
 
     unsigned SrcReg = Reg + VA.getValNo();
-    EVT RVEVT = TLI.getValueType(RV->getType());
+    EVT RVEVT = TLI.getValueType(DL, RV->getType());
     if (!RVEVT.isSimple()) return false;
     MVT RVVT = RVEVT.getSimpleVT();
     MVT DestVT = VA.getValVT();
@@ -2173,7 +2172,7 @@ unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
 unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
   // Manually compute the global's type to avoid building it when unnecessary.
   Type *GVTy = Type::getInt32PtrTy(*Context, /*AS=*/0);
-  EVT LCREVT = TLI.getValueType(GVTy);
+  EVT LCREVT = TLI.getValueType(DL, GVTy);
   if (!LCREVT.isSimple()) return 0;
 
   GlobalValue *GV = new GlobalVariable(M, Type::getInt32Ty(*Context), false,
@@ -2246,19 +2245,19 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
     return false;
 
   unsigned CalleeReg = 0;
-  if (EnableARMLongCalls) {
+  if (Subtarget->genLongCalls()) {
     CalleeReg = getLibcallReg(TLI.getLibcallName(Call));
     if (CalleeReg == 0) return false;
   }
 
   // Issue the call.
-  unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls);
+  unsigned CallOpc = ARMSelectCallOp(Subtarget->genLongCalls());
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DbgLoc, TII.get(CallOpc));
   // BL / BLX don't take a predicate, but tBL / tBLX do.
   if (isThumb2)
     AddDefaultPred(MIB);
-  if (EnableARMLongCalls)
+  if (Subtarget->genLongCalls())
     MIB.addReg(CalleeReg);
   else
     MIB.addExternalSymbol(TLI.getLibcallName(Call));
@@ -2380,7 +2379,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
 
   bool UseReg = false;
   const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
-  if (!GV || EnableARMLongCalls) UseReg = true;
+  if (!GV || Subtarget->genLongCalls()) UseReg = true;
 
   unsigned CalleeReg = 0;
   if (UseReg) {
@@ -2576,8 +2575,8 @@ bool ARMFastISel::SelectTrunc(const Instruction *I) {
   Value *Op = I->getOperand(0);
 
   EVT SrcVT, DestVT;
-  SrcVT = TLI.getValueType(Op->getType(), true);
-  DestVT = TLI.getValueType(I->getType(), true);
+  SrcVT = TLI.getValueType(DL, Op->getType(), true);
+  DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
     return false;
@@ -2742,8 +2741,8 @@ bool ARMFastISel::SelectIntExt(const Instruction *I) {
   if (!SrcReg) return false;
 
   EVT SrcEVT, DestEVT;
-  SrcEVT = TLI.getValueType(SrcTy, true);
-  DestEVT = TLI.getValueType(DestTy, true);
+  SrcEVT = TLI.getValueType(DL, SrcTy, true);
+  DestEVT = TLI.getValueType(DL, DestTy, true);
   if (!SrcEVT.isSimple()) return false;
   if (!DestEVT.isSimple()) return false;
 
@@ -2763,7 +2762,7 @@ bool ARMFastISel::SelectShift(const Instruction *I,
     return false;
 
   // Only handle i32 now.
-  EVT DestVT = TLI.getValueType(I->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
   if (DestVT != MVT::i32)
     return false;
 
@@ -3026,7 +3025,7 @@ bool ARMFastISel::fastLowerArguments() {
     if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
       return false;
 
-    EVT ArgVT = TLI.getValueType(ArgTy);
+    EVT ArgVT = TLI.getValueType(DL, ArgTy);
     if (!ArgVT.isSimple()) return false;
     switch (ArgVT.getSimpleVT().SimpleTy) {
     case MVT::i8:
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index a52e49780e27..6744000afe2b 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -800,7 +800,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
           // This is bad, if an interrupt is taken after the mov, sp is in an
           // inconsistent state.
           // Use the first callee-saved register as a scratch register.
-          assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) &&
+          assert(!MFI->getPristineRegs(MF).test(ARM::R4) &&
                  "No scratch register to restore SP from FP!");
           emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
                                  ARMCC::AL, 0, TII);
@@ -1470,7 +1470,8 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
 // callee-saved vector registers after realigning the stack. The vst1 and vld1
 // instructions take alignment hints that can improve performance.
 //
-static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) {
+static void
+checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) {
   MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(0);
   if (!SpillAlignedNEONRegs)
     return;
@@ -1497,10 +1498,9 @@ static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) {
   // callee-saved registers in order, but it can happen that there are holes in
   // the range.  Registers above the hole will be spilled to the standard DPRCS
   // area.
-  MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned NumSpills = 0;
   for (; NumSpills < 8; ++NumSpills)
-    if (!MRI.isPhysRegUsed(ARM::D8 + NumSpills))
+    if (!SavedRegs.test(ARM::D8 + NumSpills))
       break;
 
   // Don't do this for just one d-register. It's not worth it.
@@ -1511,12 +1511,13 @@ static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) {
   MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(NumSpills);
 
   // A scratch register is required for the vst1 / vld1 instructions.
-  MF.getRegInfo().setPhysRegUsed(ARM::R4);
+  SavedRegs.set(ARM::R4);
 }
 
-void
-ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                       RegScavenger *RS) const {
+void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   // This tells PEI to spill the FP as if it is any other callee-save register
   // to take advantage the eliminateFrameIndex machinery. This also ensures it
   // is spilled in the order specified by getCalleeSavedRegs() to make it easier
@@ -1543,12 +1544,12 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // FIXME: It will be better just to find spare register here.
   if (AFI->isThumb2Function() &&
       (MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)))
-    MRI.setPhysRegUsed(ARM::R4);
+    SavedRegs.set(ARM::R4);
 
   if (AFI->isThumb1OnlyFunction()) {
     // Spill LR if Thumb1 function uses variable length argument lists.
     if (AFI->getArgRegsSaveSize() > 0)
-      MRI.setPhysRegUsed(ARM::LR);
+      SavedRegs.set(ARM::LR);
 
     // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know
     // for sure what the stack size will be, but for this, an estimate is good
@@ -1558,23 +1559,23 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     // FIXME: It will be better just to find spare register here.
     unsigned StackSize = MFI->estimateStackSize(MF);
     if (MFI->hasVarSizedObjects() || StackSize > 508)
-      MRI.setPhysRegUsed(ARM::R4);
+      SavedRegs.set(ARM::R4);
   }
 
   // See if we can spill vector registers to aligned stack.
-  checkNumAlignedDPRCS2Regs(MF);
+  checkNumAlignedDPRCS2Regs(MF, SavedRegs);
 
   // Spill the BasePtr if it's used.
   if (RegInfo->hasBasePointer(MF))
-    MRI.setPhysRegUsed(RegInfo->getBaseRegister());
+    SavedRegs.set(RegInfo->getBaseRegister());
 
   // Don't spill FP if the frame can be eliminated. This is determined
-  // by scanning the callee-save registers to see if any is used.
+  // by scanning the callee-save registers to see if any is modified.
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     bool Spilled = false;
-    if (MRI.isPhysRegUsed(Reg)) {
+    if (SavedRegs.test(Reg)) {
       Spilled = true;
       CanEliminateFrame = false;
     }
@@ -1668,7 +1669,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
     // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
     if (!LRSpilled && CS1Spilled) {
-      MRI.setPhysRegUsed(ARM::LR);
+      SavedRegs.set(ARM::LR);
       NumGPRSpills++;
       SmallVectorImpl<unsigned>::iterator LRPos;
       LRPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(),
@@ -1681,7 +1682,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     }
 
     if (hasFP(MF)) {
-      MRI.setPhysRegUsed(FramePtr);
+      SavedRegs.set(FramePtr);
       auto FPPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(),
                              FramePtr);
       if (FPPos != UnspilledCS1GPRs.end())
@@ -1700,7 +1701,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
           // Don't spill high register if the function is thumb
           if (!AFI->isThumbFunction() ||
               isARMLowRegister(Reg) || Reg == ARM::LR) {
-            MRI.setPhysRegUsed(Reg);
+            SavedRegs.set(Reg);
             if (!MRI.isReserved(Reg))
               ExtraCSSpill = true;
             break;
@@ -1708,7 +1709,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
         }
       } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) {
         unsigned Reg = UnspilledCS2GPRs.front();
-        MRI.setPhysRegUsed(Reg);
+        SavedRegs.set(Reg);
         if (!MRI.isReserved(Reg))
           ExtraCSSpill = true;
       }
@@ -1747,7 +1748,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       }
       if (Extras.size() && NumExtras == 0) {
         for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
-          MRI.setPhysRegUsed(Extras[i]);
+          SavedRegs.set(Extras[i]);
         }
       } else if (!AFI->isThumb1OnlyFunction()) {
         // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
@@ -1761,7 +1762,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   }
 
   if (ForceLRSpill) {
-    MRI.setPhysRegUsed(ARM::LR);
+    SavedRegs.set(ARM::LR);
     AFI->setLRIsSpilledForFarJump(true);
   }
 }
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index d763d17a506f..6fdc5eff5e47 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -54,8 +54,8 @@ public:
                                  unsigned &FrameReg, int SPAdj) const;
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
 
   void adjustForSegmentedStacks(MachineFunction &MF,
                                 MachineBasicBlock &MBB) const override;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 50afb192b331..b110628a0a86 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -533,7 +533,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
     if (N.getOpcode() == ISD::FrameIndex) {
       // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       OffImm  = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
       return true;
     }
@@ -556,7 +557,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
       Base   = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
       return true;
@@ -702,7 +704,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
       Base = N.getOperand(0);
@@ -722,7 +725,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
       Offset = CurDAG->getRegister(0, MVT::i32);
 
@@ -900,7 +904,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
     Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), SDLoc(N),
@@ -915,7 +920,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
 
@@ -964,7 +970,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
       Base = N.getOperand(0);
@@ -981,7 +988,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     }
 
     ARM_AM::AddrOpc AddSub = ARM_AM::add;
@@ -1215,7 +1223,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
     MachineFrameInfo *MFI = MF->getFrameInfo();
     if (MFI->getObjectAlignment(FI) < 4)
       MFI->setObjectAlignment(FI, 4);
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
     return true;
   }
@@ -1237,7 +1246,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
         MachineFrameInfo *MFI = MF->getFrameInfo();
         if (MFI->getObjectAlignment(FI) < 4)
           MFI->setObjectAlignment(FI, 4);
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
       return true;
@@ -1285,7 +1295,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
     if (N.getOpcode() == ISD::FrameIndex) {
       // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       OffImm  = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
       return true;
     }
@@ -1314,7 +1325,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
       Base   = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
       return true;
@@ -1343,7 +1355,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
       }
       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
       return true;
@@ -1438,7 +1451,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base,
   Base = N.getOperand(0);
   if (Base.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
   }
 
   OffImm = CurDAG->getTargetConstant(RHSC/4, SDLoc(N), MVT::i32);
@@ -2510,7 +2524,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     if (UseCP) {
       SDValue CPIdx = CurDAG->getTargetConstantPool(
           ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
-          TLI->getPointerTy());
+          TLI->getPointerTy(CurDAG->getDataLayout()));
 
       SDNode *ResNode;
       if (Subtarget->isThumb()) {
@@ -2540,7 +2554,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ISD::FrameIndex: {
     // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    SDValue TFI = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     if (Subtarget->isThumb1Only()) {
       // Set the alignment of the frame object to 4, to avoid having to generate
       // more than one ADD
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 4b2105b7442f..e335784f6d87 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -60,11 +60,6 @@ STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
 
-cl::opt<bool>
-EnableARMLongCalls("arm-long-calls", cl::Hidden,
-  cl::desc("Generate calls via indirect call instructions"),
-  cl::init(false));
-
 static cl::opt<bool>
 ARMInterworking("arm-interworking", cl::Hidden,
   cl::desc("Enable / disable ARM interworking (for debugging only)"),
@@ -548,6 +543,27 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
 
+    // NEON does not have single instruction CTTZ for vectors.
+    setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
+
+    setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
+
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
+
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
+
     // NEON only has FMA instructions as of VFP4.
     if (!Subtarget->hasVFP4()) {
       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
@@ -1149,8 +1165,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
 }
 
-EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-  if (!VT.isVector()) return getPointerTy();
+EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                          EVT VT) const {
+  if (!VT.isVector())
+    return getPointerTy(DL);
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -1429,7 +1447,8 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
                                     ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                       StackPtr, PtrOff);
   return DAG.getStore(Chain, dl, Arg, PtrOff,
                       MachinePointerInfo::getStack(LocMemOffset),
                       false, false, 0);
@@ -1453,7 +1472,8 @@ void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
   else {
     assert(NextVA.isMemLoc());
     if (!StackPtr.getNode())
-      StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+      StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
+                                    getPointerTy(DAG.getDataLayout()));
 
     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
                                            dl, DAG, NextVA,
@@ -1526,7 +1546,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getCALLSEQ_START(Chain,
                                  DAG.getIntPtrConstant(NumBytes, dl, true), dl);
 
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+  SDValue StackPtr =
+      DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
 
   RegsToPassVector RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
@@ -1607,7 +1628,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         unsigned RegBegin, RegEnd;
         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
 
-        EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+        EVT PtrVT =
+            DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
         unsigned int i, j;
         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
@@ -1628,12 +1650,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       }
 
       if (Flags.getByValSize() > 4*offset) {
+        auto PtrVT = getPointerTy(DAG.getDataLayout());
         unsigned LocMemOffset = VA.getLocMemOffset();
         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-        SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
-                                  StkPtrOff);
+        SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
-        SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
+        SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
                                            MVT::i32);
         SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
@@ -1693,8 +1715,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool isARMFunc = false;
   bool isLocalARMFunc = false;
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  auto PtrVt = getPointerTy(DAG.getDataLayout());
 
-  if (EnableARMLongCalls) {
+  if (Subtarget->genLongCalls()) {
     assert((Subtarget->isTargetWindows() ||
             getTargetMachine().getRelocationModel() == Reloc::Static) &&
            "long-calls with non-static relocation model!");
@@ -1709,12 +1732,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
 
       // Get the address of the callee into a register
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-      Callee = DAG.getLoad(getPointerTy(), dl,
-                           DAG.getEntryNode(), CPAddr,
-                           MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
+      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(), false, false,
+                           false, 0);
     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
       const char *Sym = S->getSymbol();
 
@@ -1724,29 +1746,28 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
                                       ARMPCLabelIndex, 0);
       // Get the address of the callee into a register
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-      Callee = DAG.getLoad(getPointerTy(), dl,
-                           DAG.getEntryNode(), CPAddr,
-                           MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
+      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(), false, false,
+                           false, 0);
     }
   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = G->getGlobal();
     isDirect = true;
-    bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
-    bool isStub = (isExt && Subtarget->isTargetMachO()) &&
+    bool isDef = GV->isStrongDefinitionForLinker();
+    bool isStub = (!isDef && Subtarget->isTargetMachO()) &&
                    getTargetMachine().getRelocationModel() != Reloc::Static;
     isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
     // ARM call to a local ARM function is predicable.
-    isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
+    isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
     // tBX takes a register source operand.
     if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
       assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
-      Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(),
-                           DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
-                                                      0, ARMII::MO_NONLAZY));
-      Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
+      Callee = DAG.getNode(
+          ARMISD::WrapperPIC, dl, PtrVt,
+          DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
+      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee,
                            MachinePointerInfo::getGOT(), false, false, true, 0);
     } else if (Subtarget->isTargetCOFF()) {
       assert(Subtarget->isTargetWindows() &&
@@ -1754,20 +1775,20 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       unsigned TargetFlags = GV->hasDLLImportStorageClass()
                                  ? ARMII::MO_DLLIMPORT
                                  : ARMII::MO_NO_FLAG;
-      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), /*Offset=*/0,
-                                          TargetFlags);
+      Callee =
+          DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags);
       if (GV->hasDLLImportStorageClass())
-        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                             DAG.getNode(ARMISD::Wrapper, dl, getPointerTy(),
-                                         Callee), MachinePointerInfo::getGOT(),
-                             false, false, false, 0);
+        Callee =
+            DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
+                        DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
+                        MachinePointerInfo::getGOT(), false, false, false, 0);
     } else {
       // On ELF targets for PIC code, direct calls should go through the PLT
       unsigned OpFlags = 0;
       if (Subtarget->isTargetELF() &&
           getTargetMachine().getRelocationModel() == Reloc::PIC_)
         OpFlags = ARMII::MO_PLT;
-      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
+      Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags);
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     isDirect = true;
@@ -1781,22 +1802,20 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       ARMConstantPoolValue *CPV =
         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
                                       ARMPCLabelIndex, 4);
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-      Callee = DAG.getLoad(getPointerTy(), dl,
-                           DAG.getEntryNode(), CPAddr,
-                           MachinePointerInfo::getConstantPool(),
-                           false, false, false, 0);
+      Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(), false, false,
+                           false, 0);
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
-      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
-                           getPointerTy(), Callee, PICLabel);
+      Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
     } else {
       unsigned OpFlags = 0;
       // On ELF targets for PIC code, direct calls should go through the PLT
       if (Subtarget->isTargetELF() &&
                   getTargetMachine().getRelocationModel() == Reloc::PIC_)
         OpFlags = ARMII::MO_PLT;
-      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags);
+      Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags);
     }
   }
 
@@ -2433,7 +2452,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned ARMPCLabelIndex = 0;
   SDLoc DL(Op);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
   SDValue CPAddr;
@@ -2462,7 +2481,7 @@ SDValue
 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                                  SelectionDAG &DAG) const {
   SDLoc dl(GA);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -2508,7 +2527,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
   SDLoc dl(GA);
   SDValue Offset;
   SDValue Chain = DAG.getEntryNode();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   // Get the Thread Pointer
   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
 
@@ -2574,7 +2593,7 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
                                                  SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
@@ -2617,7 +2636,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
 
 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
                                                     SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
@@ -2648,7 +2667,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   const ARMII::TOF TargetFlags =
     (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result;
   SDLoc DL(Op);
 
@@ -2672,7 +2691,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
   ARMConstantPoolValue *CPV =
@@ -2716,14 +2735,14 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1));
   }
   case Intrinsic::arm_thread_pointer: {
-    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
   }
   case Intrinsic::eh_sjlj_lsda: {
     MachineFunction &MF = DAG.getMachineFunction();
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-    EVT PtrVT = getPointerTy();
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
     Reloc::Model RelocM = getTargetMachine().getRelocationModel();
     SDValue CPAddr;
     unsigned PCAdj = (RelocM != Reloc::PIC_)
@@ -2820,7 +2839,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
   SDLoc dl(Op);
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
@@ -2850,7 +2869,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
     int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true);
 
     // Create load node to retrieve arguments from the stack.
-    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN,
                             MachinePointerInfo::getFixedStack(FI),
                             false, false, false, 0);
@@ -2904,8 +2923,9 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
   if (REnd != RBegin)
     ArgOffset = -4 * (ARM::R4 - RBegin);
 
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
   int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false);
-  SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy());
+  SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
 
   SmallVector<SDValue, 4> MemOps;
   const TargetRegisterClass *RC =
@@ -2918,8 +2938,7 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
         DAG.getStore(Val.getValue(1), dl, Val, FIN,
                      MachinePointerInfo(OrigArg, 4 * i), false, false, 0);
     MemOps.push_back(Store);
-    FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
-                      DAG.getConstant(4, dl, getPointerTy()));
+    FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
   }
 
   if (!MemOps.empty())
@@ -3013,6 +3032,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
 
   unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
   AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -3035,7 +3055,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           SDValue ArgValue2;
           if (VA.isMemLoc()) {
             int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
-            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
                                     MachinePointerInfo::getFixedStack(FI),
                                     false, false, false, 0);
@@ -3122,7 +3142,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
             int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, CurOrigArg,
                                             CurByValIndex, VA.getLocMemOffset(),
                                             Flags.getByValSize());
-            InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy()));
+            InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
             CCInfo.nextInRegsParam();
           } else {
             unsigned FIOffset = VA.getLocMemOffset();
@@ -3130,7 +3150,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                                             FIOffset, true);
 
             // Create load nodes to retrieve arguments from the stack.
-            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
             InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
                                          MachinePointerInfo::getFixedStack(FI),
                                          false, false, false, 0));
@@ -3855,7 +3875,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Index = Op.getOperand(2);
   SDLoc dl(Op);
 
-  EVT PTy = getPointerTy();
+  EVT PTy = getPointerTy(DAG.getDataLayout());
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
@@ -4102,8 +4122,8 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned ARMTargetLowering::getRegisterByName(const char* RegName,
-                                              EVT VT) const {
+unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              SelectionDAG &DAG) const {
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("sp", ARM::SP)
                        .Default(0);
@@ -4163,7 +4183,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
   // Turn f64->i64 into VMOVRRD.
   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
     SDValue Cvt;
-    if (TLI.isBigEndian() && SrcVT.isVector() &&
+    if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
         SrcVT.getVectorNumElements() > 1)
       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
                         DAG.getVTList(MVT::i32, MVT::i32),
@@ -4283,8 +4303,82 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 
 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
                          const ARMSubtarget *ST) {
-  EVT VT = N->getValueType(0);
   SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  if (VT.isVector()) {
+    assert(ST->hasNEON());
+
+    // Compute the least significant set bit: LSB = X & -X
+    SDValue X = N->getOperand(0);
+    SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
+    SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
+
+    EVT ElemTy = VT.getVectorElementType();
+
+    if (ElemTy == MVT::i8) {
+      // Compute with: cttz(x) = ctpop(lsb - 1)
+      SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                                DAG.getTargetConstant(1, dl, ElemTy));
+      SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
+      return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
+    }
+
+    if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
+        (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
+      // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
+      unsigned NumBits = ElemTy.getSizeInBits();
+      SDValue WidthMinus1 =
+          DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                      DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
+      SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
+      return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
+    }
+
+    // Compute with: cttz(x) = ctpop(lsb - 1)
+
+    // Since we can only compute the number of bits in a byte with vcnt.8, we
+    // have to gather the result with pairwise addition (vpaddl) for i16, i32,
+    // and i64.
+
+    // Compute LSB - 1.
+    SDValue Bits;
+    if (ElemTy == MVT::i64) {
+      // Load constant 0xffff'ffff'ffff'ffff to register.
+      SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                               DAG.getTargetConstant(0x1eff, dl, MVT::i32));
+      Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
+    } else {
+      SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                                DAG.getTargetConstant(1, dl, ElemTy));
+      Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
+    }
+
+    // Count #bits with vcnt.8.
+    EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+    SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
+    SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
+
+    // Gather the #bits with vpaddl (pairwise add.)
+    EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+    SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
+        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+        Cnt8);
+    if (ElemTy == MVT::i16)
+      return Cnt16;
+
+    EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
+    SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
+        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+        Cnt16);
+    if (ElemTy == MVT::i32)
+      return Cnt32;
+
+    assert(ElemTy == MVT::i64);
+    SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+        Cnt32);
+    return Cnt64;
+  }
 
   if (!ST->hasV6T2Ops())
     return SDValue();
@@ -4730,7 +4824,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
       ImmMask <<= 1;
     }
 
-    if (DAG.getTargetLoweringInfo().isBigEndian())
+    if (DAG.getDataLayout().isBigEndian())
       // swap higher and lower 32 bit word
       Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
 
@@ -5868,7 +5962,7 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
     if (BVN->getValueType(0) != MVT::v4i32 ||
         BVN->getOpcode() != ISD::BUILD_VECTOR)
       return false;
-    unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
+    unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
     unsigned HiElt = 1 - LoElt;
     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
@@ -6013,7 +6107,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
     SDNode *BVN = N->getOperand(0).getNode();
     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
-    unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
+    unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
     return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32,
                        BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
   }
@@ -6342,18 +6436,19 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   SDValue Arg = Op.getOperand(0);
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Pair of floats / doubles used to pass the result.
   StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
 
   // Create stack object for sret.
-  const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy);
-  const unsigned StackAlign = TLI.getDataLayout()->getPrefTypeAlignment(RetTy);
+  auto &DL = DAG.getDataLayout();
+  const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
+  const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
   int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
-  SDValue SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy());
+  SDValue SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
 
   ArgListTy Args;
   ArgListEntry Entry;
@@ -6373,7 +6468,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
 
   const char *LibcallName  = (ArgVT == MVT::f64)
   ? "__sincos_stret" : "__sincosf_stret";
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
@@ -6387,7 +6482,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
                                 MachinePointerInfo(), false, false, false, 0);
 
   // Address of cos field.
-  SDValue Add = DAG.getNode(ISD::ADD, dl, getPointerTy(), SRet,
+  SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
   SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
                                 MachinePointerInfo(), false, false, false, 0);
@@ -6487,7 +6582,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
   case ISD::SRL_PARTS:
   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
-  case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
@@ -6845,9 +6941,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
 
       // MachineConstantPool wants an explicit alignment.
-      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
+      unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
       if (Align == 0)
-        Align = getDataLayout()->getTypeAllocSize(C->getType());
+        Align = MF->getDataLayout().getTypeAllocSize(C->getType());
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
@@ -6935,9 +7031,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
 
       // MachineConstantPool wants an explicit alignment.
-      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
+      unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
       if (Align == 0)
-        Align = getDataLayout()->getTypeAllocSize(C->getType());
+        Align = MF->getDataLayout().getTypeAllocSize(C->getType());
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
@@ -7313,9 +7409,9 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
 
     // MachineConstantPool wants an explicit alignment.
-    unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
+    unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
     if (Align == 0)
-      Align = getDataLayout()->getTypeAllocSize(C->getType());
+      Align = MF->getDataLayout().getTypeAllocSize(C->getType());
     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
     if (IsThumb1)
@@ -8001,7 +8097,7 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
   // Build operand list.
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
-                                TLI.getPointerTy()));
+                                TLI.getPointerTy(DAG.getDataLayout())));
 
   // Input is the vector.
   Ops.push_back(Vec);
@@ -8681,7 +8777,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
                                  std::min(4U, LD->getAlignment() / 2));
 
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
-    if (DCI.DAG.getTargetLoweringInfo().isBigEndian())
+    if (DCI.DAG.getDataLayout().isBigEndian())
       std::swap (NewLD1, NewLD2);
     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
     return Result;
@@ -9312,7 +9408,9 @@ static SDValue PerformSTORECombine(SDNode *N,
     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i < NumElems; ++i)
-      ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio;
+      ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
+                          ? (i + 1) * SizeRatio - 1
+                          : i * SizeRatio;
 
     // Can't shuffle using an illegal type.
     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
@@ -9339,8 +9437,8 @@ static SDValue PerformSTORECombine(SDNode *N,
     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
     SmallVector<SDValue, 8> Chains;
-    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, DL,
-                                        TLI.getPointerTy());
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
     SDValue BasePtr = St->getBasePtr();
 
     // Perform one or more big stores into memory.
@@ -9367,7 +9465,7 @@ static SDValue PerformSTORECombine(SDNode *N,
   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
       StVal.getNode()->hasOneUse()) {
     SelectionDAG  &DAG = DCI.DAG;
-    bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian();
+    bool isBigEndian = DAG.getDataLayout().isBigEndian();
     SDLoc DL(St);
     SDValue BasePtr = St->getBasePtr();
     SDValue NewST1 = DAG.getStore(St->getChain(), DL,
@@ -10078,7 +10176,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     // For any little-endian targets with neon, we can support unaligned ld/st
     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
     // A big-endian target may also explicitly support unaligned accesses
-    if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
+    if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
       if (Fast)
         *Fast = true;
       return true;
@@ -10317,10 +10415,10 @@ bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
 
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
-bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              Type *Ty,
+bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
                                               unsigned AS) const {
-  EVT VT = getValueType(Ty, true);
+  EVT VT = getValueType(DL, Ty, true);
   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
     return false;
 
@@ -10664,7 +10762,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 ARMTargetLowering::ConstraintType
-ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
+ARMTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:  break;
@@ -10723,10 +10821,8 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
 }
 
 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
-RCPair
-ARMTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                const std::string &Constraint,
-                                                MVT VT) const {
+RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC ARM Constraint Letters
     switch (Constraint[0]) {
@@ -10974,7 +11070,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   }
 
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
-                                         getPointerTy());
+                                         getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
 
@@ -11083,7 +11179,8 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::arm_neon_vld4lane: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
@@ -11103,12 +11200,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::arm_neon_vst4lane: {
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     unsigned NumElts = 0;
     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
@@ -11122,12 +11220,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   }
   case Intrinsic::arm_ldaex:
   case Intrinsic::arm_ldrex: {
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
     Info.vol = true;
     Info.readMem = true;
     Info.writeMem = false;
@@ -11135,12 +11234,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   }
   case Intrinsic::arm_stlex:
   case Intrinsic::arm_strex: {
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
     Info.vol = true;
     Info.readMem = false;
     Info.writeMem = true;
@@ -11427,9 +11527,9 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   VectorType *VecTy = Shuffles[0]->getType();
   Type *EltTy = VecTy->getVectorElementType();
 
-  const DataLayout *DL = getDataLayout();
-  unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy);
-  bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64;
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
+  bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
 
   // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't
   // support i64/f64 element).
@@ -11439,8 +11539,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
   if (EltTy->isPointerTy())
-    VecTy = VectorType::get(DL->getIntPtrType(EltTy),
-                            VecTy->getVectorNumElements());
+    VecTy =
+        VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
                                             Intrinsic::arm_neon_vld3,
@@ -11517,9 +11617,9 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   Type *EltTy = VecTy->getVectorElementType();
   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
 
-  const DataLayout *DL = getDataLayout();
-  unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy);
-  bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64;
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+  bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
 
   // Skip illegal sub vector types and vector types of i64/f64 element (vstN
   // doesn't support i64/f64 element).
@@ -11533,7 +11633,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
   // vectors to integer vectors.
   if (EltTy->isPointerTy()) {
-    Type *IntTy = DL->getIntPtrType(EltTy);
+    Type *IntTy = DL.getIntPtrType(EltTy);
 
     // Convert to the corresponding integer vector.
     Type *IntVecTy =
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 74396392f8e3..efc9020c193a 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -249,7 +249,8 @@ namespace llvm {
     }
 
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
-    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
 
     MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
@@ -286,8 +287,8 @@ namespace llvm {
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                               unsigned AS) const override;
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
     bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
@@ -324,8 +325,7 @@ namespace llvm {
 
     bool ExpandInlineAsm(CallInst *CI) const override;
 
-    ConstraintType
-      getConstraintType(const std::string &Constraint) const override;
+    ConstraintType getConstraintType(StringRef Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
@@ -334,8 +334,7 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
@@ -345,8 +344,8 @@ namespace llvm {
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
 
-    unsigned getInlineAsmMemConstraint(
-        const std::string &ConstraintCode) const override {
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "Q")
         return InlineAsm::Constraint_Q;
       else if (ConstraintCode.size() == 2) {
@@ -533,7 +532,8 @@ namespace llvm {
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
-    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
 
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
     /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index b8cac135baf6..61c45af26fe1 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -306,8 +306,8 @@ def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">;
 def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">;
 
-def IsLE             : Predicate<"getTargetLowering()->isLittleEndian()">;
-def IsBE             : Predicate<"getTargetLowering()->isBigEndian()">;
+def IsLE             : Predicate<"MF->getDataLayout().isLittleEndian()">;
+def IsBE             : Predicate<"MF->getDataLayout().isBigEndian()">;
 
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 245c9e869bf6..37352810c99f 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -31,11 +31,13 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -65,12 +67,18 @@ namespace {
     static char ID;
     ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
 
+    const MachineFunction *MF;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
+    const MachineRegisterInfo *MRI;
     const ARMSubtarget *STI;
     const TargetLowering *TL;
     ARMFunctionInfo *AFI;
-    RegScavenger *RS;
+    LivePhysRegs LiveRegs;
+    RegisterClassInfo RegClassInfo;
+    MachineBasicBlock::const_iterator LiveRegPos;
+    bool LiveRegsValid;
+    bool RegClassInfoValid;
     bool isThumb1, isThumb2;
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -80,64 +88,60 @@ namespace {
     }
 
   private:
+    /// A set of load/store MachineInstrs with same base register sorted by
+    /// offset.
     struct MemOpQueueEntry {
-      int Offset;
-      unsigned Reg;
-      bool isKill;
-      unsigned Position;
-      MachineBasicBlock::iterator MBBI;
-      bool Merged;
-      MemOpQueueEntry(int o, unsigned r, bool k, unsigned p,
-                      MachineBasicBlock::iterator i)
-        : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {}
+      MachineInstr *MI;
+      int Offset;        ///< Load/Store offset.
+      unsigned Position; ///< Position as counted from end of basic block.
+      MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position)
+        : MI(MI), Offset(Offset), Position(Position) {}
     };
     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
-    typedef MemOpQueue::iterator MemOpQueueIter;
 
-    void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs,
-                          const MemOpQueue &MemOps, unsigned DefReg,
-                          unsigned RangeBegin, unsigned RangeEnd);
+    /// A set of MachineInstrs that fulfill (nearly all) conditions to get
+    /// merged into a LDM/STM.
+    struct MergeCandidate {
+      /// List of instructions ordered by load/store offset.
+      SmallVector<MachineInstr*, 4> Instrs;
+      /// Index in Instrs of the instruction being latest in the schedule.
+      unsigned LatestMIIdx;
+      /// Index in Instrs of the instruction being earliest in the schedule.
+      unsigned EarliestMIIdx;
+      /// Index into the basic block where the merged instruction will be
+      /// inserted. (See MemOpQueueEntry.Position)
+      unsigned InsertPos;
+      /// Whether the instructions can be merged into a ldm/stm instruction.
+      bool CanMergeToLSMulti;
+      /// Whether the instructions can be merged into a ldrd/strd instruction.
+      bool CanMergeToLSDouble;
+    };
+    SpecificBumpPtrAllocator<MergeCandidate> Allocator;
+    SmallVector<const MergeCandidate*,4> Candidates;
+    SmallVector<MachineInstr*,4> MergeBaseCandidates;
+
+    void moveLiveRegsBefore(const MachineBasicBlock &MBB,
+                            MachineBasicBlock::const_iterator Before);
+    unsigned findFreeReg(const TargetRegisterClass &RegClass);
     void UpdateBaseRegUses(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
-                           DebugLoc dl, unsigned Base, unsigned WordOffset,
+                           DebugLoc DL, unsigned Base, unsigned WordOffset,
                            ARMCC::CondCodes Pred, unsigned PredReg);
-    bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                  int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
-                  ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
-                  DebugLoc dl,
-                  ArrayRef<std::pair<unsigned, bool> > Regs,
-                  ArrayRef<unsigned> ImpDefs);
-    void MergeOpsUpdate(MachineBasicBlock &MBB,
-                        MemOpQueue &MemOps,
-                        unsigned memOpsBegin,
-                        unsigned memOpsEnd,
-                        unsigned insertAfter,
-                        int Offset,
-                        unsigned Base,
-                        bool BaseKill,
-                        unsigned Opcode,
-                        ARMCC::CondCodes Pred,
-                        unsigned PredReg,
-                        unsigned Scratch,
-                        DebugLoc dl,
-                        SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
-    void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
-                      unsigned Opcode, unsigned Size,
-                      ARMCC::CondCodes Pred, unsigned PredReg,
-                      unsigned Scratch, MemOpQueue &MemOps,
-                      SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
-    void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
+    MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB,
+        MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
+        bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
+        DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs);
+    MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB,
+        MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
+        bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
+        DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const;
+    void FormCandidates(const MemOpQueue &MemOps);
+    MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator &MBBI);
-    bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MBBI,
-                                  const TargetInstrInfo *TII,
-                                  bool &Advance,
-                                  MachineBasicBlock::iterator &I);
-    bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   bool &Advance,
-                                   MachineBasicBlock::iterator &I);
+    bool MergeBaseUpdateLoadStore(MachineInstr *MI);
+    bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
+    bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
     bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
     bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
   };
@@ -185,6 +189,14 @@ static int getMemoryOpOffset(const MachineInstr *MI) {
   return Offset;
 }
 
+static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) {
+  return MI.getOperand(1);
+}
+
+static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) {
+  return MI.getOperand(0);
+}
+
 static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) {
   switch (Opcode) {
   default: llvm_unreachable("Unhandled opcode!");
@@ -348,6 +360,10 @@ static bool isi32Store(unsigned Opc) {
   return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
 }
 
+static bool isLoadSingle(unsigned Opc) {
+  return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
+}
+
 static unsigned getImmScale(unsigned Opc) {
   switch (Opc) {
   default: llvm_unreachable("Unhandled opcode!");
@@ -365,12 +381,55 @@ static unsigned getImmScale(unsigned Opc) {
   }
 }
 
+static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return 0;
+  case ARM::LDRi12:
+  case ARM::STRi12:
+  case ARM::tLDRi:
+  case ARM::tSTRi:
+  case ARM::tLDRspi:
+  case ARM::tSTRspi:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+  case ARM::VLDRS:
+  case ARM::VSTRS:
+    return 4;
+  case ARM::VLDRD:
+  case ARM::VSTRD:
+    return 8;
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::VLDMSIA:
+  case ARM::VSTMSIA:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
+  case ARM::VLDMDIA:
+  case ARM::VSTMDIA:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
+  }
+}
+
 /// Update future uses of the base register with the offset introduced
 /// due to writeback. This function only works on Thumb1.
 void
 ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
-                                   DebugLoc dl, unsigned Base,
+                                   DebugLoc DL, unsigned Base,
                                    unsigned WordOffset,
                                    ARMCC::CondCodes Pred, unsigned PredReg) {
   assert(isThumb1 && "Can only update base register uses for Thumb1!");
@@ -398,7 +457,7 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
         Offset = MO.getImm() - WordOffset * getImmScale(Opc);
 
         // If storing the base register, it needs to be reset first.
-        unsigned InstrSrcReg = MBBI->getOperand(0).getReg();
+        unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
 
         if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
           MO.setImm(Offset);
@@ -439,7 +498,7 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
 
     if (InsertSub) {
       // An instruction above couldn't be updated, so insert a sub.
-      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true)
+      AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
         .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
       return;
     }
@@ -457,31 +516,65 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
     // See PR21029.
     if (MBBI != MBB.end()) --MBBI;
     AddDefaultT1CC(
-      BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true)
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
       .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
   }
 }
 
+/// Return the first register of class \p RegClass that is not in \p Regs.
+unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) {
+  if (!RegClassInfoValid) {
+    RegClassInfo.runOnMachineFunction(*MF);
+    RegClassInfoValid = true;
+  }
+
+  for (unsigned Reg : RegClassInfo.getOrder(&RegClass))
+    if (!LiveRegs.contains(Reg))
+      return Reg;
+  return 0;
+}
+
+/// Compute live registers just before instruction \p Before (in normal schedule
+/// direction). Computes backwards so multiple queries in the same block must
+/// come in reverse order.
+void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
+    MachineBasicBlock::const_iterator Before) {
+  // Initialize if we never queried in this block.
+  if (!LiveRegsValid) {
+    LiveRegs.init(TRI);
+    LiveRegs.addLiveOuts(&MBB, true);
+    LiveRegPos = MBB.end();
+    LiveRegsValid = true;
+  }
+  // Move backward just before the "Before" position.
+  while (LiveRegPos != Before) {
+    --LiveRegPos;
+    LiveRegs.stepBackward(*LiveRegPos);
+  }
+}
+
+static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
+                        unsigned Reg) {
+  for (const std::pair<unsigned, bool> &R : Regs)
+    if (R.first == Reg)
+      return true;
+  return false;
+}
+
 /// Create and insert a LDM or STM with Base as base register and registers in
 /// Regs as the register operands that would be loaded / stored.  It returns
 /// true if the transformation is done.
-bool
-ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI,
-                          int Offset, unsigned Base, bool BaseKill,
-                          unsigned Opcode, ARMCC::CondCodes Pred,
-                          unsigned PredReg, unsigned Scratch, DebugLoc dl,
-                          ArrayRef<std::pair<unsigned, bool> > Regs,
-                          ArrayRef<unsigned> ImpDefs) {
-  // Only a single register to load / store. Don't bother.
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
+    bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
+    DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) {
   unsigned NumRegs = Regs.size();
-  if (NumRegs <= 1)
-    return false;
+  assert(NumRegs > 1);
 
   // For Thumb1 targets, it might be necessary to clobber the CPSR to merge.
   // Compute liveness information for that register to make the decision.
   bool SafeToClobberCPSR = !isThumb1 ||
-    (MBB.computeRegisterLiveness(TRI, ARM::CPSR, std::prev(MBBI), 15) ==
+    (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) ==
      MachineBasicBlock::LQR_Dead);
 
   bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
@@ -489,17 +582,14 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   // Exception: If the base register is in the input reglist, Thumb1 LDM is
   // non-writeback.
   // It's also not possible to merge an STR of the base register in Thumb1.
-  if (isThumb1)
-    for (const std::pair<unsigned, bool> &R : Regs)
-      if (Base == R.first) {
-        assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
-        if (Opcode == ARM::tLDRi) {
-          Writeback = false;
-          break;
-        } else if (Opcode == ARM::tSTRi) {
-          return false;
-        }
-      }
+  if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
+    assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
+    if (Opcode == ARM::tLDRi) {
+      Writeback = false;
+    } else if (Opcode == ARM::tSTRi) {
+      return nullptr;
+    }
+  }
 
   ARM_AM::AMSubMode Mode = ARM_AM::ia;
   // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
@@ -516,18 +606,18 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
     // Check if this is a supported opcode before inserting instructions to
     // calculate a new base register.
-    if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false;
+    if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr;
 
     // If starting offset isn't zero, insert a MI to materialize a new base.
     // But only do so if it is cost effective, i.e. merging more than two
     // loads / stores.
     if (NumRegs <= 2)
-      return false;
+      return nullptr;
 
     // On Thumb1, it's not worth materializing a new base register without
     // clobbering the CPSR (i.e. not using ADDS/SUBS).
     if (!SafeToClobberCPSR)
-      return false;
+      return nullptr;
 
     unsigned NewBase;
     if (isi32Load(Opcode)) {
@@ -535,10 +625,17 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       // use as the new base.
       NewBase = Regs[NumRegs-1].first;
     } else {
-      // Use the scratch register to use as a new base.
-      NewBase = Scratch;
+      // Find a free register that we can use as scratch register.
+      moveLiveRegsBefore(MBB, InsertBefore);
+      // The merged instruction does not exist yet but will use several Regs if
+      // it is a Store.
+      if (!isLoadSingle(Opcode))
+        for (const std::pair<unsigned, bool> &R : Regs)
+          LiveRegs.addReg(R.first);
+
+      NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass);
       if (NewBase == 0)
-        return false;
+        return nullptr;
     }
 
     int BaseOpc =
@@ -557,7 +654,12 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
 
     if (!TL->isLegalAddImmediate(Offset))
       // FIXME: Try add with register operand?
-      return false; // Probably not worth it then.
+      return nullptr; // Probably not worth it then.
+
+    // We can only append a kill flag to the add/sub input if the value is not
+    // used in the register list of the stm as well.
+    bool KillOldBase = BaseKill &&
+      (!isi32Store(Opcode) || !ContainsReg(Regs, Base));
 
     if (isThumb1) {
       // Thumb1: depending on immediate size, use either
@@ -572,43 +674,44 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
             !STI->hasV6Ops()) {
           // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
           if (Pred != ARMCC::AL)
-            return false;
-          BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVSr), NewBase)
-            .addReg(Base, getKillRegState(BaseKill));
+            return nullptr;
+          BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase)
+            .addReg(Base, getKillRegState(KillOldBase));
         } else
-          BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)
-            .addReg(Base, getKillRegState(BaseKill))
+          BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
+            .addReg(Base, getKillRegState(KillOldBase))
             .addImm(Pred).addReg(PredReg);
 
-        // Set up BaseKill and Base correctly to insert the ADDS/SUBS below.
+        // The following ADDS/SUBS becomes an update.
         Base = NewBase;
-        BaseKill = false;
+        KillOldBase = true;
       }
       if (BaseOpc == ARM::tADDrSPi) {
         assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
-        BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
-          .addReg(Base, getKillRegState(BaseKill)).addImm(Offset/4)
+        BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
+          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4)
           .addImm(Pred).addReg(PredReg);
       } else
-        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase), true)
-          .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
+        AddDefaultT1CC(
+          BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true)
+          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
           .addImm(Pred).addReg(PredReg);
     } else {
-      BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
-        .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
+      BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
+        .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
         .addImm(Pred).addReg(PredReg).addReg(0);
     }
     Base = NewBase;
     BaseKill = true; // New base is always killed straight away.
   }
 
-  bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
-                Opcode == ARM::VLDRD);
+  bool isDef = isLoadSingle(Opcode);
 
   // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
   // base register writeback.
   Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
-  if (!Opcode) return false;
+  if (!Opcode)
+    return nullptr;
 
   // Check if a Thumb1 LDM/STM merge is safe. This is the case if:
   // - There is no writeback (LDM of base register),
@@ -619,7 +722,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   // It's safe to return here since the code to materialize a new base register
   // above is also conditional on SafeToClobberCPSR.
   if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill)
-    return false;
+    return nullptr;
 
   MachineInstrBuilder MIB;
 
@@ -628,7 +731,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       // Update tLDMIA with writeback if necessary.
       Opcode = ARM::tLDMIA_UPD;
 
-    MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
+    MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
 
     // Thumb1: we might need to set base writeback when building the MI.
     MIB.addReg(Base, getDefRegState(true))
@@ -637,381 +740,257 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
     // The base isn't dead after a merged instruction with writeback.
     // Insert a sub instruction after the newly formed instruction to reset.
     if (!BaseKill)
-      UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
+      UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg);
 
   } else {
     // No writeback, simply build the MachineInstr.
-    MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
+    MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
     MIB.addReg(Base, getKillRegState(BaseKill));
   }
 
   MIB.addImm(Pred).addReg(PredReg);
 
   for (const std::pair<unsigned, bool> &R : Regs)
-    MIB = MIB.addReg(R.first, getDefRegState(isDef)
-                     | getKillRegState(R.second));
+    MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second));
 
-  // Add implicit defs for super-registers.
-  for (unsigned ImpDef : ImpDefs)
-    MIB.addReg(ImpDef, RegState::ImplicitDefine);
-
-  return true;
+  return MIB.getInstr();
 }
 
-/// Find all instructions using a given imp-def within a range.
-///
-/// We are trying to combine a range of instructions, one of which (located at
-/// position RangeBegin) implicitly defines a register. The final LDM/STM will
-/// be placed at RangeEnd, and so any uses of this definition between RangeStart
-/// and RangeEnd must be modified to use an undefined value.
-///
-/// The live range continues until we find a second definition or one of the
-/// uses we find is a kill. Unfortunately MemOps is not sorted by Position, so
-/// we must consider all uses and decide which are relevant in a second pass.
-void ARMLoadStoreOpt::findUsesOfImpDef(
-    SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, const MemOpQueue &MemOps,
-    unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd) {
-  std::map<unsigned, MachineOperand *> Uses;
-  unsigned LastLivePos = RangeEnd;
-
-  // First we find all uses of this register with Position between RangeBegin
-  // and RangeEnd, any or all of these could be uses of a definition at
-  // RangeBegin. We also record the latest position a definition at RangeBegin
-  // would be considered live.
-  for (unsigned i = 0; i < MemOps.size(); ++i) {
-    MachineInstr &MI = *MemOps[i].MBBI;
-    unsigned MIPosition = MemOps[i].Position;
-    if (MIPosition <= RangeBegin || MIPosition > RangeEnd)
-      continue;
-
-    // If this instruction defines the register, then any later use will be of
-    // that definition rather than ours.
-    if (MI.definesRegister(DefReg))
-      LastLivePos = std::min(LastLivePos, MIPosition);
-
-    MachineOperand *UseOp = MI.findRegisterUseOperand(DefReg);
-    if (!UseOp)
-      continue;
-
-    // If this instruction kills the register then (assuming liveness is
-    // correct when we start) we don't need to think about anything after here.
-    if (UseOp->isKill())
-      LastLivePos = std::min(LastLivePos, MIPosition);
-
-    Uses[MIPosition] = UseOp;
-  }
-
-  // Now we traverse the list of all uses, and append the ones that actually use
-  // our definition to the requested list.
-  for (std::map<unsigned, MachineOperand *>::iterator I = Uses.begin(),
-                                                      E = Uses.end();
-       I != E; ++I) {
-    // List is sorted by position so once we've found one out of range there
-    // will be no more to consider.
-    if (I->first > LastLivePos)
-      break;
-    UsesOfImpDefs.push_back(I->second);
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
+    bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
+    DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const {
+  bool IsLoad = isi32Load(Opcode);
+  assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
+  unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
+
+  assert(Regs.size() == 2);
+  MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL,
+                                    TII->get(LoadStoreOpcode));
+  if (IsLoad) {
+    MIB.addReg(Regs[0].first, RegState::Define)
+       .addReg(Regs[1].first, RegState::Define);
+  } else {
+    MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second))
+       .addReg(Regs[1].first, getKillRegState(Regs[1].second));
   }
+  MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+  return MIB.getInstr();
 }
 
 /// Call MergeOps and update MemOps and merges accordingly on success.
-void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
-                                     MemOpQueue &memOps,
-                                     unsigned memOpsBegin, unsigned memOpsEnd,
-                                     unsigned insertAfter, int Offset,
-                                     unsigned Base, bool BaseKill,
-                                     unsigned Opcode,
-                                     ARMCC::CondCodes Pred, unsigned PredReg,
-                                     unsigned Scratch,
-                                     DebugLoc dl,
-                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
-  // First calculate which of the registers should be killed by the merged
-  // instruction.
-  const unsigned insertPos = memOps[insertAfter].Position;
-  SmallSet<unsigned, 4> KilledRegs;
-  DenseMap<unsigned, unsigned> Killer;
-  for (unsigned i = 0, e = memOps.size(); i != e; ++i) {
-    if (i == memOpsBegin) {
-      i = memOpsEnd;
-      if (i == e)
-        break;
-    }
-    if (memOps[i].Position < insertPos && memOps[i].isKill) {
-      unsigned Reg = memOps[i].Reg;
+MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
+  const MachineInstr *First = Cand.Instrs.front();
+  unsigned Opcode = First->getOpcode();
+  bool IsLoad = isLoadSingle(Opcode);
+  SmallVector<std::pair<unsigned, bool>, 8> Regs;
+  SmallVector<unsigned, 4> ImpDefs;
+  DenseSet<unsigned> KilledRegs;
+  // Determine list of registers and list of implicit super-register defs.
+  for (const MachineInstr *MI : Cand.Instrs) {
+    const MachineOperand &MO = getLoadStoreRegOp(*MI);
+    unsigned Reg = MO.getReg();
+    bool IsKill = MO.isKill();
+    if (IsKill)
       KilledRegs.insert(Reg);
-      Killer[Reg] = i;
+    Regs.push_back(std::make_pair(Reg, IsKill));
+
+    if (IsLoad) {
+      // Collect any implicit defs of super-registers, after merging we can't
+      // be sure anymore that we properly preserved these live ranges and must
+      // removed these implicit operands.
+      for (const MachineOperand &MO : MI->implicit_operands()) {
+        if (!MO.isReg() || !MO.isDef() || MO.isDead())
+          continue;
+        assert(MO.isImplicit());
+        unsigned DefReg = MO.getReg();
+
+        if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end())
+          continue;
+        // We can ignore cases where the super-reg is read and written.
+        if (MI->readsRegister(DefReg))
+          continue;
+        ImpDefs.push_back(DefReg);
+      }
     }
   }
 
-  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
-    MachineOperand &TransferOp = memOps[i].MBBI->getOperand(0);
-    if (TransferOp.isUse() && TransferOp.getReg() == Base)
-      BaseKill = false;
+  // Attempt the merge.
+  typedef MachineBasicBlock::iterator iterator;
+  MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
+  iterator InsertBefore = std::next(iterator(LatestMI));
+  MachineBasicBlock &MBB = *LatestMI->getParent();
+  unsigned Offset = getMemoryOpOffset(First);
+  unsigned Base = getLoadStoreBaseOp(*First).getReg();
+  bool BaseKill = LatestMI->killsRegister(Base);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg);
+  DebugLoc DL = First->getDebugLoc();
+  MachineInstr *Merged = nullptr;
+  if (Cand.CanMergeToLSDouble)
+    Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill,
+                                   Opcode, Pred, PredReg, DL, Regs);
+  if (!Merged && Cand.CanMergeToLSMulti)
+    Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill,
+                                  Opcode, Pred, PredReg, DL, Regs);
+  if (!Merged)
+    return nullptr;
+
+  // Determine earliest instruction that will get removed. We then keep an
+  // iterator just above it so the following erases don't invalidated it.
+  iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]);
+  bool EarliestAtBegin = false;
+  if (EarliestI == MBB.begin()) {
+    EarliestAtBegin = true;
+  } else {
+    EarliestI = std::prev(EarliestI);
   }
 
-  SmallVector<std::pair<unsigned, bool>, 8> Regs;
-  SmallVector<unsigned, 8> ImpDefs;
-  SmallVector<MachineOperand *, 8> UsesOfImpDefs;
-  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
-    unsigned Reg = memOps[i].Reg;
-    // If we are inserting the merged operation after an operation that
-    // uses the same register, make sure to transfer any kill flag.
-    bool isKill = memOps[i].isKill || KilledRegs.count(Reg);
-    Regs.push_back(std::make_pair(Reg, isKill));
-
-    // Collect any implicit defs of super-registers. They must be preserved.
-    for (const MachineOperand &MO : memOps[i].MBBI->operands()) {
-      if (!MO.isReg() || !MO.isDef() || !MO.isImplicit() || MO.isDead())
-        continue;
-      unsigned DefReg = MO.getReg();
-      if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end())
-        ImpDefs.push_back(DefReg);
-
-      // There may be other uses of the definition between this instruction and
-      // the eventual LDM/STM position. These should be marked undef if the
-      // merge takes place.
-      findUsesOfImpDef(UsesOfImpDefs, memOps, DefReg, memOps[i].Position,
-                       insertPos);
+  // Remove instructions which have been merged.
+  for (MachineInstr *MI : Cand.Instrs)
+    MBB.erase(MI);
+
+  // Determine range between the earliest removed instruction and the new one.
+  if (EarliestAtBegin)
+    EarliestI = MBB.begin();
+  else
+    EarliestI = std::next(EarliestI);
+  auto FixupRange = make_range(EarliestI, iterator(Merged));
+
+  if (isLoadSingle(Opcode)) {
+    // If the previous loads defined a super-reg, then we have to mark earlier
+    // operands undef; Replicate the super-reg def on the merged instruction.
+    for (MachineInstr &MI : FixupRange) {
+      for (unsigned &ImpDefReg : ImpDefs) {
+        for (MachineOperand &MO : MI.implicit_operands()) {
+          if (!MO.isReg() || MO.getReg() != ImpDefReg)
+            continue;
+          if (MO.readsReg())
+            MO.setIsUndef();
+          else if (MO.isDef())
+            ImpDefReg = 0;
+        }
+      }
     }
-  }
 
-  // Try to do the merge.
-  MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
-  ++Loc;
-  if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
-                Pred, PredReg, Scratch, dl, Regs, ImpDefs))
-    return;
-
-  // Merge succeeded, update records.
-  Merges.push_back(std::prev(Loc));
-
-  // In gathering loads together, we may have moved the imp-def of a register
-  // past one of its uses. This is OK, since we know better than the rest of
-  // LLVM what's OK with ARM loads and stores; but we still have to adjust the
-  // affected uses.
-  for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(),
-                                                   E = UsesOfImpDefs.end();
-                                                   I != E; ++I)
-    (*I)->setIsUndef();
-
-  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
-    // Remove kill flags from any memops that come before insertPos.
-    if (Regs[i-memOpsBegin].second) {
-      unsigned Reg = Regs[i-memOpsBegin].first;
-      if (KilledRegs.count(Reg)) {
-        unsigned j = Killer[Reg];
-        int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true);
-        assert(Idx >= 0 && "Cannot find killing operand");
-        memOps[j].MBBI->getOperand(Idx).setIsKill(false);
-        memOps[j].isKill = false;
+    MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged);
+    for (unsigned ImpDef : ImpDefs)
+      MIB.addReg(ImpDef, RegState::ImplicitDefine);
+  } else {
+    // Remove kill flags: We are possibly storing the values later now.
+    assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD);
+    for (MachineInstr &MI : FixupRange) {
+      for (MachineOperand &MO : MI.uses()) {
+        if (!MO.isReg() || !MO.isKill())
+          continue;
+        if (KilledRegs.count(MO.getReg()))
+          MO.setIsKill(false);
       }
-      memOps[i].isKill = true;
     }
-    MBB.erase(memOps[i].MBBI);
-    // Update this memop to refer to the merged instruction.
-    // We may need to move kill flags again.
-    memOps[i].Merged = true;
-    memOps[i].MBBI = Merges.back();
-    memOps[i].Position = insertPos;
+    assert(ImpDefs.empty());
   }
 
-  // Update memOps offsets, since they may have been modified by MergeOps.
-  for (auto &MemOp : memOps) {
-    MemOp.Offset = getMemoryOpOffset(MemOp.MBBI);
-  }
+  return Merged;
 }
 
-/// Merge a number of load / store instructions into one or more load / store
-/// multiple instructions.
-void
-ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
-                         unsigned Base, unsigned Opcode, unsigned Size,
-                         ARMCC::CondCodes Pred, unsigned PredReg,
-                         unsigned Scratch, MemOpQueue &MemOps,
-                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
-  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
-  int Offset = MemOps[SIndex].Offset;
-  int SOffset = Offset;
-  unsigned insertAfter = SIndex;
-  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
-  DebugLoc dl = Loc->getDebugLoc();
-  const MachineOperand &PMO = Loc->getOperand(0);
-  unsigned PReg = PMO.getReg();
-  unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
-  unsigned Count = 1;
-  unsigned Limit = ~0U;
-  bool BaseKill = false;
-  // vldm / vstm limit are 32 for S variants, 16 for D variants.
+static bool isValidLSDoubleOffset(int Offset) {
+  unsigned Value = abs(Offset);
+  // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally
+  // multiplied by 4.
+  return (Value % 4) == 0 && Value < 1024;
+}
 
-  switch (Opcode) {
-  default: break;
-  case ARM::VSTRS:
-    Limit = 32;
-    break;
-  case ARM::VSTRD:
-    Limit = 16;
-    break;
-  case ARM::VLDRD:
-    Limit = 16;
-    break;
-  case ARM::VLDRS:
-    Limit = 32;
-    break;
-  }
+/// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
+void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
+  const MachineInstr *FirstMI = MemOps[0].MI;
+  unsigned Opcode = FirstMI->getOpcode();
+  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
+  unsigned Size = getLSMultipleTransferSize(FirstMI);
+
+  unsigned SIndex = 0;
+  unsigned EIndex = MemOps.size();
+  do {
+    // Look at the first instruction.
+    const MachineInstr *MI = MemOps[SIndex].MI;
+    int Offset = MemOps[SIndex].Offset;
+    const MachineOperand &PMO = getLoadStoreRegOp(*MI);
+    unsigned PReg = PMO.getReg();
+    unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
+    unsigned Latest = SIndex;
+    unsigned Earliest = SIndex;
+    unsigned Count = 1;
+    bool CanMergeToLSDouble =
+      STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset);
+    // ARM errata 602117: LDRD with base in list may result in incorrect base
+    // register when interrupted or faulted.
+    if (STI->isCortexM3() && isi32Load(Opcode) &&
+        PReg == getLoadStoreBaseOp(*MI).getReg())
+      CanMergeToLSDouble = false;
+
+    bool CanMergeToLSMulti = true;
+    // On swift vldm/vstm starting with an odd register number as that needs
+    // more uops than single vldrs.
+    if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
+      CanMergeToLSMulti = false;
+
+    // Merge following instructions where possible.
+    for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
+      int NewOffset = MemOps[I].Offset;
+      if (NewOffset != Offset + (int)Size)
+        break;
+      const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
+      unsigned Reg = MO.getReg();
+      unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
+
+      // See if the current load/store may be part of a multi load/store.
+      bool PartOfLSMulti = CanMergeToLSMulti;
+      if (PartOfLSMulti) {
+        // Cannot load from SP
+        if (Reg == ARM::SP)
+          PartOfLSMulti = false;
+        // Register numbers must be in ascending order.
+        else if (RegNum <= PRegNum)
+          PartOfLSMulti = false;
+        // For VFP / NEON load/store multiples, the registers must be
+        // consecutive and within the limit on the number of registers per
+        // instruction.
+        else if (!isNotVFP && RegNum != PRegNum+1)
+          PartOfLSMulti = false;
+      }
+      // See if the current load/store may be part of a double load/store.
+      bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1;
 
-  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
-    int NewOffset = MemOps[i].Offset;
-    const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
-    unsigned Reg = MO.getReg();
-    unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
-    // Register numbers must be in ascending order. For VFP / NEON load and
-    // store multiples, the registers must also be consecutive and within the
-    // limit on the number of registers per instruction.
-    if (Reg != ARM::SP &&
-        NewOffset == Offset + (int)Size &&
-        ((isNotVFP && RegNum > PRegNum) ||
-         ((Count < Limit) && RegNum == PRegNum+1)) &&
-        // On Swift we don't want vldm/vstm to start with a odd register num
-        // because Q register unaligned vldm/vstm need more uops.
-        (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) {
+      if (!PartOfLSMulti && !PartOfLSDouble)
+        break;
+      CanMergeToLSMulti &= PartOfLSMulti;
+      CanMergeToLSDouble &= PartOfLSDouble;
+      // Track MemOp with latest and earliest position (Positions are
+      // counted in reverse).
+      unsigned Position = MemOps[I].Position;
+      if (Position < MemOps[Latest].Position)
+        Latest = I;
+      else if (Position > MemOps[Earliest].Position)
+        Earliest = I;
+      // Prepare for next MemOp.
       Offset += Size;
       PRegNum = RegNum;
-      ++Count;
-    } else {
-      // Can't merge this in. Try merge the earlier ones first.
-      // We need to compute BaseKill here because the MemOps may have been
-      // reordered.
-      BaseKill = Loc->killsRegister(Base);
-
-      MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, Base,
-                     BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
-      MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
-                   MemOps, Merges);
-      return;
     }
 
-    if (MemOps[i].Position > MemOps[insertAfter].Position) {
-      insertAfter = i;
-      Loc = MemOps[i].MBBI;
-    }
-  }
-
-  BaseKill =  Loc->killsRegister(Base);
-  MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
-                 Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
-}
-
-static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
-                                unsigned Bytes, unsigned Limit,
-                                ARMCC::CondCodes Pred, unsigned PredReg) {
-  unsigned MyPredReg = 0;
-  if (!MI)
-    return false;
-
-  bool CheckCPSRDef = false;
-  switch (MI->getOpcode()) {
-  default: return false;
-  case ARM::tSUBi8:
-  case ARM::t2SUBri:
-  case ARM::SUBri:
-    CheckCPSRDef = true;
-    break;
-  case ARM::tSUBspi:
-    break;
-  }
-
-  // Make sure the offset fits in 8 bits.
-  if (Bytes == 0 || (Limit && Bytes >= Limit))
-    return false;
-
-  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi ||
-                    MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME
-  if (!(MI->getOperand(0).getReg() == Base &&
-        MI->getOperand(1).getReg() == Base &&
-        (MI->getOperand(2).getImm() * Scale) == Bytes &&
-        getInstrPredicate(MI, MyPredReg) == Pred &&
-        MyPredReg == PredReg))
-    return false;
-
-  return CheckCPSRDef ? !definesCPSR(MI) : true;
-}
-
-static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
-                                unsigned Bytes, unsigned Limit,
-                                ARMCC::CondCodes Pred, unsigned PredReg) {
-  unsigned MyPredReg = 0;
-  if (!MI)
-    return false;
-
-  bool CheckCPSRDef = false;
-  switch (MI->getOpcode()) {
-  default: return false;
-  case ARM::tADDi8:
-  case ARM::t2ADDri:
-  case ARM::ADDri:
-    CheckCPSRDef = true;
-    break;
-  case ARM::tADDspi:
-    break;
-  }
-
-  if (Bytes == 0 || (Limit && Bytes >= Limit))
-    // Make sure the offset fits in 8 bits.
-    return false;
-
-  unsigned Scale = (MI->getOpcode() == ARM::tADDspi ||
-                    MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME
-  if (!(MI->getOperand(0).getReg() == Base &&
-        MI->getOperand(1).getReg() == Base &&
-        (MI->getOperand(2).getImm() * Scale) == Bytes &&
-        getInstrPredicate(MI, MyPredReg) == Pred &&
-        MyPredReg == PredReg))
-    return false;
-
-  return CheckCPSRDef ? !definesCPSR(MI) : true;
-}
-
-static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
-  default: return 0;
-  case ARM::LDRi12:
-  case ARM::STRi12:
-  case ARM::tLDRi:
-  case ARM::tSTRi:
-  case ARM::tLDRspi:
-  case ARM::tSTRspi:
-  case ARM::t2LDRi8:
-  case ARM::t2LDRi12:
-  case ARM::t2STRi8:
-  case ARM::t2STRi12:
-  case ARM::VLDRS:
-  case ARM::VSTRS:
-    return 4;
-  case ARM::VLDRD:
-  case ARM::VSTRD:
-    return 8;
-  case ARM::LDMIA:
-  case ARM::LDMDA:
-  case ARM::LDMDB:
-  case ARM::LDMIB:
-  case ARM::STMIA:
-  case ARM::STMDA:
-  case ARM::STMDB:
-  case ARM::STMIB:
-  case ARM::tLDMIA:
-  case ARM::tLDMIA_UPD:
-  case ARM::tSTMIA_UPD:
-  case ARM::t2LDMIA:
-  case ARM::t2LDMDB:
-  case ARM::t2STMIA:
-  case ARM::t2STMDB:
-  case ARM::VLDMSIA:
-  case ARM::VSTMSIA:
-    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
-  case ARM::VLDMDIA:
-  case ARM::VSTMDIA:
-    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
-  }
+    // Form a candidate from the Ops collected so far.
+    MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate;
+    for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C)
+      Candidate->Instrs.push_back(MemOps[C].MI);
+    Candidate->LatestMIIdx = Latest - SIndex;
+    Candidate->EarliestMIIdx = Earliest - SIndex;
+    Candidate->InsertPos = MemOps[Latest].Position;
+    if (Count == 1)
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+    Candidate->CanMergeToLSMulti = CanMergeToLSMulti;
+    Candidate->CanMergeToLSDouble = CanMergeToLSDouble;
+    Candidates.push_back(Candidate);
+    // Continue after the chain.
+    SIndex += Count;
+  } while (SIndex < EIndex);
 }
 
 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
@@ -1081,6 +1060,75 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
   }
 }
 
+/// Check if the given instruction increments or decrements a register and
+/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
+/// generated by the instruction are possibly read as well.
+static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
+                                  ARMCC::CondCodes Pred, unsigned PredReg) {
+  bool CheckCPSRDef;
+  int Scale;
+  switch (MI.getOpcode()) {
+  case ARM::tADDi8:  Scale =  4; CheckCPSRDef = true; break;
+  case ARM::tSUBi8:  Scale = -4; CheckCPSRDef = true; break;
+  case ARM::t2SUBri:
+  case ARM::SUBri:   Scale = -1; CheckCPSRDef = true; break;
+  case ARM::t2ADDri:
+  case ARM::ADDri:   Scale =  1; CheckCPSRDef = true; break;
+  case ARM::tADDspi: Scale =  4; CheckCPSRDef = false; break;
+  case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
+  default: return 0;
+  }
+
+  unsigned MIPredReg;
+  if (MI.getOperand(0).getReg() != Reg ||
+      MI.getOperand(1).getReg() != Reg ||
+      getInstrPredicate(&MI, MIPredReg) != Pred ||
+      MIPredReg != PredReg)
+    return 0;
+
+  if (CheckCPSRDef && definesCPSR(&MI))
+    return 0;
+  return MI.getOperand(2).getImm() * Scale;
+}
+
+/// Searches for an increment or decrement of \p Reg before \p MBBI.
+static MachineBasicBlock::iterator
+findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
+                 ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+  Offset = 0;
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (MBBI == BeginMBBI)
+    return EndMBBI;
+
+  // Skip debug values.
+  MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+  while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
+    --PrevMBBI;
+
+  Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
+  return Offset == 0 ? EndMBBI : PrevMBBI;
+}
+
+/// Searches for a increment or decrement of \p Reg after \p MBBI.
+static MachineBasicBlock::iterator
+findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
+                ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+  Offset = 0;
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+  // Skip debug values.
+  while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+    ++NextMBBI;
+  if (NextMBBI == EndMBBI)
+    return EndMBBI;
+
+  Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
+  return Offset == 0 ? EndMBBI : NextMBBI;
+}
+
 /// Fold proceeding/trailing inc/dec of base register into the
 /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
 ///
@@ -1093,21 +1141,17 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
 /// ldmia rn, <ra, rb, rc>
 /// =>
 /// ldmdb rn!, <ra, rb, rc>
-bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator MBBI,
-                                               bool &Advance,
-                                               MachineBasicBlock::iterator &I) {
+bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   // Thumb1 is already using updating loads/stores.
   if (isThumb1) return false;
 
-  MachineInstr *MI = MBBI;
-  unsigned Base = MI->getOperand(0).getReg();
-  bool BaseKill = MI->getOperand(0).isKill();
-  unsigned Bytes = getLSMultipleTransferSize(MI);
+  const MachineOperand &BaseOP = MI->getOperand(0);
+  unsigned Base = BaseOP.getReg();
+  bool BaseKill = BaseOP.isKill();
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   unsigned Opcode = MI->getOpcode();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc DL = MI->getDebugLoc();
 
   // Can't use an updating ld/st if the base register is also a dest
   // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
@@ -1115,55 +1159,27 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
     if (MI->getOperand(i).getReg() == Base)
       return false;
 
-  bool DoMerge = false;
+  int Bytes = getLSMultipleTransferSize(MI);
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineBasicBlock::iterator MBBI(MI);
+  int Offset;
+  MachineBasicBlock::iterator MergeInstr
+    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
   ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
-
-  // Try merging with the previous instruction.
-  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
-  if (MBBI != BeginMBBI) {
-    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
-    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
-      --PrevMBBI;
-    if (Mode == ARM_AM::ia &&
-        isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
-      Mode = ARM_AM::db;
-      DoMerge = true;
-    } else if (Mode == ARM_AM::ib &&
-               isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
-      Mode = ARM_AM::da;
-      DoMerge = true;
-    }
-    if (DoMerge)
-      MBB.erase(PrevMBBI);
-  }
-
-  // Try merging with the next instruction.
-  MachineBasicBlock::iterator EndMBBI = MBB.end();
-  if (!DoMerge && MBBI != EndMBBI) {
-    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
-    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
-      ++NextMBBI;
-    if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
-        isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
-      DoMerge = true;
-    } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
-               isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
-      DoMerge = true;
-    }
-    if (DoMerge) {
-      if (NextMBBI == I) {
-        Advance = true;
-        ++I;
-      }
-      MBB.erase(NextMBBI);
-    }
+  if (Mode == ARM_AM::ia && Offset == -Bytes) {
+    Mode = ARM_AM::db;
+  } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
+    Mode = ARM_AM::da;
+  } else {
+    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+    if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
+        ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes))
+      return false;
   }
-
-  if (!DoMerge)
-    return false;
+  MBB.erase(MergeInstr);
 
   unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
-  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
     .addReg(Base, getDefRegState(true)) // WB base register
     .addReg(Base, getKillRegState(BaseKill))
     .addImm(Pred).addReg(PredReg);
@@ -1231,21 +1247,15 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
 
 /// Fold proceeding/trailing inc/dec of base register into the
 /// LDR/STR/FLD{D|S}/FST{D|S} op when possible:
-bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator MBBI,
-                                               const TargetInstrInfo *TII,
-                                               bool &Advance,
-                                               MachineBasicBlock::iterator &I) {
+bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
   // Thumb1 doesn't have updating LDR/STR.
   // FIXME: Use LDM/STM with single register instead.
   if (isThumb1) return false;
 
-  MachineInstr *MI = MBBI;
-  unsigned Base = MI->getOperand(1).getReg();
-  bool BaseKill = MI->getOperand(1).isKill();
-  unsigned Bytes = getLSMultipleTransferSize(MI);
+  unsigned Base = getLoadStoreBaseOp(*MI).getReg();
+  bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
   unsigned Opcode = MI->getOpcode();
-  DebugLoc dl = MI->getDebugLoc();
+  DebugLoc DL = MI->getDebugLoc();
   bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
                 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
   bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
@@ -1255,7 +1265,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
     return false;
 
-  bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
   // Can't do the merge if the destination register is the same as the would-be
   // writeback register.
   if (MI->getOperand(0).getReg() == Base)
@@ -1263,64 +1272,38 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
 
   unsigned PredReg = 0;
   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
-  bool DoMerge = false;
-  ARM_AM::AddrOpc AddSub = ARM_AM::add;
-  unsigned NewOpc = 0;
-  // AM2 - 12 bits, thumb2 - 8 bits.
-  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
-
-  // Try merging with the previous instruction.
-  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
-  if (MBBI != BeginMBBI) {
-    MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
-    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
-      --PrevMBBI;
-    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
-      DoMerge = true;
-      AddSub = ARM_AM::sub;
-    } else if (!isAM5 &&
-               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
-      DoMerge = true;
-    }
-    if (DoMerge) {
-      NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
-      MBB.erase(PrevMBBI);
-    }
-  }
-
-  // Try merging with the next instruction.
-  MachineBasicBlock::iterator EndMBBI = MBB.end();
-  if (!DoMerge && MBBI != EndMBBI) {
-    MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
-    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
-      ++NextMBBI;
-    if (!isAM5 &&
-        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
-      DoMerge = true;
-      AddSub = ARM_AM::sub;
-    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
-      DoMerge = true;
-    }
-    if (DoMerge) {
-      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
-      if (NextMBBI == I) {
-        Advance = true;
-        ++I;
-      }
-      MBB.erase(NextMBBI);
-    }
+  int Bytes = getLSMultipleTransferSize(MI);
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineBasicBlock::iterator MBBI(MI);
+  int Offset;
+  MachineBasicBlock::iterator MergeInstr
+    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
+  unsigned NewOpc;
+  if (!isAM5 && Offset == Bytes) {
+    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+  } else if (Offset == -Bytes) {
+    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+  } else {
+    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+    if (Offset == Bytes) {
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+    } else if (!isAM5 && Offset == -Bytes) {
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+    } else
+      return false;
   }
+  MBB.erase(MergeInstr);
 
-  if (!DoMerge)
-    return false;
+  ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
 
+  bool isLd = isLoadSingle(Opcode);
   if (isAM5) {
     // VLDM[SD]_UPD, VSTM[SD]_UPD
     // (There are no base-updating versions of VLDR/VSTR instructions, but the
     // updating load/store-multiple instructions can be used with only one
     // register.)
     MachineOperand &MO = MI->getOperand(0);
-    BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
+    BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
       .addReg(Base, getDefRegState(true)) // WB base register
       .addReg(Base, getKillRegState(isLd ? BaseKill : false))
       .addImm(Pred).addReg(PredReg)
@@ -1330,20 +1313,18 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
     if (isAM2) {
       // LDR_PRE, LDR_POST
       if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
-        int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
-        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
           .addReg(Base, RegState::Define)
           .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
       } else {
-        int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
-        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
           .addReg(Base, RegState::Define)
-          .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+          .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
       }
     } else {
-      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2LDR_PRE, t2LDR_POST
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
         .addReg(Base, RegState::Define)
         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
     }
@@ -1353,15 +1334,14 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
     // the vestigal zero-reg offset register. When that's fixed, this clause
     // can be removed entirely.
     if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
-      int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+      int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
       // STR_PRE, STR_POST
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
-        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+        .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
     } else {
-      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2STR_PRE, t2STR_POST
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
     }
@@ -1371,6 +1351,66 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   return true;
 }
 
+bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
+         "Must have t2STRDi8 or t2LDRDi8");
+  if (MI.getOperand(3).getImm() != 0)
+    return false;
+
+  // Behaviour for writeback is undefined if base register is the same as one
+  // of the others.
+  const MachineOperand &BaseOp = MI.getOperand(2);
+  unsigned Base = BaseOp.getReg();
+  const MachineOperand &Reg0Op = MI.getOperand(0);
+  const MachineOperand &Reg1Op = MI.getOperand(1);
+  if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
+    return false;
+
+  unsigned PredReg;
+  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
+  MachineBasicBlock::iterator MBBI(MI);
+  MachineBasicBlock &MBB = *MI.getParent();
+  int Offset;
+  MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
+                                                            PredReg, Offset);
+  unsigned NewOpc;
+  if (Offset == 8 || Offset == -8) {
+    NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
+  } else {
+    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+    if (Offset == 8 || Offset == -8) {
+      NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
+    } else
+      return false;
+  }
+  MBB.erase(MergeInstr);
+
+  DebugLoc DL = MI.getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
+  if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
+    MIB.addOperand(Reg0Op).addOperand(Reg1Op)
+       .addReg(BaseOp.getReg(), RegState::Define);
+  } else {
+    assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
+    MIB.addReg(BaseOp.getReg(), RegState::Define)
+       .addOperand(Reg0Op).addOperand(Reg1Op);
+  }
+  MIB.addReg(BaseOp.getReg(), RegState::Kill)
+     .addImm(Offset).addImm(Pred).addReg(PredReg);
+  assert(TII->get(Opcode).getNumOperands() == 6 &&
+         TII->get(NewOpc).getNumOperands() == 7 &&
+         "Unexpected number of operands in Opcode specification.");
+
+  // Transfer implicit operands.
+  for (const MachineOperand &MO : MI.implicit_operands())
+    MIB.addOperand(MO);
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MBB.erase(MBBI);
+  return true;
+}
+
 /// Returns true if instruction is a memory operation that this pass is capable
 /// of operating on.
 static bool isMemoryOp(const MachineInstr *MI) {
@@ -1426,26 +1466,10 @@ static bool isMemoryOp(const MachineInstr *MI) {
   return false;
 }
 
-/// Advance register scavenger to just before the earliest memory op that is
-/// being merged.
-void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
-  MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
-  unsigned Position = MemOps[0].Position;
-  for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
-    if (MemOps[i].Position < Position) {
-      Position = MemOps[i].Position;
-      Loc = MemOps[i].MBBI;
-    }
-  }
-
-  if (Loc != MBB.begin())
-    RS->forward(std::prev(Loc));
-}
-
 static void InsertLDR_STR(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator &MBBI,
                           int Offset, bool isDef,
-                          DebugLoc dl, unsigned NewOpc,
+                          DebugLoc DL, unsigned NewOpc,
                           unsigned Reg, bool RegDeadKill, bool RegUndef,
                           unsigned BaseReg, bool BaseKill, bool BaseUndef,
                           bool OffKill, bool OffUndef,
@@ -1491,7 +1515,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
   if (!Errata602117 && !NonConsecutiveRegs)
     return false;
 
-  MachineBasicBlock::iterator NewBBI = MBBI;
   bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
   bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
   bool EvenDeadKill = isLd ?
@@ -1531,7 +1554,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                 getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
       ++NumSTRD2STM;
     }
-    NewBBI = std::prev(MBBI);
   } else {
     // Split into two instructions.
     unsigned NewOpc = (isLd)
@@ -1553,7 +1575,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                     OddReg, OddDeadKill, false,
                     BaseReg, false, BaseUndef, false, OffUndef,
                     Pred, PredReg, TII, isT2);
-      NewBBI = std::prev(MBBI);
       InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
                     EvenReg, EvenDeadKill, false,
                     BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
@@ -1573,7 +1594,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                     EvenReg, EvenDeadKill, EvenUndef,
                     BaseReg, false, BaseUndef, false, OffUndef,
                     Pred, PredReg, TII, isT2);
-      NewBBI = std::prev(MBBI);
       InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
                     OddReg, OddDeadKill, OddUndef,
                     BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
@@ -1585,191 +1605,160 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
       ++NumSTRD2STR;
   }
 
-  MBB.erase(MI);
-  MBBI = NewBBI;
+  MBBI = MBB.erase(MBBI);
   return true;
 }
 
 /// An optimization pass to turn multiple LDR / STR ops of the same base and
 /// incrementing offset into LDM / STM ops.
 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
-  unsigned NumMerges = 0;
-  unsigned NumMemOps = 0;
   MemOpQueue MemOps;
   unsigned CurrBase = 0;
   unsigned CurrOpc = ~0u;
-  unsigned CurrSize = 0;
   ARMCC::CondCodes CurrPred = ARMCC::AL;
-  unsigned CurrPredReg = 0;
   unsigned Position = 0;
-  SmallVector<MachineBasicBlock::iterator,4> Merges;
-
-  RS->enterBasicBlock(&MBB);
-  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-  while (MBBI != E) {
+  assert(Candidates.size() == 0);
+  assert(MergeBaseCandidates.size() == 0);
+  LiveRegsValid = false;
+
+  for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
+       I = MBBI) {
+    // The instruction in front of the iterator is the one we look at.
+    MBBI = std::prev(I);
     if (FixInvalidRegPairOp(MBB, MBBI))
       continue;
+    ++Position;
 
-    bool Advance  = false;
-    bool TryMerge = false;
-
-    bool isMemOp = isMemoryOp(MBBI);
-    if (isMemOp) {
+    if (isMemoryOp(MBBI)) {
       unsigned Opcode = MBBI->getOpcode();
-      unsigned Size = getLSMultipleTransferSize(MBBI);
       const MachineOperand &MO = MBBI->getOperand(0);
       unsigned Reg = MO.getReg();
-      bool isKill = MO.isDef() ? false : MO.isKill();
-      unsigned Base = MBBI->getOperand(1).getReg();
+      unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
       unsigned PredReg = 0;
       ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
       int Offset = getMemoryOpOffset(MBBI);
-      // Watch out for:
-      // r4 := ldr [r5]
-      // r5 := ldr [r5, #4]
-      // r6 := ldr [r5, #8]
-      //
-      // The second ldr has effectively broken the chain even though it
-      // looks like the later ldr(s) use the same base register. Try to
-      // merge the ldr's so far, including this one. But don't try to
-      // combine the following ldr(s).
-      bool Clobber = isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg();
-
-      // Watch out for:
-      // r4 := ldr [r0, #8]
-      // r4 := ldr [r0, #4]
-      //
-      // The optimization may reorder the second ldr in front of the first
-      // ldr, which violates write after write(WAW) dependence. The same as
-      // str. Try to merge inst(s) already in MemOps.
-      bool Overlap = false;
-      for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); I != E; ++I) {
-        if (TRI->regsOverlap(Reg, I->MBBI->getOperand(0).getReg())) {
-          Overlap = true;
-          break;
-        }
-      }
-
-      if (CurrBase == 0 && !Clobber) {
+      if (CurrBase == 0) {
         // Start of a new chain.
         CurrBase = Base;
         CurrOpc  = Opcode;
-        CurrSize = Size;
         CurrPred = Pred;
-        CurrPredReg = PredReg;
-        MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI));
-        ++NumMemOps;
-        Advance = true;
-      } else if (!Overlap) {
-        if (Clobber) {
-          TryMerge = true;
-          Advance = true;
+        MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
+        continue;
+      }
+      // Note: No need to match PredReg in the next if.
+      if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
+        // Watch out for:
+        //   r4 := ldr [r0, #8]
+        //   r4 := ldr [r0, #4]
+        // or
+        //   r0 := ldr [r0]
+        // If a load overrides the base register or a register loaded by
+        // another load in our chain, we cannot take this instruction.
+        bool Overlap = false;
+        if (isLoadSingle(Opcode)) {
+          Overlap = (Base == Reg);
+          if (!Overlap) {
+            for (const MemOpQueueEntry &E : MemOps) {
+              if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) {
+                Overlap = true;
+                break;
+              }
+            }
+          }
         }
 
-        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
-          // No need to match PredReg.
-          // Continue adding to the queue.
+        if (!Overlap) {
+          // Check offset and sort memory operation into the current chain.
           if (Offset > MemOps.back().Offset) {
-            MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill,
-                                             Position, MBBI));
-            ++NumMemOps;
-            Advance = true;
+            MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
+            continue;
           } else {
-            for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
-                 I != E; ++I) {
-              if (Offset < I->Offset) {
-                MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill,
-                                                 Position, MBBI));
-                ++NumMemOps;
-                Advance = true;
+            MemOpQueue::iterator MI, ME;
+            for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) {
+              if (Offset < MI->Offset) {
+                // Found a place to insert.
                 break;
-              } else if (Offset == I->Offset) {
-                // Collision! This can't be merged!
+              }
+              if (Offset == MI->Offset) {
+                // Collision, abort.
+                MI = ME;
                 break;
               }
             }
+            if (MI != MemOps.end()) {
+              MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position));
+              continue;
+            }
           }
         }
       }
-    }
 
-    if (MBBI->isDebugValue()) {
-      ++MBBI;
-      if (MBBI == E)
-        // Reach the end of the block, try merging the memory instructions.
-        TryMerge = true;
-    } else if (Advance) {
-      ++Position;
-      ++MBBI;
-      if (MBBI == E)
-        // Reach the end of the block, try merging the memory instructions.
-        TryMerge = true;
-    } else {
-      TryMerge = true;
+      // Don't advance the iterator; The op will start a new chain next.
+      MBBI = I;
+      --Position;
+      // Fallthrough to look into existing chain.
+    } else if (MBBI->isDebugValue()) {
+      continue;
+    } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
+               MBBI->getOpcode() == ARM::t2STRDi8) {
+      // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
+      // remember them because we may still be able to merge add/sub into them.
+      MergeBaseCandidates.push_back(MBBI);
     }
 
-    if (TryMerge) {
-      if (NumMemOps > 1) {
-        // Try to find a free register to use as a new base in case it's needed.
-        // First advance to the instruction just before the start of the chain.
-        AdvanceRS(MBB, MemOps);
-
-        // Find a scratch register.
-        unsigned Scratch =
-          RS->FindUnusedReg(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass);
-
-        // Process the load / store instructions.
-        RS->forward(std::prev(MBBI));
-
-        // Merge ops.
-        Merges.clear();
-        MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
-                     CurrPred, CurrPredReg, Scratch, MemOps, Merges);
-
-        // Try folding preceding/trailing base inc/dec into the generated
-        // LDM/STM ops.
-        for (unsigned i = 0, e = Merges.size(); i < e; ++i)
-          if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
-            ++NumMerges;
-        NumMerges += Merges.size();
-
-        // Try folding preceding/trailing base inc/dec into those load/store
-        // that were not merged to form LDM/STM ops.
-        for (unsigned i = 0; i != NumMemOps; ++i)
-          if (!MemOps[i].Merged)
-            if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
-              ++NumMerges;
-
-        // RS may be pointing to an instruction that's deleted.
-        RS->skipTo(std::prev(MBBI));
-      } else if (NumMemOps == 1) {
-        // Try folding preceding/trailing base inc/dec into the single
-        // load/store.
-        if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
-          ++NumMerges;
-          RS->forward(std::prev(MBBI));
-        }
-      }
 
+    // If we are here then the chain is broken; Extract candidates for a merge.
+    if (MemOps.size() > 0) {
+      FormCandidates(MemOps);
+      // Reset for the next chain.
       CurrBase = 0;
       CurrOpc = ~0u;
-      CurrSize = 0;
       CurrPred = ARMCC::AL;
-      CurrPredReg = 0;
-      if (NumMemOps) {
-        MemOps.clear();
-        NumMemOps = 0;
-      }
+      MemOps.clear();
+    }
+  }
+  if (MemOps.size() > 0)
+    FormCandidates(MemOps);
 
-      // If iterator hasn't been advanced and this is not a memory op, skip it.
-      // It can't start a new chain anyway.
-      if (!Advance && !isMemOp && MBBI != E) {
-        ++Position;
-        ++MBBI;
+  // Sort candidates so they get processed from end to begin of the basic
+  // block later; This is necessary for liveness calculation.
+  auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
+    return M0->InsertPos < M1->InsertPos;
+  };
+  std::sort(Candidates.begin(), Candidates.end(), LessThan);
+
+  // Go through list of candidates and merge.
+  bool Changed = false;
+  for (const MergeCandidate *Candidate : Candidates) {
+    if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) {
+      MachineInstr *Merged = MergeOpsUpdate(*Candidate);
+      // Merge preceding/trailing base inc/dec into the merged op.
+      if (Merged) {
+        Changed = true;
+        unsigned Opcode = Merged->getOpcode();
+        if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
+          MergeBaseUpdateLSDouble(*Merged);
+        else
+          MergeBaseUpdateLSMultiple(Merged);
+      } else {
+        for (MachineInstr *MI : Candidate->Instrs) {
+          if (MergeBaseUpdateLoadStore(MI))
+            Changed = true;
+        }
       }
+    } else {
+      assert(Candidate->Instrs.size() == 1);
+      if (MergeBaseUpdateLoadStore(Candidate->Instrs.front()))
+        Changed = true;
     }
   }
-  return NumMerges > 0;
+  Candidates.clear();
+  // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
+  for (MachineInstr *MI : MergeBaseCandidates)
+    MergeBaseUpdateLSDouble(*MI);
+  MergeBaseCandidates.clear();
+
+  return Changed;
 }
 
 /// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr")
@@ -1814,12 +1803,14 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
 }
 
 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  MF = &Fn;
   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
   TL = STI->getTargetLowering();
   AFI = Fn.getInfo<ARMFunctionInfo>();
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
-  RS = new RegScavenger();
+  MRI = &Fn.getRegInfo();
+  RegClassInfoValid = false;
   isThumb2 = AFI->isThumb2Function();
   isThumb1 = AFI->isThumbFunction() && !isThumb2;
 
@@ -1832,7 +1823,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
       Modified |= MergeReturnIntoLDM(MBB);
   }
 
-  delete RS;
+  Allocator.DestroyAll();
   return Modified;
 }
 
@@ -2219,7 +2210,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
         continue;
 
       int Opc = MI->getOpcode();
-      bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
+      bool isLd = isLoadSingle(Opc);
       unsigned Base = MI->getOperand(1).getReg();
       int Offset = getMemoryOpOffset(MI);
 
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index a59cf9851108..6cafbbb9f8eb 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -18,12 +18,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "arm-selectiondag-info"
 
-ARMSelectionDAGInfo::ARMSelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-ARMSelectionDAGInfo::~ARMSelectionDAGInfo() {
-}
-
 // Emit, if possible, a specialized version of the given Libcall. Typically this
 // means selecting the appropriately aligned version, but we also convert memset
 // of 0 into memclr.
@@ -83,7 +77,7 @@ EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl,
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  Entry.Ty = TLI->getDataLayout()->getIntPtrType(*DAG.getContext());
+  Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
   Entry.Node = Dst;
   Args.push_back(Entry);
   if (AEABILibcall == AEABI_MEMCLR) {
@@ -121,12 +115,14 @@ EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl,
     { "__aeabi_memclr",  "__aeabi_memclr4",  "__aeabi_memclr8"  }
   };
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(TLI->getLibcallCallingConv(LC),
-               Type::getVoidTy(*DAG.getContext()),
-               DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
-                                     TLI->getPointerTy()), std::move(Args), 0)
-    .setDiscardResult();
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setCallee(
+           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
+           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
+                                 TLI->getPointerTy(DAG.getDataLayout())),
+           std::move(Args), 0)
+      .setDiscardResult();
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   
   return CallResult.second;
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 1db190f41e1a..289879ee1d7e 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -37,8 +37,6 @@ namespace ARM_AM {
 
 class ARMSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit ARMSelectionDAGInfo(const DataLayout &DL);
-  ~ARMSelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 55808dfb9efe..002c3e9b6291 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -112,7 +112,6 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
     : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
       ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle),
       TargetTriple(TT), Options(TM.Options), TM(TM),
-      TSInfo(*TM.getDataLayout()),
       FrameLowering(initializeFrameLowering(CPU, FS)),
       // At this point initializeSubtargetDependencies has been called so
       // we can query directly.
@@ -172,6 +171,7 @@ void ARMSubtarget::initializeEnvironment() {
   AllowsUnalignedMem = false;
   Thumb2DSP = false;
   UseNaClTrap = false;
+  GenLongCalls = false;
   UnsafeFPMath = false;
 }
 
@@ -286,7 +286,7 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
   if (RelocM == Reloc::Static)
     return false;
 
-  bool isDecl = GV->isDeclarationForLinker();
+  bool isDef = GV->isStrongDefinitionForLinker();
 
   if (!isTargetMachO()) {
     // Extra load is needed for all externally visible.
@@ -294,34 +294,22 @@ ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
       return false;
     return true;
   } else {
-    if (RelocM == Reloc::PIC_) {
-      // If this is a strong reference to a definition, it is definitely not
-      // through a stub.
-      if (!isDecl && !GV->isWeakForLinker())
-        return false;
-
-      // Unless we have a symbol with hidden visibility, we have to go through a
-      // normal $non_lazy_ptr stub because this symbol might be resolved late.
-      if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
-        return true;
+    // If this is a strong reference to a definition, it is definitely not
+    // through a stub.
+    if (isDef)
+      return false;
+
+    // Unless we have a symbol with hidden visibility, we have to go through a
+    // normal $non_lazy_ptr stub because this symbol might be resolved late.
+    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+      return true;
 
+    if (RelocM == Reloc::PIC_) {
       // If symbol visibility is hidden, we have a stub for common symbol
       // references and external declarations.
-      if (isDecl || GV->hasCommonLinkage())
+      if (GV->isDeclarationForLinker() || GV->hasCommonLinkage())
         // Hidden $non_lazy_ptr reference.
         return true;
-
-      return false;
-    } else {
-      // If this is a strong reference to a definition, it is definitely not
-      // through a stub.
-      if (!isDecl && !GV->isWeakForLinker())
-        return false;
-
-      // Unless we have a symbol with hidden visibility, we have to go through a
-      // normal $non_lazy_ptr stub because this symbol might be resolved late.
-      if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
-        return true;
     }
   }
 
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 9909a6a6d198..dd101df9b63d 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -206,6 +206,9 @@ protected:
   /// NaCl TRAP instruction is generated instead of the regular TRAP.
   bool UseNaClTrap;
 
+  /// Generate calls via indirect call instructions.
+  bool GenLongCalls;
+
   /// Target machine allowed unsafe FP math (such as use of NEON fp)
   bool UnsafeFPMath;
 
@@ -342,6 +345,7 @@ public:
   bool hasMPExtension() const { return HasMPExtension; }
   bool hasThumb2DSP() const { return Thumb2DSP; }
   bool useNaClTrap() const { return UseNaClTrap; }
+  bool genLongCalls() const { return GenLongCalls; }
 
   bool hasFP16() const { return HasFP16; }
   bool hasD16() const { return HasD16; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 6e81bd2d349d..93495d66ae70 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -80,8 +80,7 @@ computeTargetABI(const Triple &TT, StringRef CPU,
   // FIXME: This is duplicated code from the front end and should be unified.
   if (TT.isOSBinFormatMachO()) {
     if (TT.getEnvironment() == llvm::Triple::EABI ||
-        (TT.getOS() == llvm::Triple::UnknownOS &&
-         TT.getObjectFormat() == llvm::Triple::MachO) ||
+        (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
         CPU.startswith("cortex-m")) {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
     } else {
@@ -104,8 +103,8 @@ computeTargetABI(const Triple &TT, StringRef CPU,
       TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
       break;
     default:
-      if (TT.getOS() == llvm::Triple::NetBSD)
-	TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
+      if (TT.isOSNetBSD())
+        TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
       else
 	TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
       break;
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index f4901fc24e44..2f194cf7ae06 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -61,14 +61,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
 
   if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
                                           ISD == ISD::FP_EXTEND)) {
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
     int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second);
     if (Idx != -1)
       return LT.first * NEONFltDblTbl[Idx].Cost;
   }
 
-  EVT SrcTy = TLI->getValueType(Src);
-  EVT DstTy = TLI->getValueType(Dst);
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return BaseT::getCastInstrCost(Opcode, Dst, Src);
@@ -282,8 +282,8 @@ unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
     };
 
-    EVT SelCondTy = TLI->getValueType(CondTy);
-    EVT SelValTy = TLI->getValueType(ValTy);
+    EVT SelCondTy = TLI->getValueType(DL, CondTy);
+    EVT SelValTy = TLI->getValueType(DL, ValTy);
     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
       int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
                                        SelCondTy.getSimpleVT(),
@@ -292,7 +292,7 @@ unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
         return NEONVectorSelectTbl[Idx].Cost;
     }
 
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
     return LT.first;
   }
 
@@ -353,7 +353,7 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
         {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
         {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
 
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
     int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
     if (Idx == -1)
@@ -379,7 +379,7 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 
         {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
 
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
     int Idx =
         CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
     if (Idx == -1)
@@ -395,7 +395,7 @@ unsigned ARMTTIImpl::getArithmeticInstrCost(
     TTI::OperandValueProperties Opd2PropInfo) {
 
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   const unsigned FunctionCallDivCost = 20;
   const unsigned ReciprocalDivCost = 10;
@@ -468,7 +468,7 @@ unsigned ARMTTIImpl::getArithmeticInstrCost(
 unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                      unsigned Alignment,
                                      unsigned AddressSpace) {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 
   if (Src->isVectorTy() && Alignment != 16 &&
       Src->getVectorElementType()->isDoubleTy()) {
@@ -488,12 +488,12 @@ unsigned ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
   // vldN/vstN doesn't support vector types of i64/f64 element.
-  bool EltIs64Bits = DL->getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;
+  bool EltIs64Bits = DL.getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
     unsigned NumElts = VecTy->getVectorNumElements();
     Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = TLI->getDataLayout()->getTypeAllocSize(SubVecTy);
+    unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy);
 
     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
     if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index f2e5db655ccf..84f256f73722 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -42,7 +42,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
 
 public:
   explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, Function &F)
-      : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   ARMTTIImpl(const ARMTTIImpl &Arg)
@@ -50,18 +51,6 @@ public:
   ARMTTIImpl(ARMTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  ARMTTIImpl &operator=(const ARMTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  ARMTTIImpl &operator=(ARMTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   /// \name Scalar TTI Implementations
   /// @{
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index c2db74619871..f8f0eb2d4baa 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -189,9 +189,9 @@ class ARMAsmParser : public MCTargetAsmParser {
     return getParser().Error(L, Msg, Ranges);
   }
 
-  bool validatetLDMRegList(MCInst Inst, const OperandVector &Operands,
+  bool validatetLDMRegList(const MCInst &Inst, const OperandVector &Operands,
                            unsigned ListNo, bool IsARPop = false);
-  bool validatetSTMRegList(MCInst Inst, const OperandVector &Operands,
+  bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands,
                            unsigned ListNo);
 
   int tryParseRegister();
@@ -242,6 +242,8 @@ class ARMAsmParser : public MCTargetAsmParser {
                              bool &CanAcceptCarrySet,
                              bool &CanAcceptPredicationCode);
 
+  void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting,
+                                     OperandVector &Operands);
   bool isThumb() const {
     // FIXME: Can tablegen auto-generate this?
     return STI.getFeatureBits()[ARM::ModeThumb];
@@ -5465,6 +5467,92 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
     CanAcceptPredicationCode = true;
 }
 
+// \brief Some Thumb instructions have two operand forms that are not
+// available as three operand, convert to two operand form if possible.
+//
+// FIXME: We would really like to be able to tablegen'erate this.
+void ARMAsmParser::tryConvertingToTwoOperandForm(StringRef Mnemonic,
+                                                 bool CarrySetting,
+                                                 OperandVector &Operands) {
+  if (Operands.size() != 6)
+    return;
+
+  const auto &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+        auto &Op4 = static_cast<ARMOperand &>(*Operands[4]);
+  if (!Op3.isReg() || !Op4.isReg())
+    return;
+
+  auto Op3Reg = Op3.getReg();
+  auto Op4Reg = Op4.getReg();
+
+  // For most Thumb2 cases we just generate the 3 operand form and reduce
+  // it in processInstruction(), but the 3 operand form of ADD (t2ADDrr)
+  // won't accept SP or PC so we do the transformation here taking care
+  // with immediate range in the 'add sp, sp #imm' case.
+  auto &Op5 = static_cast<ARMOperand &>(*Operands[5]);
+  if (isThumbTwo()) {
+    if (Mnemonic != "add")
+      return;
+    bool TryTransform = Op3Reg == ARM::PC || Op4Reg == ARM::PC ||
+                        (Op5.isReg() && Op5.getReg() == ARM::PC);
+    if (!TryTransform) {
+      TryTransform = (Op3Reg == ARM::SP || Op4Reg == ARM::SP ||
+                      (Op5.isReg() && Op5.getReg() == ARM::SP)) &&
+                     !(Op3Reg == ARM::SP && Op4Reg == ARM::SP &&
+                       Op5.isImm() && !Op5.isImm0_508s4());
+    }
+    if (!TryTransform)
+      return;
+  } else if (!isThumbOne())
+    return;
+
+  if (!(Mnemonic == "add" || Mnemonic == "sub" || Mnemonic == "and" ||
+        Mnemonic == "eor" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
+        Mnemonic == "asr" || Mnemonic == "adc" || Mnemonic == "sbc" ||
+        Mnemonic == "ror" || Mnemonic == "orr" || Mnemonic == "bic"))
+    return;
+
+  // If first 2 operands of a 3 operand instruction are the same
+  // then transform to 2 operand version of the same instruction
+  // e.g. 'adds r0, r0, #1' transforms to 'adds r0, #1'
+  bool Transform = Op3Reg == Op4Reg;
+
+  // For communtative operations, we might be able to transform if we swap
+  // Op4 and Op5.  The 'ADD Rdm, SP, Rdm' form is already handled specially
+  // as tADDrsp.
+  const ARMOperand *LastOp = &Op5;
+  bool Swap = false;
+  if (!Transform && Op5.isReg() && Op3Reg == Op5.getReg() &&
+      ((Mnemonic == "add" && Op4Reg != ARM::SP) ||
+       Mnemonic == "and" || Mnemonic == "eor" ||
+       Mnemonic == "adc" || Mnemonic == "orr")) {
+    Swap = true;
+    LastOp = &Op4;
+    Transform = true;
+  }
+
+  // If both registers are the same then remove one of them from
+  // the operand list, with certain exceptions.
+  if (Transform) {
+    // Don't transform 'adds Rd, Rd, Rm' or 'sub{s} Rd, Rd, Rm' because the
+    // 2 operand forms don't exist.
+    if (((Mnemonic == "add" && CarrySetting) || Mnemonic == "sub") &&
+        LastOp->isReg())
+      Transform = false;
+
+    // Don't transform 'add/sub{s} Rd, Rd, #imm' if the immediate fits into
+    // 3-bits because the ARMARM says not to.
+    if ((Mnemonic == "add" || Mnemonic == "sub") && LastOp->isImm0_7())
+      Transform = false;
+  }
+
+  if (Transform) {
+    if (Swap)
+      std::swap(Op4, Op5);
+    Operands.erase(Operands.begin() + 3);
+  }
+}
+
 bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
                                           OperandVector &Operands) {
   // FIXME: This is all horribly hacky. We really need a better way to deal
@@ -5838,6 +5926,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                    "VFP/Neon double precision register expected");
   }
 
+  tryConvertingToTwoOperandForm(Mnemonic, CarrySetting, Operands);
+
   // Some instructions, mostly Thumb, have forms for the same mnemonic that
   // do and don't have a cc_out optional-def operand. With some spot-checks
   // of the operand list, we can figure out which variant we're trying to
@@ -5901,48 +5991,6 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     }
   }
 
-  // If first 2 operands of a 3 operand instruction are the same
-  // then transform to 2 operand version of the same instruction
-  // e.g. 'adds r0, r0, #1' transforms to 'adds r0, #1'
-  // FIXME: We would really like to be able to tablegen'erate this.
-  if (isThumbOne() && Operands.size() == 6 &&
-       (Mnemonic == "add" || Mnemonic == "sub" || Mnemonic == "and" ||
-        Mnemonic == "eor" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
-        Mnemonic == "asr" || Mnemonic == "adc" || Mnemonic == "sbc" ||
-        Mnemonic == "ror" || Mnemonic == "orr" || Mnemonic == "bic")) {
-      ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
-      ARMOperand &Op4 = static_cast<ARMOperand &>(*Operands[4]);
-      ARMOperand &Op5 = static_cast<ARMOperand &>(*Operands[5]);
-
-      // If both registers are the same then remove one of them from
-      // the operand list.
-      if (Op3.isReg() && Op4.isReg() && Op3.getReg() == Op4.getReg()) {
-          // If 3rd operand (variable Op5) is a register and the instruction is adds/sub
-          // then do not transform as the backend already handles this instruction
-          // correctly.
-          if (!Op5.isReg() || !((Mnemonic == "add" && CarrySetting) || Mnemonic == "sub")) {
-              Operands.erase(Operands.begin() + 3);
-              if (Mnemonic == "add" && !CarrySetting) {
-                  // Special case for 'add' (not 'adds') instruction must
-                  // remove the CCOut operand as well.
-                  Operands.erase(Operands.begin() + 1);
-              }
-          }
-      }
-  }
-
-  // If instruction is 'add' and first two register operands
-  // use SP register, then remove one of the SP registers from
-  // the instruction.
-  // FIXME: We would really like to be able to tablegen'erate this.
-  if (isThumbOne() && Operands.size() == 5 && Mnemonic == "add" && !CarrySetting) {
-      ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
-      ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
-      if (Op2.isReg() && Op3.isReg() && Op2.getReg() == ARM::SP && Op3.getReg() == ARM::SP) {
-          Operands.erase(Operands.begin() + 2);
-      }
-  }
-
   // GNU Assembler extension (compatibility)
   if ((Mnemonic == "ldrd" || Mnemonic == "strd")) {
     ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
@@ -5985,8 +6033,9 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 // return 'true' if register list contains non-low GPR registers,
 // 'false' otherwise. If Reg is in the register list or is HiReg, set
 // 'containsReg' to true.
-static bool checkLowRegisterList(MCInst Inst, unsigned OpNo, unsigned Reg,
-                                 unsigned HiReg, bool &containsReg) {
+static bool checkLowRegisterList(const MCInst &Inst, unsigned OpNo,
+                                 unsigned Reg, unsigned HiReg,
+                                 bool &containsReg) {
   containsReg = false;
   for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
     unsigned OpReg = Inst.getOperand(i).getReg();
@@ -6001,8 +6050,8 @@ static bool checkLowRegisterList(MCInst Inst, unsigned OpNo, unsigned Reg,
 
 // Check if the specified regisgter is in the register list of the inst,
 // starting at the indicated operand number.
-static bool listContainsReg(MCInst &Inst, unsigned OpNo, unsigned Reg) {
-  for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
+static bool listContainsReg(const MCInst &Inst, unsigned OpNo, unsigned Reg) {
+  for (unsigned i = OpNo, e = Inst.getNumOperands(); i < e; ++i) {
     unsigned OpReg = Inst.getOperand(i).getReg();
     if (OpReg == Reg)
       return true;
@@ -6020,7 +6069,7 @@ static bool instIsBreakpoint(const MCInst &Inst) {
 
 }
 
-bool ARMAsmParser::validatetLDMRegList(MCInst Inst,
+bool ARMAsmParser::validatetLDMRegList(const MCInst &Inst,
                                        const OperandVector &Operands,
                                        unsigned ListNo, bool IsARPop) {
   const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
@@ -6043,7 +6092,7 @@ bool ARMAsmParser::validatetLDMRegList(MCInst Inst,
   return false;
 }
 
-bool ARMAsmParser::validatetSTMRegList(MCInst Inst,
+bool ARMAsmParser::validatetSTMRegList(const MCInst &Inst,
                                        const OperandVector &Operands,
                                        unsigned ListNo) {
   const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
@@ -8167,8 +8216,16 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // If the destination and first source operand are the same, and
     // there's no setting of the flags, use encoding T2 instead of T3.
     // Note that this is only for ADD, not SUB. This mirrors the system
-    // 'as' behaviour. Make sure the wide encoding wasn't explicit.
-    if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
+    // 'as' behaviour.  Also take advantage of ADD being commutative.
+    // Make sure the wide encoding wasn't explicit.
+    bool Swap = false;
+    auto DestReg = Inst.getOperand(0).getReg();
+    bool Transform = DestReg == Inst.getOperand(1).getReg();
+    if (!Transform && DestReg == Inst.getOperand(2).getReg()) {
+      Transform = true;
+      Swap = true;
+    }
+    if (!Transform ||
         Inst.getOperand(5).getReg() != 0 ||
         (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
          static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
@@ -8177,7 +8234,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     TmpInst.setOpcode(ARM::tADDhirr);
     TmpInst.addOperand(Inst.getOperand(0));
     TmpInst.addOperand(Inst.getOperand(0));
-    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(Swap ? 1 : 2));
     TmpInst.addOperand(Inst.getOperand(3));
     TmpInst.addOperand(Inst.getOperand(4));
     Inst = TmpInst;
@@ -9176,8 +9233,7 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
     return false;
   }
 
-  STI.InitMCProcessorInfo(CPU, "");
-  STI.InitCPUSchedModel(CPU);
+  STI.setDefaultFeatures(CPU);
   setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
 
   return false;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 4d12bfb5d60f..d17fdb95dbdf 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -1362,7 +1362,7 @@ MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S) {
 MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S,
                                                 const MCSubtargetInfo &STI) {
   const Triple &TT = STI.getTargetTriple();
-  if (TT.getObjectFormat() == Triple::ELF)
+  if (TT.isOSBinFormatELF())
     return new ARMTargetELFStreamer(S);
   return new ARMTargetStreamer(S);
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index fafe25ae5be5..21c9fc1e58b2 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 #define GET_REGINFO_MC_DESC
 #include "ARMGenRegisterInfo.inc"
 
-static bool getMCRDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+static bool getMCRDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
                                   std::string &Info) {
   if (STI.getFeatureBits()[llvm::ARM::HasV7Ops] &&
       (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 15) &&
@@ -63,7 +63,7 @@ static bool getMCRDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
   return false;
 }
 
-static bool getITDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+static bool getITDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
                                  std::string &Info) {
   if (STI.getFeatureBits()[llvm::ARM::HasV8Ops] && MI.getOperand(1).isImm() &&
       MI.getOperand(1).getImm() != 8) {
@@ -75,7 +75,7 @@ static bool getITDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
   return false;
 }
 
-static bool getARMStoreDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+static bool getARMStoreDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
                                        std::string &Info) {
   assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] &&
          "cannot predicate thumb instructions");
@@ -92,7 +92,7 @@ static bool getARMStoreDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
   return false;
 }
 
-static bool getARMLoadDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+static bool getARMLoadDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
                                       std::string &Info) {
   assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] &&
          "cannot predicate thumb instructions");
@@ -257,9 +257,7 @@ MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT,
       ArchFS = FS;
   }
 
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitARMMCSubtargetInfo(X, TT, CPU, ArchFS);
-  return X;
+  return createARMMCSubtargetInfoImpl(TT, CPU, ArchFS);
 }
 
 static MCInstrInfo *createARMMCInstrInfo() {
@@ -268,7 +266,7 @@ static MCInstrInfo *createARMMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createARMMCRegisterInfo(StringRef Triple) {
+static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitARMMCRegisterInfo(X, ARM::LR, 0, 0, ARM::PC);
   return X;
@@ -279,10 +277,10 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
   MCAsmInfo *MAI;
   if (TheTriple.isOSDarwin() || TheTriple.isOSBinFormatMachO())
     MAI = new ARMMCAsmInfoDarwin(TheTriple);
-  else if (TheTriple.isWindowsItaniumEnvironment())
-    MAI = new ARMCOFFMCAsmInfoGNU();
   else if (TheTriple.isWindowsMSVCEnvironment())
     MAI = new ARMCOFFMCAsmInfoMicrosoft();
+  else if (TheTriple.isOSWindows())
+    MAI = new ARMCOFFMCAsmInfoGNU();
   else
     MAI = new ARMELFMCAsmInfo(TheTriple);
 
@@ -292,14 +290,13 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createARMMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createARMMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
                                              CodeModel::Model CM,
                                              CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
   if (RM == Reloc::Default) {
-    Triple TheTriple(TT);
     // Default relocation model on Darwin is PIC, not DynamicNoPIC.
-    RM = TheTriple.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC;
+    RM = TT.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC;
   }
   X->initMCCodeGenInfo(RM, CM, OL);
   return X;
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 77cd890e4cad..3b4358b5d9bf 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -365,7 +365,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
       // frame pointer stack slot, the target is ELF and the function has FP, or
       // the target uses var sized objects.
       if (NumBytes) {
-        assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) &&
+        assert(!MFI->getPristineRegs(MF).test(ARM::R4) &&
                "No scratch register to restore SP from FP!");
         emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
                                   TII, *RegInfo);
diff --git a/lib/Target/BPF/BPFFrameLowering.cpp b/lib/Target/BPF/BPFFrameLowering.cpp
index 54c5ececc7de..c2806c85f24f 100644
--- a/lib/Target/BPF/BPFFrameLowering.cpp
+++ b/lib/Target/BPF/BPFFrameLowering.cpp
@@ -29,12 +29,12 @@ void BPFFrameLowering::emitPrologue(MachineFunction &MF,
 void BPFFrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {}
 
-void BPFFrameLowering::processFunctionBeforeCalleeSavedScan(
-    MachineFunction &MF, RegScavenger *RS) const {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  MRI.setPhysRegUnused(BPF::R6);
-  MRI.setPhysRegUnused(BPF::R7);
-  MRI.setPhysRegUnused(BPF::R8);
-  MRI.setPhysRegUnused(BPF::R9);
+void BPFFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+  SavedRegs.reset(BPF::R6);
+  SavedRegs.reset(BPF::R7);
+  SavedRegs.reset(BPF::R8);
+  SavedRegs.reset(BPF::R9);
 }
diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h
index 3b9fc443e053..251cda965ff5 100644
--- a/lib/Target/BPF/BPFFrameLowering.h
+++ b/lib/Target/BPF/BPFFrameLowering.h
@@ -28,8 +28,8 @@ public:
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   bool hasFP(const MachineFunction &MF) const override;
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
 
   void
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index 38c56bbef81e..58498a1aec7d 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -302,8 +302,9 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     DAG.getContext()->diagnose(Err);
   }
 
+  auto PtrVT = getPointerTy(MF.getDataLayout());
   Chain = DAG.getCALLSEQ_START(
-      Chain, DAG.getConstant(NumBytes, CLI.DL, getPointerTy(), true), CLI.DL);
+      Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL);
 
   SmallVector<std::pair<unsigned, SDValue>, 5> RegsToPass;
 
@@ -350,10 +351,10 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), CLI.DL, getPointerTy(),
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), CLI.DL, PtrVT,
                                         G->getOffset(), 0);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
-    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy(), 0);
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
 
   // Returns a chain & a flag for retval copy to use.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -374,8 +375,8 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Create the CALLSEQ_END node.
   Chain = DAG.getCALLSEQ_END(
-      Chain, DAG.getConstant(NumBytes, CLI.DL, getPointerTy(), true),
-      DAG.getConstant(0, CLI.DL, getPointerTy(), true), InFlag, CLI.DL);
+      Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true),
+      DAG.getConstant(0, CLI.DL, PtrVT, true), InFlag, CLI.DL);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
diff --git a/lib/Target/BPF/BPFSubtarget.cpp b/lib/Target/BPF/BPFSubtarget.cpp
index 65acd585116d..c3a8b1caa63d 100644
--- a/lib/Target/BPF/BPFSubtarget.cpp
+++ b/lib/Target/BPF/BPFSubtarget.cpp
@@ -28,4 +28,4 @@ void BPFSubtarget::anchor() {}
 BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS, const TargetMachine &TM)
     : BPFGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this),
-      TLInfo(TM, *this), TSInfo(TM.getDataLayout()) {}
+      TLInfo(TM, *this) {}
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 3e928fc93a37..840570ebc392 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -40,7 +40,7 @@ static MCInstrInfo *createBPFMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createBPFMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createBPFMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitBPFMCRegisterInfo(X, BPF::R11 /* RAReg doesn't exist */);
   return X;
@@ -48,12 +48,10 @@ static MCRegisterInfo *createBPFMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT,
                                                  StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitBPFMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createBPFMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createBPFMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createBPFMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
                                              CodeModel::Model CM,
                                              CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index bc5d7f65b2f6..272688edb8a1 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -2148,8 +2148,8 @@ char CppWriter::ID = 0;
 
 bool CPPTargetMachine::addPassesToEmitFile(
     PassManagerBase &PM, raw_pwrite_stream &o, CodeGenFileType FileType,
-    bool DisableVerify, AnalysisID StartAfter, AnalysisID StopAfter,
-    MachineFunctionInitializer *MFInitializer) {
+    bool DisableVerify, AnalysisID StartBefore, AnalysisID StartAfter,
+    AnalysisID StopAfter, MachineFunctionInitializer *MFInitializer) {
   if (FileType != TargetMachine::CGFT_AssemblyFile)
     return true;
   auto FOut = llvm::make_unique<formatted_raw_ostream>(o);
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index ebf0635b12e4..00e402feffbc 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -31,7 +31,8 @@ struct CPPTargetMachine : public TargetMachine {
 public:
   bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out,
                            CodeGenFileType FileType, bool DisableVerify,
-                           AnalysisID StartAfter, AnalysisID StopAfter,
+                           AnalysisID StartBefore, AnalysisID StartAfter,
+                           AnalysisID StopAfter,
                            MachineFunctionInitializer *MFInitializer) override;
 };
 
diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp
new file mode 100644
index 000000000000..cb7e633fb82f
--- /dev/null
+++ b/lib/Target/Hexagon/BitTracker.cpp
@@ -0,0 +1,1127 @@
+//===--- BitTracker.cpp ---------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// SSA-based bit propagation.
+//
+// The purpose of this code is, for a given virtual register, to provide
+// information about the value of each bit in the register. The values
+// of bits are represented by the class BitValue, and take one of four
+// cases: 0, 1, "ref" and "bottom". The 0 and 1 are rather clear, the
+// "ref" value means that the bit is a copy of another bit (which itself
+// cannot be a copy of yet another bit---such chains are not allowed).
+// A "ref" value is associated with a BitRef structure, which indicates
+// which virtual register, and which bit in that register is the origin
+// of the value. For example, given an instruction
+//   vreg2 = ASL vreg1, 1
+// assuming that nothing is known about bits of vreg1, bit 1 of vreg2
+// will be a "ref" to (vreg1, 0). If there is a subsequent instruction
+//   vreg3 = ASL vreg2, 2
+// then bit 3 of vreg3 will be a "ref" to (vreg1, 0) as well.
+// The "bottom" case means that the bit's value cannot be determined,
+// and that this virtual register actually defines it. The "bottom" case
+// is discussed in detail in BitTracker.h. In fact, "bottom" is a "ref
+// to self", so for the vreg1 above, the bit 0 of it will be a "ref" to
+// (vreg1, 0), bit 1 will be a "ref" to (vreg1, 1), etc.
+//
+// The tracker implements the Wegman-Zadeck algorithm, originally developed
+// for SSA-based constant propagation. Each register is represented as
+// a sequence of bits, with the convention that bit 0 is the least signi-
+// ficant bit. Each bit is propagated individually. The class RegisterCell
+// implements the register's representation, and is also the subject of
+// the lattice operations in the tracker.
+//
+// The intended usage of the bit tracker is to create a target-specific
+// machine instruction evaluator, pass the evaluator to the BitTracker
+// object, and run the tracker. The tracker will then collect the bit
+// value information for a given machine function. After that, it can be
+// queried for the cells for each virtual register.
+// Sample code:
+//   const TargetSpecificEvaluator TSE(TRI, MRI);
+//   BitTracker BT(TSE, MF);
+//   BT.run();
+//   ...
+//   unsigned Reg = interestingRegister();
+//   RegisterCell RC = BT.get(Reg);
+//   if (RC[3].is(1))
+//      Reg0bit3 = 1;
+//
+// The code below is intended to be fully target-independent.
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include "BitTracker.h"
+
+using namespace llvm;
+
+typedef BitTracker BT;
+
+namespace {
+  // Local trickery to pretty print a register (without the whole "%vreg"
+  // business).
+  struct printv {
+    printv(unsigned r) : R(r) {}
+    unsigned R;
+  };
+  raw_ostream &operator<< (raw_ostream &OS, const printv &PV) {
+    if (PV.R)
+      OS << 'v' << TargetRegisterInfo::virtReg2Index(PV.R);
+    else
+      OS << 's';
+    return OS;
+  }
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const BT::BitValue &BV) {
+  switch (BV.Type) {
+    case BT::BitValue::Top:
+      OS << 'T';
+      break;
+    case BT::BitValue::Zero:
+      OS << '0';
+      break;
+    case BT::BitValue::One:
+      OS << '1';
+      break;
+    case BT::BitValue::Ref:
+      OS << printv(BV.RefI.Reg) << '[' << BV.RefI.Pos << ']';
+      break;
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const BT::RegisterCell &RC) {
+  unsigned n = RC.Bits.size();
+  OS << "{ w:" << n;
+  // Instead of printing each bit value individually, try to group them
+  // into logical segments, such as sequences of 0 or 1 bits or references
+  // to consecutive bits (e.g. "bits 3-5 are same as bits 7-9 of reg xyz").
+  // "Start" will be the index of the beginning of the most recent segment.
+  unsigned Start = 0;
+  bool SeqRef = false;    // A sequence of refs to consecutive bits.
+  bool ConstRef = false;  // A sequence of refs to the same bit.
+
+  for (unsigned i = 1, n = RC.Bits.size(); i < n; ++i) {
+    const BT::BitValue &V = RC[i];
+    const BT::BitValue &SV = RC[Start];
+    bool IsRef = (V.Type == BT::BitValue::Ref);
+    // If the current value is the same as Start, skip to the next one.
+    if (!IsRef && V == SV)
+      continue;
+    if (IsRef && SV.Type == BT::BitValue::Ref && V.RefI.Reg == SV.RefI.Reg) {
+      if (Start+1 == i) {
+        SeqRef = (V.RefI.Pos == SV.RefI.Pos+1);
+        ConstRef = (V.RefI.Pos == SV.RefI.Pos);
+      }
+      if (SeqRef && V.RefI.Pos == SV.RefI.Pos+(i-Start))
+        continue;
+      if (ConstRef && V.RefI.Pos == SV.RefI.Pos)
+        continue;
+    }
+
+    // The current value is different. Print the previous one and reset
+    // the Start.
+    OS << " [" << Start;
+    unsigned Count = i - Start;
+    if (Count == 1) {
+      OS << "]:" << SV;
+    } else {
+      OS << '-' << i-1 << "]:";
+      if (SV.Type == BT::BitValue::Ref && SeqRef)
+        OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-'
+           << SV.RefI.Pos+(Count-1) << ']';
+      else
+        OS << SV;
+    }
+    Start = i;
+    SeqRef = ConstRef = false;
+  }
+
+  OS << " [" << Start;
+  unsigned Count = n - Start;
+  if (n-Start == 1) {
+    OS << "]:" << RC[Start];
+  } else {
+    OS << '-' << n-1 << "]:";
+    const BT::BitValue &SV = RC[Start];
+    if (SV.Type == BT::BitValue::Ref && SeqRef)
+      OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-'
+         << SV.RefI.Pos+(Count-1) << ']';
+    else
+      OS << SV;
+  }
+  OS << " }";
+
+  return OS;
+}
+
+BitTracker::BitTracker(const MachineEvaluator &E, MachineFunction &F)
+    : Trace(false), ME(E), MF(F), MRI(F.getRegInfo()), Map(*new CellMapType) {}
+
+BitTracker::~BitTracker() {
+  delete &Map;
+}
+
+
+// If we were allowed to update a cell for a part of a register, the meet
+// operation would need to be parametrized by the register number and the
+// exact part of the register, so that the computer BitRefs correspond to
+// the actual bits of the "self" register.
+// While this cannot happen in the current implementation, I'm not sure
+// if this should be ruled out in the future.
+bool BT::RegisterCell::meet(const RegisterCell &RC, unsigned SelfR) {
+  // An example when "meet" can be invoked with SelfR == 0 is a phi node
+  // with a physical register as an operand.
+  assert(SelfR == 0 || TargetRegisterInfo::isVirtualRegister(SelfR));
+  bool Changed = false;
+  for (uint16_t i = 0, n = Bits.size(); i < n; ++i) {
+    const BitValue &RCV = RC[i];
+    Changed |= Bits[i].meet(RCV, BitRef(SelfR, i));
+  }
+  return Changed;
+}
+
+
+// Insert the entire cell RC into the current cell at position given by M.
+BT::RegisterCell &BT::RegisterCell::insert(const BT::RegisterCell &RC,
+      const BitMask &M) {
+  uint16_t B = M.first(), E = M.last(), W = width();
+  // Sanity: M must be a valid mask for *this.
+  assert(B < W && E < W);
+  // Sanity: the masked part of *this must have the same number of bits
+  // as the source.
+  assert(B > E || E-B+1 == RC.width());      // B <= E  =>  E-B+1 = |RC|.
+  assert(B <= E || E+(W-B)+1 == RC.width()); // E < B   =>  E+(W-B)+1 = |RC|.
+  if (B <= E) {
+    for (uint16_t i = 0; i <= E-B; ++i)
+      Bits[i+B] = RC[i];
+  } else {
+    for (uint16_t i = 0; i < W-B; ++i)
+      Bits[i+B] = RC[i];
+    for (uint16_t i = 0; i <= E; ++i)
+      Bits[i] = RC[i+(W-B)];
+  }
+  return *this;
+}
+
+
+BT::RegisterCell BT::RegisterCell::extract(const BitMask &M) const {
+  uint16_t B = M.first(), E = M.last(), W = width();
+  assert(B < W && E < W);
+  if (B <= E) {
+    RegisterCell RC(E-B+1);
+    for (uint16_t i = B; i <= E; ++i)
+      RC.Bits[i-B] = Bits[i];
+    return RC;
+  }
+
+  RegisterCell RC(E+(W-B)+1);
+  for (uint16_t i = 0; i < W-B; ++i)
+    RC.Bits[i] = Bits[i+B];
+  for (uint16_t i = 0; i <= E; ++i)
+    RC.Bits[i+(W-B)] = Bits[i];
+  return RC;
+}
+
+
+BT::RegisterCell &BT::RegisterCell::rol(uint16_t Sh) {
+  // Rotate left (i.e. towards increasing bit indices).
+  // Swap the two parts:  [0..W-Sh-1] [W-Sh..W-1]
+  uint16_t W = width();
+  Sh = Sh % W;
+  if (Sh == 0)
+    return *this;
+
+  RegisterCell Tmp(W-Sh);
+  // Tmp = [0..W-Sh-1].
+  for (uint16_t i = 0; i < W-Sh; ++i)
+    Tmp[i] = Bits[i];
+  // Shift [W-Sh..W-1] to [0..Sh-1].
+  for (uint16_t i = 0; i < Sh; ++i)
+    Bits[i] = Bits[W-Sh+i];
+  // Copy Tmp to [Sh..W-1].
+  for (uint16_t i = 0; i < W-Sh; ++i)
+    Bits[i+Sh] = Tmp.Bits[i];
+  return *this;
+}
+
+
+BT::RegisterCell &BT::RegisterCell::fill(uint16_t B, uint16_t E,
+      const BitValue &V) {
+  assert(B <= E);
+  while (B < E)
+    Bits[B++] = V;
+  return *this;
+}
+
+
+BT::RegisterCell &BT::RegisterCell::cat(const RegisterCell &RC) {
+  // Append the cell given as the argument to the "this" cell.
+  // Bit 0 of RC becomes bit W of the result, where W is this->width().
+  uint16_t W = width(), WRC = RC.width();
+  Bits.resize(W+WRC);
+  for (uint16_t i = 0; i < WRC; ++i)
+    Bits[i+W] = RC.Bits[i];
+  return *this;
+}
+
+
+uint16_t BT::RegisterCell::ct(bool B) const {
+  uint16_t W = width();
+  uint16_t C = 0;
+  BitValue V = B;
+  while (C < W && Bits[C] == V)
+    C++;
+  return C;
+}
+
+
+uint16_t BT::RegisterCell::cl(bool B) const {
+  uint16_t W = width();
+  uint16_t C = 0;
+  BitValue V = B;
+  while (C < W && Bits[W-(C+1)] == V)
+    C++;
+  return C;
+}
+
+
+bool BT::RegisterCell::operator== (const RegisterCell &RC) const {
+  uint16_t W = Bits.size();
+  if (RC.Bits.size() != W)
+    return false;
+  for (uint16_t i = 0; i < W; ++i)
+    if (Bits[i] != RC[i])
+      return false;
+  return true;
+}
+
+
+uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
+  // The general problem is with finding a register class that corresponds
+  // to a given reference reg:sub. There can be several such classes, and
+  // since we only care about the register size, it does not matter which
+  // such class we would find.
+  // The easiest way to accomplish what we want is to
+  // 1. find a physical register PhysR from the same class as RR.Reg,
+  // 2. find a physical register PhysS that corresponds to PhysR:RR.Sub,
+  // 3. find a register class that contains PhysS.
+  unsigned PhysR;
+  if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) {
+    const TargetRegisterClass *VC = MRI.getRegClass(RR.Reg);
+    assert(VC->begin() != VC->end() && "Empty register class");
+    PhysR = *VC->begin();
+  } else {
+    assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg));
+    PhysR = RR.Reg;
+  }
+
+  unsigned PhysS = (RR.Sub == 0) ? PhysR : TRI.getSubReg(PhysR, RR.Sub);
+  const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PhysS);
+  uint16_t BW = RC->getSize()*8;
+  return BW;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR,
+      const CellMapType &M) const {
+  uint16_t BW = getRegBitWidth(RR);
+
+  // Physical registers are assumed to be present in the map with an unknown
+  // value. Don't actually insert anything in the map, just return the cell.
+  if (TargetRegisterInfo::isPhysicalRegister(RR.Reg))
+    return RegisterCell::self(0, BW);
+
+  assert(TargetRegisterInfo::isVirtualRegister(RR.Reg));
+  // For virtual registers that belong to a class that is not tracked,
+  // generate an "unknown" value as well.
+  const TargetRegisterClass *C = MRI.getRegClass(RR.Reg);
+  if (!track(C))
+    return RegisterCell::self(0, BW);
+
+  CellMapType::const_iterator F = M.find(RR.Reg);
+  if (F != M.end()) {
+    if (!RR.Sub)
+      return F->second;
+    BitMask M = mask(RR.Reg, RR.Sub);
+    return F->second.extract(M);
+  }
+  // If not found, create a "top" entry, but do not insert it in the map.
+  return RegisterCell::top(BW);
+}
+
+
+void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
+      CellMapType &M) const {
+  // While updating the cell map can be done in a meaningful way for
+  // a part of a register, it makes little sense to implement it as the
+  // SSA representation would never contain such "partial definitions".
+  if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+    return;
+  assert(RR.Sub == 0 && "Unexpected sub-register in definition");
+  // Eliminate all ref-to-reg-0 bit values: replace them with "self".
+  for (unsigned i = 0, n = RC.width(); i < n; ++i) {
+    const BitValue &V = RC[i];
+    if (V.Type == BitValue::Ref && V.RefI.Reg == 0)
+      RC[i].RefI = BitRef(RR.Reg, i);
+  }
+  M[RR.Reg] = RC;
+}
+
+
+// Check if the cell represents a compile-time integer value.
+bool BT::MachineEvaluator::isInt(const RegisterCell &A) const {
+  uint16_t W = A.width();
+  for (uint16_t i = 0; i < W; ++i)
+    if (!A[i].is(0) && !A[i].is(1))
+      return false;
+  return true;
+}
+
+
+// Convert a cell to the integer value. The result must fit in uint64_t.
+uint64_t BT::MachineEvaluator::toInt(const RegisterCell &A) const {
+  assert(isInt(A));
+  uint64_t Val = 0;
+  uint16_t W = A.width();
+  for (uint16_t i = 0; i < W; ++i) {
+    Val <<= 1;
+    Val |= A[i].is(1);
+  }
+  return Val;
+}
+
+
+// Evaluator helper functions. These implement some common operation on
+// register cells that can be used to implement target-specific instructions
+// in a target-specific evaluator.
+
+BT::RegisterCell BT::MachineEvaluator::eIMM(int64_t V, uint16_t W) const {
+  RegisterCell Res(W);
+  // For bits beyond the 63rd, this will generate the sign bit of V.
+  for (uint16_t i = 0; i < W; ++i) {
+    Res[i] = BitValue(V & 1);
+    V >>= 1;
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eIMM(const ConstantInt *CI) const {
+  APInt A = CI->getValue();
+  uint16_t BW = A.getBitWidth();
+  assert((unsigned)BW == A.getBitWidth() && "BitWidth overflow");
+  RegisterCell Res(BW);
+  for (uint16_t i = 0; i < BW; ++i)
+    Res[i] = A[i];
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eADD(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  bool Carry = false;
+  uint16_t I;
+  for (I = 0; I < W; ++I) {
+    const BitValue &V1 = A1[I];
+    const BitValue &V2 = A2[I];
+    if (!V1.num() || !V2.num())
+      break;
+    unsigned S = bool(V1) + bool(V2) + Carry;
+    Res[I] = BitValue(S & 1);
+    Carry = (S > 1);
+  }
+  for (; I < W; ++I) {
+    const BitValue &V1 = A1[I];
+    const BitValue &V2 = A2[I];
+    // If the next bit is same as Carry, the result will be 0 plus the
+    // other bit. The Carry bit will remain unchanged.
+    if (V1.is(Carry))
+      Res[I] = BitValue::ref(V2);
+    else if (V2.is(Carry))
+      Res[I] = BitValue::ref(V1);
+    else
+      break;
+  }
+  for (; I < W; ++I)
+    Res[I] = BitValue::self();
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eSUB(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  bool Borrow = false;
+  uint16_t I;
+  for (I = 0; I < W; ++I) {
+    const BitValue &V1 = A1[I];
+    const BitValue &V2 = A2[I];
+    if (!V1.num() || !V2.num())
+      break;
+    unsigned S = bool(V1) - bool(V2) - Borrow;
+    Res[I] = BitValue(S & 1);
+    Borrow = (S > 1);
+  }
+  for (; I < W; ++I) {
+    const BitValue &V1 = A1[I];
+    const BitValue &V2 = A2[I];
+    if (V1.is(Borrow)) {
+      Res[I] = BitValue::ref(V2);
+      break;
+    }
+    if (V2.is(Borrow))
+      Res[I] = BitValue::ref(V1);
+    else
+      break;
+  }
+  for (; I < W; ++I)
+    Res[I] = BitValue::self();
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eMLS(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width() + A2.width();
+  uint16_t Z = A1.ct(0) + A2.ct(0);
+  RegisterCell Res(W);
+  Res.fill(0, Z, BitValue::Zero);
+  Res.fill(Z, W, BitValue::self());
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eMLU(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width() + A2.width();
+  uint16_t Z = A1.ct(0) + A2.ct(0);
+  RegisterCell Res(W);
+  Res.fill(0, Z, BitValue::Zero);
+  Res.fill(Z, W, BitValue::self());
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eASL(const RegisterCell &A1,
+      uint16_t Sh) const {
+  assert(Sh <= A1.width());
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res.rol(Sh);
+  Res.fill(0, Sh, BitValue::Zero);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eLSR(const RegisterCell &A1,
+      uint16_t Sh) const {
+  uint16_t W = A1.width();
+  assert(Sh <= W);
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res.rol(W-Sh);
+  Res.fill(W-Sh, W, BitValue::Zero);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eASR(const RegisterCell &A1,
+      uint16_t Sh) const {
+  uint16_t W = A1.width();
+  assert(Sh <= W);
+  RegisterCell Res = RegisterCell::ref(A1);
+  BitValue Sign = Res[W-1];
+  Res.rol(W-Sh);
+  Res.fill(W-Sh, W, Sign);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eAND(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitValue &V1 = A1[i];
+    const BitValue &V2 = A2[i];
+    if (V1.is(1))
+      Res[i] = BitValue::ref(V2);
+    else if (V2.is(1))
+      Res[i] = BitValue::ref(V1);
+    else if (V1.is(0) || V2.is(0))
+      Res[i] = BitValue::Zero;
+    else if (V1 == V2)
+      Res[i] = V1;
+    else
+      Res[i] = BitValue::self();
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eORL(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitValue &V1 = A1[i];
+    const BitValue &V2 = A2[i];
+    if (V1.is(1) || V2.is(1))
+      Res[i] = BitValue::One;
+    else if (V1.is(0))
+      Res[i] = BitValue::ref(V2);
+    else if (V2.is(0))
+      Res[i] = BitValue::ref(V1);
+    else if (V1 == V2)
+      Res[i] = V1;
+    else
+      Res[i] = BitValue::self();
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eXOR(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitValue &V1 = A1[i];
+    const BitValue &V2 = A2[i];
+    if (V1.is(0))
+      Res[i] = BitValue::ref(V2);
+    else if (V2.is(0))
+      Res[i] = BitValue::ref(V1);
+    else if (V1 == V2)
+      Res[i] = BitValue::Zero;
+    else
+      Res[i] = BitValue::self();
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eNOT(const RegisterCell &A1) const {
+  uint16_t W = A1.width();
+  RegisterCell Res(W);
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitValue &V = A1[i];
+    if (V.is(0))
+      Res[i] = BitValue::One;
+    else if (V.is(1))
+      Res[i] = BitValue::Zero;
+    else
+      Res[i] = BitValue::self();
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eSET(const RegisterCell &A1,
+      uint16_t BitN) const {
+  assert(BitN < A1.width());
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res[BitN] = BitValue::One;
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eCLR(const RegisterCell &A1,
+      uint16_t BitN) const {
+  assert(BitN < A1.width());
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res[BitN] = BitValue::Zero;
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eCLB(const RegisterCell &A1, bool B,
+      uint16_t W) const {
+  uint16_t C = A1.cl(B), AW = A1.width();
+  // If the last leading non-B bit is not a constant, then we don't know
+  // the real count.
+  if ((C < AW && A1[AW-1-C].num()) || C == AW)
+    return eIMM(C, W);
+  return RegisterCell::self(0, W);
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eCTB(const RegisterCell &A1, bool B,
+      uint16_t W) const {
+  uint16_t C = A1.ct(B), AW = A1.width();
+  // If the last trailing non-B bit is not a constant, then we don't know
+  // the real count.
+  if ((C < AW && A1[C].num()) || C == AW)
+    return eIMM(C, W);
+  return RegisterCell::self(0, W);
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eSXT(const RegisterCell &A1,
+      uint16_t FromN) const {
+  uint16_t W = A1.width();
+  assert(FromN <= W);
+  RegisterCell Res = RegisterCell::ref(A1);
+  BitValue Sign = Res[FromN-1];
+  // Sign-extend "inreg".
+  Res.fill(FromN, W, Sign);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eZXT(const RegisterCell &A1,
+      uint16_t FromN) const {
+  uint16_t W = A1.width();
+  assert(FromN <= W);
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res.fill(FromN, W, BitValue::Zero);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eXTR(const RegisterCell &A1,
+      uint16_t B, uint16_t E) const {
+  uint16_t W = A1.width();
+  assert(B < W && E <= W);
+  if (B == E)
+    return RegisterCell(0);
+  uint16_t Last = (E > 0) ? E-1 : W-1;
+  RegisterCell Res = RegisterCell::ref(A1).extract(BT::BitMask(B, Last));
+  // Return shorter cell.
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eINS(const RegisterCell &A1,
+      const RegisterCell &A2, uint16_t AtN) const {
+  uint16_t W1 = A1.width(), W2 = A2.width();
+  (void)W1;
+  assert(AtN < W1 && AtN+W2 <= W1);
+  // Copy bits from A1, insert A2 at position AtN.
+  RegisterCell Res = RegisterCell::ref(A1);
+  if (W2 > 0)
+    Res.insert(RegisterCell::ref(A2), BT::BitMask(AtN, AtN+W2-1));
+  return Res;
+}
+
+
+BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const {
+  assert(Sub == 0 && "Generic BitTracker::mask called for Sub != 0");
+  uint16_t W = getRegBitWidth(Reg);
+  assert(W > 0 && "Cannot generate mask for empty register");
+  return BitMask(0, W-1);
+}
+
+
+bool BT::MachineEvaluator::evaluate(const MachineInstr *MI,
+      const CellMapType &Inputs, CellMapType &Outputs) const {
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case TargetOpcode::REG_SEQUENCE: {
+      RegisterRef RD = MI->getOperand(0);
+      assert(RD.Sub == 0);
+      RegisterRef RS = MI->getOperand(1);
+      unsigned SS = MI->getOperand(2).getImm();
+      RegisterRef RT = MI->getOperand(3);
+      unsigned ST = MI->getOperand(4).getImm();
+      assert(SS != ST);
+
+      uint16_t W = getRegBitWidth(RD);
+      RegisterCell Res(W);
+      Res.insert(RegisterCell::ref(getCell(RS, Inputs)), mask(RD.Reg, SS));
+      Res.insert(RegisterCell::ref(getCell(RT, Inputs)), mask(RD.Reg, ST));
+      putCell(RD, Res, Outputs);
+      break;
+    }
+
+    case TargetOpcode::COPY: {
+      // COPY can transfer a smaller register into a wider one.
+      // If that is the case, fill the remaining high bits with 0.
+      RegisterRef RD = MI->getOperand(0);
+      RegisterRef RS = MI->getOperand(1);
+      assert(RD.Sub == 0);
+      uint16_t WD = getRegBitWidth(RD);
+      uint16_t WS = getRegBitWidth(RS);
+      assert(WD >= WS);
+      RegisterCell Src = getCell(RS, Inputs);
+      RegisterCell Res(WD);
+      Res.insert(Src, BitMask(0, WS-1));
+      Res.fill(WS, WD, BitValue::Zero);
+      putCell(RD, Res, Outputs);
+      break;
+    }
+
+    default:
+      return false;
+  }
+
+  return true;
+}
+
+
+// Main W-Z implementation.
+
+void BT::visitPHI(const MachineInstr *PI) {
+  int ThisN = PI->getParent()->getNumber();
+  if (Trace)
+    dbgs() << "Visit FI(BB#" << ThisN << "): " << *PI;
+
+  const MachineOperand &MD = PI->getOperand(0);
+  assert(MD.getSubReg() == 0 && "Unexpected sub-register in definition");
+  RegisterRef DefRR(MD);
+  uint16_t DefBW = ME.getRegBitWidth(DefRR);
+
+  RegisterCell DefC = ME.getCell(DefRR, Map);
+  if (DefC == RegisterCell::self(DefRR.Reg, DefBW))    // XXX slow
+    return;
+
+  bool Changed = false;
+
+  for (unsigned i = 1, n = PI->getNumOperands(); i < n; i += 2) {
+    const MachineBasicBlock *PB = PI->getOperand(i+1).getMBB();
+    int PredN = PB->getNumber();
+    if (Trace)
+      dbgs() << "  edge BB#" << PredN << "->BB#" << ThisN;
+    if (!EdgeExec.count(CFGEdge(PredN, ThisN))) {
+      if (Trace)
+        dbgs() << " not executable\n";
+      continue;
+    }
+
+    RegisterRef RU = PI->getOperand(i);
+    RegisterCell ResC = ME.getCell(RU, Map);
+    if (Trace)
+      dbgs() << " input reg: " << PrintReg(RU.Reg, &ME.TRI, RU.Sub)
+             << " cell: " << ResC << "\n";
+    Changed |= DefC.meet(ResC, DefRR.Reg);
+  }
+
+  if (Changed) {
+    if (Trace)
+      dbgs() << "Output: " << PrintReg(DefRR.Reg, &ME.TRI, DefRR.Sub)
+             << " cell: " << DefC << "\n";
+    ME.putCell(DefRR, DefC, Map);
+    visitUsesOf(DefRR.Reg);
+  }
+}
+
+
+void BT::visitNonBranch(const MachineInstr *MI) {
+  if (Trace) {
+    int ThisN = MI->getParent()->getNumber();
+    dbgs() << "Visit MI(BB#" << ThisN << "): " << *MI;
+  }
+  if (MI->isDebugValue())
+    return;
+  assert(!MI->isBranch() && "Unexpected branch instruction");
+
+  CellMapType ResMap;
+  bool Eval = ME.evaluate(MI, Map, ResMap);
+
+  if (Trace && Eval) {
+    for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      RegisterRef RU(MO);
+      dbgs() << "  input reg: " << PrintReg(RU.Reg, &ME.TRI, RU.Sub)
+             << " cell: " << ME.getCell(RU, Map) << "\n";
+    }
+    dbgs() << "Outputs:\n";
+    for (CellMapType::iterator I = ResMap.begin(), E = ResMap.end();
+         I != E; ++I) {
+      RegisterRef RD(I->first);
+      dbgs() << "  " << PrintReg(I->first, &ME.TRI) << " cell: "
+             << ME.getCell(RD, ResMap) << "\n";
+    }
+  }
+
+  // Iterate over all definitions of the instruction, and update the
+  // cells accordingly.
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    // Visit register defs only.
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    RegisterRef RD(MO);
+    assert(RD.Sub == 0 && "Unexpected sub-register in definition");
+    if (!TargetRegisterInfo::isVirtualRegister(RD.Reg))
+      continue;
+
+    bool Changed = false;
+    if (!Eval || !ResMap.has(RD.Reg)) {
+      // Set to "ref" (aka "bottom").
+      uint16_t DefBW = ME.getRegBitWidth(RD);
+      RegisterCell RefC = RegisterCell::self(RD.Reg, DefBW);
+      if (RefC != ME.getCell(RD, Map)) {
+        ME.putCell(RD, RefC, Map);
+        Changed = true;
+      }
+    } else {
+      RegisterCell DefC = ME.getCell(RD, Map);
+      RegisterCell ResC = ME.getCell(RD, ResMap);
+      // This is a non-phi instruction, so the values of the inputs come
+      // from the same registers each time this instruction is evaluated.
+      // During the propagation, the values of the inputs can become lowered
+      // in the sense of the lattice operation, which may cause different
+      // results to be calculated in subsequent evaluations. This should
+      // not cause the bottoming of the result in the map, since the new
+      // result is already reflecting the lowered inputs.
+      for (uint16_t i = 0, w = DefC.width(); i < w; ++i) {
+        BitValue &V = DefC[i];
+        // Bits that are already "bottom" should not be updated.
+        if (V.Type == BitValue::Ref && V.RefI.Reg == RD.Reg)
+          continue;
+        // Same for those that are identical in DefC and ResC.
+        if (V == ResC[i])
+          continue;
+        V = ResC[i];
+        Changed = true;
+      }
+      if (Changed)
+        ME.putCell(RD, DefC, Map);
+    }
+    if (Changed)
+      visitUsesOf(RD.Reg);
+  }
+}
+
+
+void BT::visitBranchesFrom(const MachineInstr *BI) {
+  const MachineBasicBlock &B = *BI->getParent();
+  MachineBasicBlock::const_iterator It = BI, End = B.end();
+  BranchTargetList Targets, BTs;
+  bool FallsThrough = true, DefaultToAll = false;
+  int ThisN = B.getNumber();
+
+  do {
+    BTs.clear();
+    const MachineInstr *MI = &*It;
+    if (Trace)
+      dbgs() << "Visit BR(BB#" << ThisN << "): " << *MI;
+    assert(MI->isBranch() && "Expecting branch instruction");
+    InstrExec.insert(MI);
+    bool Eval = ME.evaluate(MI, Map, BTs, FallsThrough);
+    if (!Eval) {
+      // If the evaluation failed, we will add all targets. Keep going in
+      // the loop to mark all executable branches as such.
+      DefaultToAll = true;
+      FallsThrough = true;
+      if (Trace)
+        dbgs() << "  failed to evaluate: will add all CFG successors\n";
+    } else if (!DefaultToAll) {
+      // If evaluated successfully add the targets to the cumulative list.
+      if (Trace) {
+        dbgs() << "  adding targets:";
+        for (unsigned i = 0, n = BTs.size(); i < n; ++i)
+          dbgs() << " BB#" << BTs[i]->getNumber();
+        if (FallsThrough)
+          dbgs() << "\n  falls through\n";
+        else
+          dbgs() << "\n  does not fall through\n";
+      }
+      Targets.insert(BTs.begin(), BTs.end());
+    }
+    ++It;
+  } while (FallsThrough && It != End);
+
+  typedef MachineBasicBlock::const_succ_iterator succ_iterator;
+  if (!DefaultToAll) {
+    // Need to add all CFG successors that lead to EH landing pads.
+    // There won't be explicit branches to these blocks, but they must
+    // be processed.
+    for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I) {
+      const MachineBasicBlock *SB = *I;
+      if (SB->isLandingPad())
+        Targets.insert(SB);
+    }
+    if (FallsThrough) {
+      MachineFunction::const_iterator BIt = &B;
+      MachineFunction::const_iterator Next = std::next(BIt);
+      if (Next != MF.end())
+        Targets.insert(&*Next);
+    }
+  } else {
+    for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I)
+      Targets.insert(*I);
+  }
+
+  for (unsigned i = 0, n = Targets.size(); i < n; ++i) {
+    int TargetN = Targets[i]->getNumber();
+    FlowQ.push(CFGEdge(ThisN, TargetN));
+  }
+}
+
+
+void BT::visitUsesOf(unsigned Reg) {
+  if (Trace)
+    dbgs() << "visiting uses of " << PrintReg(Reg, &ME.TRI) << "\n";
+
+  typedef MachineRegisterInfo::use_nodbg_iterator use_iterator;
+  use_iterator End = MRI.use_nodbg_end();
+  for (use_iterator I = MRI.use_nodbg_begin(Reg); I != End; ++I) {
+    MachineInstr *UseI = I->getParent();
+    if (!InstrExec.count(UseI))
+      continue;
+    if (UseI->isPHI())
+      visitPHI(UseI);
+    else if (!UseI->isBranch())
+      visitNonBranch(UseI);
+    else
+      visitBranchesFrom(UseI);
+  }
+}
+
+
+BT::RegisterCell BT::get(RegisterRef RR) const {
+  return ME.getCell(RR, Map);
+}
+
+
+void BT::put(RegisterRef RR, const RegisterCell &RC) {
+  ME.putCell(RR, RC, Map);
+}
+
+
+// Replace all references to bits from OldRR with the corresponding bits
+// in NewRR.
+void BT::subst(RegisterRef OldRR, RegisterRef NewRR) {
+  assert(Map.has(OldRR.Reg) && "OldRR not present in map");
+  BitMask OM = ME.mask(OldRR.Reg, OldRR.Sub);
+  BitMask NM = ME.mask(NewRR.Reg, NewRR.Sub);
+  uint16_t OMB = OM.first(), OME = OM.last();
+  uint16_t NMB = NM.first(), NME = NM.last();
+  (void)NME;
+  assert((OME-OMB == NME-NMB) &&
+         "Substituting registers of different lengths");
+  for (CellMapType::iterator I = Map.begin(), E = Map.end(); I != E; ++I) {
+    RegisterCell &RC = I->second;
+    for (uint16_t i = 0, w = RC.width(); i < w; ++i) {
+      BitValue &V = RC[i];
+      if (V.Type != BitValue::Ref || V.RefI.Reg != OldRR.Reg)
+        continue;
+      if (V.RefI.Pos < OMB || V.RefI.Pos > OME)
+        continue;
+      V.RefI.Reg = NewRR.Reg;
+      V.RefI.Pos += NMB-OMB;
+    }
+  }
+}
+
+
+// Check if the block has been "executed" during propagation. (If not, the
+// block is dead, but it may still appear to be reachable.)
+bool BT::reached(const MachineBasicBlock *B) const {
+  int BN = B->getNumber();
+  assert(BN >= 0);
+  for (EdgeSetType::iterator I = EdgeExec.begin(), E = EdgeExec.end();
+       I != E; ++I) {
+    if (I->second == BN)
+      return true;
+  }
+  return false;
+}
+
+
+void BT::reset() {
+  EdgeExec.clear();
+  InstrExec.clear();
+  Map.clear();
+}
+
+
+void BT::run() {
+  reset();
+  assert(FlowQ.empty());
+
+  typedef GraphTraits<const MachineFunction*> MachineFlowGraphTraits;
+  const MachineBasicBlock *Entry = MachineFlowGraphTraits::getEntryNode(&MF);
+
+  unsigned MaxBN = 0;
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    assert(I->getNumber() >= 0 && "Disconnected block");
+    unsigned BN = I->getNumber();
+    if (BN > MaxBN)
+      MaxBN = BN;
+  }
+
+  // Keep track of visited blocks.
+  BitVector BlockScanned(MaxBN+1);
+
+  int EntryN = Entry->getNumber();
+  // Generate a fake edge to get something to start with.
+  FlowQ.push(CFGEdge(-1, EntryN));
+
+  while (!FlowQ.empty()) {
+    CFGEdge Edge = FlowQ.front();
+    FlowQ.pop();
+
+    if (EdgeExec.count(Edge))
+      continue;
+    EdgeExec.insert(Edge);
+
+    const MachineBasicBlock &B = *MF.getBlockNumbered(Edge.second);
+    MachineBasicBlock::const_iterator It = B.begin(), End = B.end();
+    // Visit PHI nodes first.
+    while (It != End && It->isPHI()) {
+      const MachineInstr *PI = &*It++;
+      InstrExec.insert(PI);
+      visitPHI(PI);
+    }
+
+    // If this block has already been visited through a flow graph edge,
+    // then the instructions have already been processed. Any updates to
+    // the cells would now only happen through visitUsesOf...
+    if (BlockScanned[Edge.second])
+      continue;
+    BlockScanned[Edge.second] = true;
+
+    // Visit non-branch instructions.
+    while (It != End && !It->isBranch()) {
+      const MachineInstr *MI = &*It++;
+      InstrExec.insert(MI);
+      visitNonBranch(MI);
+    }
+    // If block end has been reached, add the fall-through edge to the queue.
+    if (It == End) {
+      MachineFunction::const_iterator BIt = &B;
+      MachineFunction::const_iterator Next = std::next(BIt);
+      if (Next != MF.end()) {
+        int ThisN = B.getNumber();
+        int NextN = Next->getNumber();
+        FlowQ.push(CFGEdge(ThisN, NextN));
+      }
+    } else {
+      // Handle the remaining sequence of branches. This function will update
+      // the work queue.
+      visitBranchesFrom(It);
+    }
+  } // while (!FlowQ->empty())
+
+  if (Trace) {
+    dbgs() << "Cells after propagation:\n";
+    for (CellMapType::iterator I = Map.begin(), E = Map.end(); I != E; ++I)
+      dbgs() << PrintReg(I->first, &ME.TRI) << " -> " << I->second << "\n";
+  }
+}
+
diff --git a/lib/Target/Hexagon/BitTracker.h b/lib/Target/Hexagon/BitTracker.h
new file mode 100644
index 000000000000..ed002a794d66
--- /dev/null
+++ b/lib/Target/Hexagon/BitTracker.h
@@ -0,0 +1,449 @@
+//===--- BitTracker.h -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BITTRACKER_H
+#define BITTRACKER_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+#include <map>
+#include <queue>
+#include <set>
+
+namespace llvm {
+  class ConstantInt;
+  class MachineRegisterInfo;
+  class MachineBasicBlock;
+  class MachineInstr;
+  class MachineOperand;
+  class raw_ostream;
+
+struct BitTracker {
+  struct BitRef;
+  struct RegisterRef;
+  struct BitValue;
+  struct BitMask;
+  struct RegisterCell;
+  struct MachineEvaluator;
+
+  typedef SetVector<const MachineBasicBlock *> BranchTargetList;
+
+  struct CellMapType : public std::map<unsigned,RegisterCell> {
+    bool has(unsigned Reg) const;
+  };
+
+  BitTracker(const MachineEvaluator &E, MachineFunction &F);
+  ~BitTracker();
+
+  void run();
+  void trace(bool On = false) { Trace = On; }
+  bool has(unsigned Reg) const;
+  const RegisterCell &lookup(unsigned Reg) const;
+  RegisterCell get(RegisterRef RR) const;
+  void put(RegisterRef RR, const RegisterCell &RC);
+  void subst(RegisterRef OldRR, RegisterRef NewRR);
+  bool reached(const MachineBasicBlock *B) const;
+
+private:
+  void visitPHI(const MachineInstr *PI);
+  void visitNonBranch(const MachineInstr *MI);
+  void visitBranchesFrom(const MachineInstr *BI);
+  void visitUsesOf(unsigned Reg);
+  void reset();
+
+  typedef std::pair<int,int> CFGEdge;
+  typedef std::set<CFGEdge> EdgeSetType;
+  typedef std::set<const MachineInstr *> InstrSetType;
+  typedef std::queue<CFGEdge> EdgeQueueType;
+
+  EdgeSetType EdgeExec;       // Executable flow graph edges.
+  InstrSetType InstrExec;     // Executable instructions.
+  EdgeQueueType FlowQ;        // Work queue of CFG edges.
+  bool Trace;                 // Enable tracing for debugging.
+
+  const MachineEvaluator &ME;
+  MachineFunction &MF;
+  MachineRegisterInfo &MRI;
+  CellMapType &Map;
+};
+
+
+// Abstraction of a reference to bit at position Pos from a register Reg.
+struct BitTracker::BitRef {
+  BitRef(unsigned R = 0, uint16_t P = 0) : Reg(R), Pos(P) {}
+  BitRef(const BitRef &BR) : Reg(BR.Reg), Pos(BR.Pos) {}
+  bool operator== (const BitRef &BR) const {
+    // If Reg is 0, disregard Pos.
+    return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos);
+  }
+  unsigned Reg;
+  uint16_t Pos;
+};
+
+
+// Abstraction of a register reference in MachineOperand.  It contains the
+// register number and the subregister index.
+struct BitTracker::RegisterRef {
+  RegisterRef(unsigned R = 0, unsigned S = 0)
+    : Reg(R), Sub(S) {}
+  RegisterRef(const MachineOperand &MO)
+      : Reg(MO.getReg()), Sub(MO.getSubReg()) {}
+  unsigned Reg, Sub;
+};
+
+
+// Value that a single bit can take.  This is outside of the context of
+// any register, it is more of an abstraction of the two-element set of
+// possible bit values.  One extension here is the "Ref" type, which
+// indicates that this bit takes the same value as the bit described by
+// RefInfo.
+struct BitTracker::BitValue {
+  enum ValueType {
+    Top,    // Bit not yet defined.
+    Zero,   // Bit = 0.
+    One,    // Bit = 1.
+    Ref     // Bit value same as the one described in RefI.
+    // Conceptually, there is no explicit "bottom" value: the lattice's
+    // bottom will be expressed as a "ref to itself", which, in the context
+    // of registers, could be read as "this value of this bit is defined by
+    // this bit".
+    // The ordering is:
+    //   x <= Top,
+    //   Self <= x, where "Self" is "ref to itself".
+    // This makes the value lattice different for each virtual register
+    // (even for each bit in the same virtual register), since the "bottom"
+    // for one register will be a simple "ref" for another register.
+    // Since we do not store the "Self" bit and register number, the meet
+    // operation will need to take it as a parameter.
+    //
+    // In practice there is a special case for values that are not associa-
+    // ted with any specific virtual register. An example would be a value
+    // corresponding to a bit of a physical register, or an intermediate
+    // value obtained in some computation (such as instruction evaluation).
+    // Such cases are identical to the usual Ref type, but the register
+    // number is 0. In such case the Pos field of the reference is ignored.
+    //
+    // What is worthy of notice is that in value V (that is a "ref"), as long
+    // as the RefI.Reg is not 0, it may actually be the same register as the
+    // one in which V will be contained.  If the RefI.Pos refers to the posi-
+    // tion of V, then V is assumed to be "bottom" (as a "ref to itself"),
+    // otherwise V is taken to be identical to the referenced bit of the
+    // same register.
+    // If RefI.Reg is 0, however, such a reference to the same register is
+    // not possible.  Any value V that is a "ref", and whose RefI.Reg is 0
+    // is treated as "bottom".
+  };
+  ValueType Type;
+  BitRef RefI;
+
+  BitValue(ValueType T = Top) : Type(T) {}
+  BitValue(bool B) : Type(B ? One : Zero) {}
+  BitValue(const BitValue &V) : Type(V.Type), RefI(V.RefI) {}
+  BitValue(unsigned Reg, uint16_t Pos) : Type(Ref), RefI(Reg, Pos) {}
+
+  bool operator== (const BitValue &V) const {
+    if (Type != V.Type)
+      return false;
+    if (Type == Ref && !(RefI == V.RefI))
+      return false;
+    return true;
+  }
+  bool operator!= (const BitValue &V) const {
+    return !operator==(V);
+  }
+  bool is(unsigned T) const {
+    assert(T == 0 || T == 1);
+    return T == 0 ? Type == Zero
+                  : (T == 1 ? Type == One : false);
+  }
+
+  // The "meet" operation is the "." operation in a semilattice (L, ., T, B):
+  // (1)  x.x = x
+  // (2)  x.y = y.x
+  // (3)  x.(y.z) = (x.y).z
+  // (4)  x.T = x  (i.e. T = "top")
+  // (5)  x.B = B  (i.e. B = "bottom")
+  //
+  // This "meet" function will update the value of the "*this" object with
+  // the newly calculated one, and return "true" if the value of *this has
+  // changed, and "false" otherwise.
+  // To prove that it satisfies the conditions (1)-(5), it is sufficient
+  // to show that a relation
+  //   x <= y  <=>  x.y = x
+  // defines a partial order (i.e. that "meet" is same as "infimum").
+  bool meet(const BitValue &V, const BitRef &Self) {
+    // First, check the cases where there is nothing to be done.
+    if (Type == Ref && RefI == Self)    // Bottom.meet(V) = Bottom (i.e. This)
+      return false;
+    if (V.Type == Top)                  // This.meet(Top) = This
+      return false;
+    if (*this == V)                     // This.meet(This) = This
+      return false;
+
+    // At this point, we know that the value of "this" will change.
+    // If it is Top, it will become the same as V, otherwise it will
+    // become "bottom" (i.e. Self).
+    if (Type == Top) {
+      Type = V.Type;
+      RefI = V.RefI;  // This may be irrelevant, but copy anyway.
+      return true;
+    }
+    // Become "bottom".
+    Type = Ref;
+    RefI = Self;
+    return true;
+  }
+
+  // Create a reference to the bit value V.
+  static BitValue ref(const BitValue &V);
+  // Create a "self".
+  static BitValue self(const BitRef &Self = BitRef());
+
+  bool num() const {
+    return Type == Zero || Type == One;
+  }
+  operator bool() const {
+    assert(Type == Zero || Type == One);
+    return Type == One;
+  }
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const BitValue &BV);
+};
+
+
+// This operation must be idempotent, i.e. ref(ref(V)) == ref(V).
+inline BitTracker::BitValue
+BitTracker::BitValue::ref(const BitValue &V) {
+  if (V.Type != Ref)
+    return BitValue(V.Type);
+  if (V.RefI.Reg != 0)
+    return BitValue(V.RefI.Reg, V.RefI.Pos);
+  return self();
+}
+
+
+inline BitTracker::BitValue
+BitTracker::BitValue::self(const BitRef &Self) {
+  return BitValue(Self.Reg, Self.Pos);
+}
+
+
+// A sequence of bits starting from index B up to and including index E.
+// If E < B, the mask represents two sections: [0..E] and [B..W) where
+// W is the width of the register.
+struct BitTracker::BitMask {
+  BitMask() : B(0), E(0) {}
+  BitMask(uint16_t b, uint16_t e) : B(b), E(e) {}
+  uint16_t first() const { return B; }
+  uint16_t last() const { return E; }
+private:
+  uint16_t B, E;
+};
+
+
+// Representation of a register: a list of BitValues.
+struct BitTracker::RegisterCell {
+  RegisterCell(uint16_t Width = DefaultBitN) : Bits(Width) {}
+
+  uint16_t width() const {
+    return Bits.size();
+  }
+  const BitValue &operator[](uint16_t BitN) const {
+    assert(BitN < Bits.size());
+    return Bits[BitN];
+  }
+  BitValue &operator[](uint16_t BitN) {
+    assert(BitN < Bits.size());
+    return Bits[BitN];
+  }
+
+  bool meet(const RegisterCell &RC, unsigned SelfR);
+  RegisterCell &insert(const RegisterCell &RC, const BitMask &M);
+  RegisterCell extract(const BitMask &M) const;  // Returns a new cell.
+  RegisterCell &rol(uint16_t Sh);    // Rotate left.
+  RegisterCell &fill(uint16_t B, uint16_t E, const BitValue &V);
+  RegisterCell &cat(const RegisterCell &RC);  // Concatenate.
+  uint16_t cl(bool B) const;
+  uint16_t ct(bool B) const;
+
+  bool operator== (const RegisterCell &RC) const;
+  bool operator!= (const RegisterCell &RC) const {
+    return !operator==(RC);
+  }
+
+  const RegisterCell &operator=(const RegisterCell &RC) {
+    Bits = RC.Bits;
+    return *this;
+  }
+
+  // Generate a "ref" cell for the corresponding register. In the resulting
+  // cell each bit will be described as being the same as the corresponding
+  // bit in register Reg (i.e. the cell is "defined" by register Reg).
+  static RegisterCell self(unsigned Reg, uint16_t Width);
+  // Generate a "top" cell of given size.
+  static RegisterCell top(uint16_t Width);
+  // Generate a cell that is a "ref" to another cell.
+  static RegisterCell ref(const RegisterCell &C);
+
+private:
+  // The DefaultBitN is here only to avoid frequent reallocation of the
+  // memory in the vector.
+  static const unsigned DefaultBitN = 32;
+  typedef SmallVector<BitValue, DefaultBitN> BitValueList;
+  BitValueList Bits;
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const RegisterCell &RC);
+};
+
+
+inline bool BitTracker::has(unsigned Reg) const {
+  return Map.find(Reg) != Map.end();
+}
+
+
+inline const BitTracker::RegisterCell&
+BitTracker::lookup(unsigned Reg) const {
+  CellMapType::const_iterator F = Map.find(Reg);
+  assert(F != Map.end());
+  return F->second;
+}
+
+
+inline BitTracker::RegisterCell
+BitTracker::RegisterCell::self(unsigned Reg, uint16_t Width) {
+  RegisterCell RC(Width);
+  for (uint16_t i = 0; i < Width; ++i)
+    RC.Bits[i] = BitValue::self(BitRef(Reg, i));
+  return RC;
+}
+
+
+inline BitTracker::RegisterCell
+BitTracker::RegisterCell::top(uint16_t Width) {
+  RegisterCell RC(Width);
+  for (uint16_t i = 0; i < Width; ++i)
+    RC.Bits[i] = BitValue(BitValue::Top);
+  return RC;
+}
+
+
+inline BitTracker::RegisterCell
+BitTracker::RegisterCell::ref(const RegisterCell &C) {
+  uint16_t W = C.width();
+  RegisterCell RC(W);
+  for (unsigned i = 0; i < W; ++i)
+    RC[i] = BitValue::ref(C[i]);
+  return RC;
+}
+
+
+inline bool BitTracker::CellMapType::has(unsigned Reg) const {
+  return find(Reg) != end();
+}
+
+// A class to evaluate target's instructions and update the cell maps.
+// This is used internally by the bit tracker.  A target that wants to
+// utilize this should implement the evaluation functions (noted below)
+// in a subclass of this class.
+struct BitTracker::MachineEvaluator {
+  MachineEvaluator(const TargetRegisterInfo &T, MachineRegisterInfo &M)
+      : TRI(T), MRI(M) {}
+  virtual ~MachineEvaluator() {}
+
+  uint16_t getRegBitWidth(const RegisterRef &RR) const;
+
+  RegisterCell getCell(const RegisterRef &RR, const CellMapType &M) const;
+  void putCell(const RegisterRef &RR, RegisterCell RC, CellMapType &M) const;
+  // A result of any operation should use refs to the source cells, not
+  // the cells directly. This function is a convenience wrapper to quickly
+  // generate a ref for a cell corresponding to a register reference.
+  RegisterCell getRef(const RegisterRef &RR, const CellMapType &M) const {
+    RegisterCell RC = getCell(RR, M);
+    return RegisterCell::ref(RC);
+  }
+
+  // Helper functions.
+  // Check if a cell is an immediate value (i.e. all bits are either 0 or 1).
+  bool isInt(const RegisterCell &A) const;
+  // Convert cell to an immediate value.
+  uint64_t toInt(const RegisterCell &A) const;
+
+  // Generate cell from an immediate value.
+  RegisterCell eIMM(int64_t V, uint16_t W) const;
+  RegisterCell eIMM(const ConstantInt *CI) const;
+
+  // Arithmetic.
+  RegisterCell eADD(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eSUB(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eMLS(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eMLU(const RegisterCell &A1, const RegisterCell &A2) const;
+
+  // Shifts.
+  RegisterCell eASL(const RegisterCell &A1, uint16_t Sh) const;
+  RegisterCell eLSR(const RegisterCell &A1, uint16_t Sh) const;
+  RegisterCell eASR(const RegisterCell &A1, uint16_t Sh) const;
+
+  // Logical.
+  RegisterCell eAND(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eORL(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eXOR(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eNOT(const RegisterCell &A1) const;
+
+  // Set bit, clear bit.
+  RegisterCell eSET(const RegisterCell &A1, uint16_t BitN) const;
+  RegisterCell eCLR(const RegisterCell &A1, uint16_t BitN) const;
+
+  // Count leading/trailing bits (zeros/ones).
+  RegisterCell eCLB(const RegisterCell &A1, bool B, uint16_t W) const;
+  RegisterCell eCTB(const RegisterCell &A1, bool B, uint16_t W) const;
+
+  // Sign/zero extension.
+  RegisterCell eSXT(const RegisterCell &A1, uint16_t FromN) const;
+  RegisterCell eZXT(const RegisterCell &A1, uint16_t FromN) const;
+
+  // Extract/insert
+  // XTR R,b,e:  extract bits from A1 starting at bit b, ending at e-1.
+  // INS R,S,b:  take R and replace bits starting from b with S.
+  RegisterCell eXTR(const RegisterCell &A1, uint16_t B, uint16_t E) const;
+  RegisterCell eINS(const RegisterCell &A1, const RegisterCell &A2,
+                    uint16_t AtN) const;
+
+  // User-provided functions for individual targets:
+
+  // Return a sub-register mask that indicates which bits in Reg belong
+  // to the subregister Sub. These bits are assumed to be contiguous in
+  // the super-register, and have the same ordering in the sub-register
+  // as in the super-register. It is valid to call this function with
+  // Sub == 0, in this case, the function should return a mask that spans
+  // the entire register Reg (which is what the default implementation
+  // does).
+  virtual BitMask mask(unsigned Reg, unsigned Sub) const;
+  // Indicate whether a given register class should be tracked.
+  virtual bool track(const TargetRegisterClass *RC) const { return true; }
+  // Evaluate a non-branching machine instruction, given the cell map with
+  // the input values. Place the results in the Outputs map. Return "true"
+  // if evaluation succeeded, "false" otherwise.
+  virtual bool evaluate(const MachineInstr *MI, const CellMapType &Inputs,
+                        CellMapType &Outputs) const;
+  // Evaluate a branch, given the cell map with the input values. Fill out
+  // a list of all possible branch targets and indicate (through a flag)
+  // whether the branch could fall-through. Return "true" if this information
+  // has been successfully computed, "false" otherwise.
+  virtual bool evaluate(const MachineInstr *BI, const CellMapType &Inputs,
+                        BranchTargetList &Targets, bool &FallsThru) const = 0;
+
+  const TargetRegisterInfo &TRI;
+  MachineRegisterInfo &MRI;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 758ccc741007..7ab2f0ba01df 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -12,13 +12,19 @@ tablegen(LLVM HexagonGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(HexagonCommonTableGen)
 
 add_llvm_target(HexagonCodeGen
+  BitTracker.cpp
   HexagonAsmPrinter.cpp
+  HexagonBitTracker.cpp
   HexagonCFGOptimizer.cpp
+  HexagonCommonGEP.cpp
   HexagonCopyToCombine.cpp
   HexagonExpandCondsets.cpp
   HexagonExpandPredSpillCode.cpp
   HexagonFixupHwLoops.cpp
   HexagonFrameLowering.cpp
+  HexagonGenExtract.cpp
+  HexagonGenInsert.cpp
+  HexagonGenPredicate.cpp
   HexagonHardwareLoops.cpp
   HexagonInstrInfo.cpp
   HexagonISelDAGToDAG.cpp
diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp
new file mode 100644
index 000000000000..021e58a1d08a
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -0,0 +1,1174 @@
+//===--- HexagonBitTracker.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonBitTracker.h"
+
+using namespace llvm;
+
+typedef BitTracker BT;
+
+HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
+                                   MachineRegisterInfo &mri,
+                                   const HexagonInstrInfo &tii,
+                                   MachineFunction &mf)
+    : MachineEvaluator(tri, mri), MF(mf), MFI(*mf.getFrameInfo()), TII(tii) {
+  // Populate the VRX map (VR to extension-type).
+  // Go over all the formal parameters of the function. If a given parameter
+  // P is sign- or zero-extended, locate the virtual register holding that
+  // parameter and create an entry in the VRX map indicating the type of ex-
+  // tension (and the source type).
+  // This is a bit complicated to do accurately, since the memory layout in-
+  // formation is necessary to precisely determine whether an aggregate para-
+  // meter will be passed in a register or in memory. What is given in MRI
+  // is the association between the physical register that is live-in (i.e.
+  // holds an argument), and the virtual register that this value will be
+  // copied into. This, by itself, is not sufficient to map back the virtual
+  // register to a formal parameter from Function (since consecutive live-ins
+  // from MRI may not correspond to consecutive formal parameters from Func-
+  // tion). To avoid the complications with in-memory arguments, only consi-
+  // der the initial sequence of formal parameters that are known to be
+  // passed via registers.
+  unsigned AttrIdx = 0;
+  unsigned InVirtReg, InPhysReg = 0;
+  const Function &F = *MF.getFunction();
+  typedef Function::const_arg_iterator arg_iterator;
+  for (arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+    AttrIdx++;
+    const Argument &Arg = *I;
+    Type *ATy = Arg.getType();
+    unsigned Width = 0;
+    if (ATy->isIntegerTy())
+      Width = ATy->getIntegerBitWidth();
+    else if (ATy->isPointerTy())
+      Width = 32;
+    // If pointer size is not set through target data, it will default to
+    // Module::AnyPointerSize.
+    if (Width == 0 || Width > 64)
+      break;
+    InPhysReg = getNextPhysReg(InPhysReg, Width);
+    if (!InPhysReg)
+      break;
+    InVirtReg = getVirtRegFor(InPhysReg);
+    if (!InVirtReg)
+      continue;
+    AttributeSet Attrs = F.getAttributes();
+    if (Attrs.hasAttribute(AttrIdx, Attribute::SExt))
+      VRX.insert(std::make_pair(InVirtReg, ExtType(ExtType::SExt, Width)));
+    else if (Attrs.hasAttribute(AttrIdx, Attribute::ZExt))
+      VRX.insert(std::make_pair(InVirtReg, ExtType(ExtType::ZExt, Width)));
+  }
+}
+
+
+BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
+  if (Sub == 0)
+    return MachineEvaluator::mask(Reg, 0);
+  using namespace Hexagon;
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  unsigned ID = RC->getID();
+  uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub));
+  switch (ID) {
+    case DoubleRegsRegClassID:
+      return (Sub == subreg_loreg) ? BT::BitMask(0, RW-1)
+                                   : BT::BitMask(RW, 2*RW-1);
+    default:
+      break;
+  }
+#ifndef NDEBUG
+  dbgs() << PrintReg(Reg, &TRI, Sub) << '\n';
+#endif
+  llvm_unreachable("Unexpected register/subregister");
+}
+
+
+namespace {
+  struct RegisterRefs : public std::vector<BT::RegisterRef> {
+    typedef std::vector<BT::RegisterRef> Base;
+    RegisterRefs(const MachineInstr *MI);
+    const BT::RegisterRef &operator[](unsigned n) const {
+      // The main purpose of this operator is to assert with bad argument.
+      assert(n < size());
+      return Base::operator[](n);
+    }
+  };
+
+  RegisterRefs::RegisterRefs(const MachineInstr *MI)
+    : Base(MI->getNumOperands()) {
+    for (unsigned i = 0, n = size(); i < n; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg())
+        at(i) = BT::RegisterRef(MO);
+      // For indices that don't correspond to registers, the entry will
+      // remain constructed via the default constructor.
+    }
+  }
+}
+
+
+bool HexagonEvaluator::evaluate(const MachineInstr *MI,
+      const CellMapType &Inputs, CellMapType &Outputs) const {
+  unsigned NumDefs = 0;
+
+  // Sanity verification: there should not be any defs with subregisters.
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    NumDefs++;
+    assert(MO.getSubReg() == 0);
+  }
+
+  if (NumDefs == 0)
+    return false;
+
+  if (MI->mayLoad())
+    return evaluateLoad(MI, Inputs, Outputs);
+
+  // Check COPY instructions that copy formal parameters into virtual
+  // registers. Such parameters can be sign- or zero-extended at the
+  // call site, and we should take advantage of this knowledge. The MRI
+  // keeps a list of pairs of live-in physical and virtual registers,
+  // which provides information about which virtual registers will hold
+  // the argument values. The function will still contain instructions
+  // defining those virtual registers, and in practice those are COPY
+  // instructions from a physical to a virtual register. In such cases,
+  // applying the argument extension to the virtual register can be seen
+  // as simply mirroring the extension that had already been applied to
+  // the physical register at the call site. If the defining instruction
+  // was not a COPY, it would not be clear how to mirror that extension
+  // on the callee's side. For that reason, only check COPY instructions
+  // for potential extensions.
+  if (MI->isCopy()) {
+    if (evaluateFormalCopy(MI, Inputs, Outputs))
+      return true;
+  }
+
+  // Beyond this point, if any operand is a global, skip that instruction.
+  // The reason is that certain instructions that can take an immediate
+  // operand can also have a global symbol in that operand. To avoid
+  // checking what kind of operand a given instruction has individually
+  // for each instruction, do it here. Global symbols as operands gene-
+  // rally do not provide any useful information.
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isGlobal() || MO.isBlockAddress() || MO.isSymbol() || MO.isJTI() ||
+        MO.isCPI())
+      return false;
+  }
+
+  RegisterRefs Reg(MI);
+  unsigned Opc = MI->getOpcode();
+  using namespace Hexagon;
+  #define op(i) MI->getOperand(i)
+  #define rc(i) RegisterCell::ref(getCell(Reg[i],Inputs))
+  #define im(i) MI->getOperand(i).getImm()
+
+  // If the instruction has no register operands, skip it.
+  if (Reg.size() == 0)
+    return false;
+
+  // Record result for register in operand 0.
+  auto rr0 = [this,Reg] (const BT::RegisterCell &Val, CellMapType &Outputs)
+        -> bool {
+    putCell(Reg[0], Val, Outputs);
+    return true;
+  };
+  // Get the cell corresponding to the N-th operand.
+  auto cop = [this,Reg,MI,Inputs] (unsigned N, uint16_t W)
+        -> BT::RegisterCell {
+    const MachineOperand &Op = MI->getOperand(N);
+    if (Op.isImm())
+      return eIMM(Op.getImm(), W);
+    if (!Op.isReg())
+      return RegisterCell::self(0, W);
+    assert(getRegBitWidth(Reg[N]) == W && "Register width mismatch");
+    return rc(N);
+  };
+  // Extract RW low bits of the cell.
+  auto lo = [this] (const BT::RegisterCell &RC, uint16_t RW)
+        -> BT::RegisterCell {
+    assert(RW <= RC.width());
+    return eXTR(RC, 0, RW);
+  };
+  // Extract RW high bits of the cell.
+  auto hi = [this] (const BT::RegisterCell &RC, uint16_t RW)
+        -> BT::RegisterCell {
+    uint16_t W = RC.width();
+    assert(RW <= W);
+    return eXTR(RC, W-RW, W);
+  };
+  // Extract N-th halfword (counting from the least significant position).
+  auto half = [this] (const BT::RegisterCell &RC, unsigned N)
+        -> BT::RegisterCell {
+    assert(N*16+16 <= RC.width());
+    return eXTR(RC, N*16, N*16+16);
+  };
+  // Shuffle bits (pick even/odd from cells and merge into result).
+  auto shuffle = [this] (const BT::RegisterCell &Rs, const BT::RegisterCell &Rt,
+                         uint16_t BW, bool Odd) -> BT::RegisterCell {
+    uint16_t I = Odd, Ws = Rs.width();
+    assert(Ws == Rt.width());
+    RegisterCell RC = eXTR(Rt, I*BW, I*BW+BW).cat(eXTR(Rs, I*BW, I*BW+BW));
+    I += 2;
+    while (I*BW < Ws) {
+      RC.cat(eXTR(Rt, I*BW, I*BW+BW)).cat(eXTR(Rs, I*BW, I*BW+BW));
+      I += 2;
+    }
+    return RC;
+  };
+
+  // The bitwidth of the 0th operand. In most (if not all) of the
+  // instructions below, the 0th operand is the defined register.
+  // Pre-compute the bitwidth here, because it is needed in many cases
+  // cases below.
+  uint16_t W0 = (Reg[0].Reg != 0) ? getRegBitWidth(Reg[0]) : 0;
+
+  switch (Opc) {
+    // Transfer immediate:
+
+    case A2_tfrsi:
+    case A2_tfrpi:
+    case CONST32:
+    case CONST32_Float_Real:
+    case CONST32_Int_Real:
+    case CONST64_Float_Real:
+    case CONST64_Int_Real:
+      return rr0(eIMM(im(1), W0), Outputs);
+    case TFR_PdFalse:
+      return rr0(RegisterCell(W0).fill(0, W0, BT::BitValue::Zero), Outputs);
+    case TFR_PdTrue:
+      return rr0(RegisterCell(W0).fill(0, W0, BT::BitValue::One), Outputs);
+    case TFR_FI: {
+      int FI = op(1).getIndex();
+      int Off = op(2).getImm();
+      unsigned A = MFI.getObjectAlignment(FI) + std::abs(Off);
+      unsigned L = Log2_32(A);
+      RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0);
+      RC.fill(0, L, BT::BitValue::Zero);
+      return rr0(RC, Outputs);
+    }
+
+    // Transfer register:
+
+    case A2_tfr:
+    case A2_tfrp:
+    case C2_pxfer_map:
+      return rr0(rc(1), Outputs);
+    case C2_tfrpr: {
+      uint16_t RW = W0;
+      uint16_t PW = 8; // XXX Pred size: getRegBitWidth(Reg[1]);
+      assert(PW <= RW);
+      RegisterCell PC = eXTR(rc(1), 0, PW);
+      RegisterCell RC = RegisterCell(RW).insert(PC, BT::BitMask(0, PW-1));
+      RC.fill(PW, RW, BT::BitValue::Zero);
+      return rr0(RC, Outputs);
+    }
+    case C2_tfrrp: {
+      RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0);
+      W0 = 8; // XXX Pred size
+      return rr0(eINS(RC, eXTR(rc(1), 0, W0), 0), Outputs);
+    }
+
+    // Arithmetic:
+
+    case A2_abs:
+    case A2_absp:
+      // TODO
+      break;
+
+    case A2_addsp: {
+      uint16_t W1 = getRegBitWidth(Reg[1]);
+      assert(W0 == 64 && W1 == 32);
+      RegisterCell CW = RegisterCell(W0).insert(rc(1), BT::BitMask(0, W1-1));
+      RegisterCell RC = eADD(eSXT(CW, W1), rc(2));
+      return rr0(RC, Outputs);
+    }
+    case A2_add:
+    case A2_addp:
+      return rr0(eADD(rc(1), rc(2)), Outputs);
+    case A2_addi:
+      return rr0(eADD(rc(1), eIMM(im(2), W0)), Outputs);
+    case S4_addi_asl_ri: {
+      RegisterCell RC = eADD(eIMM(im(1), W0), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_addi_lsr_ri: {
+      RegisterCell RC = eADD(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_addaddi: {
+      RegisterCell RC = eADD(rc(1), eADD(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyri_addi: {
+      RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+      RegisterCell RC = eADD(eIMM(im(1), W0), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyrr_addi: {
+      RegisterCell M = eMLS(rc(2), rc(3));
+      RegisterCell RC = eADD(eIMM(im(1), W0), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyri_addr_u2: {
+      RegisterCell M = eMLS(eIMM(im(2), W0), rc(3));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyri_addr: {
+      RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyrr_addr: {
+      RegisterCell M = eMLS(rc(2), rc(3));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case S4_subaddi: {
+      RegisterCell RC = eADD(rc(1), eSUB(eIMM(im(2), W0), rc(3)));
+      return rr0(RC, Outputs);
+    }
+    case M2_accii: {
+      RegisterCell RC = eADD(rc(1), eADD(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case M2_acci: {
+      RegisterCell RC = eADD(rc(1), eADD(rc(2), rc(3)));
+      return rr0(RC, Outputs);
+    }
+    case M2_subacc: {
+      RegisterCell RC = eADD(rc(1), eSUB(rc(2), rc(3)));
+      return rr0(RC, Outputs);
+    }
+    case S2_addasl_rrri: {
+      RegisterCell RC = eADD(rc(1), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case C4_addipc: {
+      RegisterCell RPC = RegisterCell::self(Reg[0].Reg, W0);
+      RPC.fill(0, 2, BT::BitValue::Zero);
+      return rr0(eADD(RPC, eIMM(im(2), W0)), Outputs);
+    }
+    case A2_sub:
+    case A2_subp:
+      return rr0(eSUB(rc(1), rc(2)), Outputs);
+    case A2_subri:
+      return rr0(eSUB(eIMM(im(1), W0), rc(2)), Outputs);
+    case S4_subi_asl_ri: {
+      RegisterCell RC = eSUB(eIMM(im(1), W0), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_subi_lsr_ri: {
+      RegisterCell RC = eSUB(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case M2_naccii: {
+      RegisterCell RC = eSUB(rc(1), eADD(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case M2_nacci: {
+      RegisterCell RC = eSUB(rc(1), eADD(rc(2), rc(3)));
+      return rr0(RC, Outputs);
+    }
+    // 32-bit negation is done by "Rd = A2_subri 0, Rs"
+    case A2_negp:
+      return rr0(eSUB(eIMM(0, W0), rc(1)), Outputs);
+
+    case M2_mpy_up: {
+      RegisterCell M = eMLS(rc(1), rc(2));
+      return rr0(hi(M, W0), Outputs);
+    }
+    case M2_dpmpyss_s0:
+      return rr0(eMLS(rc(1), rc(2)), Outputs);
+    case M2_dpmpyss_acc_s0:
+      return rr0(eADD(rc(1), eMLS(rc(2), rc(3))), Outputs);
+    case M2_dpmpyss_nac_s0:
+      return rr0(eSUB(rc(1), eMLS(rc(2), rc(3))), Outputs);
+    case M2_mpyi: {
+      RegisterCell M = eMLS(rc(1), rc(2));
+      return rr0(lo(M, W0), Outputs);
+    }
+    case M2_macsip: {
+      RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M2_macsin: {
+      RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+      RegisterCell RC = eSUB(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M2_maci: {
+      RegisterCell M = eMLS(rc(2), rc(3));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M2_mpysmi: {
+      RegisterCell M = eMLS(rc(1), eIMM(im(2), W0));
+      return rr0(lo(M, 32), Outputs);
+    }
+    case M2_mpysin: {
+      RegisterCell M = eMLS(rc(1), eIMM(-im(2), W0));
+      return rr0(lo(M, 32), Outputs);
+    }
+    case M2_mpysip: {
+      RegisterCell M = eMLS(rc(1), eIMM(im(2), W0));
+      return rr0(lo(M, 32), Outputs);
+    }
+    case M2_mpyu_up: {
+      RegisterCell M = eMLU(rc(1), rc(2));
+      return rr0(hi(M, W0), Outputs);
+    }
+    case M2_dpmpyuu_s0:
+      return rr0(eMLU(rc(1), rc(2)), Outputs);
+    case M2_dpmpyuu_acc_s0:
+      return rr0(eADD(rc(1), eMLU(rc(2), rc(3))), Outputs);
+    case M2_dpmpyuu_nac_s0:
+      return rr0(eSUB(rc(1), eMLU(rc(2), rc(3))), Outputs);
+    //case M2_mpysu_up:
+
+    // Logical/bitwise:
+
+    case A2_andir:
+      return rr0(eAND(rc(1), eIMM(im(2), W0)), Outputs);
+    case A2_and:
+    case A2_andp:
+      return rr0(eAND(rc(1), rc(2)), Outputs);
+    case A4_andn:
+    case A4_andnp:
+      return rr0(eAND(rc(1), eNOT(rc(2))), Outputs);
+    case S4_andi_asl_ri: {
+      RegisterCell RC = eAND(eIMM(im(1), W0), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_andi_lsr_ri: {
+      RegisterCell RC = eAND(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case M4_and_and:
+      return rr0(eAND(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case M4_and_andn:
+      return rr0(eAND(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case M4_and_or:
+      return rr0(eAND(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case M4_and_xor:
+      return rr0(eAND(rc(1), eXOR(rc(2), rc(3))), Outputs);
+    case A2_orir:
+      return rr0(eORL(rc(1), eIMM(im(2), W0)), Outputs);
+    case A2_or:
+    case A2_orp:
+      return rr0(eORL(rc(1), rc(2)), Outputs);
+    case A4_orn:
+    case A4_ornp:
+      return rr0(eORL(rc(1), eNOT(rc(2))), Outputs);
+    case S4_ori_asl_ri: {
+      RegisterCell RC = eORL(eIMM(im(1), W0), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_ori_lsr_ri: {
+      RegisterCell RC = eORL(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case M4_or_and:
+      return rr0(eORL(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case M4_or_andn:
+      return rr0(eORL(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case S4_or_andi:
+    case S4_or_andix: {
+      RegisterCell RC = eORL(rc(1), eAND(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case S4_or_ori: {
+      RegisterCell RC = eORL(rc(1), eORL(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case M4_or_or:
+      return rr0(eORL(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case M4_or_xor:
+      return rr0(eORL(rc(1), eXOR(rc(2), rc(3))), Outputs);
+    case A2_xor:
+    case A2_xorp:
+      return rr0(eXOR(rc(1), rc(2)), Outputs);
+    case M4_xor_and:
+      return rr0(eXOR(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case M4_xor_andn:
+      return rr0(eXOR(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case M4_xor_or:
+      return rr0(eXOR(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case M4_xor_xacc:
+      return rr0(eXOR(rc(1), eXOR(rc(2), rc(3))), Outputs);
+    case A2_not:
+    case A2_notp:
+      return rr0(eNOT(rc(1)), Outputs);
+
+    case S2_asl_i_r:
+    case S2_asl_i_p:
+      return rr0(eASL(rc(1), im(2)), Outputs);
+    case A2_aslh:
+      return rr0(eASL(rc(1), 16), Outputs);
+    case S2_asl_i_r_acc:
+    case S2_asl_i_p_acc:
+      return rr0(eADD(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_r_nac:
+    case S2_asl_i_p_nac:
+      return rr0(eSUB(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_r_and:
+    case S2_asl_i_p_and:
+      return rr0(eAND(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_r_or:
+    case S2_asl_i_p_or:
+      return rr0(eORL(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_r_xacc:
+    case S2_asl_i_p_xacc:
+      return rr0(eXOR(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_vh:
+    case S2_asl_i_vw:
+      // TODO
+      break;
+
+    case S2_asr_i_r:
+    case S2_asr_i_p:
+      return rr0(eASR(rc(1), im(2)), Outputs);
+    case A2_asrh:
+      return rr0(eASR(rc(1), 16), Outputs);
+    case S2_asr_i_r_acc:
+    case S2_asr_i_p_acc:
+      return rr0(eADD(rc(1), eASR(rc(2), im(3))), Outputs);
+    case S2_asr_i_r_nac:
+    case S2_asr_i_p_nac:
+      return rr0(eSUB(rc(1), eASR(rc(2), im(3))), Outputs);
+    case S2_asr_i_r_and:
+    case S2_asr_i_p_and:
+      return rr0(eAND(rc(1), eASR(rc(2), im(3))), Outputs);
+    case S2_asr_i_r_or:
+    case S2_asr_i_p_or:
+      return rr0(eORL(rc(1), eASR(rc(2), im(3))), Outputs);
+    case S2_asr_i_r_rnd: {
+      // The input is first sign-extended to 64 bits, then the output
+      // is truncated back to 32 bits.
+      assert(W0 == 32);
+      RegisterCell XC = eSXT(rc(1).cat(eIMM(0, W0)), W0);
+      RegisterCell RC = eASR(eADD(eASR(XC, im(2)), eIMM(1, 2*W0)), 1);
+      return rr0(eXTR(RC, 0, W0), Outputs);
+    }
+    case S2_asr_i_r_rnd_goodsyntax: {
+      int64_t S = im(2);
+      if (S == 0)
+        return rr0(rc(1), Outputs);
+      // Result: S2_asr_i_r_rnd Rs, u5-1
+      RegisterCell XC = eSXT(rc(1).cat(eIMM(0, W0)), W0);
+      RegisterCell RC = eLSR(eADD(eASR(XC, S-1), eIMM(1, 2*W0)), 1);
+      return rr0(eXTR(RC, 0, W0), Outputs);
+    }
+    case S2_asr_r_vh:
+    case S2_asr_i_vw:
+    case S2_asr_i_svw_trun:
+      // TODO
+      break;
+
+    case S2_lsr_i_r:
+    case S2_lsr_i_p:
+      return rr0(eLSR(rc(1), im(2)), Outputs);
+    case S2_lsr_i_r_acc:
+    case S2_lsr_i_p_acc:
+      return rr0(eADD(rc(1), eLSR(rc(2), im(3))), Outputs);
+    case S2_lsr_i_r_nac:
+    case S2_lsr_i_p_nac:
+      return rr0(eSUB(rc(1), eLSR(rc(2), im(3))), Outputs);
+    case S2_lsr_i_r_and:
+    case S2_lsr_i_p_and:
+      return rr0(eAND(rc(1), eLSR(rc(2), im(3))), Outputs);
+    case S2_lsr_i_r_or:
+    case S2_lsr_i_p_or:
+      return rr0(eORL(rc(1), eLSR(rc(2), im(3))), Outputs);
+    case S2_lsr_i_r_xacc:
+    case S2_lsr_i_p_xacc:
+      return rr0(eXOR(rc(1), eLSR(rc(2), im(3))), Outputs);
+
+    case S2_clrbit_i: {
+      RegisterCell RC = rc(1);
+      RC[im(2)] = BT::BitValue::Zero;
+      return rr0(RC, Outputs);
+    }
+    case S2_setbit_i: {
+      RegisterCell RC = rc(1);
+      RC[im(2)] = BT::BitValue::One;
+      return rr0(RC, Outputs);
+    }
+    case S2_togglebit_i: {
+      RegisterCell RC = rc(1);
+      uint16_t BX = im(2);
+      RC[BX] = RC[BX].is(0) ? BT::BitValue::One
+                            : RC[BX].is(1) ? BT::BitValue::Zero
+                                           : BT::BitValue::self();
+      return rr0(RC, Outputs);
+    }
+
+    case A4_bitspliti: {
+      uint16_t W1 = getRegBitWidth(Reg[1]);
+      uint16_t BX = im(2);
+      // Res.uw[1] = Rs[bx+1:], Res.uw[0] = Rs[0:bx]
+      const BT::BitValue Zero = BT::BitValue::Zero;
+      RegisterCell RZ = RegisterCell(W0).fill(BX, W1, Zero)
+                                        .fill(W1+(W1-BX), W0, Zero);
+      RegisterCell BF1 = eXTR(rc(1), 0, BX), BF2 = eXTR(rc(1), BX, W1);
+      RegisterCell RC = eINS(eINS(RZ, BF1, 0), BF2, W1);
+      return rr0(RC, Outputs);
+    }
+    case S4_extract:
+    case S4_extractp:
+    case S2_extractu:
+    case S2_extractup: {
+      uint16_t Wd = im(2), Of = im(3);
+      assert(Wd <= W0);
+      if (Wd == 0)
+        return rr0(eIMM(0, W0), Outputs);
+      // If the width extends beyond the register size, pad the register
+      // with 0 bits.
+      RegisterCell Pad = (Wd+Of > W0) ? rc(1).cat(eIMM(0, Wd+Of-W0)) : rc(1);
+      RegisterCell Ext = eXTR(Pad, Of, Wd+Of);
+      // Ext is short, need to extend it with 0s or sign bit.
+      RegisterCell RC = RegisterCell(W0).insert(Ext, BT::BitMask(0, Wd-1));
+      if (Opc == S2_extractu || Opc == S2_extractup)
+        return rr0(eZXT(RC, Wd), Outputs);
+      return rr0(eSXT(RC, Wd), Outputs);
+    }
+    case S2_insert:
+    case S2_insertp: {
+      uint16_t Wd = im(3), Of = im(4);
+      assert(Wd < W0 && Of < W0);
+      // If Wd+Of exceeds W0, the inserted bits are truncated.
+      if (Wd+Of > W0)
+        Wd = W0-Of;
+      if (Wd == 0)
+        return rr0(rc(1), Outputs);
+      return rr0(eINS(rc(1), eXTR(rc(2), 0, Wd), Of), Outputs);
+    }
+
+    // Bit permutations:
+
+    case A2_combineii:
+    case A4_combineii:
+    case A4_combineir:
+    case A4_combineri:
+    case A2_combinew:
+      assert(W0 % 2 == 0);
+      return rr0(cop(2, W0/2).cat(cop(1, W0/2)), Outputs);
+    case A2_combine_ll:
+    case A2_combine_lh:
+    case A2_combine_hl:
+    case A2_combine_hh: {
+      assert(W0 == 32);
+      assert(getRegBitWidth(Reg[1]) == 32 && getRegBitWidth(Reg[2]) == 32);
+      // Low half in the output is 0 for _ll and _hl, 1 otherwise:
+      unsigned LoH = !(Opc == A2_combine_ll || Opc == A2_combine_hl);
+      // High half in the output is 0 for _ll and _lh, 1 otherwise:
+      unsigned HiH = !(Opc == A2_combine_ll || Opc == A2_combine_lh);
+      RegisterCell R1 = rc(1);
+      RegisterCell R2 = rc(2);
+      RegisterCell RC = half(R2, LoH).cat(half(R1, HiH));
+      return rr0(RC, Outputs);
+    }
+    case S2_packhl: {
+      assert(W0 == 64);
+      assert(getRegBitWidth(Reg[1]) == 32 && getRegBitWidth(Reg[2]) == 32);
+      RegisterCell R1 = rc(1);
+      RegisterCell R2 = rc(2);
+      RegisterCell RC = half(R2, 0).cat(half(R1, 0)).cat(half(R2, 1))
+                                   .cat(half(R1, 1));
+      return rr0(RC, Outputs);
+    }
+    case S2_shuffeb: {
+      RegisterCell RC = shuffle(rc(1), rc(2), 8, false);
+      return rr0(RC, Outputs);
+    }
+    case S2_shuffeh: {
+      RegisterCell RC = shuffle(rc(1), rc(2), 16, false);
+      return rr0(RC, Outputs);
+    }
+    case S2_shuffob: {
+      RegisterCell RC = shuffle(rc(1), rc(2), 8, true);
+      return rr0(RC, Outputs);
+    }
+    case S2_shuffoh: {
+      RegisterCell RC = shuffle(rc(1), rc(2), 16, true);
+      return rr0(RC, Outputs);
+    }
+    case C2_mask: {
+      uint16_t WR = W0;
+      uint16_t WP = 8; // XXX Pred size: getRegBitWidth(Reg[1]);
+      assert(WR == 64 && WP == 8);
+      RegisterCell R1 = rc(1);
+      RegisterCell RC(WR);
+      for (uint16_t i = 0; i < WP; ++i) {
+        const BT::BitValue &V = R1[i];
+        BT::BitValue F = (V.is(0) || V.is(1)) ? V : BT::BitValue::self();
+        RC.fill(i*8, i*8+8, F);
+      }
+      return rr0(RC, Outputs);
+    }
+
+    // Mux:
+
+    case C2_muxii:
+    case C2_muxir:
+    case C2_muxri:
+    case C2_mux: {
+      BT::BitValue PC0 = rc(1)[0];
+      RegisterCell R2 = cop(2, W0);
+      RegisterCell R3 = cop(3, W0);
+      if (PC0.is(0) || PC0.is(1))
+        return rr0(RegisterCell::ref(PC0 ? R2 : R3), Outputs);
+      R2.meet(R3, Reg[0].Reg);
+      return rr0(R2, Outputs);
+    }
+    case C2_vmux:
+      // TODO
+      break;
+
+    // Sign- and zero-extension:
+
+    case A2_sxtb:
+      return rr0(eSXT(rc(1), 8), Outputs);
+    case A2_sxth:
+      return rr0(eSXT(rc(1), 16), Outputs);
+    case A2_sxtw: {
+      uint16_t W1 = getRegBitWidth(Reg[1]);
+      assert(W0 == 64 && W1 == 32);
+      RegisterCell RC = eSXT(rc(1).cat(eIMM(0, W1)), W1);
+      return rr0(RC, Outputs);
+    }
+    case A2_zxtb:
+      return rr0(eZXT(rc(1), 8), Outputs);
+    case A2_zxth:
+      return rr0(eZXT(rc(1), 16), Outputs);
+
+    // Bit count:
+
+    case S2_cl0:
+    case S2_cl0p:
+      // Always produce a 32-bit result.
+      return rr0(eCLB(rc(1), 0/*bit*/, 32), Outputs);
+    case S2_cl1:
+    case S2_cl1p:
+      return rr0(eCLB(rc(1), 1/*bit*/, 32), Outputs);
+    case S2_clb:
+    case S2_clbp: {
+      uint16_t W1 = getRegBitWidth(Reg[1]);
+      RegisterCell R1 = rc(1);
+      BT::BitValue TV = R1[W1-1];
+      if (TV.is(0) || TV.is(1))
+        return rr0(eCLB(R1, TV, 32), Outputs);
+      break;
+    }
+    case S2_ct0:
+    case S2_ct0p:
+      return rr0(eCTB(rc(1), 0/*bit*/, 32), Outputs);
+    case S2_ct1:
+    case S2_ct1p:
+      return rr0(eCTB(rc(1), 1/*bit*/, 32), Outputs);
+    case S5_popcountp:
+      // TODO
+      break;
+
+    case C2_all8: {
+      RegisterCell P1 = rc(1);
+      bool Has0 = false, All1 = true;
+      for (uint16_t i = 0; i < 8/*XXX*/; ++i) {
+        if (!P1[i].is(1))
+          All1 = false;
+        if (!P1[i].is(0))
+          continue;
+        Has0 = true;
+        break;
+      }
+      if (!Has0 && !All1)
+        break;
+      RegisterCell RC(W0);
+      RC.fill(0, W0, (All1 ? BT::BitValue::One : BT::BitValue::Zero));
+      return rr0(RC, Outputs);
+    }
+    case C2_any8: {
+      RegisterCell P1 = rc(1);
+      bool Has1 = false, All0 = true;
+      for (uint16_t i = 0; i < 8/*XXX*/; ++i) {
+        if (!P1[i].is(0))
+          All0 = false;
+        if (!P1[i].is(1))
+          continue;
+        Has1 = true;
+        break;
+      }
+      if (!Has1 && !All0)
+        break;
+      RegisterCell RC(W0);
+      RC.fill(0, W0, (Has1 ? BT::BitValue::One : BT::BitValue::Zero));
+      return rr0(RC, Outputs);
+    }
+    case C2_and:
+      return rr0(eAND(rc(1), rc(2)), Outputs);
+    case C2_andn:
+      return rr0(eAND(rc(1), eNOT(rc(2))), Outputs);
+    case C2_not:
+      return rr0(eNOT(rc(1)), Outputs);
+    case C2_or:
+      return rr0(eORL(rc(1), rc(2)), Outputs);
+    case C2_orn:
+      return rr0(eORL(rc(1), eNOT(rc(2))), Outputs);
+    case C2_xor:
+      return rr0(eXOR(rc(1), rc(2)), Outputs);
+    case C4_and_and:
+      return rr0(eAND(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case C4_and_andn:
+      return rr0(eAND(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case C4_and_or:
+      return rr0(eAND(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case C4_and_orn:
+      return rr0(eAND(rc(1), eORL(rc(2), eNOT(rc(3)))), Outputs);
+    case C4_or_and:
+      return rr0(eORL(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case C4_or_andn:
+      return rr0(eORL(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case C4_or_or:
+      return rr0(eORL(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case C4_or_orn:
+      return rr0(eORL(rc(1), eORL(rc(2), eNOT(rc(3)))), Outputs);
+    case C2_bitsclr:
+    case C2_bitsclri:
+    case C2_bitsset:
+    case C4_nbitsclr:
+    case C4_nbitsclri:
+    case C4_nbitsset:
+      // TODO
+      break;
+    case S2_tstbit_i:
+    case S4_ntstbit_i: {
+      BT::BitValue V = rc(1)[im(2)];
+      if (V.is(0) || V.is(1)) {
+        // If instruction is S2_tstbit_i, test for 1, otherwise test for 0.
+        bool TV = (Opc == S2_tstbit_i);
+        BT::BitValue F = V.is(TV) ? BT::BitValue::One : BT::BitValue::Zero;
+        return rr0(RegisterCell(W0).fill(0, W0, F), Outputs);
+      }
+      break;
+    }
+
+    default:
+      return MachineEvaluator::evaluate(MI, Inputs, Outputs);
+  }
+  #undef im
+  #undef rc
+  #undef op
+  return false;
+}
+
+
+bool HexagonEvaluator::evaluate(const MachineInstr *BI,
+      const CellMapType &Inputs, BranchTargetList &Targets,
+      bool &FallsThru) const {
+  // We need to evaluate one branch at a time. TII::AnalyzeBranch checks
+  // all the branches in a basic block at once, so we cannot use it.
+  unsigned Opc = BI->getOpcode();
+  bool SimpleBranch = false;
+  bool Negated = false;
+  switch (Opc) {
+    case Hexagon::J2_jumpf:
+    case Hexagon::J2_jumpfnew:
+    case Hexagon::J2_jumpfnewpt:
+      Negated = true;
+    case Hexagon::J2_jumpt:
+    case Hexagon::J2_jumptnew:
+    case Hexagon::J2_jumptnewpt:
+      // Simple branch:  if([!]Pn) jump ...
+      // i.e. Op0 = predicate, Op1 = branch target.
+      SimpleBranch = true;
+      break;
+    case Hexagon::J2_jump:
+      Targets.insert(BI->getOperand(0).getMBB());
+      FallsThru = false;
+      return true;
+    default:
+      // If the branch is of unknown type, assume that all successors are
+      // executable.
+      return false;
+  }
+
+  if (!SimpleBranch)
+    return false;
+
+  // BI is a conditional branch if we got here.
+  RegisterRef PR = BI->getOperand(0);
+  RegisterCell PC = getCell(PR, Inputs);
+  const BT::BitValue &Test = PC[0];
+
+  // If the condition is neither true nor false, then it's unknown.
+  if (!Test.is(0) && !Test.is(1))
+    return false;
+
+  // "Test.is(!Negated)" means "branch condition is true".
+  if (!Test.is(!Negated)) {
+    // Condition known to be false.
+    FallsThru = true;
+    return true;
+  }
+
+  Targets.insert(BI->getOperand(1).getMBB());
+  FallsThru = false;
+  return true;
+}
+
+
+bool HexagonEvaluator::evaluateLoad(const MachineInstr *MI,
+      const CellMapType &Inputs, CellMapType &Outputs) const {
+  if (TII.isPredicated(MI))
+    return false;
+  assert(MI->mayLoad() && "A load that mayn't?");
+  unsigned Opc = MI->getOpcode();
+
+  uint16_t BitNum;
+  bool SignEx;
+  using namespace Hexagon;
+
+  switch (Opc) {
+    default:
+      return false;
+
+#if 0
+    // memb_fifo
+    case L2_loadalignb_pbr:
+    case L2_loadalignb_pcr:
+    case L2_loadalignb_pi:
+    // memh_fifo
+    case L2_loadalignh_pbr:
+    case L2_loadalignh_pcr:
+    case L2_loadalignh_pi:
+    // membh
+    case L2_loadbsw2_pbr:
+    case L2_loadbsw2_pci:
+    case L2_loadbsw2_pcr:
+    case L2_loadbsw2_pi:
+    case L2_loadbsw4_pbr:
+    case L2_loadbsw4_pci:
+    case L2_loadbsw4_pcr:
+    case L2_loadbsw4_pi:
+    // memubh
+    case L2_loadbzw2_pbr:
+    case L2_loadbzw2_pci:
+    case L2_loadbzw2_pcr:
+    case L2_loadbzw2_pi:
+    case L2_loadbzw4_pbr:
+    case L2_loadbzw4_pci:
+    case L2_loadbzw4_pcr:
+    case L2_loadbzw4_pi:
+#endif
+
+    case L2_loadrbgp:
+    case L2_loadrb_io:
+    case L2_loadrb_pbr:
+    case L2_loadrb_pci:
+    case L2_loadrb_pcr:
+    case L2_loadrb_pi:
+    case L4_loadrb_abs:
+    case L4_loadrb_ap:
+    case L4_loadrb_rr:
+    case L4_loadrb_ur:
+      BitNum = 8;
+      SignEx = true;
+      break;
+
+    case L2_loadrubgp:
+    case L2_loadrub_io:
+    case L2_loadrub_pbr:
+    case L2_loadrub_pci:
+    case L2_loadrub_pcr:
+    case L2_loadrub_pi:
+    case L4_loadrub_abs:
+    case L4_loadrub_ap:
+    case L4_loadrub_rr:
+    case L4_loadrub_ur:
+      BitNum = 8;
+      SignEx = false;
+      break;
+
+    case L2_loadrhgp:
+    case L2_loadrh_io:
+    case L2_loadrh_pbr:
+    case L2_loadrh_pci:
+    case L2_loadrh_pcr:
+    case L2_loadrh_pi:
+    case L4_loadrh_abs:
+    case L4_loadrh_ap:
+    case L4_loadrh_rr:
+    case L4_loadrh_ur:
+      BitNum = 16;
+      SignEx = true;
+      break;
+
+    case L2_loadruhgp:
+    case L2_loadruh_io:
+    case L2_loadruh_pbr:
+    case L2_loadruh_pci:
+    case L2_loadruh_pcr:
+    case L2_loadruh_pi:
+    case L4_loadruh_rr:
+    case L4_loadruh_abs:
+    case L4_loadruh_ap:
+    case L4_loadruh_ur:
+      BitNum = 16;
+      SignEx = false;
+      break;
+
+    case L2_loadrigp:
+    case L2_loadri_io:
+    case L2_loadri_pbr:
+    case L2_loadri_pci:
+    case L2_loadri_pcr:
+    case L2_loadri_pi:
+    case L2_loadw_locked:
+    case L4_loadri_abs:
+    case L4_loadri_ap:
+    case L4_loadri_rr:
+    case L4_loadri_ur:
+    case LDriw_pred:
+      BitNum = 32;
+      SignEx = true;
+      break;
+
+    case L2_loadrdgp:
+    case L2_loadrd_io:
+    case L2_loadrd_pbr:
+    case L2_loadrd_pci:
+    case L2_loadrd_pcr:
+    case L2_loadrd_pi:
+    case L4_loadd_locked:
+    case L4_loadrd_abs:
+    case L4_loadrd_ap:
+    case L4_loadrd_rr:
+    case L4_loadrd_ur:
+      BitNum = 64;
+      SignEx = true;
+      break;
+  }
+
+  const MachineOperand &MD = MI->getOperand(0);
+  assert(MD.isReg() && MD.isDef());
+  RegisterRef RD = MD;
+
+  uint16_t W = getRegBitWidth(RD);
+  assert(W >= BitNum && BitNum > 0);
+  RegisterCell Res(W);
+
+  for (uint16_t i = 0; i < BitNum; ++i)
+    Res[i] = BT::BitValue::self(BT::BitRef(RD.Reg, i));
+
+  if (SignEx) {
+    const BT::BitValue &Sign = Res[BitNum-1];
+    for (uint16_t i = BitNum; i < W; ++i)
+      Res[i] = BT::BitValue::ref(Sign);
+  } else {
+    for (uint16_t i = BitNum; i < W; ++i)
+      Res[i] = BT::BitValue::Zero;
+  }
+
+  putCell(RD, Res, Outputs);
+  return true;
+}
+
+
+bool HexagonEvaluator::evaluateFormalCopy(const MachineInstr *MI,
+      const CellMapType &Inputs, CellMapType &Outputs) const {
+  // If MI defines a formal parameter, but is not a copy (loads are handled
+  // in evaluateLoad), then it's not clear what to do.
+  assert(MI->isCopy());
+
+  RegisterRef RD = MI->getOperand(0);
+  RegisterRef RS = MI->getOperand(1);
+  assert(RD.Sub == 0);
+  if (!TargetRegisterInfo::isPhysicalRegister(RS.Reg))
+    return false;
+  RegExtMap::const_iterator F = VRX.find(RD.Reg);
+  if (F == VRX.end())
+    return false;
+
+  uint16_t EW = F->second.Width;
+  // Store RD's cell into the map. This will associate the cell with a virtual
+  // register, and make zero-/sign-extends possible (otherwise we would be ex-
+  // tending "self" bit values, which will have no effect, since "self" values
+  // cannot be references to anything).
+  putCell(RD, getCell(RS, Inputs), Outputs);
+
+  RegisterCell Res;
+  // Read RD's cell from the outputs instead of RS's cell from the inputs:
+  if (F->second.Type == ExtType::SExt)
+    Res = eSXT(getCell(RD, Outputs), EW);
+  else if (F->second.Type == ExtType::ZExt)
+    Res = eZXT(getCell(RD, Outputs), EW);
+
+  putCell(RD, Res, Outputs);
+  return true;
+}
+
+
+unsigned HexagonEvaluator::getNextPhysReg(unsigned PReg, unsigned Width) const {
+  using namespace Hexagon;
+  bool Is64 = DoubleRegsRegClass.contains(PReg);
+  assert(PReg == 0 || Is64 || IntRegsRegClass.contains(PReg));
+
+  static const unsigned Phys32[] = { R0, R1, R2, R3, R4, R5 };
+  static const unsigned Phys64[] = { D0, D1, D2 };
+  const unsigned Num32 = sizeof(Phys32)/sizeof(unsigned);
+  const unsigned Num64 = sizeof(Phys64)/sizeof(unsigned);
+
+  // Return the first parameter register of the required width.
+  if (PReg == 0)
+    return (Width <= 32) ? Phys32[0] : Phys64[0];
+
+  // Set Idx32, Idx64 in such a way that Idx+1 would give the index of the
+  // next register.
+  unsigned Idx32 = 0, Idx64 = 0;
+  if (!Is64) {
+    while (Idx32 < Num32) {
+      if (Phys32[Idx32] == PReg)
+        break;
+      Idx32++;
+    }
+    Idx64 = Idx32/2;
+  } else {
+    while (Idx64 < Num64) {
+      if (Phys64[Idx64] == PReg)
+        break;
+      Idx64++;
+    }
+    Idx32 = Idx64*2+1;
+  }
+
+  if (Width <= 32)
+    return (Idx32+1 < Num32) ? Phys32[Idx32+1] : 0;
+  return (Idx64+1 < Num64) ? Phys64[Idx64+1] : 0;
+}
+
+
+unsigned HexagonEvaluator::getVirtRegFor(unsigned PReg) const {
+  typedef MachineRegisterInfo::livein_iterator iterator;
+  for (iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) {
+    if (I->first == PReg)
+      return I->second;
+  }
+  return 0;
+}
diff --git a/lib/Target/Hexagon/HexagonBitTracker.h b/lib/Target/Hexagon/HexagonBitTracker.h
new file mode 100644
index 000000000000..897af2d71870
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonBitTracker.h
@@ -0,0 +1,64 @@
+//===--- HexagonBitTracker.h ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONBITTRACKER_H
+#define HEXAGONBITTRACKER_H
+
+#include "BitTracker.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+  class HexagonInstrInfo;
+  class HexagonRegisterInfo;
+
+struct HexagonEvaluator : public BitTracker::MachineEvaluator {
+  typedef BitTracker::CellMapType CellMapType;
+  typedef BitTracker::RegisterRef RegisterRef;
+  typedef BitTracker::RegisterCell RegisterCell;
+  typedef BitTracker::BranchTargetList BranchTargetList;
+
+  HexagonEvaluator(const HexagonRegisterInfo &tri, MachineRegisterInfo &mri,
+                   const HexagonInstrInfo &tii, MachineFunction &mf);
+
+  bool evaluate(const MachineInstr *MI, const CellMapType &Inputs,
+                CellMapType &Outputs) const override;
+  bool evaluate(const MachineInstr *BI, const CellMapType &Inputs,
+                BranchTargetList &Targets, bool &FallsThru) const override;
+
+  BitTracker::BitMask mask(unsigned Reg, unsigned Sub) const override;
+
+  MachineFunction &MF;
+  MachineFrameInfo &MFI;
+  const HexagonInstrInfo &TII;
+
+private:
+  bool evaluateLoad(const MachineInstr *MI, const CellMapType &Inputs,
+                    CellMapType &Outputs) const;
+  bool evaluateFormalCopy(const MachineInstr *MI, const CellMapType &Inputs,
+                          CellMapType &Outputs) const;
+
+  unsigned getNextPhysReg(unsigned PReg, unsigned Width) const;
+  unsigned getVirtRegFor(unsigned PReg) const;
+
+  // Type of formal parameter extension.
+  struct ExtType {
+    enum { SExt, ZExt };
+    char Type;
+    uint16_t Width;
+    ExtType() : Type(0), Width(0) {}
+    ExtType(char t, uint16_t w) : Type(t), Width(w) {}
+  };
+  // Map VR -> extension type.
+  typedef DenseMap<unsigned, ExtType> RegExtMap;
+  RegExtMap VRX;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonCommonGEP.cpp b/lib/Target/Hexagon/HexagonCommonGEP.cpp
new file mode 100644
index 000000000000..9f5fac156527
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -0,0 +1,1325 @@
+//===--- HexagonCommonGEP.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "commgep"
+
+#include "llvm/Pass.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "HexagonTargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<bool> OptSpeculate("commgep-speculate", cl::init(true),
+  cl::Hidden, cl::ZeroOrMore);
+
+static cl::opt<bool> OptEnableInv("commgep-inv", cl::init(true), cl::Hidden,
+  cl::ZeroOrMore);
+
+static cl::opt<bool> OptEnableConst("commgep-const", cl::init(true),
+  cl::Hidden, cl::ZeroOrMore);
+
+namespace llvm {
+  void initializeHexagonCommonGEPPass(PassRegistry&);
+}
+
+namespace {
+  struct GepNode;
+  typedef std::set<GepNode*> NodeSet;
+  typedef std::map<GepNode*,Value*> NodeToValueMap;
+  typedef std::vector<GepNode*> NodeVect;
+  typedef std::map<GepNode*,NodeVect> NodeChildrenMap;
+  typedef std::set<Use*> UseSet;
+  typedef std::map<GepNode*,UseSet> NodeToUsesMap;
+
+  // Numbering map for gep nodes. Used to keep track of ordering for
+  // gep nodes.
+  struct NodeNumbering : public std::map<const GepNode*,unsigned> {
+  };
+
+  struct NodeOrdering : public NodeNumbering {
+    NodeOrdering() : LastNum(0) {}
+#ifdef _MSC_VER
+    void special_insert_for_special_msvc(const GepNode *N)
+#else
+    using NodeNumbering::insert;
+    void insert(const GepNode* N)
+#endif
+    {
+      insert(std::make_pair(N, ++LastNum));
+    }
+    bool operator() (const GepNode* N1, const GepNode *N2) const {
+      const_iterator F1 = find(N1), F2 = find(N2);
+      assert(F1 != end() && F2 != end());
+      return F1->second < F2->second;
+    }
+  private:
+    unsigned LastNum;
+  };
+
+
+  class HexagonCommonGEP : public FunctionPass {
+  public:
+    static char ID;
+    HexagonCommonGEP() : FunctionPass(ID) {
+      initializeHexagonCommonGEPPass(*PassRegistry::getPassRegistry());
+    }
+    virtual bool runOnFunction(Function &F);
+    virtual const char *getPassName() const {
+      return "Hexagon Common GEP";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<PostDominatorTree>();
+      AU.addPreserved<PostDominatorTree>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    typedef std::map<Value*,GepNode*> ValueToNodeMap;
+    typedef std::vector<Value*> ValueVect;
+    typedef std::map<GepNode*,ValueVect> NodeToValuesMap;
+
+    void getBlockTraversalOrder(BasicBlock *Root, ValueVect &Order);
+    bool isHandledGepForm(GetElementPtrInst *GepI);
+    void processGepInst(GetElementPtrInst *GepI, ValueToNodeMap &NM);
+    void collect();
+    void common();
+
+    BasicBlock *recalculatePlacement(GepNode *Node, NodeChildrenMap &NCM,
+                                     NodeToValueMap &Loc);
+    BasicBlock *recalculatePlacementRec(GepNode *Node, NodeChildrenMap &NCM,
+                                        NodeToValueMap &Loc);
+    bool isInvariantIn(Value *Val, Loop *L);
+    bool isInvariantIn(GepNode *Node, Loop *L);
+    bool isInMainPath(BasicBlock *B, Loop *L);
+    BasicBlock *adjustForInvariance(GepNode *Node, NodeChildrenMap &NCM,
+                                    NodeToValueMap &Loc);
+    void separateChainForNode(GepNode *Node, Use *U, NodeToValueMap &Loc);
+    void separateConstantChains(GepNode *Node, NodeChildrenMap &NCM,
+                                NodeToValueMap &Loc);
+    void computeNodePlacement(NodeToValueMap &Loc);
+
+    Value *fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
+                        BasicBlock *LocB);
+    void getAllUsersForNode(GepNode *Node, ValueVect &Values,
+                            NodeChildrenMap &NCM);
+    void materialize(NodeToValueMap &Loc);
+
+    void removeDeadCode();
+
+    NodeVect Nodes;
+    NodeToUsesMap Uses;
+    NodeOrdering NodeOrder;   // Node ordering, for deterministic behavior.
+    SpecificBumpPtrAllocator<GepNode> *Mem;
+    LLVMContext *Ctx;
+    LoopInfo *LI;
+    DominatorTree *DT;
+    PostDominatorTree *PDT;
+    Function *Fn;
+  };
+}
+
+
+char HexagonCommonGEP::ID = 0;
+INITIALIZE_PASS_BEGIN(HexagonCommonGEP, "hcommgep", "Hexagon Common GEP",
+      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(HexagonCommonGEP, "hcommgep", "Hexagon Common GEP",
+      false, false)
+
+namespace {
+  struct GepNode {
+    enum {
+      None      = 0,
+      Root      = 0x01,
+      Internal  = 0x02,
+      Used      = 0x04
+    };
+
+    uint32_t Flags;
+    union {
+      GepNode *Parent;
+      Value *BaseVal;
+    };
+    Value *Idx;
+    Type *PTy;  // Type of the pointer operand.
+
+    GepNode() : Flags(0), Parent(0), Idx(0), PTy(0) {}
+    GepNode(const GepNode *N) : Flags(N->Flags), Idx(N->Idx), PTy(N->PTy) {
+      if (Flags & Root)
+        BaseVal = N->BaseVal;
+      else
+        Parent = N->Parent;
+    }
+    friend raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN);
+  };
+
+
+  Type *next_type(Type *Ty, Value *Idx) {
+    // Advance the type.
+    if (!Ty->isStructTy()) {
+      Type *NexTy = cast<SequentialType>(Ty)->getElementType();
+      return NexTy;
+    }
+    // Otherwise it is a struct type.
+    ConstantInt *CI = dyn_cast<ConstantInt>(Idx);
+    assert(CI && "Struct type with non-constant index");
+    int64_t i = CI->getValue().getSExtValue();
+    Type *NextTy = cast<StructType>(Ty)->getElementType(i);
+    return NextTy;
+  }
+
+
+  raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN) {
+    OS << "{ {";
+    bool Comma = false;
+    if (GN.Flags & GepNode::Root) {
+      OS << "root";
+      Comma = true;
+    }
+    if (GN.Flags & GepNode::Internal) {
+      if (Comma)
+        OS << ',';
+      OS << "internal";
+      Comma = true;
+    }
+    if (GN.Flags & GepNode::Used) {
+      if (Comma)
+        OS << ',';
+      OS << "used";
+      Comma = true;
+    }
+    OS << "} ";
+    if (GN.Flags & GepNode::Root)
+      OS << "BaseVal:" << GN.BaseVal->getName() << '(' << GN.BaseVal << ')';
+    else
+      OS << "Parent:" << GN.Parent;
+
+    OS << " Idx:";
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(GN.Idx))
+      OS << CI->getValue().getSExtValue();
+    else if (GN.Idx->hasName())
+      OS << GN.Idx->getName();
+    else
+      OS << "<anon> =" << *GN.Idx;
+
+    OS << " PTy:";
+    if (GN.PTy->isStructTy()) {
+      StructType *STy = cast<StructType>(GN.PTy);
+      if (!STy->isLiteral())
+        OS << GN.PTy->getStructName();
+      else
+        OS << "<anon-struct>:" << *STy;
+    }
+    else
+      OS << *GN.PTy;
+    OS << " }";
+    return OS;
+  }
+
+
+  template <typename NodeContainer>
+  void dump_node_container(raw_ostream &OS, const NodeContainer &S) {
+    typedef typename NodeContainer::const_iterator const_iterator;
+    for (const_iterator I = S.begin(), E = S.end(); I != E; ++I)
+      OS << *I << ' ' << **I << '\n';
+  }
+
+  raw_ostream &operator<< (raw_ostream &OS,
+                           const NodeVect &S) LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<< (raw_ostream &OS, const NodeVect &S) {
+    dump_node_container(OS, S);
+    return OS;
+  }
+
+
+  raw_ostream &operator<< (raw_ostream &OS,
+                           const NodeToUsesMap &M) LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<< (raw_ostream &OS, const NodeToUsesMap &M){
+    typedef NodeToUsesMap::const_iterator const_iterator;
+    for (const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
+      const UseSet &Us = I->second;
+      OS << I->first << " -> #" << Us.size() << '{';
+      for (UseSet::const_iterator J = Us.begin(), F = Us.end(); J != F; ++J) {
+        User *R = (*J)->getUser();
+        if (R->hasName())
+          OS << ' ' << R->getName();
+        else
+          OS << " <?>(" << *R << ')';
+      }
+      OS << " }\n";
+    }
+    return OS;
+  }
+
+
+  struct in_set {
+    in_set(const NodeSet &S) : NS(S) {}
+    bool operator() (GepNode *N) const {
+      return NS.find(N) != NS.end();
+    }
+  private:
+    const NodeSet &NS;
+  };
+}
+
+
+inline void *operator new(size_t, SpecificBumpPtrAllocator<GepNode> &A) {
+  return A.Allocate();
+}
+
+
+void HexagonCommonGEP::getBlockTraversalOrder(BasicBlock *Root,
+      ValueVect &Order) {
+  // Compute block ordering for a typical DT-based traversal of the flow
+  // graph: "before visiting a block, all of its dominators must have been
+  // visited".
+
+  Order.push_back(Root);
+  DomTreeNode *DTN = DT->getNode(Root);
+  typedef GraphTraits<DomTreeNode*> GTN;
+  typedef GTN::ChildIteratorType Iter;
+  for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
+    getBlockTraversalOrder((*I)->getBlock(), Order);
+}
+
+
+bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) {
+  // No vector GEPs.
+  if (!GepI->getType()->isPointerTy())
+    return false;
+  // No GEPs without any indices.  (Is this possible?)
+  if (GepI->idx_begin() == GepI->idx_end())
+    return false;
+  return true;
+}
+
+
+void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
+      ValueToNodeMap &NM) {
+  DEBUG(dbgs() << "Visiting GEP: " << *GepI << '\n');
+  GepNode *N = new (*Mem) GepNode;
+  Value *PtrOp = GepI->getPointerOperand();
+  ValueToNodeMap::iterator F = NM.find(PtrOp);
+  if (F == NM.end()) {
+    N->BaseVal = PtrOp;
+    N->Flags |= GepNode::Root;
+  } else {
+    // If PtrOp was a GEP instruction, it must have already been processed.
+    // The ValueToNodeMap entry for it is the last gep node in the generated
+    // chain. Link to it here.
+    N->Parent = F->second;
+  }
+  N->PTy = PtrOp->getType();
+  N->Idx = *GepI->idx_begin();
+
+  // Collect the list of users of this GEP instruction. Will add it to the
+  // last node created for it.
+  UseSet Us;
+  for (Value::user_iterator UI = GepI->user_begin(), UE = GepI->user_end();
+       UI != UE; ++UI) {
+    // Check if this gep is used by anything other than other geps that
+    // we will process.
+    if (isa<GetElementPtrInst>(*UI)) {
+      GetElementPtrInst *UserG = cast<GetElementPtrInst>(*UI);
+      if (isHandledGepForm(UserG))
+        continue;
+    }
+    Us.insert(&UI.getUse());
+  }
+  Nodes.push_back(N);
+#ifdef _MSC_VER
+  NodeOrder.special_insert_for_special_msvc(N);
+#else
+  NodeOrder.insert(N);
+#endif
+
+  // Skip the first index operand, since we only handle 0. This dereferences
+  // the pointer operand.
+  GepNode *PN = N;
+  Type *PtrTy = cast<PointerType>(PtrOp->getType())->getElementType();
+  for (User::op_iterator OI = GepI->idx_begin()+1, OE = GepI->idx_end();
+       OI != OE; ++OI) {
+    Value *Op = *OI;
+    GepNode *Nx = new (*Mem) GepNode;
+    Nx->Parent = PN;  // Link Nx to the previous node.
+    Nx->Flags |= GepNode::Internal;
+    Nx->PTy = PtrTy;
+    Nx->Idx = Op;
+    Nodes.push_back(Nx);
+#ifdef _MSC_VER
+    NodeOrder.special_insert_for_special_msvc(Nx);
+#else
+    NodeOrder.insert(Nx);
+#endif
+    PN = Nx;
+
+    PtrTy = next_type(PtrTy, Op);
+  }
+
+  // After last node has been created, update the use information.
+  if (!Us.empty()) {
+    PN->Flags |= GepNode::Used;
+    Uses[PN].insert(Us.begin(), Us.end());
+  }
+
+  // Link the last node with the originating GEP instruction. This is to
+  // help with linking chained GEP instructions.
+  NM.insert(std::make_pair(GepI, PN));
+}
+
+
+void HexagonCommonGEP::collect() {
+  // Establish depth-first traversal order of the dominator tree.
+  ValueVect BO;
+  getBlockTraversalOrder(Fn->begin(), BO);
+
+  // The creation of gep nodes requires DT-traversal. When processing a GEP
+  // instruction that uses another GEP instruction as the base pointer, the
+  // gep node for the base pointer should already exist.
+  ValueToNodeMap NM;
+  for (ValueVect::iterator I = BO.begin(), E = BO.end(); I != E; ++I) {
+    BasicBlock *B = cast<BasicBlock>(*I);
+    for (BasicBlock::iterator J = B->begin(), F = B->end(); J != F; ++J) {
+      if (!isa<GetElementPtrInst>(J))
+        continue;
+      GetElementPtrInst *GepI = cast<GetElementPtrInst>(J);
+      if (isHandledGepForm(GepI))
+        processGepInst(GepI, NM);
+    }
+  }
+
+  DEBUG(dbgs() << "Gep nodes after initial collection:\n" << Nodes);
+}
+
+
+namespace {
+  void invert_find_roots(const NodeVect &Nodes, NodeChildrenMap &NCM,
+        NodeVect &Roots) {
+    typedef NodeVect::const_iterator const_iterator;
+    for (const_iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+      GepNode *N = *I;
+      if (N->Flags & GepNode::Root) {
+        Roots.push_back(N);
+        continue;
+      }
+      GepNode *PN = N->Parent;
+      NCM[PN].push_back(N);
+    }
+  }
+
+  void nodes_for_root(GepNode *Root, NodeChildrenMap &NCM, NodeSet &Nodes) {
+    NodeVect Work;
+    Work.push_back(Root);
+    Nodes.insert(Root);
+
+    while (!Work.empty()) {
+      NodeVect::iterator First = Work.begin();
+      GepNode *N = *First;
+      Work.erase(First);
+      NodeChildrenMap::iterator CF = NCM.find(N);
+      if (CF != NCM.end()) {
+        Work.insert(Work.end(), CF->second.begin(), CF->second.end());
+        Nodes.insert(CF->second.begin(), CF->second.end());
+      }
+    }
+  }
+}
+
+
+namespace {
+  typedef std::set<NodeSet> NodeSymRel;
+  typedef std::pair<GepNode*,GepNode*> NodePair;
+  typedef std::set<NodePair> NodePairSet;
+
+  const NodeSet *node_class(GepNode *N, NodeSymRel &Rel) {
+    for (NodeSymRel::iterator I = Rel.begin(), E = Rel.end(); I != E; ++I)
+      if (I->count(N))
+        return &*I;
+    return 0;
+  }
+
+  // Create an ordered pair of GepNode pointers. The pair will be used in
+  // determining equality. The only purpose of the ordering is to eliminate
+  // duplication due to the commutativity of equality/non-equality.
+  NodePair node_pair(GepNode *N1, GepNode *N2) {
+    uintptr_t P1 = uintptr_t(N1), P2 = uintptr_t(N2);
+    if (P1 <= P2)
+      return std::make_pair(N1, N2);
+    return std::make_pair(N2, N1);
+  }
+
+  unsigned node_hash(GepNode *N) {
+    // Include everything except flags and parent.
+    FoldingSetNodeID ID;
+    ID.AddPointer(N->Idx);
+    ID.AddPointer(N->PTy);
+    return ID.ComputeHash();
+  }
+
+  bool node_eq(GepNode *N1, GepNode *N2, NodePairSet &Eq, NodePairSet &Ne) {
+    // Don't cache the result for nodes with different hashes. The hash
+    // comparison is fast enough.
+    if (node_hash(N1) != node_hash(N2))
+      return false;
+
+    NodePair NP = node_pair(N1, N2);
+    NodePairSet::iterator FEq = Eq.find(NP);
+    if (FEq != Eq.end())
+      return true;
+    NodePairSet::iterator FNe = Ne.find(NP);
+    if (FNe != Ne.end())
+      return false;
+    // Not previously compared.
+    bool Root1 = N1->Flags & GepNode::Root;
+    bool Root2 = N2->Flags & GepNode::Root;
+    NodePair P = node_pair(N1, N2);
+    // If the Root flag has different values, the nodes are different.
+    // If both nodes are root nodes, but their base pointers differ,
+    // they are different.
+    if (Root1 != Root2 || (Root1 && N1->BaseVal != N2->BaseVal)) {
+      Ne.insert(P);
+      return false;
+    }
+    // Here the root flags are identical, and for root nodes the
+    // base pointers are equal, so the root nodes are equal.
+    // For non-root nodes, compare their parent nodes.
+    if (Root1 || node_eq(N1->Parent, N2->Parent, Eq, Ne)) {
+      Eq.insert(P);
+      return true;
+    }
+    return false;
+  }
+}
+
+
+void HexagonCommonGEP::common() {
+  // The essence of this commoning is finding gep nodes that are equal.
+  // To do this we need to compare all pairs of nodes. To save time,
+  // first, partition the set of all nodes into sets of potentially equal
+  // nodes, and then compare pairs from within each partition.
+  typedef std::map<unsigned,NodeSet> NodeSetMap;
+  NodeSetMap MaybeEq;
+
+  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    GepNode *N = *I;
+    unsigned H = node_hash(N);
+    MaybeEq[H].insert(N);
+  }
+
+  // Compute the equivalence relation for the gep nodes.  Use two caches,
+  // one for equality and the other for non-equality.
+  NodeSymRel EqRel;  // Equality relation (as set of equivalence classes).
+  NodePairSet Eq, Ne;  // Caches.
+  for (NodeSetMap::iterator I = MaybeEq.begin(), E = MaybeEq.end();
+       I != E; ++I) {
+    NodeSet &S = I->second;
+    for (NodeSet::iterator NI = S.begin(), NE = S.end(); NI != NE; ++NI) {
+      GepNode *N = *NI;
+      // If node already has a class, then the class must have been created
+      // in a prior iteration of this loop. Since equality is transitive,
+      // nothing more will be added to that class, so skip it.
+      if (node_class(N, EqRel))
+        continue;
+
+      // Create a new class candidate now.
+      NodeSet C;
+      for (NodeSet::iterator NJ = std::next(NI); NJ != NE; ++NJ)
+        if (node_eq(N, *NJ, Eq, Ne))
+          C.insert(*NJ);
+      // If Tmp is empty, N would be the only element in it. Don't bother
+      // creating a class for it then.
+      if (!C.empty()) {
+        C.insert(N);  // Finalize the set before adding it to the relation.
+        std::pair<NodeSymRel::iterator, bool> Ins = EqRel.insert(C);
+        (void)Ins;
+        assert(Ins.second && "Cannot add a class");
+      }
+    }
+  }
+
+  DEBUG({
+    dbgs() << "Gep node equality:\n";
+    for (NodePairSet::iterator I = Eq.begin(), E = Eq.end(); I != E; ++I)
+      dbgs() << "{ " << I->first << ", " << I->second << " }\n";
+
+    dbgs() << "Gep equivalence classes:\n";
+    for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) {
+      dbgs() << '{';
+      const NodeSet &S = *I;
+      for (NodeSet::const_iterator J = S.begin(), F = S.end(); J != F; ++J) {
+        if (J != S.begin())
+          dbgs() << ',';
+        dbgs() << ' ' << *J;
+      }
+      dbgs() << " }\n";
+    }
+  });
+
+
+  // Create a projection from a NodeSet to the minimal element in it.
+  typedef std::map<const NodeSet*,GepNode*> ProjMap;
+  ProjMap PM;
+  for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) {
+    const NodeSet &S = *I;
+    GepNode *Min = *std::min_element(S.begin(), S.end(), NodeOrder);
+    std::pair<ProjMap::iterator,bool> Ins = PM.insert(std::make_pair(&S, Min));
+    (void)Ins;
+    assert(Ins.second && "Cannot add minimal element");
+
+    // Update the min element's flags, and user list.
+    uint32_t Flags = 0;
+    UseSet &MinUs = Uses[Min];
+    for (NodeSet::iterator J = S.begin(), F = S.end(); J != F; ++J) {
+      GepNode *N = *J;
+      uint32_t NF = N->Flags;
+      // If N is used, append all original values of N to the list of
+      // original values of Min.
+      if (NF & GepNode::Used)
+        MinUs.insert(Uses[N].begin(), Uses[N].end());
+      Flags |= NF;
+    }
+    if (MinUs.empty())
+      Uses.erase(Min);
+
+    // The collected flags should include all the flags from the min element.
+    assert((Min->Flags & Flags) == Min->Flags);
+    Min->Flags = Flags;
+  }
+
+  // Commoning: for each non-root gep node, replace "Parent" with the
+  // selected (minimum) node from the corresponding equivalence class.
+  // If a given parent does not have an equivalence class, leave it
+  // unchanged (it means that it's the only element in its class).
+  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    GepNode *N = *I;
+    if (N->Flags & GepNode::Root)
+      continue;
+    const NodeSet *PC = node_class(N->Parent, EqRel);
+    if (!PC)
+      continue;
+    ProjMap::iterator F = PM.find(PC);
+    if (F == PM.end())
+      continue;
+    // Found a replacement, use it.
+    GepNode *Rep = F->second;
+    N->Parent = Rep;
+  }
+
+  DEBUG(dbgs() << "Gep nodes after commoning:\n" << Nodes);
+
+  // Finally, erase the nodes that are no longer used.
+  NodeSet Erase;
+  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    GepNode *N = *I;
+    const NodeSet *PC = node_class(N, EqRel);
+    if (!PC)
+      continue;
+    ProjMap::iterator F = PM.find(PC);
+    if (F == PM.end())
+      continue;
+    if (N == F->second)
+      continue;
+    // Node for removal.
+    Erase.insert(*I);
+  }
+  NodeVect::iterator NewE = std::remove_if(Nodes.begin(), Nodes.end(),
+                                           in_set(Erase));
+  Nodes.resize(std::distance(Nodes.begin(), NewE));
+
+  DEBUG(dbgs() << "Gep nodes after post-commoning cleanup:\n" << Nodes);
+}
+
+
+namespace {
+  template <typename T>
+  BasicBlock *nearest_common_dominator(DominatorTree *DT, T &Blocks) {
+    DEBUG({
+      dbgs() << "NCD of {";
+      for (typename T::iterator I = Blocks.begin(), E = Blocks.end();
+           I != E; ++I) {
+        if (!*I)
+          continue;
+        BasicBlock *B = cast<BasicBlock>(*I);
+        dbgs() << ' ' << B->getName();
+      }
+      dbgs() << " }\n";
+    });
+
+    // Allow null basic blocks in Blocks.  In such cases, return 0.
+    typename T::iterator I = Blocks.begin(), E = Blocks.end();
+    if (I == E || !*I)
+      return 0;
+    BasicBlock *Dom = cast<BasicBlock>(*I);
+    while (++I != E) {
+      BasicBlock *B = cast_or_null<BasicBlock>(*I);
+      Dom = B ? DT->findNearestCommonDominator(Dom, B) : 0;
+      if (!Dom)
+        return 0;
+    }
+    DEBUG(dbgs() << "computed:" << Dom->getName() << '\n');
+    return Dom;
+  }
+
+  template <typename T>
+  BasicBlock *nearest_common_dominatee(DominatorTree *DT, T &Blocks) {
+    // If two blocks, A and B, dominate a block C, then A dominates B,
+    // or B dominates A.
+    typename T::iterator I = Blocks.begin(), E = Blocks.end();
+    // Find the first non-null block.
+    while (I != E && !*I)
+      ++I;
+    if (I == E)
+      return DT->getRoot();
+    BasicBlock *DomB = cast<BasicBlock>(*I);
+    while (++I != E) {
+      if (!*I)
+        continue;
+      BasicBlock *B = cast<BasicBlock>(*I);
+      if (DT->dominates(B, DomB))
+        continue;
+      if (!DT->dominates(DomB, B))
+        return 0;
+      DomB = B;
+    }
+    return DomB;
+  }
+
+  // Find the first use in B of any value from Values. If no such use,
+  // return B->end().
+  template <typename T>
+  BasicBlock::iterator first_use_of_in_block(T &Values, BasicBlock *B) {
+    BasicBlock::iterator FirstUse = B->end(), BEnd = B->end();
+    typedef typename T::iterator iterator;
+    for (iterator I = Values.begin(), E = Values.end(); I != E; ++I) {
+      Value *V = *I;
+      // If V is used in a PHI node, the use belongs to the incoming block,
+      // not the block with the PHI node. In the incoming block, the use
+      // would be considered as being at the end of it, so it cannot
+      // influence the position of the first use (which is assumed to be
+      // at the end to start with).
+      if (isa<PHINode>(V))
+        continue;
+      if (!isa<Instruction>(V))
+        continue;
+      Instruction *In = cast<Instruction>(V);
+      if (In->getParent() != B)
+        continue;
+      BasicBlock::iterator It = In;
+      if (std::distance(FirstUse, BEnd) < std::distance(It, BEnd))
+        FirstUse = It;
+    }
+    return FirstUse;
+  }
+
+  bool is_empty(const BasicBlock *B) {
+    return B->empty() || (&*B->begin() == B->getTerminator());
+  }
+}
+
+
+BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node,
+      NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+  DEBUG(dbgs() << "Loc for node:" << Node << '\n');
+  // Recalculate the placement for Node, assuming that the locations of
+  // its children in Loc are valid.
+  // Return 0 if there is no valid placement for Node (for example, it
+  // uses an index value that is not available at the location required
+  // to dominate all children, etc.).
+
+  // Find the nearest common dominator for:
+  // - all users, if the node is used, and
+  // - all children.
+  ValueVect Bs;
+  if (Node->Flags & GepNode::Used) {
+    // Append all blocks with uses of the original values to the
+    // block vector Bs.
+    NodeToUsesMap::iterator UF = Uses.find(Node);
+    assert(UF != Uses.end() && "Used node with no use information");
+    UseSet &Us = UF->second;
+    for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) {
+      Use *U = *I;
+      User *R = U->getUser();
+      if (!isa<Instruction>(R))
+        continue;
+      BasicBlock *PB = isa<PHINode>(R)
+          ? cast<PHINode>(R)->getIncomingBlock(*U)
+          : cast<Instruction>(R)->getParent();
+      Bs.push_back(PB);
+    }
+  }
+  // Append the location of each child.
+  NodeChildrenMap::iterator CF = NCM.find(Node);
+  if (CF != NCM.end()) {
+    NodeVect &Cs = CF->second;
+    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) {
+      GepNode *CN = *I;
+      NodeToValueMap::iterator LF = Loc.find(CN);
+      // If the child is only used in GEP instructions (i.e. is not used in
+      // non-GEP instructions), the nearest dominator computed for it may
+      // have been null. In such case it won't have a location available.
+      if (LF == Loc.end())
+        continue;
+      Bs.push_back(LF->second);
+    }
+  }
+
+  BasicBlock *DomB = nearest_common_dominator(DT, Bs);
+  if (!DomB)
+    return 0;
+  // Check if the index used by Node dominates the computed dominator.
+  Instruction *IdxI = dyn_cast<Instruction>(Node->Idx);
+  if (IdxI && !DT->dominates(IdxI->getParent(), DomB))
+    return 0;
+
+  // Avoid putting nodes into empty blocks.
+  while (is_empty(DomB)) {
+    DomTreeNode *N = (*DT)[DomB]->getIDom();
+    if (!N)
+      break;
+    DomB = N->getBlock();
+  }
+
+  // Otherwise, DomB is fine. Update the location map.
+  Loc[Node] = DomB;
+  return DomB;
+}
+
+
+BasicBlock *HexagonCommonGEP::recalculatePlacementRec(GepNode *Node,
+      NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+  DEBUG(dbgs() << "LocRec begin for node:" << Node << '\n');
+  // Recalculate the placement of Node, after recursively recalculating the
+  // placements of all its children.
+  NodeChildrenMap::iterator CF = NCM.find(Node);
+  if (CF != NCM.end()) {
+    NodeVect &Cs = CF->second;
+    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I)
+      recalculatePlacementRec(*I, NCM, Loc);
+  }
+  BasicBlock *LB = recalculatePlacement(Node, NCM, Loc);
+  DEBUG(dbgs() << "LocRec end for node:" << Node << '\n');
+  return LB;
+}
+
+
+bool HexagonCommonGEP::isInvariantIn(Value *Val, Loop *L) {
+  if (isa<Constant>(Val) || isa<Argument>(Val))
+    return true;
+  Instruction *In = dyn_cast<Instruction>(Val);
+  if (!In)
+    return false;
+  BasicBlock *HdrB = L->getHeader(), *DefB = In->getParent();
+  return DT->properlyDominates(DefB, HdrB);
+}
+
+
+bool HexagonCommonGEP::isInvariantIn(GepNode *Node, Loop *L) {
+  if (Node->Flags & GepNode::Root)
+    if (!isInvariantIn(Node->BaseVal, L))
+      return false;
+  return isInvariantIn(Node->Idx, L);
+}
+
+
+bool HexagonCommonGEP::isInMainPath(BasicBlock *B, Loop *L) {
+  BasicBlock *HB = L->getHeader();
+  BasicBlock *LB = L->getLoopLatch();
+  // B must post-dominate the loop header or dominate the loop latch.
+  if (PDT->dominates(B, HB))
+    return true;
+  if (LB && DT->dominates(B, LB))
+    return true;
+  return false;
+}
+
+
+namespace {
+  BasicBlock *preheader(DominatorTree *DT, Loop *L) {
+    if (BasicBlock *PH = L->getLoopPreheader())
+      return PH;
+    if (!OptSpeculate)
+      return 0;
+    DomTreeNode *DN = DT->getNode(L->getHeader());
+    if (!DN)
+      return 0;
+    return DN->getIDom()->getBlock();
+  }
+}
+
+
+BasicBlock *HexagonCommonGEP::adjustForInvariance(GepNode *Node,
+      NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+  // Find the "topmost" location for Node: it must be dominated by both,
+  // its parent (or the BaseVal, if it's a root node), and by the index
+  // value.
+  ValueVect Bs;
+  if (Node->Flags & GepNode::Root) {
+    if (Instruction *PIn = dyn_cast<Instruction>(Node->BaseVal))
+      Bs.push_back(PIn->getParent());
+  } else {
+    Bs.push_back(Loc[Node->Parent]);
+  }
+  if (Instruction *IIn = dyn_cast<Instruction>(Node->Idx))
+    Bs.push_back(IIn->getParent());
+  BasicBlock *TopB = nearest_common_dominatee(DT, Bs);
+
+  // Traverse the loop nest upwards until we find a loop in which Node
+  // is no longer invariant, or until we get to the upper limit of Node's
+  // placement. The traversal will also stop when a suitable "preheader"
+  // cannot be found for a given loop. The "preheader" may actually be
+  // a regular block outside of the loop (i.e. not guarded), in which case
+  // the Node will be speculated.
+  // For nodes that are not in the main path of the containing loop (i.e.
+  // are not executed in each iteration), do not move them out of the loop.
+  BasicBlock *LocB = cast_or_null<BasicBlock>(Loc[Node]);
+  if (LocB) {
+    Loop *Lp = LI->getLoopFor(LocB);
+    while (Lp) {
+      if (!isInvariantIn(Node, Lp) || !isInMainPath(LocB, Lp))
+        break;
+      BasicBlock *NewLoc = preheader(DT, Lp);
+      if (!NewLoc || !DT->dominates(TopB, NewLoc))
+        break;
+      Lp = Lp->getParentLoop();
+      LocB = NewLoc;
+    }
+  }
+  Loc[Node] = LocB;
+
+  // Recursively compute the locations of all children nodes.
+  NodeChildrenMap::iterator CF = NCM.find(Node);
+  if (CF != NCM.end()) {
+    NodeVect &Cs = CF->second;
+    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I)
+      adjustForInvariance(*I, NCM, Loc);
+  }
+  return LocB;
+}
+
+
+namespace {
+  struct LocationAsBlock {
+    LocationAsBlock(const NodeToValueMap &L) : Map(L) {}
+    const NodeToValueMap &Map;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS,
+                           const LocationAsBlock &Loc) LLVM_ATTRIBUTE_UNUSED ;
+  raw_ostream &operator<< (raw_ostream &OS, const LocationAsBlock &Loc) {
+    for (NodeToValueMap::const_iterator I = Loc.Map.begin(), E = Loc.Map.end();
+         I != E; ++I) {
+      OS << I->first << " -> ";
+      BasicBlock *B = cast<BasicBlock>(I->second);
+      OS << B->getName() << '(' << B << ')';
+      OS << '\n';
+    }
+    return OS;
+  }
+
+  inline bool is_constant(GepNode *N) {
+    return isa<ConstantInt>(N->Idx);
+  }
+}
+
+
+void HexagonCommonGEP::separateChainForNode(GepNode *Node, Use *U,
+      NodeToValueMap &Loc) {
+  User *R = U->getUser();
+  DEBUG(dbgs() << "Separating chain for node (" << Node << ") user: "
+               << *R << '\n');
+  BasicBlock *PB = cast<Instruction>(R)->getParent();
+
+  GepNode *N = Node;
+  GepNode *C = 0, *NewNode = 0;
+  while (is_constant(N) && !(N->Flags & GepNode::Root)) {
+    // XXX if (single-use) dont-replicate;
+    GepNode *NewN = new (*Mem) GepNode(N);
+    Nodes.push_back(NewN);
+    Loc[NewN] = PB;
+
+    if (N == Node)
+      NewNode = NewN;
+    NewN->Flags &= ~GepNode::Used;
+    if (C)
+      C->Parent = NewN;
+    C = NewN;
+    N = N->Parent;
+  }
+  if (!NewNode)
+    return;
+
+  // Move over all uses that share the same user as U from Node to NewNode.
+  NodeToUsesMap::iterator UF = Uses.find(Node);
+  assert(UF != Uses.end());
+  UseSet &Us = UF->second;
+  UseSet NewUs;
+  for (UseSet::iterator I = Us.begin(); I != Us.end(); ) {
+    User *S = (*I)->getUser();
+    UseSet::iterator Nx = std::next(I);
+    if (S == R) {
+      NewUs.insert(*I);
+      Us.erase(I);
+    }
+    I = Nx;
+  }
+  if (Us.empty()) {
+    Node->Flags &= ~GepNode::Used;
+    Uses.erase(UF);
+  }
+
+  // Should at least have U in NewUs.
+  NewNode->Flags |= GepNode::Used;
+  DEBUG(dbgs() << "new node: " << NewNode << "  " << *NewNode << '\n');
+  assert(!NewUs.empty());
+  Uses[NewNode] = NewUs;
+}
+
+
+void HexagonCommonGEP::separateConstantChains(GepNode *Node,
+      NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+  // First approximation: extract all chains.
+  NodeSet Ns;
+  nodes_for_root(Node, NCM, Ns);
+
+  DEBUG(dbgs() << "Separating constant chains for node: " << Node << '\n');
+  // Collect all used nodes together with the uses from loads and stores,
+  // where the GEP node could be folded into the load/store instruction.
+  NodeToUsesMap FNs; // Foldable nodes.
+  for (NodeSet::iterator I = Ns.begin(), E = Ns.end(); I != E; ++I) {
+    GepNode *N = *I;
+    if (!(N->Flags & GepNode::Used))
+      continue;
+    NodeToUsesMap::iterator UF = Uses.find(N);
+    assert(UF != Uses.end());
+    UseSet &Us = UF->second;
+    // Loads/stores that use the node N.
+    UseSet LSs;
+    for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J) {
+      Use *U = *J;
+      User *R = U->getUser();
+      // We're interested in uses that provide the address. It can happen
+      // that the value may also be provided via GEP, but we won't handle
+      // those cases here for now.
+      if (LoadInst *Ld = dyn_cast<LoadInst>(R)) {
+        unsigned PtrX = LoadInst::getPointerOperandIndex();
+        if (&Ld->getOperandUse(PtrX) == U)
+          LSs.insert(U);
+      } else if (StoreInst *St = dyn_cast<StoreInst>(R)) {
+        unsigned PtrX = StoreInst::getPointerOperandIndex();
+        if (&St->getOperandUse(PtrX) == U)
+          LSs.insert(U);
+      }
+    }
+    // Even if the total use count is 1, separating the chain may still be
+    // beneficial, since the constant chain may be longer than the GEP alone
+    // would be (e.g. if the parent node has a constant index and also has
+    // other children).
+    if (!LSs.empty())
+      FNs.insert(std::make_pair(N, LSs));
+  }
+
+  DEBUG(dbgs() << "Nodes with foldable users:\n" << FNs);
+
+  for (NodeToUsesMap::iterator I = FNs.begin(), E = FNs.end(); I != E; ++I) {
+    GepNode *N = I->first;
+    UseSet &Us = I->second;
+    for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J)
+      separateChainForNode(N, *J, Loc);
+  }
+}
+
+
+void HexagonCommonGEP::computeNodePlacement(NodeToValueMap &Loc) {
+  // Compute the inverse of the Node.Parent links. Also, collect the set
+  // of root nodes.
+  NodeChildrenMap NCM;
+  NodeVect Roots;
+  invert_find_roots(Nodes, NCM, Roots);
+
+  // Compute the initial placement determined by the users' locations, and
+  // the locations of the child nodes.
+  for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
+    recalculatePlacementRec(*I, NCM, Loc);
+
+  DEBUG(dbgs() << "Initial node placement:\n" << LocationAsBlock(Loc));
+
+  if (OptEnableInv) {
+    for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
+      adjustForInvariance(*I, NCM, Loc);
+
+    DEBUG(dbgs() << "Node placement after adjustment for invariance:\n"
+                 << LocationAsBlock(Loc));
+  }
+  if (OptEnableConst) {
+    for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
+      separateConstantChains(*I, NCM, Loc);
+  }
+  DEBUG(dbgs() << "Node use information:\n" << Uses);
+
+  // At the moment, there is no further refinement of the initial placement.
+  // Such a refinement could include splitting the nodes if they are placed
+  // too far from some of its users.
+
+  DEBUG(dbgs() << "Final node placement:\n" << LocationAsBlock(Loc));
+}
+
+
+Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
+      BasicBlock *LocB) {
+  DEBUG(dbgs() << "Fabricating GEP in " << LocB->getName()
+               << " for nodes:\n" << NA);
+  unsigned Num = NA.size();
+  GepNode *RN = NA[0];
+  assert((RN->Flags & GepNode::Root) && "Creating GEP for non-root");
+
+  Value *NewInst = 0;
+  Value *Input = RN->BaseVal;
+  Value **IdxList = new Value*[Num+1];
+  unsigned nax = 0;
+  do {
+    unsigned IdxC = 0;
+    // If the type of the input of the first node is not a pointer,
+    // we need to add an artificial i32 0 to the indices (because the
+    // actual input in the IR will be a pointer).
+    if (!NA[nax]->PTy->isPointerTy()) {
+      Type *Int32Ty = Type::getInt32Ty(*Ctx);
+      IdxList[IdxC++] = ConstantInt::get(Int32Ty, 0);
+    }
+
+    // Keep adding indices from NA until we have to stop and generate
+    // an "intermediate" GEP.
+    while (++nax <= Num) {
+      GepNode *N = NA[nax-1];
+      IdxList[IdxC++] = N->Idx;
+      if (nax < Num) {
+        // We have to stop, if the expected type of the output of this node
+        // is not the same as the input type of the next node.
+        Type *NextTy = next_type(N->PTy, N->Idx);
+        if (NextTy != NA[nax]->PTy)
+          break;
+      }
+    }
+    ArrayRef<Value*> A(IdxList, IdxC);
+    Type *InpTy = Input->getType();
+    Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType();
+    NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", At);
+    DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
+    Input = NewInst;
+  } while (nax <= Num);
+
+  delete[] IdxList;
+  return NewInst;
+}
+
+
+void HexagonCommonGEP::getAllUsersForNode(GepNode *Node, ValueVect &Values,
+      NodeChildrenMap &NCM) {
+  NodeVect Work;
+  Work.push_back(Node);
+
+  while (!Work.empty()) {
+    NodeVect::iterator First = Work.begin();
+    GepNode *N = *First;
+    Work.erase(First);
+    if (N->Flags & GepNode::Used) {
+      NodeToUsesMap::iterator UF = Uses.find(N);
+      assert(UF != Uses.end() && "No use information for used node");
+      UseSet &Us = UF->second;
+      for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I)
+        Values.push_back((*I)->getUser());
+    }
+    NodeChildrenMap::iterator CF = NCM.find(N);
+    if (CF != NCM.end()) {
+      NodeVect &Cs = CF->second;
+      Work.insert(Work.end(), Cs.begin(), Cs.end());
+    }
+  }
+}
+
+
+void HexagonCommonGEP::materialize(NodeToValueMap &Loc) {
+  DEBUG(dbgs() << "Nodes before materialization:\n" << Nodes << '\n');
+  NodeChildrenMap NCM;
+  NodeVect Roots;
+  // Compute the inversion again, since computing placement could alter
+  // "parent" relation between nodes.
+  invert_find_roots(Nodes, NCM, Roots);
+
+  while (!Roots.empty()) {
+    NodeVect::iterator First = Roots.begin();
+    GepNode *Root = *First, *Last = *First;
+    Roots.erase(First);
+
+    NodeVect NA;  // Nodes to assemble.
+    // Append to NA all child nodes up to (and including) the first child
+    // that:
+    // (1) has more than 1 child, or
+    // (2) is used, or
+    // (3) has a child located in a different block.
+    bool LastUsed = false;
+    unsigned LastCN = 0;
+    // The location may be null if the computation failed (it can legitimately
+    // happen for nodes created from dead GEPs).
+    Value *LocV = Loc[Last];
+    if (!LocV)
+      continue;
+    BasicBlock *LastB = cast<BasicBlock>(LocV);
+    do {
+      NA.push_back(Last);
+      LastUsed = (Last->Flags & GepNode::Used);
+      if (LastUsed)
+        break;
+      NodeChildrenMap::iterator CF = NCM.find(Last);
+      LastCN = (CF != NCM.end()) ? CF->second.size() : 0;
+      if (LastCN != 1)
+        break;
+      GepNode *Child = CF->second.front();
+      BasicBlock *ChildB = cast_or_null<BasicBlock>(Loc[Child]);
+      if (ChildB != 0 && LastB != ChildB)
+        break;
+      Last = Child;
+    } while (true);
+
+    BasicBlock::iterator InsertAt = LastB->getTerminator();
+    if (LastUsed || LastCN > 0) {
+      ValueVect Urs;
+      getAllUsersForNode(Root, Urs, NCM);
+      BasicBlock::iterator FirstUse = first_use_of_in_block(Urs, LastB);
+      if (FirstUse != LastB->end())
+        InsertAt = FirstUse;
+    }
+
+    // Generate a new instruction for NA.
+    Value *NewInst = fabricateGEP(NA, InsertAt, LastB);
+
+    // Convert all the children of Last node into roots, and append them
+    // to the Roots list.
+    if (LastCN > 0) {
+      NodeVect &Cs = NCM[Last];
+      for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) {
+        GepNode *CN = *I;
+        CN->Flags &= ~GepNode::Internal;
+        CN->Flags |= GepNode::Root;
+        CN->BaseVal = NewInst;
+        Roots.push_back(CN);
+      }
+    }
+
+    // Lastly, if the Last node was used, replace all uses with the new GEP.
+    // The uses reference the original GEP values.
+    if (LastUsed) {
+      NodeToUsesMap::iterator UF = Uses.find(Last);
+      assert(UF != Uses.end() && "No use information found");
+      UseSet &Us = UF->second;
+      for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) {
+        Use *U = *I;
+        U->set(NewInst);
+      }
+    }
+  }
+}
+
+
+void HexagonCommonGEP::removeDeadCode() {
+  ValueVect BO;
+  BO.push_back(&Fn->front());
+
+  for (unsigned i = 0; i < BO.size(); ++i) {
+    BasicBlock *B = cast<BasicBlock>(BO[i]);
+    DomTreeNode *N = DT->getNode(B);
+    typedef GraphTraits<DomTreeNode*> GTN;
+    typedef GTN::ChildIteratorType Iter;
+    for (Iter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
+      BO.push_back((*I)->getBlock());
+  }
+
+  for (unsigned i = BO.size(); i > 0; --i) {
+    BasicBlock *B = cast<BasicBlock>(BO[i-1]);
+    BasicBlock::InstListType &IL = B->getInstList();
+    typedef BasicBlock::InstListType::reverse_iterator reverse_iterator;
+    ValueVect Ins;
+    for (reverse_iterator I = IL.rbegin(), E = IL.rend(); I != E; ++I)
+      Ins.push_back(&*I);
+    for (ValueVect::iterator I = Ins.begin(), E = Ins.end(); I != E; ++I) {
+      Instruction *In = cast<Instruction>(*I);
+      if (isInstructionTriviallyDead(In))
+        In->eraseFromParent();
+    }
+  }
+}
+
+
+bool HexagonCommonGEP::runOnFunction(Function &F) {
+  // For now bail out on C++ exception handling.
+  for (Function::iterator A = F.begin(), Z = F.end(); A != Z; ++A)
+    for (BasicBlock::iterator I = A->begin(), E = A->end(); I != E; ++I)
+      if (isa<InvokeInst>(I) || isa<LandingPadInst>(I))
+        return false;
+
+  Fn = &F;
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  PDT = &getAnalysis<PostDominatorTree>();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  Ctx = &F.getContext();
+
+  Nodes.clear();
+  Uses.clear();
+  NodeOrder.clear();
+
+  SpecificBumpPtrAllocator<GepNode> Allocator;
+  Mem = &Allocator;
+
+  collect();
+  common();
+
+  NodeToValueMap Loc;
+  computeNodePlacement(Loc);
+  materialize(Loc);
+  removeDeadCode();
+
+#ifdef XDEBUG
+  // Run this only when expensive checks are enabled.
+  verifyFunction(F);
+#endif
+  return true;
+}
+
+
+namespace llvm {
+  FunctionPass *createHexagonCommonGEP() {
+    return new HexagonCommonGEP();
+  }
+}
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 37ed173a79cd..ce10aeadef94 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -1,3 +1,12 @@
+//===--- HexagonExpandCondsets.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
 // Replace mux instructions with the corresponding legal instructions.
 // It is meant to work post-SSA, but still on virtual registers. It was
 // originally placed between register coalescing and machine instruction
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 868f87e18413..29283c81877e 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -864,13 +864,13 @@ static bool needToReserveScavengingSpillSlots(MachineFunction &MF,
   // Check for an unused caller-saved register.
   for ( ; *CallerSavedRegs; ++CallerSavedRegs) {
     MCPhysReg FreeReg = *CallerSavedRegs;
-    if (MRI.isPhysRegUsed(FreeReg))
+    if (!MRI.reg_nodbg_empty(FreeReg))
       continue;
 
     // Check aliased register usage.
     bool IsCurrentRegUsed = false;
     for (MCRegAliasIterator AI(FreeReg, &HRI, false); AI.isValid(); ++AI)
-      if (MRI.isPhysRegUsed(*AI)) {
+      if (!MRI.reg_nodbg_empty(*AI)) {
         IsCurrentRegUsed = true;
         break;
       }
@@ -959,8 +959,11 @@ bool HexagonFrameLowering::replacePredRegPseudoSpillCode(MachineFunction &MF)
 }
 
 
-void HexagonFrameLowering::processFunctionBeforeCalleeSavedScan(
-      MachineFunction &MF, RegScavenger* RS) const {
+void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                BitVector &SavedRegs,
+                                                RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
   auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget());
   auto &HRI = *HST.getRegisterInfo();
 
@@ -969,11 +972,9 @@ void HexagonFrameLowering::processFunctionBeforeCalleeSavedScan(
   // If we have a function containing __builtin_eh_return we want to spill and
   // restore all callee saved registers. Pretend that they are used.
   if (HasEHReturn) {
-    MachineRegisterInfo &MRI = MF.getRegInfo();
     for (const MCPhysReg *CSRegs = HRI.getCalleeSavedRegs(&MF); *CSRegs;
          ++CSRegs)
-      if (!MRI.isPhysRegUsed(*CSRegs))
-        MRI.setPhysRegUsed(*CSRegs);
+      SavedRegs.set(*CSRegs);
   }
 
   const TargetRegisterClass &RC = Hexagon::IntRegsRegClass;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index 89500cb85724..d39ee2c77195 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -45,7 +45,7 @@ public:
       MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const override;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
         RegScavenger *RS = nullptr) const override;
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
         RegScavenger *RS) const override;
 
   bool targetHandlesStackFrameRounding() const override {
diff --git a/lib/Target/Hexagon/HexagonGenExtract.cpp b/lib/Target/Hexagon/HexagonGenExtract.cpp
new file mode 100644
index 000000000000..4d32208bd5aa
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -0,0 +1,259 @@
+//===--- HexagonGenExtract.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> ExtractCutoff("extract-cutoff", cl::init(~0U),
+  cl::Hidden, cl::desc("Cutoff for generating \"extract\""
+  " instructions"));
+
+// This prevents generating extract instructions that have the offset of 0.
+// One of the reasons for "extract" is to put a sequence of bits in a regis-
+// ter, starting at offset 0 (so that these bits can then be used by an
+// "insert"). If the bits are already at offset 0, it is better not to gene-
+// rate "extract", since logical bit operations can be merged into compound
+// instructions (as opposed to "extract").
+static cl::opt<bool> NoSR0("extract-nosr0", cl::init(true), cl::Hidden,
+  cl::desc("No extract instruction with offset 0"));
+
+static cl::opt<bool> NeedAnd("extract-needand", cl::init(true), cl::Hidden,
+  cl::desc("Require & in extract patterns"));
+
+namespace llvm {
+  void initializeHexagonGenExtractPass(PassRegistry&);
+  FunctionPass *createHexagonGenExtract();
+}
+
+
+namespace {
+  class HexagonGenExtract : public FunctionPass {
+  public:
+    static char ID;
+    HexagonGenExtract() : FunctionPass(ID), ExtractCount(0) {
+      initializeHexagonGenExtractPass(*PassRegistry::getPassRegistry());
+    }
+    virtual const char *getPassName() const override {
+      return "Hexagon generate \"extract\" instructions";
+    }
+    virtual bool runOnFunction(Function &F) override;
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<MachineFunctionAnalysis>();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+  private:
+    bool visitBlock(BasicBlock *B);
+    bool convert(Instruction *In);
+
+    unsigned ExtractCount;
+    DominatorTree *DT;
+  };
+
+  char HexagonGenExtract::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(HexagonGenExtract, "hextract", "Hexagon generate "
+  "\"extract\" instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(HexagonGenExtract, "hextract", "Hexagon generate "
+  "\"extract\" instructions", false, false)
+
+
+bool HexagonGenExtract::convert(Instruction *In) {
+  using namespace PatternMatch;
+  Value *BF = 0;
+  ConstantInt *CSL = 0, *CSR = 0, *CM = 0;
+  BasicBlock *BB = In->getParent();
+  LLVMContext &Ctx = BB->getContext();
+  bool LogicalSR;
+
+  // (and (shl (lshr x, #sr), #sl), #m)
+  LogicalSR = true;
+  bool Match = match(In, m_And(m_Shl(m_LShr(m_Value(BF), m_ConstantInt(CSR)),
+                               m_ConstantInt(CSL)),
+                         m_ConstantInt(CM)));
+
+  if (!Match) {
+    // (and (shl (ashr x, #sr), #sl), #m)
+    LogicalSR = false;
+    Match = match(In, m_And(m_Shl(m_AShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CSL)),
+                      m_ConstantInt(CM)));
+  }
+  if (!Match) {
+    // (and (shl x, #sl), #m)
+    LogicalSR = true;
+    CSR = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+    Match = match(In, m_And(m_Shl(m_Value(BF), m_ConstantInt(CSL)),
+                      m_ConstantInt(CM)));
+    if (Match && NoSR0)
+      return false;
+  }
+  if (!Match) {
+    // (and (lshr x, #sr), #m)
+    LogicalSR = true;
+    CSL = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+    Match = match(In, m_And(m_LShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CM)));
+  }
+  if (!Match) {
+    // (and (ashr x, #sr), #m)
+    LogicalSR = false;
+    CSL = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+    Match = match(In, m_And(m_AShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CM)));
+  }
+  if (!Match) {
+    CM = 0;
+    // (shl (lshr x, #sr), #sl)
+    LogicalSR = true;
+    Match = match(In, m_Shl(m_LShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CSL)));
+  }
+  if (!Match) {
+    CM = 0;
+    // (shl (ashr x, #sr), #sl)
+    LogicalSR = false;
+    Match = match(In, m_Shl(m_AShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CSL)));
+  }
+  if (!Match)
+    return false;
+
+  Type *Ty = BF->getType();
+  if (!Ty->isIntegerTy())
+    return false;
+  unsigned BW = Ty->getPrimitiveSizeInBits();
+  if (BW != 32 && BW != 64)
+    return false;
+
+  uint32_t SR = CSR->getZExtValue();
+  uint32_t SL = CSL->getZExtValue();
+
+  if (!CM) {
+    // If there was no and, and the shift left did not remove all potential
+    // sign bits created by the shift right, then extractu cannot reproduce
+    // this value.
+    if (!LogicalSR && (SR > SL))
+      return false;
+    APInt A = APInt(BW, ~0ULL).lshr(SR).shl(SL);
+    CM = ConstantInt::get(Ctx, A);
+  }
+
+  // CM is the shifted-left mask. Shift it back right to remove the zero
+  // bits on least-significant positions.
+  APInt M = CM->getValue().lshr(SL);
+  uint32_t T = M.countTrailingOnes();
+
+  // During the shifts some of the bits will be lost. Calculate how many
+  // of the original value will remain after shift right and then left.
+  uint32_t U = BW - std::max(SL, SR);
+  // The width of the extracted field is the minimum of the original bits
+  // that remain after the shifts and the number of contiguous 1s in the mask.
+  uint32_t W = std::min(U, T);
+  if (W == 0)
+    return false;
+
+  // Check if the extracted bits are contained within the mask that it is
+  // and-ed with. The extract operation will copy these bits, and so the
+  // mask cannot any holes in it that would clear any of the bits of the
+  // extracted field.
+  if (!LogicalSR) {
+    // If the shift right was arithmetic, it could have included some 1 bits.
+    // It is still ok to generate extract, but only if the mask eliminates
+    // those bits (i.e. M does not have any bits set beyond U).
+    APInt C = APInt::getHighBitsSet(BW, BW-U);
+    if (M.intersects(C) || !APIntOps::isMask(W, M))
+      return false;
+  } else {
+    // Check if M starts with a contiguous sequence of W times 1 bits. Get
+    // the low U bits of M (which eliminates the 0 bits shifted in on the
+    // left), and check if the result is APInt's "mask":
+    if (!APIntOps::isMask(W, M.getLoBits(U)))
+      return false;
+  }
+
+  IRBuilder<> IRB(BB, In);
+  Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu
+                                   : Intrinsic::hexagon_S2_extractup;
+  Module *Mod = BB->getParent()->getParent();
+  Value *ExtF = Intrinsic::getDeclaration(Mod, IntId);
+  Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)});
+  if (SL != 0)
+    NewIn = IRB.CreateShl(NewIn, SL, CSL->getName());
+  In->replaceAllUsesWith(NewIn);
+  return true;
+}
+
+
+bool HexagonGenExtract::visitBlock(BasicBlock *B) {
+  // Depth-first, bottom-up traversal.
+  DomTreeNode *DTN = DT->getNode(B);
+  typedef GraphTraits<DomTreeNode*> GTN;
+  typedef GTN::ChildIteratorType Iter;
+  for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
+    visitBlock((*I)->getBlock());
+
+  // Allow limiting the number of generated extracts for debugging purposes.
+  bool HasCutoff = ExtractCutoff.getPosition();
+  unsigned Cutoff = ExtractCutoff;
+
+  bool Changed = false;
+  BasicBlock::iterator I = std::prev(B->end()), NextI, Begin = B->begin();
+  while (true) {
+    if (HasCutoff && (ExtractCount >= Cutoff))
+      return Changed;
+    bool Last = (I == Begin);
+    if (!Last)
+      NextI = std::prev(I);
+    Instruction *In = &*I;
+    bool Done = convert(In);
+    if (HasCutoff && Done)
+      ExtractCount++;
+    Changed |= Done;
+    if (Last)
+      break;
+    I = NextI;
+  }
+  return Changed;
+}
+
+
+bool HexagonGenExtract::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  bool Changed;
+
+  // Traverse the function bottom-up, to see super-expressions before their
+  // sub-expressions.
+  BasicBlock *Entry = GraphTraits<Function*>::getEntryNode(&F);
+  Changed = visitBlock(Entry);
+
+  return Changed;
+}
+
+
+FunctionPass *llvm::createHexagonGenExtract() {
+  return new HexagonGenExtract();
+}
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
new file mode 100644
index 000000000000..096da949e77b
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -0,0 +1,1598 @@
+//===--- HexagonGenInsert.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexinsert"
+
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonBitTracker.h"
+
+#include <map>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<unsigned> VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg# cutoff for insert generation."));
+// The distance cutoff is selected based on the precheckin-perf results:
+// cutoffs 20, 25, 35, and 40 are worse than 30.
+static cl::opt<unsigned> VRegDistCutoff("insert-dist-cutoff", cl::init(30U),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg distance cutoff for insert "
+  "generation."));
+
+static cl::opt<bool> OptTiming("insert-timing", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore, cl::desc("Enable timing of insert generation"));
+static cl::opt<bool> OptTimingDetail("insert-timing-detail", cl::init(false),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Enable detailed timing of insert "
+  "generation"));
+
+static cl::opt<bool> OptSelectAll0("insert-all0", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore);
+static cl::opt<bool> OptSelectHas0("insert-has0", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore);
+// Whether to construct constant values via "insert". Could eliminate constant
+// extenders, but often not practical.
+static cl::opt<bool> OptConst("insert-const", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore);
+
+namespace {
+  // The preprocessor gets confused when the DEBUG macro is passed larger
+  // chunks of code. Use this function to detect debugging.
+  inline bool isDebug() {
+#ifndef NDEBUG
+    return ::llvm::DebugFlag && ::llvm::isCurrentDebugType(DEBUG_TYPE);
+#else
+    return false;
+#endif
+  }
+}
+
+
+namespace {
+  // Set of virtual registers, based on BitVector.
+  struct RegisterSet : private BitVector {
+    RegisterSet() : BitVector() {}
+    explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
+    RegisterSet(const RegisterSet &RS) : BitVector(RS) {}
+
+    using BitVector::clear;
+
+    unsigned find_first() const {
+      int First = BitVector::find_first();
+      if (First < 0)
+        return 0;
+      return x2v(First);
+    }
+
+    unsigned find_next(unsigned Prev) const {
+      int Next = BitVector::find_next(v2x(Prev));
+      if (Next < 0)
+        return 0;
+      return x2v(Next);
+    }
+
+    RegisterSet &insert(unsigned R) {
+      unsigned Idx = v2x(R);
+      ensure(Idx);
+      return static_cast<RegisterSet&>(BitVector::set(Idx));
+    }
+    RegisterSet &remove(unsigned R) {
+      unsigned Idx = v2x(R);
+      if (Idx >= size())
+        return *this;
+      return static_cast<RegisterSet&>(BitVector::reset(Idx));
+    }
+
+    RegisterSet &insert(const RegisterSet &Rs) {
+      return static_cast<RegisterSet&>(BitVector::operator|=(Rs));
+    }
+    RegisterSet &remove(const RegisterSet &Rs) {
+      return static_cast<RegisterSet&>(BitVector::reset(Rs));
+    }
+
+    reference operator[](unsigned R) {
+      unsigned Idx = v2x(R);
+      ensure(Idx);
+      return BitVector::operator[](Idx);
+    }
+    bool operator[](unsigned R) const {
+      unsigned Idx = v2x(R);
+      assert(Idx < size());
+      return BitVector::operator[](Idx);
+    }
+    bool has(unsigned R) const {
+      unsigned Idx = v2x(R);
+      if (Idx >= size())
+        return false;
+      return BitVector::test(Idx);
+    }
+
+    bool empty() const {
+      return !BitVector::any();
+    }
+    bool includes(const RegisterSet &Rs) const {
+      // A.BitVector::test(B)  <=>  A-B != {}
+      return !Rs.BitVector::test(*this);
+    }
+    bool intersects(const RegisterSet &Rs) const {
+      return BitVector::anyCommon(Rs);
+    }
+
+  private:
+    void ensure(unsigned Idx) {
+      if (size() <= Idx)
+        resize(std::max(Idx+1, 32U));
+    }
+    static inline unsigned v2x(unsigned v) {
+      return TargetRegisterInfo::virtReg2Index(v);
+    }
+    static inline unsigned x2v(unsigned x) {
+      return TargetRegisterInfo::index2VirtReg(x);
+    }
+  };
+
+
+  struct PrintRegSet {
+    PrintRegSet(const RegisterSet &S, const TargetRegisterInfo *RI)
+      : RS(S), TRI(RI) {}
+    friend raw_ostream &operator<< (raw_ostream &OS,
+          const PrintRegSet &P);
+  private:
+    const RegisterSet &RS;
+    const TargetRegisterInfo *TRI;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) {
+    OS << '{';
+    for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R))
+      OS << ' ' << PrintReg(R, P.TRI);
+    OS << " }";
+    return OS;
+  }
+}
+
+
+namespace {
+  // A convenience class to associate unsigned numbers (such as virtual
+  // registers) with unsigned numbers.
+  struct UnsignedMap : public DenseMap<unsigned,unsigned> {
+    UnsignedMap() : BaseType() {}
+  private:
+    typedef DenseMap<unsigned,unsigned> BaseType;
+  };
+
+  // A utility to establish an ordering between virtual registers:
+  // VRegA < VRegB  <=>  RegisterOrdering[VRegA] < RegisterOrdering[VRegB]
+  // This is meant as a cache for the ordering of virtual registers defined
+  // by a potentially expensive comparison function, or obtained by a proce-
+  // dure that should not be repeated each time two registers are compared.
+  struct RegisterOrdering : public UnsignedMap {
+    RegisterOrdering() : UnsignedMap() {}
+    unsigned operator[](unsigned VR) const {
+      const_iterator F = find(VR);
+      assert(F != end());
+      return F->second;
+    }
+    // Add operator(), so that objects of this class can be used as
+    // comparators in std::sort et al.
+    bool operator() (unsigned VR1, unsigned VR2) const {
+      return operator[](VR1) < operator[](VR2);
+    }
+  };
+}
+
+
+namespace {
+  // Ordering of bit values. This class does not have operator[], but
+  // is supplies a comparison operator() for use in std:: algorithms.
+  // The order is as follows:
+  // - 0 < 1 < ref
+  // - ref1 < ref2, if ord(ref1.Reg) < ord(ref2.Reg),
+  //   or ord(ref1.Reg) == ord(ref2.Reg), and ref1.Pos < ref2.Pos.
+  struct BitValueOrdering {
+    BitValueOrdering(const RegisterOrdering &RB) : BaseOrd(RB) {}
+    bool operator() (const BitTracker::BitValue &V1,
+          const BitTracker::BitValue &V2) const;
+    const RegisterOrdering &BaseOrd;
+  };
+}
+
+
+bool BitValueOrdering::operator() (const BitTracker::BitValue &V1,
+      const BitTracker::BitValue &V2) const {
+  if (V1 == V2)
+    return false;
+  // V1==0 => true, V2==0 => false
+  if (V1.is(0) || V2.is(0))
+    return V1.is(0);
+  // Neither of V1,V2 is 0, and V1!=V2.
+  // V2==1 => false, V1==1 => true
+  if (V2.is(1) || V1.is(1))
+    return !V2.is(1);
+  // Both V1,V2 are refs.
+  unsigned Ind1 = BaseOrd[V1.RefI.Reg], Ind2 = BaseOrd[V2.RefI.Reg];
+  if (Ind1 != Ind2)
+    return Ind1 < Ind2;
+  // If V1.Pos==V2.Pos
+  assert(V1.RefI.Pos != V2.RefI.Pos && "Bit values should be different");
+  return V1.RefI.Pos < V2.RefI.Pos;
+}
+
+
+namespace {
+  // Cache for the BitTracker's cell map. Map lookup has a logarithmic
+  // complexity, this class will memoize the lookup results to reduce
+  // the access time for repeated lookups of the same cell.
+  struct CellMapShadow {
+    CellMapShadow(const BitTracker &T) : BT(T) {}
+    const BitTracker::RegisterCell &lookup(unsigned VR) {
+      unsigned RInd = TargetRegisterInfo::virtReg2Index(VR);
+      // Grow the vector to at least 32 elements.
+      if (RInd >= CVect.size())
+        CVect.resize(std::max(RInd+16, 32U), 0);
+      const BitTracker::RegisterCell *CP = CVect[RInd];
+      if (CP == 0)
+        CP = CVect[RInd] = &BT.lookup(VR);
+      return *CP;
+    }
+
+    const BitTracker &BT;
+
+  private:
+    typedef std::vector<const BitTracker::RegisterCell*> CellVectType;
+    CellVectType CVect;
+  };
+}
+
+
+namespace {
+  // Comparator class for lexicographic ordering of virtual registers
+  // according to the corresponding BitTracker::RegisterCell objects.
+  struct RegisterCellLexCompare {
+    RegisterCellLexCompare(const BitValueOrdering &BO, CellMapShadow &M)
+      : BitOrd(BO), CM(M) {}
+    bool operator() (unsigned VR1, unsigned VR2) const;
+  private:
+    const BitValueOrdering &BitOrd;
+    CellMapShadow &CM;
+  };
+
+  // Comparator class for lexicographic ordering of virtual registers
+  // according to the specified bits of the corresponding BitTracker::
+  // RegisterCell objects.
+  // Specifically, this class will be used to compare bit B of a register
+  // cell for a selected virtual register R with bit N of any register
+  // other than R.
+  struct RegisterCellBitCompareSel {
+    RegisterCellBitCompareSel(unsigned R, unsigned B, unsigned N,
+          const BitValueOrdering &BO, CellMapShadow &M)
+      : SelR(R), SelB(B), BitN(N), BitOrd(BO), CM(M) {}
+    bool operator() (unsigned VR1, unsigned VR2) const;
+  private:
+    const unsigned SelR, SelB;
+    const unsigned BitN;
+    const BitValueOrdering &BitOrd;
+    CellMapShadow &CM;
+  };
+}
+
+
+bool RegisterCellLexCompare::operator() (unsigned VR1, unsigned VR2) const {
+  // Ordering of registers, made up from two given orderings:
+  // - the ordering of the register numbers, and
+  // - the ordering of register cells.
+  // Def. R1 < R2 if:
+  // - cell(R1) < cell(R2), or
+  // - cell(R1) == cell(R2), and index(R1) < index(R2).
+  //
+  // For register cells, the ordering is lexicographic, with index 0 being
+  // the most significant.
+  if (VR1 == VR2)
+    return false;
+
+  const BitTracker::RegisterCell &RC1 = CM.lookup(VR1), &RC2 = CM.lookup(VR2);
+  uint16_t W1 = RC1.width(), W2 = RC2.width();
+  for (uint16_t i = 0, w = std::min(W1, W2); i < w; ++i) {
+    const BitTracker::BitValue &V1 = RC1[i], &V2 = RC2[i];
+    if (V1 != V2)
+      return BitOrd(V1, V2);
+  }
+  // Cells are equal up until the common length.
+  if (W1 != W2)
+    return W1 < W2;
+
+  return BitOrd.BaseOrd[VR1] < BitOrd.BaseOrd[VR2];
+}
+
+
+bool RegisterCellBitCompareSel::operator() (unsigned VR1, unsigned VR2) const {
+  if (VR1 == VR2)
+    return false;
+  const BitTracker::RegisterCell &RC1 = CM.lookup(VR1);
+  const BitTracker::RegisterCell &RC2 = CM.lookup(VR2);
+  uint16_t W1 = RC1.width(), W2 = RC2.width();
+  uint16_t Bit1 = (VR1 == SelR) ? SelB : BitN;
+  uint16_t Bit2 = (VR2 == SelR) ? SelB : BitN;
+  // If Bit1 exceeds the width of VR1, then:
+  // - return false, if at the same time Bit2 exceeds VR2, or
+  // - return true, otherwise.
+  // (I.e. "a bit value that does not exist is less than any bit value
+  // that does exist".)
+  if (W1 <= Bit1)
+    return Bit2 < W2;
+  // If Bit1 is within VR1, but Bit2 is not within VR2, return false.
+  if (W2 <= Bit2)
+    return false;
+
+  const BitTracker::BitValue &V1 = RC1[Bit1], V2 = RC2[Bit2];
+  if (V1 != V2)
+    return BitOrd(V1, V2);
+  return false;
+}
+
+
+namespace {
+  class OrderedRegisterList {
+    typedef std::vector<unsigned> ListType;
+  public:
+    OrderedRegisterList(const RegisterOrdering &RO) : Ord(RO) {}
+    void insert(unsigned VR);
+    void remove(unsigned VR);
+    unsigned operator[](unsigned Idx) const {
+      assert(Idx < Seq.size());
+      return Seq[Idx];
+    }
+    unsigned size() const {
+      return Seq.size();
+    }
+
+    typedef ListType::iterator iterator;
+    typedef ListType::const_iterator const_iterator;
+    iterator begin() { return Seq.begin(); }
+    iterator end() { return Seq.end(); }
+    const_iterator begin() const { return Seq.begin(); }
+    const_iterator end() const { return Seq.end(); }
+
+    // Convenience function to convert an iterator to the corresponding index.
+    unsigned idx(iterator It) const { return It-begin(); }
+  private:
+    ListType Seq;
+    const RegisterOrdering &Ord;
+  };
+
+
+  struct PrintORL {
+    PrintORL(const OrderedRegisterList &L, const TargetRegisterInfo *RI)
+      : RL(L), TRI(RI) {}
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintORL &P);
+  private:
+    const OrderedRegisterList &RL;
+    const TargetRegisterInfo *TRI;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PrintORL &P) {
+    OS << '(';
+    OrderedRegisterList::const_iterator B = P.RL.begin(), E = P.RL.end();
+    for (OrderedRegisterList::const_iterator I = B; I != E; ++I) {
+      if (I != B)
+        OS << ", ";
+      OS << PrintReg(*I, P.TRI);
+    }
+    OS << ')';
+    return OS;
+  }
+}
+
+
+void OrderedRegisterList::insert(unsigned VR) {
+  iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord);
+  if (L == Seq.end())
+    Seq.push_back(VR);
+  else
+    Seq.insert(L, VR);
+}
+
+
+void OrderedRegisterList::remove(unsigned VR) {
+  iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord);
+  assert(L != Seq.end());
+  Seq.erase(L);
+}
+
+
+namespace {
+  // A record of the insert form. The fields correspond to the operands
+  // of the "insert" instruction:
+  // ... = insert(SrcR, InsR, #Wdh, #Off)
+  struct IFRecord {
+    IFRecord(unsigned SR = 0, unsigned IR = 0, uint16_t W = 0, uint16_t O = 0)
+      : SrcR(SR), InsR(IR), Wdh(W), Off(O) {}
+    unsigned SrcR, InsR;
+    uint16_t Wdh, Off;
+  };
+
+  struct PrintIFR {
+    PrintIFR(const IFRecord &R, const TargetRegisterInfo *RI)
+      : IFR(R), TRI(RI) {}
+  private:
+    const IFRecord &IFR;
+    const TargetRegisterInfo *TRI;
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintIFR &P);
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PrintIFR &P) {
+    unsigned SrcR = P.IFR.SrcR, InsR = P.IFR.InsR;
+    OS << '(' << PrintReg(SrcR, P.TRI) << ',' << PrintReg(InsR, P.TRI)
+       << ",#" << P.IFR.Wdh << ",#" << P.IFR.Off << ')';
+    return OS;
+  }
+
+  typedef std::pair<IFRecord,RegisterSet> IFRecordWithRegSet;
+}
+
+
+namespace llvm {
+  void initializeHexagonGenInsertPass(PassRegistry&);
+  FunctionPass *createHexagonGenInsert();
+}
+
+
+namespace {
+  class HexagonGenInsert : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonGenInsert() : MachineFunctionPass(ID), HII(0), HRI(0) {
+      initializeHexagonGenInsertPass(*PassRegistry::getPassRegistry());
+    }
+    virtual const char *getPassName() const {
+      return "Hexagon generate \"insert\" instructions";
+    }
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  private:
+    typedef DenseMap<std::pair<unsigned,unsigned>,unsigned> PairMapType;
+
+    void buildOrderingMF(RegisterOrdering &RO) const;
+    void buildOrderingBT(RegisterOrdering &RB, RegisterOrdering &RO) const;
+    bool isIntClass(const TargetRegisterClass *RC) const;
+    bool isConstant(unsigned VR) const;
+    bool isSmallConstant(unsigned VR) const;
+    bool isValidInsertForm(unsigned DstR, unsigned SrcR, unsigned InsR,
+          uint16_t L, uint16_t S) const;
+    bool findSelfReference(unsigned VR) const;
+    bool findNonSelfReference(unsigned VR) const;
+    void getInstrDefs(const MachineInstr *MI, RegisterSet &Defs) const;
+    void getInstrUses(const MachineInstr *MI, RegisterSet &Uses) const;
+    unsigned distance(const MachineBasicBlock *FromB,
+          const MachineBasicBlock *ToB, const UnsignedMap &RPO,
+          PairMapType &M) const;
+    unsigned distance(MachineBasicBlock::const_iterator FromI,
+          MachineBasicBlock::const_iterator ToI, const UnsignedMap &RPO,
+          PairMapType &M) const;
+    bool findRecordInsertForms(unsigned VR, OrderedRegisterList &AVs);
+    void collectInBlock(MachineBasicBlock *B, OrderedRegisterList &AVs);
+    void findRemovableRegisters(unsigned VR, IFRecord IF,
+          RegisterSet &RMs) const;
+    void computeRemovableRegisters();
+
+    void pruneEmptyLists();
+    void pruneCoveredSets(unsigned VR);
+    void pruneUsesTooFar(unsigned VR, const UnsignedMap &RPO, PairMapType &M);
+    void pruneRegCopies(unsigned VR);
+    void pruneCandidates();
+    void selectCandidates();
+    bool generateInserts();
+
+    bool removeDeadCode(MachineDomTreeNode *N);
+
+    // IFRecord coupled with a set of potentially removable registers:
+    typedef std::vector<IFRecordWithRegSet> IFListType;
+    typedef DenseMap<unsigned,IFListType> IFMapType;  // vreg -> IFListType
+
+    void dump_map() const;
+
+    const HexagonInstrInfo *HII;
+    const HexagonRegisterInfo *HRI;
+
+    MachineFunction *MFN;
+    MachineRegisterInfo *MRI;
+    MachineDominatorTree *MDT;
+    CellMapShadow *CMS;
+
+    RegisterOrdering BaseOrd;
+    RegisterOrdering CellOrd;
+    IFMapType IFMap;
+  };
+
+  char HexagonGenInsert::ID = 0;
+}
+
+
+void HexagonGenInsert::dump_map() const {
+  typedef IFMapType::const_iterator iterator;
+  for (iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    dbgs() << "  " << PrintReg(I->first, HRI) << ":\n";
+    const IFListType &LL = I->second;
+    for (unsigned i = 0, n = LL.size(); i < n; ++i)
+      dbgs() << "    " << PrintIFR(LL[i].first, HRI) << ", "
+             << PrintRegSet(LL[i].second, HRI) << '\n';
+  }
+}
+
+
+void HexagonGenInsert::buildOrderingMF(RegisterOrdering &RO) const {
+  unsigned Index = 0;
+  typedef MachineFunction::const_iterator mf_iterator;
+  for (mf_iterator A = MFN->begin(), Z = MFN->end(); A != Z; ++A) {
+    const MachineBasicBlock &B = *A;
+    if (!CMS->BT.reached(&B))
+      continue;
+    typedef MachineBasicBlock::const_iterator mb_iterator;
+    for (mb_iterator I = B.begin(), E = B.end(); I != E; ++I) {
+      const MachineInstr *MI = &*I;
+      for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.isDef()) {
+          unsigned R = MO.getReg();
+          assert(MO.getSubReg() == 0 && "Unexpected subregister in definition");
+          if (TargetRegisterInfo::isVirtualRegister(R))
+            RO.insert(std::make_pair(R, Index++));
+        }
+      }
+    }
+  }
+  // Since some virtual registers may have had their def and uses eliminated,
+  // they are no longer referenced in the code, and so they will not appear
+  // in the map.
+}
+
+
+void HexagonGenInsert::buildOrderingBT(RegisterOrdering &RB,
+      RegisterOrdering &RO) const {
+  // Create a vector of all virtual registers (collect them from the base
+  // ordering RB), and then sort it using the RegisterCell comparator.
+  BitValueOrdering BVO(RB);
+  RegisterCellLexCompare LexCmp(BVO, *CMS);
+  typedef std::vector<unsigned> SortableVectorType;
+  SortableVectorType VRs;
+  for (RegisterOrdering::iterator I = RB.begin(), E = RB.end(); I != E; ++I)
+    VRs.push_back(I->first);
+  std::sort(VRs.begin(), VRs.end(), LexCmp);
+  // Transfer the results to the outgoing register ordering.
+  for (unsigned i = 0, n = VRs.size(); i < n; ++i)
+    RO.insert(std::make_pair(VRs[i], i));
+}
+
+
+inline bool HexagonGenInsert::isIntClass(const TargetRegisterClass *RC) const {
+  return RC == &Hexagon::IntRegsRegClass || RC == &Hexagon::DoubleRegsRegClass;
+}
+
+
+bool HexagonGenInsert::isConstant(unsigned VR) const {
+  const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+  uint16_t W = RC.width();
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitTracker::BitValue &BV = RC[i];
+    if (BV.is(0) || BV.is(1))
+      continue;
+    return false;
+  }
+  return true;
+}
+
+
+bool HexagonGenInsert::isSmallConstant(unsigned VR) const {
+  const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+  uint16_t W = RC.width();
+  if (W > 64)
+    return false;
+  uint64_t V = 0, B = 1;
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitTracker::BitValue &BV = RC[i];
+    if (BV.is(1))
+      V |= B;
+    else if (!BV.is(0))
+      return false;
+    B <<= 1;
+  }
+
+  // For 32-bit registers, consider: Rd = #s16.
+  if (W == 32)
+    return isInt<16>(V);
+
+  // For 64-bit registers, it's Rdd = #s8 or Rdd = combine(#s8,#s8)
+  return isInt<8>(Lo_32(V)) && isInt<8>(Hi_32(V));
+}
+
+
+bool HexagonGenInsert::isValidInsertForm(unsigned DstR, unsigned SrcR,
+      unsigned InsR, uint16_t L, uint16_t S) const {
+  const TargetRegisterClass *DstRC = MRI->getRegClass(DstR);
+  const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcR);
+  const TargetRegisterClass *InsRC = MRI->getRegClass(InsR);
+  // Only integet (32-/64-bit) register classes.
+  if (!isIntClass(DstRC) || !isIntClass(SrcRC) || !isIntClass(InsRC))
+    return false;
+  // The "source" register must be of the same class as DstR.
+  if (DstRC != SrcRC)
+    return false;
+  if (DstRC == InsRC)
+    return true;
+  // A 64-bit register can only be generated from other 64-bit registers.
+  if (DstRC == &Hexagon::DoubleRegsRegClass)
+    return false;
+  // Otherwise, the L and S cannot span 32-bit word boundary.
+  if (S < 32 && S+L > 32)
+    return false;
+  return true;
+}
+
+
+bool HexagonGenInsert::findSelfReference(unsigned VR) const {
+  const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+  for (uint16_t i = 0, w = RC.width(); i < w; ++i) {
+    const BitTracker::BitValue &V = RC[i];
+    if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg == VR)
+      return true;
+  }
+  return false;
+}
+
+
+bool HexagonGenInsert::findNonSelfReference(unsigned VR) const {
+  BitTracker::RegisterCell RC = CMS->lookup(VR);
+  for (uint16_t i = 0, w = RC.width(); i < w; ++i) {
+    const BitTracker::BitValue &V = RC[i];
+    if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg != VR)
+      return true;
+  }
+  return false;
+}
+
+
+void HexagonGenInsert::getInstrDefs(const MachineInstr *MI,
+      RegisterSet &Defs) const {
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned R = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    Defs.insert(R);
+  }
+}
+
+
+void HexagonGenInsert::getInstrUses(const MachineInstr *MI,
+      RegisterSet &Uses) const {
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned R = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    Uses.insert(R);
+  }
+}
+
+
+unsigned HexagonGenInsert::distance(const MachineBasicBlock *FromB,
+      const MachineBasicBlock *ToB, const UnsignedMap &RPO,
+      PairMapType &M) const {
+  // Forward distance from the end of a block to the beginning of it does
+  // not make sense. This function should not be called with FromB == ToB.
+  assert(FromB != ToB);
+
+  unsigned FromN = FromB->getNumber(), ToN = ToB->getNumber();
+  // If we have already computed it, return the cached result.
+  PairMapType::iterator F = M.find(std::make_pair(FromN, ToN));
+  if (F != M.end())
+    return F->second;
+  unsigned ToRPO = RPO.lookup(ToN);
+
+  unsigned MaxD = 0;
+  typedef MachineBasicBlock::const_pred_iterator pred_iterator;
+  for (pred_iterator I = ToB->pred_begin(), E = ToB->pred_end(); I != E; ++I) {
+    const MachineBasicBlock *PB = *I;
+    // Skip back edges. Also, if FromB is a predecessor of ToB, the distance
+    // along that path will be 0, and we don't need to do any calculations
+    // on it.
+    if (PB == FromB || RPO.lookup(PB->getNumber()) >= ToRPO)
+      continue;
+    unsigned D = PB->size() + distance(FromB, PB, RPO, M);
+    if (D > MaxD)
+      MaxD = D;
+  }
+
+  // Memoize the result for later lookup.
+  M.insert(std::make_pair(std::make_pair(FromN, ToN), MaxD));
+  return MaxD;
+}
+
+
+unsigned HexagonGenInsert::distance(MachineBasicBlock::const_iterator FromI,
+      MachineBasicBlock::const_iterator ToI, const UnsignedMap &RPO,
+      PairMapType &M) const {
+  const MachineBasicBlock *FB = FromI->getParent(), *TB = ToI->getParent();
+  if (FB == TB)
+    return std::distance(FromI, ToI);
+  unsigned D1 = std::distance(TB->begin(), ToI);
+  unsigned D2 = distance(FB, TB, RPO, M);
+  unsigned D3 = std::distance(FromI, FB->end());
+  return D1+D2+D3;
+}
+
+
+bool HexagonGenInsert::findRecordInsertForms(unsigned VR,
+      OrderedRegisterList &AVs) {
+  if (isDebug()) {
+    dbgs() << LLVM_FUNCTION_NAME << ": " << PrintReg(VR, HRI)
+           << "  AVs: " << PrintORL(AVs, HRI) << "\n";
+  }
+  if (AVs.size() == 0)
+    return false;
+
+  typedef OrderedRegisterList::iterator iterator;
+  BitValueOrdering BVO(BaseOrd);
+  const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+  uint16_t W = RC.width();
+
+  typedef std::pair<unsigned,uint16_t> RSRecord;  // (reg,shift)
+  typedef std::vector<RSRecord> RSListType;
+  // Have a map, with key being the matching prefix length, and the value
+  // being the list of pairs (R,S), where R's prefix matches VR at S.
+  // (DenseMap<uint16_t,RSListType> fails to instantiate.)
+  typedef DenseMap<unsigned,RSListType> LRSMapType;
+  LRSMapType LM;
+
+  // Conceptually, rotate the cell RC right (i.e. towards the LSB) by S,
+  // and find matching prefixes from AVs with the rotated RC. Such a prefix
+  // would match a string of bits (of length L) in RC starting at S.
+  for (uint16_t S = 0; S < W; ++S) {
+    iterator B = AVs.begin(), E = AVs.end();
+    // The registers in AVs are ordered according to the lexical order of
+    // the corresponding register cells. This means that the range of regis-
+    // ters in AVs that match a prefix of length L+1 will be contained in
+    // the range that matches a prefix of length L. This means that we can
+    // keep narrowing the search space as the prefix length goes up. This
+    // helps reduce the overall complexity of the search.
+    uint16_t L;
+    for (L = 0; L < W-S; ++L) {
+      // Compare against VR's bits starting at S, which emulates rotation
+      // of VR by S.
+      RegisterCellBitCompareSel RCB(VR, S+L, L, BVO, *CMS);
+      iterator NewB = std::lower_bound(B, E, VR, RCB);
+      iterator NewE = std::upper_bound(NewB, E, VR, RCB);
+      // For the registers that are eliminated from the next range, L is
+      // the longest prefix matching VR at position S (their prefixes
+      // differ from VR at S+L). If L>0, record this information for later
+      // use.
+      if (L > 0) {
+        for (iterator I = B; I != NewB; ++I)
+          LM[L].push_back(std::make_pair(*I, S));
+        for (iterator I = NewE; I != E; ++I)
+          LM[L].push_back(std::make_pair(*I, S));
+      }
+      B = NewB, E = NewE;
+      if (B == E)
+        break;
+    }
+    // Record the final register range. If this range is non-empty, then
+    // L=W-S.
+    assert(B == E || L == W-S);
+    if (B != E) {
+      for (iterator I = B; I != E; ++I)
+        LM[L].push_back(std::make_pair(*I, S));
+      // If B!=E, then we found a range of registers whose prefixes cover the
+      // rest of VR from position S. There is no need to further advance S.
+      break;
+    }
+  }
+
+  if (isDebug()) {
+    dbgs() << "Prefixes matching register " << PrintReg(VR, HRI) << "\n";
+    for (LRSMapType::iterator I = LM.begin(), E = LM.end(); I != E; ++I) {
+      dbgs() << "  L=" << I->first << ':';
+      const RSListType &LL = I->second;
+      for (unsigned i = 0, n = LL.size(); i < n; ++i)
+        dbgs() << " (" << PrintReg(LL[i].first, HRI) << ",@"
+               << LL[i].second << ')';
+      dbgs() << '\n';
+    }
+  }
+
+
+  bool Recorded = false;
+
+  for (iterator I = AVs.begin(), E = AVs.end(); I != E; ++I) {
+    unsigned SrcR = *I;
+    int FDi = -1, LDi = -1;   // First/last different bit.
+    const BitTracker::RegisterCell &AC = CMS->lookup(SrcR);
+    uint16_t AW = AC.width();
+    for (uint16_t i = 0, w = std::min(W, AW); i < w; ++i) {
+      if (RC[i] == AC[i])
+        continue;
+      if (FDi == -1)
+        FDi = i;
+      LDi = i;
+    }
+    if (FDi == -1)
+      continue;  // TODO (future): Record identical registers.
+    // Look for a register whose prefix could patch the range [FD..LD]
+    // where VR and SrcR differ.
+    uint16_t FD = FDi, LD = LDi;  // Switch to unsigned type.
+    uint16_t MinL = LD-FD+1;
+    for (uint16_t L = MinL; L < W; ++L) {
+      LRSMapType::iterator F = LM.find(L);
+      if (F == LM.end())
+        continue;
+      RSListType &LL = F->second;
+      for (unsigned i = 0, n = LL.size(); i < n; ++i) {
+        uint16_t S = LL[i].second;
+        // MinL is the minimum length of the prefix. Any length above MinL
+        // allows some flexibility as to where the prefix can start:
+        // given the extra length EL=L-MinL, the prefix must start between
+        // max(0,FD-EL) and FD.
+        if (S > FD)   // Starts too late.
+          continue;
+        uint16_t EL = L-MinL;
+        uint16_t LowS = (EL < FD) ? FD-EL : 0;
+        if (S < LowS) // Starts too early.
+          continue;
+        unsigned InsR = LL[i].first;
+        if (!isValidInsertForm(VR, SrcR, InsR, L, S))
+          continue;
+        if (isDebug()) {
+          dbgs() << PrintReg(VR, HRI) << " = insert(" << PrintReg(SrcR, HRI)
+                 << ',' << PrintReg(InsR, HRI) << ",#" << L << ",#"
+                 << S << ")\n";
+        }
+        IFRecordWithRegSet RR(IFRecord(SrcR, InsR, L, S), RegisterSet());
+        IFMap[VR].push_back(RR);
+        Recorded = true;
+      }
+    }
+  }
+
+  return Recorded;
+}
+
+
+void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
+      OrderedRegisterList &AVs) {
+  if (isDebug())
+    dbgs() << "visiting block BB#" << B->getNumber() << "\n";
+
+  // First, check if this block is reachable at all. If not, the bit tracker
+  // will not have any information about registers in it.
+  if (!CMS->BT.reached(B))
+    return;
+
+  bool DoConst = OptConst;
+  // Keep a separate set of registers defined in this block, so that we
+  // can remove them from the list of available registers once all DT
+  // successors have been processed.
+  RegisterSet BlockDefs, InsDefs;
+  for (MachineBasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) {
+    MachineInstr *MI = &*I;
+    InsDefs.clear();
+    getInstrDefs(MI, InsDefs);
+    // Leave those alone. They are more transparent than "insert".
+    bool Skip = MI->isCopy() || MI->isRegSequence();
+
+    if (!Skip) {
+      // Visit all defined registers, and attempt to find the corresponding
+      // "insert" representations.
+      for (unsigned VR = InsDefs.find_first(); VR; VR = InsDefs.find_next(VR)) {
+        // Do not collect registers that are known to be compile-time cons-
+        // tants, unless requested.
+        if (!DoConst && isConstant(VR))
+          continue;
+        // If VR's cell contains a reference to VR, then VR cannot be defined
+        // via "insert". If VR is a constant that can be generated in a single
+        // instruction (without constant extenders), generating it via insert
+        // makes no sense.
+        if (findSelfReference(VR) || isSmallConstant(VR))
+          continue;
+
+        findRecordInsertForms(VR, AVs);
+      }
+    }
+
+    // Insert the defined registers into the list of available registers
+    // after they have been processed.
+    for (unsigned VR = InsDefs.find_first(); VR; VR = InsDefs.find_next(VR))
+      AVs.insert(VR);
+    BlockDefs.insert(InsDefs);
+  }
+
+  MachineDomTreeNode *N = MDT->getNode(B);
+  typedef GraphTraits<MachineDomTreeNode*> GTN;
+  typedef GTN::ChildIteratorType ChildIter;
+  for (ChildIter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
+    MachineBasicBlock *SB = (*I)->getBlock();
+    collectInBlock(SB, AVs);
+  }
+
+  for (unsigned VR = BlockDefs.find_first(); VR; VR = BlockDefs.find_next(VR))
+    AVs.remove(VR);
+}
+
+
+void HexagonGenInsert::findRemovableRegisters(unsigned VR, IFRecord IF,
+      RegisterSet &RMs) const {
+  // For a given register VR and a insert form, find the registers that are
+  // used by the current definition of VR, and which would no longer be
+  // needed for it after the definition of VR is replaced with the insert
+  // form. These are the registers that could potentially become dead.
+  RegisterSet Regs[2];
+
+  unsigned S = 0;  // Register set selector.
+  Regs[S].insert(VR);
+
+  while (!Regs[S].empty()) {
+    // Breadth-first search.
+    unsigned OtherS = 1-S;
+    Regs[OtherS].clear();
+    for (unsigned R = Regs[S].find_first(); R; R = Regs[S].find_next(R)) {
+      Regs[S].remove(R);
+      if (R == IF.SrcR || R == IF.InsR)
+        continue;
+      // Check if a given register has bits that are references to any other
+      // registers. This is to detect situations where the instruction that
+      // defines register R takes register Q as an operand, but R itself does
+      // not contain any bits from Q. Loads are examples of how this could
+      // happen:
+      //   R = load Q
+      // In this case (assuming we do not have any knowledge about the loaded
+      // value), we must not treat R as a "conveyance" of the bits from Q.
+      // (The information in BT about R's bits would have them as constants,
+      // in case of zero-extending loads, or refs to R.)
+      if (!findNonSelfReference(R))
+        continue;
+      RMs.insert(R);
+      const MachineInstr *DefI = MRI->getVRegDef(R);
+      assert(DefI);
+      // Do not iterate past PHI nodes to avoid infinite loops. This can
+      // make the final set a bit less accurate, but the removable register
+      // sets are an approximation anyway.
+      if (DefI->isPHI())
+        continue;
+      getInstrUses(DefI, Regs[OtherS]);
+    }
+    S = OtherS;
+  }
+  // The register VR is added to the list as a side-effect of the algorithm,
+  // but it is not "potentially removable". A potentially removable register
+  // is one that may become unused (dead) after conversion to the insert form
+  // IF, and obviously VR (or its replacement) will not become dead by apply-
+  // ing IF.
+  RMs.remove(VR);
+}
+
+
+void HexagonGenInsert::computeRemovableRegisters() {
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    IFListType &LL = I->second;
+    for (unsigned i = 0, n = LL.size(); i < n; ++i)
+      findRemovableRegisters(I->first, LL[i].first, LL[i].second);
+  }
+}
+
+
+void HexagonGenInsert::pruneEmptyLists() {
+  // Remove all entries from the map, where the register has no insert forms
+  // associated with it.
+  typedef SmallVector<IFMapType::iterator,16> IterListType;
+  IterListType Prune;
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    if (I->second.size() == 0)
+      Prune.push_back(I);
+  }
+  for (unsigned i = 0, n = Prune.size(); i < n; ++i)
+    IFMap.erase(Prune[i]);
+}
+
+
+void HexagonGenInsert::pruneCoveredSets(unsigned VR) {
+  IFMapType::iterator F = IFMap.find(VR);
+  assert(F != IFMap.end());
+  IFListType &LL = F->second;
+
+  // First, examine the IF candidates for register VR whose removable-regis-
+  // ter sets are empty. This means that a given candidate will not help eli-
+  // minate any registers, but since "insert" is not a constant-extendable
+  // instruction, using such a candidate may reduce code size if the defini-
+  // tion of VR is constant-extended.
+  // If there exists a candidate with a non-empty set, the ones with empty
+  // sets will not be used and can be removed.
+  MachineInstr *DefVR = MRI->getVRegDef(VR);
+  bool DefEx = HII->isConstExtended(DefVR);
+  bool HasNE = false;
+  for (unsigned i = 0, n = LL.size(); i < n; ++i) {
+    if (LL[i].second.empty())
+      continue;
+    HasNE = true;
+    break;
+  }
+  if (!DefEx || HasNE) {
+    // The definition of VR is not constant-extended, or there is a candidate
+    // with a non-empty set. Remove all candidates with empty sets.
+    auto IsEmpty = [] (const IFRecordWithRegSet &IR) -> bool {
+      return IR.second.empty();
+    };
+    auto End = std::remove_if(LL.begin(), LL.end(), IsEmpty);
+    if (End != LL.end())
+      LL.erase(End, LL.end());
+  } else {
+    // The definition of VR is constant-extended, and all candidates have
+    // empty removable-register sets. Pick the maximum candidate, and remove
+    // all others. The "maximum" does not have any special meaning here, it
+    // is only so that the candidate that will remain on the list is selec-
+    // ted deterministically.
+    IFRecord MaxIF = LL[0].first;
+    for (unsigned i = 1, n = LL.size(); i < n; ++i) {
+      // If LL[MaxI] < LL[i], then MaxI = i.
+      const IFRecord &IF = LL[i].first;
+      unsigned M0 = BaseOrd[MaxIF.SrcR], M1 = BaseOrd[MaxIF.InsR];
+      unsigned R0 = BaseOrd[IF.SrcR], R1 = BaseOrd[IF.InsR];
+      if (M0 > R0)
+        continue;
+      if (M0 == R0) {
+        if (M1 > R1)
+          continue;
+        if (M1 == R1) {
+          if (MaxIF.Wdh > IF.Wdh)
+            continue;
+          if (MaxIF.Wdh == IF.Wdh && MaxIF.Off >= IF.Off)
+            continue;
+        }
+      }
+      // MaxIF < IF.
+      MaxIF = IF;
+    }
+    // Remove everything except the maximum candidate. All register sets
+    // are empty, so no need to preserve anything.
+    LL.clear();
+    LL.push_back(std::make_pair(MaxIF, RegisterSet()));
+  }
+
+  // Now, remove those whose sets of potentially removable registers are
+  // contained in another IF candidate for VR. For example, given these
+  // candidates for vreg45,
+  //   %vreg45:
+  //     (%vreg44,%vreg41,#9,#8), { %vreg42 }
+  //     (%vreg43,%vreg41,#9,#8), { %vreg42 %vreg44 }
+  // remove the first one, since it is contained in the second one.
+  for (unsigned i = 0, n = LL.size(); i < n; ) {
+    const RegisterSet &RMi = LL[i].second;
+    unsigned j = 0;
+    while (j < n) {
+      if (j != i && LL[j].second.includes(RMi))
+        break;
+      j++;
+    }
+    if (j == n) {   // RMi not contained in anything else.
+      i++;
+      continue;
+    }
+    LL.erase(LL.begin()+i);
+    n = LL.size();
+  }
+}
+
+
+void HexagonGenInsert::pruneUsesTooFar(unsigned VR, const UnsignedMap &RPO,
+      PairMapType &M) {
+  IFMapType::iterator F = IFMap.find(VR);
+  assert(F != IFMap.end());
+  IFListType &LL = F->second;
+  unsigned Cutoff = VRegDistCutoff;
+  const MachineInstr *DefV = MRI->getVRegDef(VR);
+
+  for (unsigned i = LL.size(); i > 0; --i) {
+    unsigned SR = LL[i-1].first.SrcR, IR = LL[i-1].first.InsR;
+    const MachineInstr *DefS = MRI->getVRegDef(SR);
+    const MachineInstr *DefI = MRI->getVRegDef(IR);
+    unsigned DSV = distance(DefS, DefV, RPO, M);
+    if (DSV < Cutoff) {
+      unsigned DIV = distance(DefI, DefV, RPO, M);
+      if (DIV < Cutoff)
+        continue;
+    }
+    LL.erase(LL.begin()+(i-1));
+  }
+}
+
+
+void HexagonGenInsert::pruneRegCopies(unsigned VR) {
+  IFMapType::iterator F = IFMap.find(VR);
+  assert(F != IFMap.end());
+  IFListType &LL = F->second;
+
+  auto IsCopy = [] (const IFRecordWithRegSet &IR) -> bool {
+    return IR.first.Wdh == 32 && (IR.first.Off == 0 || IR.first.Off == 32);
+  };
+  auto End = std::remove_if(LL.begin(), LL.end(), IsCopy);
+  if (End != LL.end())
+    LL.erase(End, LL.end());
+}
+
+
+void HexagonGenInsert::pruneCandidates() {
+  // Remove candidates that are not beneficial, regardless of the final
+  // selection method.
+  // First, remove candidates whose potentially removable set is a subset
+  // of another candidate's set.
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
+    pruneCoveredSets(I->first);
+
+  UnsignedMap RPO;
+  typedef ReversePostOrderTraversal<const MachineFunction*> RPOTType;
+  RPOTType RPOT(MFN);
+  unsigned RPON = 0;
+  for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
+    RPO[(*I)->getNumber()] = RPON++;
+
+  PairMapType Memo; // Memoization map for distance calculation.
+  // Remove candidates that would use registers defined too far away.
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
+    pruneUsesTooFar(I->first, RPO, Memo);
+
+  pruneEmptyLists();
+
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
+    pruneRegCopies(I->first);
+}
+
+
+namespace {
+  // Class for comparing IF candidates for registers that have multiple of
+  // them. The smaller the candidate, according to this ordering, the better.
+  // First, compare the number of zeros in the associated potentially remova-
+  // ble register sets. "Zero" indicates that the register is very likely to
+  // become dead after this transformation.
+  // Second, compare "averages", i.e. use-count per size. The lower wins.
+  // After that, it does not really matter which one is smaller. Resolve
+  // the tie in some deterministic way.
+  struct IFOrdering {
+    IFOrdering(const UnsignedMap &UC, const RegisterOrdering &BO)
+      : UseC(UC), BaseOrd(BO) {}
+    bool operator() (const IFRecordWithRegSet &A,
+          const IFRecordWithRegSet &B) const;
+  private:
+    void stats(const RegisterSet &Rs, unsigned &Size, unsigned &Zero,
+          unsigned &Sum) const;
+    const UnsignedMap &UseC;
+    const RegisterOrdering &BaseOrd;
+  };
+}
+
+
+bool IFOrdering::operator() (const IFRecordWithRegSet &A,
+      const IFRecordWithRegSet &B) const {
+  unsigned SizeA = 0, ZeroA = 0, SumA = 0;
+  unsigned SizeB = 0, ZeroB = 0, SumB = 0;
+  stats(A.second, SizeA, ZeroA, SumA);
+  stats(B.second, SizeB, ZeroB, SumB);
+
+  // We will pick the minimum element. The more zeros, the better.
+  if (ZeroA != ZeroB)
+    return ZeroA > ZeroB;
+  // Compare SumA/SizeA with SumB/SizeB, lower is better.
+  uint64_t AvgA = SumA*SizeB, AvgB = SumB*SizeA;
+  if (AvgA != AvgB)
+    return AvgA < AvgB;
+
+  // The sets compare identical so far. Resort to comparing the IF records.
+  // The actual values don't matter, this is only for determinism.
+  unsigned OSA = BaseOrd[A.first.SrcR], OSB = BaseOrd[B.first.SrcR];
+  if (OSA != OSB)
+    return OSA < OSB;
+  unsigned OIA = BaseOrd[A.first.InsR], OIB = BaseOrd[B.first.InsR];
+  if (OIA != OIB)
+    return OIA < OIB;
+  if (A.first.Wdh != B.first.Wdh)
+    return A.first.Wdh < B.first.Wdh;
+  return A.first.Off < B.first.Off;
+}
+
+
+void IFOrdering::stats(const RegisterSet &Rs, unsigned &Size, unsigned &Zero,
+      unsigned &Sum) const {
+  for (unsigned R = Rs.find_first(); R; R = Rs.find_next(R)) {
+    UnsignedMap::const_iterator F = UseC.find(R);
+    assert(F != UseC.end());
+    unsigned UC = F->second;
+    if (UC == 0)
+      Zero++;
+    Sum += UC;
+    Size++;
+  }
+}
+
+
+void HexagonGenInsert::selectCandidates() {
+  // Some registers may have multiple valid candidates. Pick the best one
+  // (or decide not to use any).
+
+  // Compute the "removability" measure of R:
+  // For each potentially removable register R, record the number of regis-
+  // ters with IF candidates, where R appears in at least one set.
+  RegisterSet AllRMs;
+  UnsignedMap UseC, RemC;
+  IFMapType::iterator End = IFMap.end();
+
+  for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+    const IFListType &LL = I->second;
+    RegisterSet TT;
+    for (unsigned i = 0, n = LL.size(); i < n; ++i)
+      TT.insert(LL[i].second);
+    for (unsigned R = TT.find_first(); R; R = TT.find_next(R))
+      RemC[R]++;
+    AllRMs.insert(TT);
+  }
+
+  for (unsigned R = AllRMs.find_first(); R; R = AllRMs.find_next(R)) {
+    typedef MachineRegisterInfo::use_nodbg_iterator use_iterator;
+    typedef SmallSet<const MachineInstr*,16> InstrSet;
+    InstrSet UIs;
+    // Count as the number of instructions in which R is used, not the
+    // number of operands.
+    use_iterator E = MRI->use_nodbg_end();
+    for (use_iterator I = MRI->use_nodbg_begin(R); I != E; ++I)
+      UIs.insert(I->getParent());
+    unsigned C = UIs.size();
+    // Calculate a measure, which is the number of instructions using R,
+    // minus the "removability" count computed earlier.
+    unsigned D = RemC[R];
+    UseC[R] = (C > D) ? C-D : 0;  // doz
+  }
+
+
+  bool SelectAll0 = OptSelectAll0, SelectHas0 = OptSelectHas0;
+  if (!SelectAll0 && !SelectHas0)
+    SelectAll0 = true;
+
+  // The smaller the number UseC for a given register R, the "less used"
+  // R is aside from the opportunities for removal offered by generating
+  // "insert" instructions.
+  // Iterate over the IF map, and for those registers that have multiple
+  // candidates, pick the minimum one according to IFOrdering.
+  IFOrdering IFO(UseC, BaseOrd);
+  for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+    IFListType &LL = I->second;
+    if (LL.empty())
+      continue;
+    // Get the minimum element, remember it and clear the list. If the
+    // element found is adequate, we will put it back on the list, other-
+    // wise the list will remain empty, and the entry for this register
+    // will be removed (i.e. this register will not be replaced by insert).
+    IFListType::iterator MinI = std::min_element(LL.begin(), LL.end(), IFO);
+    assert(MinI != LL.end());
+    IFRecordWithRegSet M = *MinI;
+    LL.clear();
+
+    // We want to make sure that this replacement will have a chance to be
+    // beneficial, and that means that we want to have indication that some
+    // register will be removed. The most likely registers to be eliminated
+    // are the use operands in the definition of I->first. Accept/reject a
+    // candidate based on how many of its uses it can potentially eliminate.
+
+    RegisterSet Us;
+    const MachineInstr *DefI = MRI->getVRegDef(I->first);
+    getInstrUses(DefI, Us);
+    bool Accept = false;
+
+    if (SelectAll0) {
+      bool All0 = true;
+      for (unsigned R = Us.find_first(); R; R = Us.find_next(R)) {
+        if (UseC[R] == 0)
+          continue;
+        All0 = false;
+        break;
+      }
+      Accept = All0;
+    } else if (SelectHas0) {
+      bool Has0 = false;
+      for (unsigned R = Us.find_first(); R; R = Us.find_next(R)) {
+        if (UseC[R] != 0)
+          continue;
+        Has0 = true;
+        break;
+      }
+      Accept = Has0;
+    }
+    if (Accept)
+      LL.push_back(M);
+  }
+
+  // Remove candidates that add uses of removable registers, unless the
+  // removable registers are among replacement candidates.
+  // Recompute the removable registers, since some candidates may have
+  // been eliminated.
+  AllRMs.clear();
+  for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+    const IFListType &LL = I->second;
+    if (LL.size() > 0)
+      AllRMs.insert(LL[0].second);
+  }
+  for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+    IFListType &LL = I->second;
+    if (LL.size() == 0)
+      continue;
+    unsigned SR = LL[0].first.SrcR, IR = LL[0].first.InsR;
+    if (AllRMs[SR] || AllRMs[IR])
+      LL.clear();
+  }
+
+  pruneEmptyLists();
+}
+
+
+bool HexagonGenInsert::generateInserts() {
+  // Create a new register for each one from IFMap, and store them in the
+  // map.
+  UnsignedMap RegMap;
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    unsigned VR = I->first;
+    const TargetRegisterClass *RC = MRI->getRegClass(VR);
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    RegMap[VR] = NewVR;
+  }
+
+  // We can generate the "insert" instructions using potentially stale re-
+  // gisters: SrcR and InsR for a given VR may be among other registers that
+  // are also replaced. This is fine, we will do the mass "rauw" a bit later.
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    MachineInstr *MI = MRI->getVRegDef(I->first);
+    MachineBasicBlock &B = *MI->getParent();
+    DebugLoc DL = MI->getDebugLoc();
+    unsigned NewR = RegMap[I->first];
+    bool R32 = MRI->getRegClass(NewR) == &Hexagon::IntRegsRegClass;
+    const MCInstrDesc &D = R32 ? HII->get(Hexagon::S2_insert)
+                               : HII->get(Hexagon::S2_insertp);
+    IFRecord IF = I->second[0].first;
+    unsigned Wdh = IF.Wdh, Off = IF.Off;
+    unsigned InsS = 0;
+    if (R32 && MRI->getRegClass(IF.InsR) == &Hexagon::DoubleRegsRegClass) {
+      InsS = Hexagon::subreg_loreg;
+      if (Off >= 32) {
+        InsS = Hexagon::subreg_hireg;
+        Off -= 32;
+      }
+    }
+    // Advance to the proper location for inserting instructions. This could
+    // be B.end().
+    MachineBasicBlock::iterator At = MI;
+    if (MI->isPHI())
+      At = B.getFirstNonPHI();
+
+    BuildMI(B, At, DL, D, NewR)
+      .addReg(IF.SrcR)
+      .addReg(IF.InsR, 0, InsS)
+      .addImm(Wdh)
+      .addImm(Off);
+
+    MRI->clearKillFlags(IF.SrcR);
+    MRI->clearKillFlags(IF.InsR);
+  }
+
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    MachineInstr *DefI = MRI->getVRegDef(I->first);
+    MRI->replaceRegWith(I->first, RegMap[I->first]);
+    DefI->eraseFromParent();
+  }
+
+  return true;
+}
+
+
+bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
+  bool Changed = false;
+  typedef GraphTraits<MachineDomTreeNode*> GTN;
+  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
+    Changed |= removeDeadCode(*I);
+
+  MachineBasicBlock *B = N->getBlock();
+  std::vector<MachineInstr*> Instrs;
+  for (auto I = B->rbegin(), E = B->rend(); I != E; ++I)
+    Instrs.push_back(&*I);
+
+  for (auto I = Instrs.begin(), E = Instrs.end(); I != E; ++I) {
+    MachineInstr *MI = *I;
+    unsigned Opc = MI->getOpcode();
+    // Do not touch lifetime markers. This is why the target-independent DCE
+    // cannot be used.
+    if (Opc == TargetOpcode::LIFETIME_START ||
+        Opc == TargetOpcode::LIFETIME_END)
+      continue;
+    bool Store = false;
+    if (MI->isInlineAsm() || !MI->isSafeToMove(nullptr, Store))
+      continue;
+
+    bool AllDead = true;
+    SmallVector<unsigned,2> Regs;
+    for (ConstMIOperands Op(MI); Op.isValid(); ++Op) {
+      if (!Op->isReg() || !Op->isDef())
+        continue;
+      unsigned R = Op->getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(R) ||
+          !MRI->use_nodbg_empty(R)) {
+        AllDead = false;
+        break;
+      }
+      Regs.push_back(R);
+    }
+    if (!AllDead)
+      continue;
+
+    B->erase(MI);
+    for (unsigned I = 0, N = Regs.size(); I != N; ++I)
+      MRI->markUsesInDebugValueAsUndef(Regs[I]);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+
+bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
+  bool Timing = OptTiming, TimingDetail = Timing && OptTimingDetail;
+  bool Changed = false;
+  TimerGroup __G("hexinsert");
+  NamedRegionTimer __T("hexinsert", Timing && !TimingDetail);
+
+  // Sanity check: one, but not both.
+  assert(!OptSelectAll0 || !OptSelectHas0);
+
+  IFMap.clear();
+  BaseOrd.clear();
+  CellOrd.clear();
+
+  const auto &ST = MF.getSubtarget<HexagonSubtarget>();
+  HII = ST.getInstrInfo();
+  HRI = ST.getRegisterInfo();
+  MFN = &MF;
+  MRI = &MF.getRegInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+
+  // Clean up before any further processing, so that dead code does not
+  // get used in a newly generated "insert" instruction. Have a custom
+  // version of DCE that preserves lifetime markers. Without it, merging
+  // of stack objects can fail to recognize and merge disjoint objects
+  // leading to unnecessary stack growth.
+  Changed |= removeDeadCode(MDT->getRootNode());
+
+  const HexagonEvaluator HE(*HRI, *MRI, *HII, MF);
+  BitTracker BTLoc(HE, MF);
+  BTLoc.trace(isDebug());
+  BTLoc.run();
+  CellMapShadow MS(BTLoc);
+  CMS = &MS;
+
+  buildOrderingMF(BaseOrd);
+  buildOrderingBT(BaseOrd, CellOrd);
+
+  if (isDebug()) {
+    dbgs() << "Cell ordering:\n";
+    for (RegisterOrdering::iterator I = CellOrd.begin(), E = CellOrd.end();
+        I != E; ++I) {
+      unsigned VR = I->first, Pos = I->second;
+      dbgs() << PrintReg(VR, HRI) << " -> " << Pos << "\n";
+    }
+  }
+
+  // Collect candidates for conversion into the insert forms.
+  MachineBasicBlock *RootB = MDT->getRoot();
+  OrderedRegisterList AvailR(CellOrd);
+
+  {
+    NamedRegionTimer _T("collection", "hexinsert", TimingDetail);
+    collectInBlock(RootB, AvailR);
+    // Complete the information gathered in IFMap.
+    computeRemovableRegisters();
+  }
+
+  if (isDebug()) {
+    dbgs() << "Candidates after collection:\n";
+    dump_map();
+  }
+
+  if (IFMap.empty())
+    return false;
+
+  {
+    NamedRegionTimer _T("pruning", "hexinsert", TimingDetail);
+    pruneCandidates();
+  }
+
+  if (isDebug()) {
+    dbgs() << "Candidates after pruning:\n";
+    dump_map();
+  }
+
+  if (IFMap.empty())
+    return false;
+
+  {
+    NamedRegionTimer _T("selection", "hexinsert", TimingDetail);
+    selectCandidates();
+  }
+
+  if (isDebug()) {
+    dbgs() << "Candidates after selection:\n";
+    dump_map();
+  }
+
+  // Filter out vregs beyond the cutoff.
+  if (VRegIndexCutoff.getPosition()) {
+    unsigned Cutoff = VRegIndexCutoff;
+    typedef SmallVector<IFMapType::iterator,16> IterListType;
+    IterListType Out;
+    for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+      unsigned Idx = TargetRegisterInfo::virtReg2Index(I->first);
+      if (Idx >= Cutoff)
+        Out.push_back(I);
+    }
+    for (unsigned i = 0, n = Out.size(); i < n; ++i)
+      IFMap.erase(Out[i]);
+  }
+
+  {
+    NamedRegionTimer _T("generation", "hexinsert", TimingDetail);
+    Changed = generateInserts();
+  }
+
+  return Changed;
+}
+
+
+FunctionPass *llvm::createHexagonGenInsert() {
+  return new HexagonGenInsert();
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+INITIALIZE_PASS_BEGIN(HexagonGenInsert, "hexinsert",
+  "Hexagon generate \"insert\" instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(HexagonGenInsert, "hexinsert",
+  "Hexagon generate \"insert\" instructions", false, false)
diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp
new file mode 100644
index 000000000000..6905c4f6d125
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -0,0 +1,525 @@
+//===--- HexagonGenPredicate.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "gen-pred"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "HexagonTargetMachine.h"
+
+#include <functional>
+#include <queue>
+#include <set>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeHexagonGenPredicatePass(PassRegistry& Registry);
+  FunctionPass *createHexagonGenPredicate();
+}
+
+namespace {
+  struct Register {
+    unsigned R, S;
+    Register(unsigned r = 0, unsigned s = 0) : R(r), S(s) {}
+    Register(const MachineOperand &MO) : R(MO.getReg()), S(MO.getSubReg()) {}
+    bool operator== (const Register &Reg) const {
+      return R == Reg.R && S == Reg.S;
+    }
+    bool operator< (const Register &Reg) const {
+      return R < Reg.R || (R == Reg.R && S < Reg.S);
+    }
+  };
+  struct PrintRegister {
+    PrintRegister(Register R, const TargetRegisterInfo &I) : Reg(R), TRI(I) {}
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR);
+  private:
+    Register Reg;
+    const TargetRegisterInfo &TRI;
+  };
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR)
+    LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR) {
+    return OS << PrintReg(PR.Reg.R, &PR.TRI, PR.Reg.S);
+  }
+
+  class HexagonGenPredicate : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonGenPredicate() : MachineFunctionPass(ID), TII(0), TRI(0), MRI(0) {
+      initializeHexagonGenPredicatePass(*PassRegistry::getPassRegistry());
+    }
+    virtual const char *getPassName() const {
+      return "Hexagon generate predicate operations";
+    }
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  private:
+    typedef SetVector<MachineInstr*> VectOfInst;
+    typedef std::set<Register> SetOfReg;
+    typedef std::map<Register,Register> RegToRegMap;
+
+    const HexagonInstrInfo *TII;
+    const HexagonRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+    SetOfReg PredGPRs;
+    VectOfInst PUsers;
+    RegToRegMap G2P;
+
+    bool isPredReg(unsigned R);
+    void collectPredicateGPR(MachineFunction &MF);
+    void processPredicateGPR(const Register &Reg);
+    unsigned getPredForm(unsigned Opc);
+    bool isConvertibleToPredForm(const MachineInstr *MI);
+    bool isScalarCmp(unsigned Opc);
+    bool isScalarPred(Register PredReg);
+    Register getPredRegFor(const Register &Reg);
+    bool convertToPredForm(MachineInstr *MI);
+    bool eliminatePredCopies(MachineFunction &MF);
+  };
+
+  char HexagonGenPredicate::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(HexagonGenPredicate, "hexagon-gen-pred",
+  "Hexagon generate predicate operations", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred",
+  "Hexagon generate predicate operations", false, false)
+
+bool HexagonGenPredicate::isPredReg(unsigned R) {
+  if (!TargetRegisterInfo::isVirtualRegister(R))
+    return false;
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  return RC == &Hexagon::PredRegsRegClass;
+}
+
+
+unsigned HexagonGenPredicate::getPredForm(unsigned Opc) {
+  using namespace Hexagon;
+
+  switch (Opc) {
+    case A2_and:
+    case A2_andp:
+      return C2_and;
+    case A4_andn:
+    case A4_andnp:
+      return C2_andn;
+    case M4_and_and:
+      return C4_and_and;
+    case M4_and_andn:
+      return C4_and_andn;
+    case M4_and_or:
+      return C4_and_or;
+
+    case A2_or:
+    case A2_orp:
+      return C2_or;
+    case A4_orn:
+    case A4_ornp:
+      return C2_orn;
+    case M4_or_and:
+      return C4_or_and;
+    case M4_or_andn:
+      return C4_or_andn;
+    case M4_or_or:
+      return C4_or_or;
+
+    case A2_xor:
+    case A2_xorp:
+      return C2_xor;
+
+    case C2_tfrrp:
+      return COPY;
+  }
+  // The opcode corresponding to 0 is TargetOpcode::PHI. We can use 0 here
+  // to denote "none", but we need to make sure that none of the valid opcodes
+  // that we return will ever be 0.
+  assert(PHI == 0 && "Use different value for <none>");
+  return 0;
+}
+
+
+bool HexagonGenPredicate::isConvertibleToPredForm(const MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  if (getPredForm(Opc) != 0)
+    return true;
+
+  // Comparisons against 0 are also convertible. This does not apply to
+  // A4_rcmpeqi or A4_rcmpneqi, since they produce values 0 or 1, which
+  // may not match the value that the predicate register would have if
+  // it was converted to a predicate form.
+  switch (Opc) {
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C4_cmpneqi:
+      if (MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0)
+        return true;
+      break;
+  }
+  return false;
+}
+
+
+void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
+  for (MachineFunction::iterator A = MF.begin(), Z = MF.end(); A != Z; ++A) {
+    MachineBasicBlock &B = *A;
+    for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
+      MachineInstr *MI = &*I;
+      unsigned Opc = MI->getOpcode();
+      switch (Opc) {
+        case Hexagon::C2_tfrpr:
+        case TargetOpcode::COPY:
+          if (isPredReg(MI->getOperand(1).getReg())) {
+            Register RD = MI->getOperand(0);
+            if (TargetRegisterInfo::isVirtualRegister(RD.R))
+              PredGPRs.insert(RD);
+          }
+          break;
+      }
+    }
+  }
+}
+
+
+void HexagonGenPredicate::processPredicateGPR(const Register &Reg) {
+  DEBUG(dbgs() << LLVM_FUNCTION_NAME << ": "
+               << PrintReg(Reg.R, TRI, Reg.S) << "\n");
+  typedef MachineRegisterInfo::use_iterator use_iterator;
+  use_iterator I = MRI->use_begin(Reg.R), E = MRI->use_end();
+  if (I == E) {
+    DEBUG(dbgs() << "Dead reg: " << PrintReg(Reg.R, TRI, Reg.S) << '\n');
+    MachineInstr *DefI = MRI->getVRegDef(Reg.R);
+    DefI->eraseFromParent();
+    return;
+  }
+
+  for (; I != E; ++I) {
+    MachineInstr *UseI = I->getParent();
+    if (isConvertibleToPredForm(UseI))
+      PUsers.insert(UseI);
+  }
+}
+
+
+Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
+  // Create a predicate register for a given Reg. The newly created register
+  // will have its value copied from Reg, so that it can be later used as
+  // an operand in other instructions.
+  assert(TargetRegisterInfo::isVirtualRegister(Reg.R));
+  RegToRegMap::iterator F = G2P.find(Reg);
+  if (F != G2P.end())
+    return F->second;
+
+  DEBUG(dbgs() << LLVM_FUNCTION_NAME << ": " << PrintRegister(Reg, *TRI));
+  MachineInstr *DefI = MRI->getVRegDef(Reg.R);
+  assert(DefI);
+  unsigned Opc = DefI->getOpcode();
+  if (Opc == Hexagon::C2_tfrpr || Opc == TargetOpcode::COPY) {
+    assert(DefI->getOperand(0).isDef() && DefI->getOperand(1).isUse());
+    Register PR = DefI->getOperand(1);
+    G2P.insert(std::make_pair(Reg, PR));
+    DEBUG(dbgs() << " -> " << PrintRegister(PR, *TRI) << '\n');
+    return PR;
+  }
+
+  MachineBasicBlock &B = *DefI->getParent();
+  DebugLoc DL = DefI->getDebugLoc();
+  const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+  unsigned NewPR = MRI->createVirtualRegister(PredRC);
+
+  // For convertible instructions, do not modify them, so that they can
+  // be coverted later.  Generate a copy from Reg to NewPR.
+  if (isConvertibleToPredForm(DefI)) {
+    MachineBasicBlock::iterator DefIt = DefI;
+    BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR)
+      .addReg(Reg.R, 0, Reg.S);
+    G2P.insert(std::make_pair(Reg, Register(NewPR)));
+    DEBUG(dbgs() << " -> !" << PrintRegister(Register(NewPR), *TRI) << '\n');
+    return Register(NewPR);
+  }
+
+  llvm_unreachable("Invalid argument");
+}
+
+
+bool HexagonGenPredicate::isScalarCmp(unsigned Opc) {
+  switch (Opc) {
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpgt:
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpeqp:
+    case Hexagon::C2_cmpgtp:
+    case Hexagon::C2_cmpgtup:
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::C2_cmpgtui:
+    case Hexagon::C2_cmpgei:
+    case Hexagon::C2_cmpgeui:
+    case Hexagon::C4_cmpneqi:
+    case Hexagon::C4_cmpltei:
+    case Hexagon::C4_cmplteui:
+    case Hexagon::C4_cmpneq:
+    case Hexagon::C4_cmplte:
+    case Hexagon::C4_cmplteu:
+    case Hexagon::A4_cmpbeq:
+    case Hexagon::A4_cmpbeqi:
+    case Hexagon::A4_cmpbgtu:
+    case Hexagon::A4_cmpbgtui:
+    case Hexagon::A4_cmpbgt:
+    case Hexagon::A4_cmpbgti:
+    case Hexagon::A4_cmpheq:
+    case Hexagon::A4_cmphgt:
+    case Hexagon::A4_cmphgtu:
+    case Hexagon::A4_cmpheqi:
+    case Hexagon::A4_cmphgti:
+    case Hexagon::A4_cmphgtui:
+      return true;
+  }
+  return false;
+}
+
+
+bool HexagonGenPredicate::isScalarPred(Register PredReg) {
+  std::queue<Register> WorkQ;
+  WorkQ.push(PredReg);
+
+  while (!WorkQ.empty()) {
+    Register PR = WorkQ.front();
+    WorkQ.pop();
+    const MachineInstr *DefI = MRI->getVRegDef(PR.R);
+    if (!DefI)
+      return false;
+    unsigned DefOpc = DefI->getOpcode();
+    switch (DefOpc) {
+      case TargetOpcode::COPY: {
+        const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+        if (MRI->getRegClass(PR.R) != PredRC)
+          return false;
+        // If it is a copy between two predicate registers, fall through.
+      }
+      case Hexagon::C2_and:
+      case Hexagon::C2_andn:
+      case Hexagon::C4_and_and:
+      case Hexagon::C4_and_andn:
+      case Hexagon::C4_and_or:
+      case Hexagon::C2_or:
+      case Hexagon::C2_orn:
+      case Hexagon::C4_or_and:
+      case Hexagon::C4_or_andn:
+      case Hexagon::C4_or_or:
+      case Hexagon::C4_or_orn:
+      case Hexagon::C2_xor:
+        // Add operands to the queue.
+        for (ConstMIOperands Mo(DefI); Mo.isValid(); ++Mo)
+          if (Mo->isReg() && Mo->isUse())
+            WorkQ.push(Register(Mo->getReg()));
+        break;
+
+      // All non-vector compares are ok, everything else is bad.
+      default:
+        return isScalarCmp(DefOpc);
+    }
+  }
+
+  return true;
+}
+
+
+bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
+  DEBUG(dbgs() << LLVM_FUNCTION_NAME << ": " << MI << " " << *MI);
+
+  unsigned Opc = MI->getOpcode();
+  assert(isConvertibleToPredForm(MI));
+  unsigned NumOps = MI->getNumOperands();
+  for (unsigned i = 0; i < NumOps; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    Register Reg(MO);
+    if (Reg.S && Reg.S != Hexagon::subreg_loreg)
+      return false;
+    if (!PredGPRs.count(Reg))
+      return false;
+  }
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned NewOpc = getPredForm(Opc);
+  // Special case for comparisons against 0.
+  if (NewOpc == 0) {
+    switch (Opc) {
+      case Hexagon::C2_cmpeqi:
+        NewOpc = Hexagon::C2_not;
+        break;
+      case Hexagon::C4_cmpneqi:
+        NewOpc = TargetOpcode::COPY;
+        break;
+      default:
+        return false;
+    }
+
+    // If it's a scalar predicate register, then all bits in it are
+    // the same. Otherwise, to determine whether all bits are 0 or not
+    // we would need to use any8.
+    Register PR = getPredRegFor(MI->getOperand(1));
+    if (!isScalarPred(PR))
+      return false;
+    // This will skip the immediate argument when creating the predicate
+    // version instruction.
+    NumOps = 2;
+  }
+
+  // Some sanity: check that def is in operand #0.
+  MachineOperand &Op0 = MI->getOperand(0);
+  assert(Op0.isDef());
+  Register OutR(Op0);
+
+  // Don't use getPredRegFor, since it will create an association between
+  // the argument and a created predicate register (i.e. it will insert a
+  // copy if a new predicate register is created).
+  const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+  Register NewPR = MRI->createVirtualRegister(PredRC);
+  MachineInstrBuilder MIB = BuildMI(B, MI, DL, TII->get(NewOpc), NewPR.R);
+
+  // Add predicate counterparts of the GPRs.
+  for (unsigned i = 1; i < NumOps; ++i) {
+    Register GPR = MI->getOperand(i);
+    Register Pred = getPredRegFor(GPR);
+    MIB.addReg(Pred.R, 0, Pred.S);
+  }
+  DEBUG(dbgs() << "generated: " << *MIB);
+
+  // Generate a copy-out: NewGPR = NewPR, and replace all uses of OutR
+  // with NewGPR.
+  const TargetRegisterClass *RC = MRI->getRegClass(OutR.R);
+  unsigned NewOutR = MRI->createVirtualRegister(RC);
+  BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), NewOutR)
+    .addReg(NewPR.R, 0, NewPR.S);
+  MRI->replaceRegWith(OutR.R, NewOutR);
+  MI->eraseFromParent();
+
+  // If the processed instruction was C2_tfrrp (i.e. Rn = Pm; Pk = Rn),
+  // then the output will be a predicate register.  Do not visit the
+  // users of it.
+  if (!isPredReg(NewOutR)) {
+    Register R(NewOutR);
+    PredGPRs.insert(R);
+    processPredicateGPR(R);
+  }
+  return true;
+}
+
+
+bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
+  DEBUG(dbgs() << LLVM_FUNCTION_NAME << "\n");
+  const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+  bool Changed = false;
+  VectOfInst Erase;
+
+  // First, replace copies
+  //   IntR = PredR1
+  //   PredR2 = IntR
+  // with
+  //   PredR2 = PredR1
+  // Such sequences can be generated when a copy-into-pred is generated from
+  // a gpr register holding a result of a convertible instruction. After
+  // the convertible instruction is converted, its predicate result will be
+  // copied back into the original gpr.
+
+  for (MachineFunction::iterator A = MF.begin(), Z = MF.end(); A != Z; ++A) {
+    MachineBasicBlock &B = *A;
+    for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
+      if (I->getOpcode() != TargetOpcode::COPY)
+        continue;
+      Register DR = I->getOperand(0);
+      Register SR = I->getOperand(1);
+      if (!TargetRegisterInfo::isVirtualRegister(DR.R))
+        continue;
+      if (!TargetRegisterInfo::isVirtualRegister(SR.R))
+        continue;
+      if (MRI->getRegClass(DR.R) != PredRC)
+        continue;
+      if (MRI->getRegClass(SR.R) != PredRC)
+        continue;
+      assert(!DR.S && !SR.S && "Unexpected subregister");
+      MRI->replaceRegWith(DR.R, SR.R);
+      Erase.insert(I);
+      Changed = true;
+    }
+  }
+
+  for (VectOfInst::iterator I = Erase.begin(), E = Erase.end(); I != E; ++I)
+    (*I)->eraseFromParent();
+
+  return Changed;
+}
+
+
+bool HexagonGenPredicate::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  TRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  PredGPRs.clear();
+  PUsers.clear();
+  G2P.clear();
+
+  bool Changed = false;
+  collectPredicateGPR(MF);
+  for (SetOfReg::iterator I = PredGPRs.begin(), E = PredGPRs.end(); I != E; ++I)
+    processPredicateGPR(*I);
+
+  bool Again;
+  do {
+    Again = false;
+    VectOfInst Processed, Copy;
+
+    typedef VectOfInst::iterator iterator;
+    Copy = PUsers;
+    for (iterator I = Copy.begin(), E = Copy.end(); I != E; ++I) {
+      MachineInstr *MI = *I;
+      bool Done = convertToPredForm(MI);
+      if (Done) {
+        Processed.insert(MI);
+        Again = true;
+      }
+    }
+    Changed |= Again;
+
+    auto Done = [Processed] (MachineInstr *MI) -> bool {
+      return Processed.count(MI);
+    };
+    PUsers.remove_if(Done);
+  } while (Again);
+
+  Changed |= eliminatePredCopies(MF);
+  return Changed;
+}
+
+
+FunctionPass *llvm::createHexagonGenPredicate() {
+  return new HexagonGenPredicate();
+}
+
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 6e9e69f5a2c7..c739afb70c15 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -459,6 +459,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
 
   // Check for varargs.
   int NumNamedVarArgParams = -1;
@@ -515,8 +516,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<SDValue, 8> MemOpChains;
 
   auto &HRI = *Subtarget.getRegisterInfo();
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(),
-                                        getPointerTy());
+  SDValue StackPtr =
+      DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT);
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -574,7 +575,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   if (!isTailCall) {
-    SDValue C = DAG.getConstant(NumBytes, dl, getPointerTy(), true);
+    SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true);
     Chain = DAG.getCALLSEQ_START(Chain, C, dl);
   }
 
@@ -615,13 +616,13 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (flag_aligned_memcpy) {
     const char *MemcpyName =
       "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes";
-    Callee = DAG.getTargetExternalSymbol(MemcpyName, getPointerTy());
+    Callee = DAG.getTargetExternalSymbol(MemcpyName, PtrVT);
     flag_aligned_memcpy = false;
   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, getPointerTy());
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, PtrVT);
   } else if (ExternalSymbolSDNode *S =
              dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT);
   }
 
   // Returns a chain & a flag for retval copy to use.
@@ -811,8 +812,8 @@ LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
     BlockAddress::get(const_cast<BasicBlock *>(MBB->getBasicBlock()));
   }
 
-  SDValue JumpTableBase = DAG.getNode(HexagonISD::JT, dl,
-                                      getPointerTy(), TargetJT);
+  SDValue JumpTableBase = DAG.getNode(
+      HexagonISD::JT, dl, getPointerTy(DAG.getDataLayout()), TargetJT);
   SDValue ShiftIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index,
                                    DAG.getConstant(2, dl, MVT::i32));
   SDValue JTAddress = DAG.getNode(ISD::ADD, dl, MVT::i32, JumpTableBase,
@@ -1231,16 +1232,17 @@ SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
   SDLoc dl(Op);
-  Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
 
   const HexagonTargetObjectFile *TLOF =
       static_cast<const HexagonTargetObjectFile *>(
           getTargetMachine().getObjFileLowering());
   if (TLOF->IsGlobalInSmallSection(GV, getTargetMachine())) {
-    return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), Result);
+    return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, Result);
   }
 
-  return DAG.getNode(HexagonISD::CONST32, dl, getPointerTy(), Result);
+  return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, Result);
 }
 
 // Specifies that for loads and stores VT can be promoted to PromotedLdStVT.
@@ -1261,7 +1263,8 @@ HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   SDValue BA_SD =  DAG.getTargetBlockAddress(BA, MVT::i32);
   SDLoc dl(Op);
-  return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), BA_SD);
+  return DAG.getNode(HexagonISD::CONST32_GP, dl,
+                     getPointerTy(DAG.getDataLayout()), BA_SD);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2254,6 +2257,7 @@ HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
   SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Mark function as containing a call to EH_RETURN.
   HexagonMachineFunctionInfo *FuncInfo =
@@ -2262,9 +2266,9 @@ HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
 
   unsigned OffsetReg = Hexagon::R28;
 
-  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                                  DAG.getRegister(Hexagon::R30, getPointerTy()),
-                                  DAG.getIntPtrConstant(4, dl));
+  SDValue StoreAddr =
+      DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getRegister(Hexagon::R30, PtrVT),
+                  DAG.getIntPtrConstant(4, dl));
   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
                        false, false, 0);
   Chain = DAG.getCopyToReg(Chain, dl, OffsetReg, Offset);
@@ -2338,8 +2342,7 @@ HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
 std::pair<unsigned, const TargetRegisterClass *>
 HexagonTargetLowering::getRegForInlineAsmConstraint(
-    const TargetRegisterInfo *TRI, const std::string &Constraint,
-    MVT VT) const {
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':   // R0-R31
@@ -2372,8 +2375,8 @@ bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 
 /// isLegalAddressingMode - Return true if the addressing mode represented by
 /// AM is legal for this target, for a load/store of the specified type.
-bool HexagonTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                  Type *Ty,
+bool HexagonTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                  const AddrMode &AM, Type *Ty,
                                                   unsigned AS) const {
   // Allows a signed-extended 11-bit immediate field.
   if (AM.BaseOffs <= -(1LL << 13) || AM.BaseOffs >= (1LL << 13)-1)
@@ -2463,3 +2466,45 @@ bool llvm::isPositiveHalfWord(SDNode *N) {
     return true;
   }
 }
+
+Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+      AtomicOrdering Ord) const {
+  BasicBlock *BB = Builder.GetInsertBlock();
+  Module *M = BB->getParent()->getParent();
+  Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
+  unsigned SZ = Ty->getPrimitiveSizeInBits();
+  assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported");
+  Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked
+                                   : Intrinsic::hexagon_L4_loadd_locked;
+  Value *Fn = Intrinsic::getDeclaration(M, IntID);
+  return Builder.CreateCall(Fn, Addr, "larx");
+}
+
+/// Perform a store-conditional operation to Addr. Return the status of the
+/// store. This should be 0 if the store succeeded, non-zero otherwise.
+Value *HexagonTargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+      Value *Val, Value *Addr, AtomicOrdering Ord) const {
+  BasicBlock *BB = Builder.GetInsertBlock();
+  Module *M = BB->getParent()->getParent();
+  Type *Ty = Val->getType();
+  unsigned SZ = Ty->getPrimitiveSizeInBits();
+  assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic stores supported");
+  Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_S2_storew_locked
+                                   : Intrinsic::hexagon_S4_stored_locked;
+  Value *Fn = Intrinsic::getDeclaration(M, IntID);
+  Value *Call = Builder.CreateCall(Fn, {Addr, Val}, "stcx");
+  Value *Cmp = Builder.CreateICmpEQ(Call, Builder.getInt32(0), "");
+  Value *Ext = Builder.CreateZExt(Cmp, Type::getInt32Ty(M->getContext()));
+  return Ext;
+}
+
+bool HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  // Do not expand loads and stores that don't exceed 64 bits.
+  return LI->getType()->getPrimitiveSizeInBits() > 64;
+}
+
+bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  // Do not expand loads and stores that don't exceed 64 bits.
+  return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64;
+}
+
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index b80e8477eb7b..2642abffaddd 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -165,7 +165,8 @@ bool isPositiveHalfWord(SDNode *N);
 
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    EVT getSetCCResultType(LLVMContext &C, EVT VT) const override {
+    EVT getSetCCResultType(const DataLayout &, LLVMContext &C,
+                           EVT VT) const override {
       if (!VT.isVector())
         return MVT::i1;
       else
@@ -179,11 +180,10 @@ bool isPositiveHalfWord(SDNode *N);
 
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
-    unsigned getInlineAsmMemConstraint(
-        const std::string &ConstraintCode) const override {
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "o")
         return InlineAsm::Constraint_o;
       else if (ConstraintCode == "v")
@@ -198,8 +198,8 @@ bool isPositiveHalfWord(SDNode *N);
     /// The type may be VoidTy, in which case only return true if the addressing
     /// mode is legal for a load/store of any legal type.
     /// TODO: Handle pre/postinc as well.
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                               unsigned AS) const override;
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
     bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
@@ -207,6 +207,21 @@ bool isPositiveHalfWord(SDNode *N);
     /// compare a register against the immediate without having to materialize
     /// the immediate into a register.
     bool isLegalICmpImmediate(int64_t Imm) const override;
+
+    // Handling of atomic RMW instructions.
+    bool hasLoadLinkedStoreConditional() const override {
+      return true;
+    }
+    Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+        AtomicOrdering Ord) const override;
+    Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+        Value *Addr, AtomicOrdering Ord) const override;
+    bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    AtomicRMWExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI)
+        const override {
+      return AtomicRMWExpansionKind::LLSC;
+    }
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 8f255a08f534..f6bb4a045438 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -221,7 +221,7 @@ unsigned HexagonRegisterInfo::getRARegister() const {
 
 unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction
                                                &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const HexagonFrameLowering *TFI = getFrameLowering(MF);
   if (TFI->hasFP(MF))
     return Hexagon::R30;
   return Hexagon::R29;
@@ -240,7 +240,8 @@ unsigned HexagonRegisterInfo::getStackRegister() const {
 
 bool
 HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
-  return MF.getSubtarget().getFrameLowering()->hasFP(MF);
+  const HexagonFrameLowering *TFI = getFrameLowering(MF);
+  return TFI->hasFP(MF);
 }
 
 
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index b5db997eb1b8..276cc69eed0f 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -18,12 +18,6 @@ using namespace llvm;
 
 bool llvm::flag_aligned_memcpy;
 
-HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-HexagonSelectionDAGInfo::~HexagonSelectionDAGInfo() {
-}
-
 SDValue
 HexagonSelectionDAGInfo::
 EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
index 8ac2e43f9294..80ac5d7bd9e2 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -20,8 +20,6 @@ namespace llvm {
 
 class HexagonSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit HexagonSelectionDAGInfo(const DataLayout &DL);
-  ~HexagonSelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index fe6c4f4298b5..cd482b3e3af1 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -74,7 +74,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
                                    StringRef FS, const TargetMachine &TM)
     : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      TSInfo(*TM.getDataLayout()), FrameLowering() {
+      FrameLowering() {
 
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index a173a8087832..b50442969a29 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -37,6 +37,18 @@ static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
   cl::init(true), cl::Hidden, cl::ZeroOrMore,
   cl::desc("Early expansion of MUX"));
 
+static cl::opt<bool> EnableGenInsert("hexagon-insert", cl::init(true),
+  cl::Hidden, cl::desc("Generate \"insert\" instructions"));
+
+static cl::opt<bool> EnableCommGEP("hexagon-commgep", cl::init(true),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Enable commoning of GEP instructions"));
+
+static cl::opt<bool> EnableGenExtract("hexagon-extract", cl::init(true),
+  cl::Hidden, cl::desc("Generate \"extract\" instructions"));
+
+static cl::opt<bool> EnableGenPred("hexagon-gen-pred", cl::init(true),
+  cl::Hidden, cl::desc("Enable conversion of arithmetic operations to "
+  "predicate instructions"));
 
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
@@ -60,23 +72,23 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
                     createVLIWMachineSched);
 
 namespace llvm {
-  FunctionPass *createHexagonExpandCondsets();
-  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
-                                     CodeGenOpt::Level OptLevel);
-  FunctionPass *createHexagonDelaySlotFillerPass(const TargetMachine &TM);
-  FunctionPass *createHexagonFPMoverPass(const TargetMachine &TM);
-  FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM);
   FunctionPass *createHexagonCFGOptimizer();
-
-  FunctionPass *createHexagonSplitConst32AndConst64();
+  FunctionPass *createHexagonCommonGEP();
+  FunctionPass *createHexagonCopyToCombine();
+  FunctionPass *createHexagonExpandCondsets();
   FunctionPass *createHexagonExpandPredSpillCode();
-  FunctionPass *createHexagonHardwareLoops();
-  FunctionPass *createHexagonPeephole();
   FunctionPass *createHexagonFixupHwLoops();
+  FunctionPass *createHexagonGenExtract();
+  FunctionPass *createHexagonGenInsert();
+  FunctionPass *createHexagonGenPredicate();
+  FunctionPass *createHexagonHardwareLoops();
+  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel);
   FunctionPass *createHexagonNewValueJump();
-  FunctionPass *createHexagonCopyToCombine();
   FunctionPass *createHexagonPacketizer();
-  FunctionPass *createHexagonNewValueJump();
+  FunctionPass *createHexagonPeephole();
+  FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonSplitConst32AndConst64();
 } // end namespace llvm;
 
 /// HexagonTargetMachine ctor - Create an ILP32 architecture model.
@@ -122,6 +134,7 @@ public:
     return createVLIWMachineSched(C);
   }
 
+  void addIRPasses() override;
   bool addInstSelector() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
@@ -134,6 +147,20 @@ TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new HexagonPassConfig(this, PM);
 }
 
+void HexagonPassConfig::addIRPasses() {
+  TargetPassConfig::addIRPasses();
+  bool NoOpt = (getOptLevel() == CodeGenOpt::None);
+
+  addPass(createAtomicExpandPass(TM));
+  if (!NoOpt) {
+    if (EnableCommGEP)
+      addPass(createHexagonCommonGEP());
+    // Replace certain combinations of shifts and ands with extracts.
+    if (EnableGenExtract)
+      addPass(createHexagonGenExtract());
+  }
+}
+
 bool HexagonPassConfig::addInstSelector() {
   HexagonTargetMachine &TM = getHexagonTargetMachine();
   bool NoOpt = (getOptLevel() == CodeGenOpt::None);
@@ -144,8 +171,13 @@ bool HexagonPassConfig::addInstSelector() {
   addPass(createHexagonISelDag(TM, getOptLevel()));
 
   if (!NoOpt) {
+    // Create logical operations on predicate registers.
+    if (EnableGenPred)
+      addPass(createHexagonGenPredicate(), false);
     addPass(createHexagonPeephole());
     printAndVerify("After hexagon peephole pass");
+    if (EnableGenInsert)
+      addPass(createHexagonGenInsert(), false);
   }
 
   return false;
diff --git a/lib/Target/Hexagon/LLVMBuild.txt b/lib/Target/Hexagon/LLVMBuild.txt
index 8259055b3f41..9d288af0214a 100644
--- a/lib/Target/Hexagon/LLVMBuild.txt
+++ b/lib/Target/Hexagon/LLVMBuild.txt
@@ -39,4 +39,5 @@ required_libraries =
  SelectionDAG
  Support
  Target
+ TransformUtils
 add_to_library_groups = Hexagon
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 83ce0abd835e..53305d85fd80 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -46,7 +46,7 @@ MCInstrInfo *llvm::createHexagonMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitHexagonMCRegisterInfo(X, Hexagon::R0);
   return X;
@@ -54,9 +54,7 @@ static MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitHexagonMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createHexagonMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
 namespace {
@@ -151,7 +149,8 @@ static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createHexagonMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createHexagonMCCodeGenInfo(const Triple &TT,
+                                                 Reloc::Model RM,
                                                  CodeModel::Model CM,
                                                  CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index a8f9b52746ad..3f377631c016 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -18,7 +18,6 @@ add_llvm_target(MSP430CodeGen
   MSP430RegisterInfo.cpp
   MSP430Subtarget.cpp
   MSP430TargetMachine.cpp
-  MSP430SelectionDAGInfo.cpp
   MSP430AsmPrinter.cpp
   MSP430MCInstLower.cpp
   )
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index be445c56389a..807d1129b5fc 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -37,7 +37,7 @@ static MCInstrInfo *createMSP430MCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createMSP430MCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createMSP430MCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitMSP430MCRegisterInfo(X, MSP430::PC);
   return X;
@@ -45,12 +45,11 @@ static MCRegisterInfo *createMSP430MCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createMSP430MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitMSP430MCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createMSP430MCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createMSP430MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createMSP430MCCodeGenInfo(const Triple &TT,
+                                                Reloc::Model RM,
                                                 CodeModel::Model CM,
                                                 CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 5ce5013d898c..8a01334ee2dd 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -254,10 +254,11 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
       AM.Base.Reg = CurDAG->getRegister(0, VT);
   }
 
-  Base  = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase) ?
-    CurDAG->getTargetFrameIndex(AM.Base.FrameIndex,
-                                getTargetLowering()->getPointerTy()) :
-    AM.Base.Reg;
+  Base = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase)
+             ? CurDAG->getTargetFrameIndex(
+                   AM.Base.FrameIndex,
+                   getTargetLowering()->getPointerTy(CurDAG->getDataLayout()))
+             : AM.Base.Reg;
 
   if (AM.GV)
     Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(N),
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index bc51741a836f..29bc8b33988a 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -213,7 +213,7 @@ SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 TargetLowering::ConstraintType
-MSP430TargetLowering::getConstraintType(const std::string &Constraint) const {
+MSP430TargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
@@ -227,8 +227,7 @@ MSP430TargetLowering::getConstraintType(const std::string &Constraint) const {
 
 std::pair<unsigned, const TargetRegisterClass *>
 MSP430TargetLowering::getRegForInlineAsmConstraint(
-    const TargetRegisterInfo *TRI, const std::string &Constraint,
-    MVT VT) const {
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC Constraint Letters
     switch (Constraint[0]) {
@@ -494,7 +493,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
       if (Flags.isByVal()) {
         int FI = MFI->CreateFixedObject(Flags.getByValSize(),
                                         VA.getLocMemOffset(), true);
-        InVal = DAG.getFrameIndex(FI, getPointerTy());
+        InVal = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
       } else {
         // Load the argument to a virtual register
         unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
@@ -592,10 +591,10 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, dl,
-                                                      getPointerTy(), true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -630,12 +629,11 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
       assert(VA.isMemLoc());
 
       if (!StackPtr.getNode())
-        StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SP, getPointerTy());
+        StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SP, PtrVT);
 
-      SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                                   StackPtr,
-                                   DAG.getIntPtrConstant(VA.getLocMemOffset(),
-                                                         dl));
+      SDValue PtrOff =
+          DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                      DAG.getIntPtrConstant(VA.getLocMemOffset(), dl));
 
       SDValue MemOp;
       ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -700,11 +698,8 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
-  Chain = DAG.getCALLSEQ_END(Chain,
-                             DAG.getConstant(NumBytes, dl, getPointerTy(),
-                                             true),
-                             DAG.getConstant(0, dl, getPointerTy(), true),
-                             InFlag, dl);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, dl, PtrVT, true),
+                             DAG.getConstant(0, dl, PtrVT, true), InFlag, dl);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
@@ -788,30 +783,31 @@ SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op,
                                                  SelectionDAG &DAG) const {
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Create the TargetGlobalAddress node, folding in the constant offset.
-  SDValue Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
-                                              getPointerTy(), Offset);
-  return DAG.getNode(MSP430ISD::Wrapper, SDLoc(Op),
-                     getPointerTy(), Result);
+  SDValue Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), PtrVT, Offset);
+  return DAG.getNode(MSP430ISD::Wrapper, SDLoc(Op), PtrVT, Result);
 }
 
 SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc dl(Op);
   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
-  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT);
 
-  return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);
+  return DAG.getNode(MSP430ISD::Wrapper, dl, PtrVT, Result);
 }
 
 SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT);
 
-  return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);
+  return DAG.getNode(MSP430ISD::Wrapper, dl, PtrVT, Result);
 }
 
 static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, SDValue &TargetCC,
@@ -1024,16 +1020,17 @@ MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
   int ReturnAddrIndex = FuncInfo->getRAIndex();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
 
   if (ReturnAddrIndex == 0) {
     // Set up a frame object for the return address.
-    uint64_t SlotSize = getDataLayout()->getPointerSize();
+    uint64_t SlotSize = MF.getDataLayout().getPointerSize();
     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
                                                            true);
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
 
-  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+  return DAG.getFrameIndex(ReturnAddrIndex, PtrVT);
 }
 
 SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
@@ -1046,21 +1043,21 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset =
-        DAG.getConstant(getDataLayout()->getPointerSize(), dl, MVT::i16);
-    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                                   FrameAddr, Offset),
+        DAG.getConstant(DAG.getDataLayout().getPointerSize(), dl, MVT::i16);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
                        MachinePointerInfo(), false, false, false, 0);
   }
 
   // Just load the return address.
   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
-  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+                     MachinePointerInfo(), false, false, false, 0);
 }
 
 SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
@@ -1084,10 +1081,11 @@ SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Frame index of first vararg argument
-  SDValue FrameIndex = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
-                                         getPointerTy());
+  SDValue FrameIndex =
+      DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
 
   // Create a store of the frame index to the location operand
@@ -1099,9 +1097,9 @@ SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
 SDValue MSP430TargetLowering::LowerJumpTable(SDValue Op,
                                              SelectionDAG &DAG) const {
     JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-    SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
-    return DAG.getNode(MSP430ISD::Wrapper, SDLoc(JT),
-                       getPointerTy(), Result);
+    auto PtrVT = getPointerTy(DAG.getDataLayout());
+    SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+    return DAG.getNode(MSP430ISD::Wrapper, SDLoc(JT), PtrVT, Result);
 }
 
 /// getPostIndexedAddressParts - returns true by value, base pointer and
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 80d3ae175fb1..2d63852c185b 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -72,7 +72,9 @@ namespace llvm {
     explicit MSP430TargetLowering(const TargetMachine &TM,
                                   const MSP430Subtarget &STI);
 
-    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; }
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i8;
+    }
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -96,11 +98,10 @@ namespace llvm {
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
 
     TargetLowering::ConstraintType
-    getConstraintType(const std::string &Constraint) const override;
+    getConstraintType(StringRef Constraint) const override;
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     /// isTruncateFree - Return true if it's free to truncate a value of type
     /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 614467bcd248..2fb82e535e8d 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -37,7 +37,7 @@ MSP430RegisterInfo::MSP430RegisterInfo()
 
 const MCPhysReg*
 MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  const MSP430FrameLowering *TFI = getFrameLowering(*MF);
   const Function* F = MF->getFunction();
   static const MCPhysReg CalleeSavedRegs[] = {
     MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7,
@@ -73,7 +73,7 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const MSP430FrameLowering *TFI = getFrameLowering(MF);
 
   // Mark 4 special registers with subregisters as reserved.
   Reserved.set(MSP430::PCB);
@@ -109,7 +109,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const MSP430FrameLowering *TFI = getFrameLowering(MF);
   DebugLoc dl = MI.getDebugLoc();
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
@@ -156,7 +156,6 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 }
 
 unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
-
+  const MSP430FrameLowering *TFI = getFrameLowering(MF);
   return TFI->hasFP(MF) ? MSP430::FP : MSP430::SP;
 }
diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
deleted file mode 100644
index 3897ef684d4d..000000000000
--- a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- MSP430SelectionDAGInfo.cpp - MSP430 SelectionDAG Info -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the MSP430SelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MSP430TargetMachine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "msp430-selectiondag-info"
-
-MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-MSP430SelectionDAGInfo::~MSP430SelectionDAGInfo() {
-}
diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.h b/lib/Target/MSP430/MSP430SelectionDAGInfo.h
deleted file mode 100644
index 61a6b19111db..000000000000
--- a/lib/Target/MSP430/MSP430SelectionDAGInfo.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- MSP430SelectionDAGInfo.h - MSP430 SelectionDAG Info -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the MSP430 subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_MSP430_MSP430SELECTIONDAGINFO_H
-#define LLVM_LIB_TARGET_MSP430_MSP430SELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class MSP430TargetMachine;
-
-class MSP430SelectionDAGInfo : public TargetSelectionDAGInfo {
-public:
-  explicit MSP430SelectionDAGInfo(const DataLayout &DL);
-  ~MSP430SelectionDAGInfo();
-};
-
-}
-
-#endif
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index 6374f41c00ea..6216348e4d71 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -34,5 +34,4 @@ MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
 MSP430Subtarget::MSP430Subtarget(const Triple &TT, const std::string &CPU,
                                  const std::string &FS, const TargetMachine &TM)
     : MSP430GenSubtargetInfo(TT, CPU, FS), FrameLowering(),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      TSInfo(*TM.getDataLayout()) {}
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {}
diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
index 81f6f027d45c..ff2656d26dd2 100644
--- a/lib/Target/MSP430/MSP430Subtarget.h
+++ b/lib/Target/MSP430/MSP430Subtarget.h
@@ -18,8 +18,8 @@
 #include "MSP430ISelLowering.h"
 #include "MSP430InstrInfo.h"
 #include "MSP430RegisterInfo.h"
-#include "MSP430SelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -35,7 +35,7 @@ class MSP430Subtarget : public MSP430GenSubtargetInfo {
   MSP430FrameLowering FrameLowering;
   MSP430InstrInfo InstrInfo;
   MSP430TargetLowering TLInfo;
-  MSP430SelectionDAGInfo TSInfo;
+  TargetSelectionDAGInfo TSInfo;
 
 public:
   /// This constructor initializes the data members to match that
@@ -60,7 +60,7 @@ public:
   const MSP430TargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const MSP430SelectionDAGInfo *getSelectionDAGInfo() const override {
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 };
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index f14156dbfa2b..5107d2ae58c3 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -1727,37 +1727,59 @@ bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
 }
 
 namespace {
-template <unsigned ShiftAmount>
+void emitRX(unsigned Opcode, unsigned DstReg, MCOperand Imm, SMLoc IDLoc,
+            SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  tmpInst.setOpcode(Opcode);
+  tmpInst.addOperand(MCOperand::createReg(DstReg));
+  tmpInst.addOperand(Imm);
+  tmpInst.setLoc(IDLoc);
+  Instructions.push_back(tmpInst);
+}
+
+void emitRI(unsigned Opcode, unsigned DstReg, int16_t Imm, SMLoc IDLoc,
+            SmallVectorImpl<MCInst> &Instructions) {
+  emitRX(Opcode, DstReg, MCOperand::createImm(Imm), IDLoc, Instructions);
+}
+
+
+void emitRRX(unsigned Opcode, unsigned DstReg, unsigned SrcReg, MCOperand Imm,
+             SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  tmpInst.setOpcode(Opcode);
+  tmpInst.addOperand(MCOperand::createReg(DstReg));
+  tmpInst.addOperand(MCOperand::createReg(SrcReg));
+  tmpInst.addOperand(Imm);
+  tmpInst.setLoc(IDLoc);
+  Instructions.push_back(tmpInst);
+}
+
+void emitRRR(unsigned Opcode, unsigned DstReg, unsigned SrcReg,
+             unsigned SrcReg2, SMLoc IDLoc,
+             SmallVectorImpl<MCInst> &Instructions) {
+  emitRRX(Opcode, DstReg, SrcReg, MCOperand::createReg(SrcReg2), IDLoc,
+          Instructions);
+}
+
+void emitRRI(unsigned Opcode, unsigned DstReg, unsigned SrcReg, int16_t Imm,
+             SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) {
+  emitRRX(Opcode, DstReg, SrcReg, MCOperand::createImm(Imm), IDLoc,
+          Instructions);
+}
+
+template <int16_t ShiftAmount>
 void createLShiftOri(MCOperand Operand, unsigned RegNo, SMLoc IDLoc,
                      SmallVectorImpl<MCInst> &Instructions) {
-  MCInst tmpInst;
-  if (ShiftAmount >= 32) {
-    tmpInst.setOpcode(Mips::DSLL32);
-    tmpInst.addOperand(MCOperand::createReg(RegNo));
-    tmpInst.addOperand(MCOperand::createReg(RegNo));
-    tmpInst.addOperand(MCOperand::createImm(ShiftAmount - 32));
-    tmpInst.setLoc(IDLoc);
-    Instructions.push_back(tmpInst);
-    tmpInst.clear();
-  } else if (ShiftAmount > 0) {
-    tmpInst.setOpcode(Mips::DSLL);
-    tmpInst.addOperand(MCOperand::createReg(RegNo));
-    tmpInst.addOperand(MCOperand::createReg(RegNo));
-    tmpInst.addOperand(MCOperand::createImm(ShiftAmount));
-    tmpInst.setLoc(IDLoc);
-    Instructions.push_back(tmpInst);
-    tmpInst.clear();
-  }
+  if (ShiftAmount >= 32)
+    emitRRI(Mips::DSLL32, RegNo, RegNo, ShiftAmount - 32, IDLoc, Instructions);
+  else if (ShiftAmount > 0)
+    emitRRI(Mips::DSLL, RegNo, RegNo, ShiftAmount, IDLoc, Instructions);
+
   // There's no need for an ORi if the immediate is 0.
   if (Operand.isImm() && Operand.getImm() == 0)
     return;
 
-  tmpInst.setOpcode(Mips::ORi);
-  tmpInst.addOperand(MCOperand::createReg(RegNo));
-  tmpInst.addOperand(MCOperand::createReg(RegNo));
-  tmpInst.addOperand(Operand);
-  tmpInst.setLoc(IDLoc);
-  Instructions.push_back(tmpInst);
+  emitRRX(Mips::ORi, RegNo, RegNo, Operand, IDLoc, Instructions);
 }
 
 template <unsigned ShiftAmount>
@@ -1818,12 +1840,22 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     return true;
   }
 
+  if (Is32BitImm) {
+    if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) {
+      // Sign extend up to 64-bit so that the predicates match the hardware
+      // behaviour. In particular, isInt<16>(0xffff8000) and similar should be
+      // true.
+      ImmValue = SignExtend64<32>(ImmValue);
+    } else {
+      Error(IDLoc, "instruction requires a 32-bit immediate");
+      return true;
+    }
+  }
+
   bool UseSrcReg = false;
   if (SrcReg != Mips::NoRegister)
     UseSrcReg = true;
 
-  MCInst tmpInst;
-
   unsigned TmpReg = DstReg;
   if (UseSrcReg && (DstReg == SrcReg)) {
     // At this point we need AT to perform the expansions and we exit if it is
@@ -1834,29 +1866,26 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     TmpReg = ATReg;
   }
 
-  tmpInst.setLoc(IDLoc);
   // FIXME: gas has a special case for values that are 000...1111, which
   // becomes a li -1 and then a dsrl
-  if (0 <= ImmValue && ImmValue <= 65535) {
-    // For unsigned and positive signed 16-bit values (0 <= j <= 65535):
-    // li d,j => ori d,$zero,j
-    if (!UseSrcReg)
-      SrcReg = isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
-    tmpInst.setOpcode(Mips::ORi);
-    tmpInst.addOperand(MCOperand::createReg(DstReg));
-    tmpInst.addOperand(MCOperand::createReg(SrcReg));
-    tmpInst.addOperand(MCOperand::createImm(ImmValue));
-    Instructions.push_back(tmpInst);
-  } else if (ImmValue < 0 && ImmValue >= -32768) {
-    // For negative signed 16-bit values (-32768 <= j < 0):
+  if (isInt<16>(ImmValue)) {
     // li d,j => addiu d,$zero,j
     if (!UseSrcReg)
       SrcReg = Mips::ZERO;
-    tmpInst.setOpcode(Mips::ADDiu);
-    tmpInst.addOperand(MCOperand::createReg(DstReg));
-    tmpInst.addOperand(MCOperand::createReg(SrcReg));
-    tmpInst.addOperand(MCOperand::createImm(ImmValue));
-    Instructions.push_back(tmpInst);
+    emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions);
+  } else if (isUInt<16>(ImmValue)) {
+    // li d,j => ori d,$zero,j
+    unsigned TmpReg = DstReg;
+    if (SrcReg == DstReg) {
+      unsigned ATReg = getATReg(IDLoc);
+      if (!ATReg)
+        return true;
+      TmpReg = ATReg;
+    }
+
+    emitRRI(Mips::ORi, TmpReg, Mips::ZERO, ImmValue, IDLoc, Instructions);
+    if (UseSrcReg)
+      emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, Instructions);
   } else if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) {
     warnIfNoMacro(IDLoc);
 
@@ -1869,30 +1898,16 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     if (!Is32BitImm && !isInt<32>(ImmValue)) {
       // For DLI, expand to an ORi instead of a LUi to avoid sign-extending the
       // upper 32 bits.
-      tmpInst.setOpcode(Mips::ORi);
-      tmpInst.addOperand(MCOperand::createReg(TmpReg));
-      tmpInst.addOperand(MCOperand::createReg(Mips::ZERO));
-      tmpInst.addOperand(MCOperand::createImm(Bits31To16));
-      tmpInst.setLoc(IDLoc);
-      Instructions.push_back(tmpInst);
-      // Move the value to the upper 16 bits by doing a 16-bit left shift.
-      createLShiftOri<16>(0, TmpReg, IDLoc, Instructions);
-    } else {
-      tmpInst.setOpcode(Mips::LUi);
-      tmpInst.addOperand(MCOperand::createReg(TmpReg));
-      tmpInst.addOperand(MCOperand::createImm(Bits31To16));
-      Instructions.push_back(tmpInst);
-    }
+      emitRRI(Mips::ORi, TmpReg, Mips::ZERO, Bits31To16, IDLoc, Instructions);
+      emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, Instructions);
+    } else
+      emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, Instructions);
     createLShiftOri<0>(Bits15To0, TmpReg, IDLoc, Instructions);
 
     if (UseSrcReg)
       createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions);
 
   } else if ((ImmValue & (0xffffLL << 48)) == 0) {
-    if (Is32BitImm) {
-      Error(IDLoc, "instruction requires a 32-bit immediate");
-      return true;
-    }
     warnIfNoMacro(IDLoc);
 
     //            <-------  lo32 ------>
@@ -1912,10 +1927,7 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
     uint16_t Bits15To0 = ImmValue & 0xffff;
 
-    tmpInst.setOpcode(Mips::LUi);
-    tmpInst.addOperand(MCOperand::createReg(TmpReg));
-    tmpInst.addOperand(MCOperand::createImm(Bits47To32));
-    Instructions.push_back(tmpInst);
+    emitRI(Mips::LUi, TmpReg, Bits47To32, IDLoc, Instructions);
     createLShiftOri<0>(Bits31To16, TmpReg, IDLoc, Instructions);
     createLShiftOri<16>(Bits15To0, TmpReg, IDLoc, Instructions);
 
@@ -1923,10 +1935,6 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
       createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions);
 
   } else {
-    if (Is32BitImm) {
-      Error(IDLoc, "instruction requires a 32-bit immediate");
-      return true;
-    }
     warnIfNoMacro(IDLoc);
 
     // <-------  hi32 ------> <-------  lo32 ------>
@@ -1948,10 +1956,7 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
     uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
     uint16_t Bits15To0 = ImmValue & 0xffff;
 
-    tmpInst.setOpcode(Mips::LUi);
-    tmpInst.addOperand(MCOperand::createReg(TmpReg));
-    tmpInst.addOperand(MCOperand::createImm(Bits63To48));
-    Instructions.push_back(tmpInst);
+    emitRI(Mips::LUi, TmpReg, Bits63To48, IDLoc, Instructions);
     createLShiftOri<0>(Bits47To32, TmpReg, IDLoc, Instructions);
 
     // When Bits31To16 is 0, do a left shift of 32 bits instead of doing
@@ -2096,8 +2101,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(
     tmpInst.addOperand(MCOperand::createExpr(HiExpr));
     Instructions.push_back(tmpInst);
 
-    createLShiftOri<0>(MCOperand::createExpr(LoExpr), TmpReg, SMLoc(),
-                       Instructions);
+    emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), SMLoc(),
+            Instructions);
   }
 
   if (UseSrcReg)
@@ -2708,12 +2713,8 @@ void MipsAsmParser::createNop(bool hasShortDelaySlot, SMLoc IDLoc,
 void MipsAsmParser::createAddu(unsigned DstReg, unsigned SrcReg,
                                unsigned TrgReg, bool Is64Bit,
                                SmallVectorImpl<MCInst> &Instructions) {
-  MCInst AdduInst;
-  AdduInst.setOpcode(Is64Bit ? Mips::DADDu : Mips::ADDu);
-  AdduInst.addOperand(MCOperand::createReg(DstReg));
-  AdduInst.addOperand(MCOperand::createReg(SrcReg));
-  AdduInst.addOperand(MCOperand::createReg(TrgReg));
-  Instructions.push_back(AdduInst);
+  emitRRR(Is64Bit ? Mips::DADDu : Mips::ADDu, DstReg, SrcReg, TrgReg, SMLoc(),
+          Instructions);
 }
 
 unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 36ba8e559e0b..bde843afd3d2 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -46,7 +46,6 @@ add_llvm_target(MipsCodeGen
   MipsSubtarget.cpp
   MipsTargetMachine.cpp
   MipsTargetObjectFile.cpp
-  MipsSelectionDAGInfo.cpp
   )
 
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 9bdf8235a2b4..949ee1474f96 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -59,7 +59,7 @@ static MCInstrInfo *createMipsMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createMipsMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitMipsMCRegisterInfo(X, Mips::RA);
   return X;
@@ -68,9 +68,7 @@ static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) {
 static MCSubtargetInfo *createMipsMCSubtargetInfo(const Triple &TT,
                                                   StringRef CPU, StringRef FS) {
   CPU = MIPS_MC::selectMipsCPU(TT, CPU);
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitMipsMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createMipsMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
 static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI,
@@ -84,7 +82,7 @@ static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createMipsMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createMipsMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
                                               CodeModel::Model CM,
                                               CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index db2a924a99f9..46cc99c62393 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -152,18 +152,19 @@ Mips16FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   return isInt<15>(MFI->getMaxCallFrameSize()) && !MFI->hasVarSizedObjects();
 }
 
-void Mips16FrameLowering::
-processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS) const {
+void Mips16FrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                               BitVector &SavedRegs,
+                                               RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   const Mips16InstrInfo &TII =
       *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
   const MipsRegisterInfo &RI = TII.getRegisterInfo();
   const BitVector Reserved = RI.getReservedRegs(MF);
   bool SaveS2 = Reserved[Mips::S2];
   if (SaveS2)
-    MF.getRegInfo().setPhysRegUsed(Mips::S2);
+    SavedRegs.set(Mips::S2);
   if (hasFP(MF))
-    MF.getRegInfo().setPhysRegUsed(Mips::S0);
+    SavedRegs.set(Mips::S0);
 }
 
 const MipsFrameLowering *
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index f281c927c1c4..b48ed4641ea7 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -38,8 +38,8 @@ public:
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 7b6a2a154471..bce2c1eb4485 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -120,13 +120,13 @@ void Mips16DAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
 SDValue Mips16DAGToDAGISel::getMips16SPAliasReg() {
   unsigned Mips16SPAliasReg =
     MF->getInfo<MipsFunctionInfo>()->getMips16SPAliasReg();
-  return CurDAG->getRegister(Mips16SPAliasReg,
-                             getTargetLowering()->getPointerTy());
+  auto PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
+  return CurDAG->getRegister(Mips16SPAliasReg, PtrVT);
 }
 
 void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
-  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0,
-                                           getTargetLowering()->getPointerTy());
+  auto PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
+  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, PtrVT);
   if (Parent) {
     switch (Parent->getOpcode()) {
       case ISD::LOAD: {
@@ -155,7 +155,7 @@ void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
       }
     }
   }
-  AliasReg = CurDAG->getRegister(Mips::SP, getTargetLowering()->getPointerTy());
+  AliasReg = CurDAG->getRegister(Mips::SP, PtrVT);
   return;
 
 }
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 846e3c964f44..3522cbb1f36a 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -502,7 +502,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
     unsigned V0Reg = Mips::V0;
     if (NeedMips16Helper) {
       RegsToPass.push_front(std::make_pair(V0Reg, Callee));
-      JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction, getPointerTy());
+      JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction,
+                                         getPointerTy(DAG.getDataLayout()));
       ExternalSymbolSDNode *S = cast<ExternalSymbolSDNode>(JumpTarget);
       JumpTarget = getAddrGlobal(S, CLI.DL, JumpTarget.getValueType(), DAG,
                                  MipsII::MO_GOT, Chain,
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index c2651b82d285..e2f6fcc17726 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -267,7 +267,7 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
 }
 
 unsigned MipsFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
-  assert(TLI.getValueType(AI->getType(), true) == MVT::i32 &&
+  assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i32 &&
          "Alloca should always return a pointer.");
 
   DenseMap<const AllocaInst *, int>::iterator SI =
@@ -382,7 +382,7 @@ unsigned MipsFastISel::materializeExternalCallSym(MCSymbol *Sym) {
 // Materialize a constant into a register, and return the register
 // number (or zero if we failed to handle it).
 unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
 
   // Only handle simple types.
   if (!CEVT.isSimple())
@@ -507,12 +507,13 @@ bool MipsFastISel::computeCallAddress(const Value *V, Address &Addr) {
     break;
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs if its operand is in the same BB.
-    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
       return computeCallAddress(U->getOperand(0), Addr);
     break;
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints if its operand is in the same BB.
-    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return computeCallAddress(U->getOperand(0), Addr);
     break;
   }
@@ -532,7 +533,7 @@ bool MipsFastISel::computeCallAddress(const Value *V, Address &Addr) {
 }
 
 bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) {
-  EVT evt = TLI.getValueType(Ty, true);
+  EVT evt = TLI.getValueType(DL, Ty, true);
   // Only handle simple types.
   if (evt == MVT::Other || !evt.isSimple())
     return false;
@@ -931,8 +932,8 @@ bool MipsFastISel::selectFPExt(const Instruction *I) {
   if (UnsupportedFPMode)
     return false;
   Value *Src = I->getOperand(0);
-  EVT SrcVT = TLI.getValueType(Src->getType(), true);
-  EVT DestVT = TLI.getValueType(I->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::f32 || DestVT != MVT::f64)
     return false;
@@ -998,8 +999,8 @@ bool MipsFastISel::selectFPTrunc(const Instruction *I) {
   if (UnsupportedFPMode)
     return false;
   Value *Src = I->getOperand(0);
-  EVT SrcVT = TLI.getValueType(Src->getType(), true);
-  EVT DestVT = TLI.getValueType(I->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::f64 || DestVT != MVT::f32)
     return false;
@@ -1415,7 +1416,8 @@ bool MipsFastISel::selectRet(const Instruction *I) {
   if (Ret->getNumOperands() > 0) {
     CallingConv::ID CC = F.getCallingConv();
     SmallVector<ISD::OutputArg, 4> Outs;
-    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
     MipsCCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs,
@@ -1449,7 +1451,7 @@ bool MipsFastISel::selectRet(const Instruction *I) {
     if (!MRI.getRegClass(SrcReg)->contains(DestReg))
       return false;
 
-    EVT RVEVT = TLI.getValueType(RV->getType());
+    EVT RVEVT = TLI.getValueType(DL, RV->getType());
     if (!RVEVT.isSimple())
       return false;
 
@@ -1493,8 +1495,8 @@ bool MipsFastISel::selectTrunc(const Instruction *I) {
   Value *Op = I->getOperand(0);
 
   EVT SrcVT, DestVT;
-  SrcVT = TLI.getValueType(Op->getType(), true);
-  DestVT = TLI.getValueType(I->getType(), true);
+  SrcVT = TLI.getValueType(DL, Op->getType(), true);
+  DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
     return false;
@@ -1521,8 +1523,8 @@ bool MipsFastISel::selectIntExt(const Instruction *I) {
     return false;
 
   EVT SrcEVT, DestEVT;
-  SrcEVT = TLI.getValueType(SrcTy, true);
-  DestEVT = TLI.getValueType(DestTy, true);
+  SrcEVT = TLI.getValueType(DL, SrcTy, true);
+  DestEVT = TLI.getValueType(DL, DestTy, true);
   if (!SrcEVT.isSimple())
     return false;
   if (!DestEVT.isSimple())
@@ -1620,7 +1622,7 @@ unsigned MipsFastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
 }
 
 bool MipsFastISel::selectDivRem(const Instruction *I, unsigned ISDOpcode) {
-  EVT DestEVT = TLI.getValueType(I->getType(), true);
+  EVT DestEVT = TLI.getValueType(DL, I->getType(), true);
   if (!DestEVT.isSimple())
     return false;
 
@@ -1685,7 +1687,7 @@ bool MipsFastISel::selectShift(const Instruction *I) {
     if (!TempReg)
       return false;
 
-    MVT Op0MVT = TLI.getValueType(Op0->getType(), true).getSimpleVT();
+    MVT Op0MVT = TLI.getValueType(DL, Op0->getType(), true).getSimpleVT();
     bool IsZExt = Opcode == Instruction::LShr;
     if (!emitIntExt(Op0MVT, Op0Reg, MVT::i32, TempReg, IsZExt))
       return false;
@@ -1803,7 +1805,7 @@ unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V,
   unsigned VReg = getRegForValue(V);
   if (VReg == 0)
     return 0;
-  MVT VMVT = TLI.getValueType(V->getType(), true).getSimpleVT();
+  MVT VMVT = TLI.getValueType(DL, V->getType(), true).getSimpleVT();
   if ((VMVT == MVT::i8) || (VMVT == MVT::i16)) {
     unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
     if (!emitIntExt(VMVT, VReg, MVT::i32, TempReg, IsUnsigned))
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 2c9868ac051d..06502397b6b8 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -59,8 +59,9 @@ bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
 /// GOT address into a register.
 SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
-  return CurDAG->getRegister(GlobalBaseReg,
-                             getTargetLowering()->getPointerTy()).getNode();
+  return CurDAG->getRegister(GlobalBaseReg, getTargetLowering()->getPointerTy(
+                                                CurDAG->getDataLayout()))
+      .getNode();
 }
 
 /// ComplexPattern used on MipsInstrInfo
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 67ddcc4dacb9..fbebb9abb4cc 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -466,7 +466,8 @@ MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
   return Mips::createFastISel(funcInfo, libInfo);
 }
 
-EVT MipsTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT MipsTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+                                           EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
@@ -1579,9 +1580,10 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Table = Op.getOperand(1);
   SDValue Index = Op.getOperand(2);
   SDLoc DL(Op);
-  EVT PTy = getPointerTy();
+  auto &TD = DAG.getDataLayout();
+  EVT PTy = getPointerTy(TD);
   unsigned EntrySize =
-    DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(*getDataLayout());
+      DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
 
   Index = DAG.getNode(ISD::MUL, DL, PTy, Index,
                       DAG.getConstant(EntrySize, DL, PTy));
@@ -1647,10 +1649,10 @@ lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 {
   SDLoc DL(Op);
   EVT Ty = Op.getOperand(0).getValueType();
-  SDValue Cond = DAG.getNode(ISD::SETCC, DL,
-                             getSetCCResultType(*DAG.getContext(), Ty),
-                             Op.getOperand(0), Op.getOperand(1),
-                             Op.getOperand(4));
+  SDValue Cond =
+      DAG.getNode(ISD::SETCC, DL, getSetCCResultType(DAG.getDataLayout(),
+                                                     *DAG.getContext(), Ty),
+                  Op.getOperand(0), Op.getOperand(1), Op.getOperand(4));
 
   return DAG.getNode(ISD::SELECT, DL, Op.getValueType(), Cond, Op.getOperand(2),
                      Op.getOperand(3));
@@ -1723,7 +1725,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   SDLoc DL(GA);
   const GlobalValue *GV = GA->getGlobal();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   TLSModel::Model model = getTargetMachine().getTLSModel(GV);
 
@@ -1831,7 +1833,7 @@ SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 
   SDLoc DL(Op);
   SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
-                                 getPointerTy());
+                                 getPointerTy(MF.getDataLayout()));
 
   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
@@ -1850,9 +1852,9 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Node);
   unsigned ArgSlotSizeInBytes = (ABI.IsN32() || ABI.IsN64()) ? 8 : 4;
 
-  SDValue VAListLoad = DAG.getLoad(getPointerTy(), DL, Chain, VAListPtr,
-                                   MachinePointerInfo(SV), false, false, false,
-                                   0);
+  SDValue VAListLoad =
+      DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL, Chain, VAListPtr,
+                  MachinePointerInfo(SV), false, false, false, 0);
   SDValue VAList = VAListLoad;
 
   // Re-align the pointer if necessary.
@@ -1874,7 +1876,9 @@ SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Increment the pointer, VAList, to the next vaarg.
-  unsigned ArgSizeInBytes = getDataLayout()->getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext()));
+  auto &TD = DAG.getDataLayout();
+  unsigned ArgSizeInBytes =
+      TD.getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext()));
   SDValue Tmp3 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
                              DAG.getConstant(RoundUpToAlignment(ArgSizeInBytes,
                                                             ArgSlotSizeInBytes),
@@ -2062,7 +2066,7 @@ SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
   Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1));
   return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain,
                      DAG.getRegister(OffsetReg, Ty),
-                     DAG.getRegister(AddrReg, getPointerTy()),
+                     DAG.getRegister(AddrReg, getPointerTy(MF.getDataLayout())),
                      Chain.getValue(1));
 }
 
@@ -2479,15 +2483,16 @@ MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
                                    SDValue Chain, SDValue Arg, SDLoc DL,
                                    bool IsTailCall, SelectionDAG &DAG) const {
   if (!IsTailCall) {
-    SDValue PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
-                                 DAG.getIntPtrConstant(Offset, DL));
+    SDValue PtrOff =
+        DAG.getNode(ISD::ADD, DL, getPointerTy(DAG.getDataLayout()), StackPtr,
+                    DAG.getIntPtrConstant(Offset, DL));
     return DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo(), false,
                         false, 0);
   }
 
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   int FI = MFI->CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false);
-  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(),
                       /*isVolatile=*/ true, false, 0);
 }
@@ -2611,8 +2616,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (!IsTailCall)
     Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
 
-  SDValue StackPtr = DAG.getCopyFromReg(
-      Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP, getPointerTy());
+  SDValue StackPtr =
+      DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP,
+                         getPointerTy(DAG.getDataLayout()));
 
   // With EABI is it possible to have 16 args on registers.
   std::deque< std::pair<unsigned, SDValue> > RegsToPass;
@@ -2750,7 +2756,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         IsCallReloc = true;
       }
     } else
-      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL,
+                                          getPointerTy(DAG.getDataLayout()), 0,
                                           MipsII::MO_NO_FLAG);
     GlobalOrExternal = true;
   }
@@ -2758,8 +2765,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     const char *Sym = S->getSymbol();
 
     if (!ABI.IsN64() && !IsPIC) // !N64 && static
-      Callee =
-          DAG.getTargetExternalSymbol(Sym, getPointerTy(), MipsII::MO_NO_FLAG);
+      Callee = DAG.getTargetExternalSymbol(
+          Sym, getPointerTy(DAG.getDataLayout()), MipsII::MO_NO_FLAG);
     else if (LargeGOT) {
       Callee = getAddrGlobalLargeGOT(S, DL, Ty, DAG, MipsII::MO_CALL_HI16,
                                      MipsII::MO_CALL_LO16, Chain,
@@ -3029,7 +3036,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
                                       VA.getLocMemOffset(), true);
 
       // Create load nodes to retrieve arguments from the stack
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
       SDValue ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
                                      MachinePointerInfo::getFixedStack(FI),
                                      false, false, false, 0);
@@ -3174,12 +3181,13 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 
     if (!Reg)
       llvm_unreachable("sret virtual register not created in the entry block");
-    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(DAG.getDataLayout()));
     unsigned V0 = ABI.IsN64() ? Mips::V0_64 : Mips::V0;
 
     Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag);
     Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(V0, getPointerTy()));
+    RetOps.push_back(DAG.getRegister(V0, getPointerTy(DAG.getDataLayout())));
   }
 
   RetOps[0] = Chain;  // Update chain.
@@ -3198,9 +3206,8 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
-MipsTargetLowering::ConstraintType MipsTargetLowering::
-getConstraintType(const std::string &Constraint) const
-{
+MipsTargetLowering::ConstraintType
+MipsTargetLowering::getConstraintType(StringRef Constraint) const {
   // Mips specific constraints
   // GCC config/mips/constraints.md
   //
@@ -3290,9 +3297,8 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
 /// into non-numeric and numeric parts (Prefix and Reg). The first boolean flag
 /// that is returned indicates whether parsing was successful. The second flag
 /// is true if the numeric part exists.
-static std::pair<bool, bool>
-parsePhysicalReg(StringRef C, std::string &Prefix,
-                 unsigned long long &Reg) {
+static std::pair<bool, bool> parsePhysicalReg(StringRef C, StringRef &Prefix,
+                                              unsigned long long &Reg) {
   if (C.front() != '{' || C.back() != '}')
     return std::make_pair(false, false);
 
@@ -3300,7 +3306,7 @@ parsePhysicalReg(StringRef C, std::string &Prefix,
   StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1;
   I = std::find_if(B, E, std::ptr_fun(isdigit));
 
-  Prefix.assign(B, I - B);
+  Prefix = StringRef(B, I - B);
 
   // The second flag is set to false if no numeric characters were found.
   if (I == E)
@@ -3316,7 +3322,7 @@ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
   const TargetRegisterInfo *TRI =
       Subtarget.getRegisterInfo();
   const TargetRegisterClass *RC;
-  std::string Prefix;
+  StringRef Prefix;
   unsigned long long Reg;
 
   std::pair<bool, bool> R = parsePhysicalReg(C, Prefix, Reg);
@@ -3332,7 +3338,7 @@ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
     RC = TRI->getRegClass(Prefix == "hi" ?
                           Mips::HI32RegClassID : Mips::LO32RegClassID);
     return std::make_pair(*(RC->begin()), RC);
-  } else if (Prefix.compare(0, 4, "$msa") == 0) {
+  } else if (Prefix.startswith("$msa")) {
     // Parse $msa(ir|csr|access|save|modify|request|map|unmap)
 
     // No numeric characters follow the name.
@@ -3390,7 +3396,7 @@ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
 /// pointer.
 std::pair<unsigned, const TargetRegisterClass *>
 MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                 const std::string &Constraint,
+                                                 StringRef Constraint,
                                                  MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
@@ -3546,8 +3552,8 @@ void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
-bool MipsTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                               Type *Ty,
+bool MipsTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                               const AddrMode &AM, Type *Ty,
                                                unsigned AS) const {
   // No global is ever allowed as a base.
   if (AM.BaseGV)
@@ -3625,7 +3631,7 @@ void MipsTargetLowering::copyByValRegs(
     FrameObjOffset = VA.getLocMemOffset();
 
   // Create frame object.
-  EVT PtrTy = getPointerTy();
+  EVT PtrTy = getPointerTy(DAG.getDataLayout());
   int FI = MFI->CreateFixedObject(FrameObjSize, FrameObjOffset, true);
   SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
   InVals.push_back(FIN);
@@ -3662,7 +3668,8 @@ void MipsTargetLowering::passByValArg(
   unsigned OffsetInBytes = 0; // From beginning of struct
   unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
   unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes);
-  EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
+  EVT PtrTy = getPointerTy(DAG.getDataLayout()),
+      RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
   unsigned NumRegs = LastReg - FirstReg;
 
   if (NumRegs) {
@@ -3787,7 +3794,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
     unsigned Reg = addLiveIn(MF, ArgRegs[I], RC);
     SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegTy);
     FI = MFI->CreateFixedObject(RegSizeInBytes, VaArgOffset, true);
-    SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
                                  MachinePointerInfo(), false, false, 0);
     cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue(
@@ -3920,8 +3927,8 @@ MipsTargetLowering::emitPseudoSELECT(MachineInstr *MI, MachineBasicBlock *BB,
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned MipsTargetLowering::getRegisterByName(const char* RegName,
-                                               EVT VT) const {
+unsigned MipsTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                               SelectionDAG &DAG) const {
   // Named registers is expected to be fairly rare. For now, just support $28
   // since the linux kernel uses it.
   if (Subtarget.isGP64bit()) {
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index bc9a1ce64097..6fe8f830d35d 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -227,7 +227,9 @@ namespace llvm {
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo) const override;
 
-    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i32;
+    }
 
     void LowerOperationWrapper(SDNode *N,
                                SmallVectorImpl<SDValue> &Results,
@@ -247,7 +249,8 @@ namespace llvm {
     const char *getTargetNodeName(unsigned Opcode) const override;
 
     /// getSetCCResultType - get the ISD::SETCC result ValueType
-    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
 
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
@@ -263,7 +266,8 @@ namespace llvm {
 
     void HandleByVal(CCState *, unsigned &, unsigned) const override;
 
-    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
 
   protected:
     SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
@@ -478,8 +482,7 @@ namespace llvm {
     bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
 
     // Inline asm support
-    ConstraintType
-      getConstraintType(const std::string &Constraint) const override;
+    ConstraintType getConstraintType(StringRef Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
@@ -493,8 +496,7 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
@@ -505,8 +507,8 @@ namespace llvm {
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
 
-    unsigned getInlineAsmMemConstraint(
-        const std::string &ConstraintCode) const override {
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "R")
         return InlineAsm::Constraint_R;
       else if (ConstraintCode == "ZC")
@@ -514,8 +516,8 @@ namespace llvm {
       return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
     }
 
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                               unsigned AS) const override;
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
 
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index ec7bf314c641..096b3bee5d07 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -621,10 +621,17 @@ MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
     !MFI->hasVarSizedObjects();
 }
 
-void MipsSEFrameLowering::
-processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS) const {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
+/// Mark \p Reg and all registers aliasing it in the bitset.
+void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs, unsigned Reg) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    SavedRegs.set(*AI);
+}
+
+void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                               BitVector &SavedRegs,
+                                               RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   MipsABIInfo ABI = STI.getABI();
   unsigned FP = ABI.GetFramePtr();
@@ -632,10 +639,10 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   // Mark $fp as used if function has dedicated frame pointer.
   if (hasFP(MF))
-    MRI.setPhysRegUsed(FP);
+    setAliasRegs(MF, SavedRegs, FP);
   // Mark $s7 as used if function has dedicated base pointer.
   if (hasBP(MF))
-    MRI.setPhysRegUsed(BP);
+    setAliasRegs(MF, SavedRegs, BP);
 
   // Create spill slots for eh data registers if function calls eh_return.
   if (MipsFI->callsEhReturn())
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index 2fcd6bbb9a15..9cb32e6c7829 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -34,8 +34,8 @@ public:
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
   unsigned ehDataReg(unsigned I) const;
 };
 
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 990a2f8d8c85..cb46d731da29 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -841,7 +841,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
   }
 
   case MipsISD::ThreadPointer: {
-    EVT PtrVT = getTargetLowering()->getPointerTy();
+    EVT PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
     unsigned RdhwrOpc, DestReg;
 
     if (PtrVT == MVT::i32) {
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index ae2837a8582c..b319fd07884b 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -838,8 +838,9 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
 
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     if (!VT.isVector())
-      return genConstMult(N->getOperand(0), C->getZExtValue(), SDLoc(N),
-                          VT, TL->getScalarShiftAmountTy(VT), DAG);
+      return genConstMult(N->getOperand(0), C->getZExtValue(), SDLoc(N), VT,
+                          TL->getScalarShiftAmountTy(DAG.getDataLayout(), VT),
+                          DAG);
 
   return SDValue(N, 0);
 }
diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.cpp b/lib/Target/Mips/MipsSelectionDAGInfo.cpp
deleted file mode 100644
index edd8f670707f..000000000000
--- a/lib/Target/Mips/MipsSelectionDAGInfo.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- MipsSelectionDAGInfo.cpp - Mips SelectionDAG Info -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the MipsSelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MipsTargetMachine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "mips-selectiondag-info"
-
-MipsSelectionDAGInfo::MipsSelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-MipsSelectionDAGInfo::~MipsSelectionDAGInfo() {
-}
diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.h b/lib/Target/Mips/MipsSelectionDAGInfo.h
deleted file mode 100644
index 061423fbeb86..000000000000
--- a/lib/Target/Mips/MipsSelectionDAGInfo.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- MipsSelectionDAGInfo.h - Mips SelectionDAG Info ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the Mips subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_MIPS_MIPSSELECTIONDAGINFO_H
-#define LLVM_LIB_TARGET_MIPS_MIPSSELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class MipsTargetMachine;
-
-class MipsSelectionDAGInfo : public TargetSelectionDAGInfo {
-public:
-  explicit MipsSelectionDAGInfo(const DataLayout &DL);
-  ~MipsSelectionDAGInfo();
-};
-
-}
-
-#endif
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index c41bb16a58ea..471b6e19a8bb 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -70,7 +70,7 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
       HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
       InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
       HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
-      HasMSA(false), TM(TM), TargetTriple(TT), TSInfo(*TM.getDataLayout()),
+      HasMSA(false), TM(TM), TargetTriple(TT), TSInfo(),
       InstrInfo(
           MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
       FrameLowering(MipsFrameLowering::create(*this)),
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 5f9296812e1c..1db8881404c9 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -18,10 +18,10 @@
 #include "MipsFrameLowering.h"
 #include "MipsISelLowering.h"
 #include "MipsInstrInfo.h"
-#include "MipsSelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -140,7 +140,7 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
 
   Triple TargetTriple;
 
-  const MipsSelectionDAGInfo TSInfo;
+  const TargetSelectionDAGInfo TSInfo;
   std::unique_ptr<const MipsInstrInfo> InstrInfo;
   std::unique_ptr<const MipsFrameLowering> FrameLowering;
   std::unique_ptr<const MipsTargetLowering> TLInfo;
@@ -275,7 +275,7 @@ public:
   void setHelperClassesMips16();
   void setHelperClassesMipsSE();
 
-  const MipsSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
   const MipsInstrInfo *getInstrInfo() const override { return InstrInfo.get(); }
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index c820668befa0..1c77745d130b 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -62,7 +62,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   if (!ABI.IsN64())
     Ret += "-p:32:32";
 
-  // 8 and 16 bit integers only need no have natural alignment, but try to
+  // 8 and 16 bit integers only need to have natural alignment, but try to
   // align them to 32 bits. 64 bit integers have natural alignment.
   Ret += "-i8:8:32-i16:16:32-i64:64";
 
@@ -237,7 +237,7 @@ TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
     if (Subtarget->allowMixed16_32()) {
       DEBUG(errs() << "No Target Transform Info Pass Added\n");
       // FIXME: This is no longer necessary as the TTI returned is per-function.
-      return TargetTransformInfo(getDataLayout());
+      return TargetTransformInfo(F.getParent()->getDataLayout());
     }
 
     DEBUG(errs() << "Target Transform Info Pass Added\n");
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index 221d2f093aeb..ad7302037cad 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -37,7 +37,7 @@ static MCInstrInfo *createNVPTXMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createNVPTXMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   // PTX does not have a return address register.
   InitNVPTXMCRegisterInfo(X, 0);
@@ -46,13 +46,13 @@ static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createNVPTXMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitNVPTXMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createNVPTXMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createNVPTXMCCodeGenInfo(
-    StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) {
+static MCCodeGenInfo *createNVPTXMCCodeGenInfo(const Triple &TT,
+                                               Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
 
   // The default relocation model is used regardless of what the client has
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index cadd7a46cd9d..ecb0f0a1d0a1 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -340,7 +340,7 @@ MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
 }
 
 void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout &DL = getDataLayout();
   const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
 
   Type *Ty = F->getReturnType();
@@ -366,20 +366,20 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
 
       O << ".param .b" << size << " func_retval0";
     } else if (isa<PointerType>(Ty)) {
-      O << ".param .b" << TLI->getPointerTy().getSizeInBits()
+      O << ".param .b" << TLI->getPointerTy(DL).getSizeInBits()
         << " func_retval0";
     } else if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) {
-       unsigned totalsz = TD->getTypeAllocSize(Ty);
+      unsigned totalsz = DL.getTypeAllocSize(Ty);
        unsigned retAlignment = 0;
        if (!llvm::getAlign(*F, 0, retAlignment))
-         retAlignment = TD->getABITypeAlignment(Ty);
+         retAlignment = DL.getABITypeAlignment(Ty);
        O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
          << "]";
     } else
       llvm_unreachable("Unknown return type");
   } else {
     SmallVector<EVT, 16> vtparts;
-    ComputeValueVTs(*TLI, Ty, vtparts);
+    ComputeValueVTs(*TLI, DL, Ty, vtparts);
     unsigned idx = 0;
     for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
       unsigned elems = 1;
@@ -1433,7 +1433,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   bool first = true;
   bool isKernelFunc = llvm::isKernelFunction(*F);
   bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
-  MVT thePointerTy = TLI->getPointerTy();
+  MVT thePointerTy = TLI->getPointerTy(*TD);
 
   O << "(\n";
 
@@ -1579,7 +1579,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       // Further, if a part is vector, print the above for
       // each vector element.
       SmallVector<EVT, 16> vtparts;
-      ComputeValueVTs(*TLI, ETy, vtparts);
+      ComputeValueVTs(*TLI, getDataLayout(), ETy, vtparts);
       for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
         unsigned elems = 1;
         EVT elemtype = vtparts[i];
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 09e0bd5d3d88..b75cf4040312 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -80,14 +80,14 @@ static bool IsPTXVectorType(MVT VT) {
 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
 /// LowerCall, and LowerReturn.
-static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty,
-                               SmallVectorImpl<EVT> &ValueVTs,
+static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                               Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
                                SmallVectorImpl<uint64_t> *Offsets = nullptr,
                                uint64_t StartingOffset = 0) {
   SmallVector<EVT, 16> TempVTs;
   SmallVector<uint64_t, 16> TempOffsets;
 
-  ComputeValueVTs(TLI, Ty, TempVTs, &TempOffsets, StartingOffset);
+  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
     EVT VT = TempVTs[i];
     uint64_t Off = TempOffsets[i];
@@ -885,15 +885,16 @@ SDValue
 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
-  return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
+  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
 }
 
-std::string
-NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
-                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  unsigned retAlignment,
-                                  const ImmutableCallSite *CS) const {
+std::string NVPTXTargetLowering::getPrototype(
+    const DataLayout &DL, Type *retTy, const ArgListTy &Args,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
+    const ImmutableCallSite *CS) const {
+  auto PtrVT = getPointerTy(DL);
 
   bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
@@ -921,13 +922,12 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
 
       O << ".param .b" << size << " _";
     } else if (isa<PointerType>(retTy)) {
-      O << ".param .b" << getPointerTy().getSizeInBits() << " _";
+      O << ".param .b" << PtrVT.getSizeInBits() << " _";
     } else if ((retTy->getTypeID() == Type::StructTyID) ||
                isa<VectorType>(retTy)) {
-      O << ".param .align "
-        << retAlignment
-        << " .b8 _["
-        << getDataLayout()->getTypeAllocSize(retTy) << "]";
+      auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
+      O << ".param .align " << retAlignment << " .b8 _["
+        << DL.getTypeAllocSize(retTy) << "]";
     } else {
       llvm_unreachable("Unknown return type");
     }
@@ -936,7 +936,6 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
   O << "_ (";
 
   bool first = true;
-  MVT thePointerTy = getPointerTy();
 
   unsigned OIdx = 0;
   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
@@ -950,24 +949,23 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
       if (Ty->isAggregateType() || Ty->isVectorTy()) {
         unsigned align = 0;
         const CallInst *CallI = cast<CallInst>(CS->getInstruction());
-        const DataLayout *TD = getDataLayout();
         // +1 because index 0 is reserved for return type alignment
         if (!llvm::getAlign(*CallI, i + 1, align))
-          align = TD->getABITypeAlignment(Ty);
-        unsigned sz = TD->getTypeAllocSize(Ty);
+          align = DL.getABITypeAlignment(Ty);
+        unsigned sz = DL.getTypeAllocSize(Ty);
         O << ".param .align " << align << " .b8 ";
         O << "_";
         O << "[" << sz << "]";
         // update the index for Outs
         SmallVector<EVT, 16> vtparts;
-        ComputeValueVTs(*this, Ty, vtparts);
+        ComputeValueVTs(*this, DL, Ty, vtparts);
         if (unsigned len = vtparts.size())
           OIdx += len - 1;
         continue;
       }
        // i8 types in IR will be i16 types in SDAG
-      assert((getValueType(Ty) == Outs[OIdx].VT ||
-             (getValueType(Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
+      assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
+              (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
              "type mismatch between callee prototype and arguments");
       // scalar type
       unsigned sz = 0;
@@ -976,7 +974,7 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
         if (sz < 32)
           sz = 32;
       } else if (isa<PointerType>(Ty))
-        sz = thePointerTy.getSizeInBits();
+        sz = PtrVT.getSizeInBits();
       else
         sz = Ty->getPrimitiveSizeInBits();
       O << ".param .b" << sz << " ";
@@ -988,7 +986,7 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
     Type *ETy = PTy->getElementType();
 
     unsigned align = Outs[OIdx].Flags.getByValAlign();
-    unsigned sz = getDataLayout()->getTypeAllocSize(ETy);
+    unsigned sz = DL.getTypeAllocSize(ETy);
     O << ".param .align " << align << " .b8 ";
     O << "_";
     O << "[" << sz << "]";
@@ -1002,7 +1000,6 @@ NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
                                           const ImmutableCallSite *CS,
                                           Type *Ty,
                                           unsigned Idx) const {
-  const DataLayout *TD = getDataLayout();
   unsigned Align = 0;
   const Value *DirectCallee = CS->getCalledFunction();
 
@@ -1043,7 +1040,8 @@ NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
 
   // Call is indirect or alignment information is not available, fall back to
   // the ABI type alignment
-  return TD->getABITypeAlignment(Ty);
+  auto &DL = CS->getCaller()->getParent()->getDataLayout();
+  return DL.getABITypeAlignment(Ty);
 }
 
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
@@ -1064,9 +1062,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
     return Chain;
-  const DataLayout *TD = getDataLayout();
   MachineFunction &MF = DAG.getMachineFunction();
   const Function *F = MF.getFunction();
+  auto &DL = MF.getDataLayout();
 
   SDValue tempChain = Chain;
   Chain = DAG.getCALLSEQ_START(Chain,
@@ -1096,11 +1094,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         // aggregate
         SmallVector<EVT, 16> vtparts;
         SmallVector<uint64_t, 16> Offsets;
-        ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0);
+        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets,
+                           0);
 
         unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
         // declare .param .align <align> .b8 .param<n>[<size>];
-        unsigned sz = TD->getTypeAllocSize(Ty);
+        unsigned sz = DL.getTypeAllocSize(Ty);
         SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
         SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl,
                                                              MVT::i32),
@@ -1137,10 +1136,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         continue;
       }
       if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(Ty);
+        EVT ObjectVT = getValueType(DL, Ty);
         unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
         // declare .param .align <align> .b8 .param<n>[<size>];
-        unsigned sz = TD->getTypeAllocSize(Ty);
+        unsigned sz = DL.getTypeAllocSize(Ty);
         SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
         SDValue DeclareParamOps[] = { Chain,
                                       DAG.getConstant(align, dl, MVT::i32),
@@ -1321,7 +1320,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     SmallVector<uint64_t, 16> Offsets;
     const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
     assert(PTy && "Type of a byval parameter should be pointer");
-    ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0);
+    ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(),
+                       vtparts, &Offsets, 0);
 
     // declare .param .align <align> .b8 .param<n>[<size>];
     unsigned sz = Outs[OIdx].Flags.getByValSize();
@@ -1342,9 +1342,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       EVT elemtype = vtparts[j];
       int curOffset = Offsets[j];
       unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
-      SDValue srcAddr =
-          DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx],
-                      DAG.getConstant(curOffset, dl, getPointerTy()));
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
+      SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
+                                    DAG.getConstant(curOffset, dl, PtrVT));
       SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
                                    MachinePointerInfo(), false, false, false,
                                    PartAlign);
@@ -1371,12 +1371,12 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Handle Result
   if (Ins.size() > 0) {
     SmallVector<EVT, 16> resvtparts;
-    ComputeValueVTs(*this, retTy, resvtparts);
+    ComputeValueVTs(*this, DL, retTy, resvtparts);
 
     // Declare
     //  .param .align 16 .b8 retval0[<size-in-bytes>], or
     //  .param .b<size-in-bits> retval0
-    unsigned resultsz = TD->getTypeAllocSizeInBits(retTy);
+    unsigned resultsz = DL.getTypeAllocSizeInBits(retTy);
     // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
     // these three types to match the logic in
     // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
@@ -1415,7 +1415,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // The prototype is embedded in a string and put as the operand for a
     // CallPrototype SDNode which will print out to the value of the string.
     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    std::string Proto = getPrototype(retTy, Args, Outs, retAlignment, CS);
+    std::string Proto =
+        getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS);
     const char *ProtoStr =
       nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
     SDValue ProtoOps[] = {
@@ -1477,7 +1478,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Generate loads from param memory/moves from registers for result
   if (Ins.size() > 0) {
     if (retTy && retTy->isVectorTy()) {
-      EVT ObjectVT = getValueType(retTy);
+      EVT ObjectVT = getValueType(DL, retTy);
       unsigned NumElts = ObjectVT.getVectorNumElements();
       EVT EltVT = ObjectVT.getVectorElementType();
       assert(STI.getTargetLowering()->getNumRegisters(F->getContext(),
@@ -1590,13 +1591,13 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
               Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
             InVals.push_back(Elt);
           }
-          Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+          Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
         }
       }
     } else {
       SmallVector<EVT, 16> VTs;
       SmallVector<uint64_t, 16> Offsets;
-      ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0);
+      ComputePTXValueVTs(*this, DAG.getDataLayout(), retTy, VTs, &Offsets, 0);
       assert(VTs.size() == Ins.size() && "Bad value decomposition");
       unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0);
       for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
@@ -1608,8 +1609,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
         SmallVector<EVT, 4> LoadRetVTs;
         EVT TheLoadType = VTs[i];
-        if (retTy->isIntegerTy() &&
-            TD->getTypeAllocSizeInBits(retTy) < 32) {
+        if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) {
           // This is for integer types only, and specifically not for
           // aggregates.
           LoadRetVTs.push_back(MVT::i32);
@@ -1920,11 +1920,11 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     }
 
     MemSDNode *MemSD = cast<MemSDNode>(N);
-    const DataLayout *TD = getDataLayout();
+    const DataLayout &TD = DAG.getDataLayout();
 
     unsigned Align = MemSD->getAlignment();
     unsigned PrefAlign =
-      TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
+        TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
     if (Align < PrefAlign) {
       // This store is not sufficiently aligned, so bail out and let this vector
       // store be scalarized.  Note that we may still be able to emit smaller
@@ -2064,7 +2064,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
     SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  const DataLayout *TD = getDataLayout();
+  const DataLayout &DL = DAG.getDataLayout();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   const Function *F = MF.getFunction();
   const AttributeSet &PAL = F->getAttributes();
@@ -2118,7 +2119,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       if (Ty->isAggregateType()) {
         SmallVector<EVT, 16> vtparts;
 
-        ComputePTXValueVTs(*this, Ty, vtparts);
+        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
         assert(vtparts.size() > 0 && "empty aggregate type not expected");
         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
              ++parti) {
@@ -2130,7 +2131,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         continue;
       }
       if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(Ty);
+        EVT ObjectVT = getValueType(DL, Ty);
         unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
         for (unsigned parti = 0; parti < NumRegs; ++parti) {
           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
@@ -2156,13 +2157,14 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         // NOTE: Here, we lose the ability to issue vector loads for vectors
         // that are a part of a struct.  This should be investigated in the
         // future.
-        ComputePTXValueVTs(*this, Ty, vtparts, &offsets, 0);
+        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets,
+                           0);
         assert(vtparts.size() > 0 && "empty aggregate type not expected");
         bool aggregateIsPacked = false;
         if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
           aggregateIsPacked = STy->isPacked();
 
-        SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
              ++parti) {
           EVT partVT = vtparts[parti];
@@ -2170,12 +2172,12 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
               PointerType::get(partVT.getTypeForEVT(F->getContext()),
                                llvm::ADDRESS_SPACE_PARAM));
           SDValue srcAddr =
-              DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
-                          DAG.getConstant(offsets[parti], dl, getPointerTy()));
-          unsigned partAlign =
-              aggregateIsPacked ? 1
-                                : TD->getABITypeAlignment(
-                                      partVT.getTypeForEVT(F->getContext()));
+              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
+                          DAG.getConstant(offsets[parti], dl, PtrVT));
+          unsigned partAlign = aggregateIsPacked
+                                   ? 1
+                                   : DL.getABITypeAlignment(
+                                         partVT.getTypeForEVT(F->getContext()));
           SDValue p;
           if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
             ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
@@ -2198,8 +2200,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         continue;
       }
       if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(Ty);
-        SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+        EVT ObjectVT = getValueType(DL, Ty);
+        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
         unsigned NumElts = ObjectVT.getVectorNumElements();
         assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
                "Vector was not scalarized");
@@ -2212,9 +2214,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           Value *SrcValue = Constant::getNullValue(PointerType::get(
               EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
           SDValue P = DAG.getLoad(
-              EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false,
-              false, true,
-              TD->getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
+              EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
+              true,
+              DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
           if (P.getNode())
             P.getNode()->setIROrder(idx + 1);
 
@@ -2229,9 +2231,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           Value *SrcValue = Constant::getNullValue(PointerType::get(
               VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
           SDValue P = DAG.getLoad(
-              VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false,
-              false, true,
-              TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+              VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
+              true,
+              DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
           if (P.getNode())
             P.getNode()->setIROrder(idx + 1);
 
@@ -2269,13 +2271,12 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             Value *SrcValue = Constant::getNullValue(
                 PointerType::get(VecVT.getTypeForEVT(F->getContext()),
                                  llvm::ADDRESS_SPACE_PARAM));
-            SDValue SrcAddr =
-                DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
-                            DAG.getConstant(Ofst, dl, getPointerTy()));
+            SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
+                                          DAG.getConstant(Ofst, dl, PtrVT));
             SDValue P = DAG.getLoad(
                 VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
                 false, true,
-                TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+                DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
             if (P.getNode())
               P.getNode()->setIROrder(idx + 1);
 
@@ -2288,7 +2289,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
                 Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
               InVals.push_back(Elt);
             }
-            Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+            Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
           }
           InsIdx += NumElts;
         }
@@ -2298,23 +2299,24 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         continue;
       }
       // A plain scalar.
-      EVT ObjectVT = getValueType(Ty);
+      EVT ObjectVT = getValueType(DL, Ty);
       // If ABI, load from the param symbol
-      SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
       Value *srcValue = Constant::getNullValue(PointerType::get(
           ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
       SDValue p;
        if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
         ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
                                        ISD::SEXTLOAD : ISD::ZEXTLOAD;
-        p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, Arg,
-                           MachinePointerInfo(srcValue), ObjectVT, false, false,
-                           false,
-        TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+        p = DAG.getExtLoad(
+            ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue),
+            ObjectVT, false, false, false,
+            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
       } else {
-        p = DAG.getLoad(Ins[InsIdx].VT, dl, Root, Arg,
-                        MachinePointerInfo(srcValue), false, false, false,
-        TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+        p = DAG.getLoad(
+            Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue), false,
+            false, false,
+            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
       }
       if (p.getNode())
         p.getNode()->setIROrder(idx + 1);
@@ -2329,10 +2331,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     // machine instruction fails because TargetExternalSymbol
     // (not lowered) is target dependent, and CopyToReg assumes
     // the source is lowered.
-    EVT ObjectVT = getValueType(Ty);
+    EVT ObjectVT = getValueType(DL, Ty);
     assert(ObjectVT == Ins[InsIdx].VT &&
            "Ins type did not match function type");
-    SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+    SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
     if (p.getNode())
       p.getNode()->setIROrder(idx + 1);
@@ -2370,7 +2372,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   MachineFunction &MF = DAG.getMachineFunction();
   const Function *F = MF.getFunction();
   Type *RetTy = F->getReturnType();
-  const DataLayout *TD = getDataLayout();
+  const DataLayout &TD = DAG.getDataLayout();
 
   bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
@@ -2384,7 +2386,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     assert(NumElts == Outs.size() && "Bad scalarization of return value");
 
     // const_cast can be removed in later LLVM versions
-    EVT EltVT = getValueType(RetTy).getVectorElementType();
+    EVT EltVT = getValueType(TD, RetTy).getVectorElementType();
     bool NeedExtend = false;
     if (EltVT.getSizeInBits() < 16)
       NeedExtend = true;
@@ -2435,7 +2437,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       EVT VecVT =
           EVT::getVectorVT(F->getContext(), EltVT, VecSize);
       unsigned PerStoreOffset =
-          TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+          TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
 
       for (unsigned i = 0; i < NumElts; i += VecSize) {
         // Get values
@@ -2493,7 +2495,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   } else {
     SmallVector<EVT, 16> ValVTs;
     SmallVector<uint64_t, 16> Offsets;
-    ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0);
+    ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0);
     assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
 
     for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
@@ -2509,8 +2511,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                TheValType.getVectorElementType(), TmpVal,
                                DAG.getIntPtrConstant(j, dl));
         EVT TheStoreType = ValVTs[i];
-        if (RetTy->isIntegerTy() &&
-            TD->getTypeAllocSizeInBits(RetTy) < 32) {
+        if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) {
           // The following zero-extension is for integer types only, and
           // specifically not for aggregates.
           TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
@@ -3291,14 +3292,14 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_p: {
-
+    auto &DL = I.getModule()->getDataLayout();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
-      Info.memVT = getValueType(I.getType());
+      Info.memVT = getValueType(DL, I.getType());
     else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
-      Info.memVT = getPointerTy();
+      Info.memVT = getPointerTy(DL);
     else
-      Info.memVT = getValueType(I.getType());
+      Info.memVT = getValueType(DL, I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.vol = 0;
@@ -3311,14 +3312,15 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_ldg_global_i:
   case Intrinsic::nvvm_ldg_global_f:
   case Intrinsic::nvvm_ldg_global_p: {
+    auto &DL = I.getModule()->getDataLayout();
 
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
-      Info.memVT = getValueType(I.getType());
+      Info.memVT = getValueType(DL, I.getType());
     else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
-      Info.memVT = getPointerTy();
+      Info.memVT = getPointerTy(DL);
     else
-      Info.memVT = getValueType(I.getType());
+      Info.memVT = getValueType(DL, I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.vol = 0;
@@ -3731,8 +3733,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
 /// Used to guide target specific optimizations, like loop strength reduction
 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
 /// (CodeGenPrepare.cpp)
-bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                Type *Ty,
+bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
                                                 unsigned AS) const {
 
   // AddrMode - This represents an addressing mode of:
@@ -3772,7 +3774,7 @@ bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 NVPTXTargetLowering::ConstraintType
-NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
+NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:
@@ -3794,7 +3796,7 @@ NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
 
 std::pair<unsigned, const TargetRegisterClass *>
 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                  const std::string &Constraint,
+                                                  StringRef Constraint,
                                                   MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
@@ -4251,7 +4253,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
 
 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
-                              const DataLayout *TD,
                               SmallVectorImpl<SDValue> &Results) {
   EVT ResVT = N->getValueType(0);
   SDLoc DL(N);
@@ -4282,8 +4283,9 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
   LoadSDNode *LD = cast<LoadSDNode>(N);
 
   unsigned Align = LD->getAlignment();
+  auto &TD = DAG.getDataLayout();
   unsigned PrefAlign =
-    TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
+      TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
   if (Align < PrefAlign) {
     // This load is not sufficiently aligned, so bail out and let this vector
     // load be scalarized.  Note that we may still be able to emit smaller
@@ -4495,7 +4497,7 @@ void NVPTXTargetLowering::ReplaceNodeResults(
   default:
     report_fatal_error("Unhandled custom legalization");
   case ISD::LOAD:
-    ReplaceLoadVector(N, DAG, getDataLayout(), Results);
+    ReplaceLoadVector(N, DAG, Results);
     return;
   case ISD::INTRINSIC_W_CHAIN:
     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index ed94775b3002..e5c37321a33b 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -456,24 +456,23 @@ public:
   /// Used to guide target specific optimizations, like loop strength
   /// reduction (LoopStrengthReduce.cpp) and memory optimization for
   /// address mode (CodeGenPrepare.cpp)
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
   /// getFunctionAlignment - Return the Log2 alignment of this function.
   unsigned getFunctionAlignment(const Function *F) const;
 
-  EVT getSetCCResultType(LLVMContext &Ctx, EVT VT) const override {
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
+                         EVT VT) const override {
     if (VT.isVector())
       return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
     return MVT::i1;
   }
 
-  ConstraintType
-  getConstraintType(const std::string &Constraint) const override;
+  ConstraintType getConstraintType(StringRef Constraint) const override;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                               const std::string &Constraint,
-                               MVT VT) const override;
+                               StringRef Constraint, MVT VT) const override;
 
   SDValue LowerFormalArguments(
       SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -483,7 +482,7 @@ public:
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
-  std::string getPrototype(Type *, const ArgListTy &,
+  std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &,
                            const SmallVectorImpl<ISD::OutputArg> &,
                            unsigned retAlignment,
                            const ImmutableCallSite *CS) const;
@@ -501,7 +500,9 @@ public:
   const NVPTXTargetMachine *nvTM;
 
   // PTX always uses 32-bit shift amounts
-  MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+    return MVT::i32;
+  }
 
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 6ab0fadf9a35..0bf72febc4a0 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -57,7 +57,6 @@ char NVPTXLowerAggrCopies::ID = 0;
 // Lower MemTransferInst or load-store pair to loop
 static void convertTransferToLoop(
     Instruction *splitAt, Value *srcAddr, Value *dstAddr, Value *len,
-    //unsigned numLoads,
     bool srcVolatile, bool dstVolatile, LLVMContext &Context, Function &F) {
   Type *indType = len->getType();
 
@@ -84,6 +83,8 @@ static void convertTransferToLoop(
   ind->addIncoming(ConstantInt::get(indType, 0), origBB);
 
   // load from srcAddr+ind
+  // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
+  // word-sized loads and stores.
   Value *val = loop.CreateLoad(loop.CreateGEP(loop.getInt8Ty(), srcAddr, ind),
                                srcVolatile);
   // store at dstAddr+ind
@@ -137,13 +138,10 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   //
   // Collect all the aggrLoads, aggrMemcpys and addrMemsets.
   //
-  //const BasicBlock *firstBB = &F.front();  // first BB in F
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
-    //BasicBlock *bb = BI;
     for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
          ++II) {
       if (LoadInst *load = dyn_cast<LoadInst>(II)) {
-
         if (!load->hasOneUse())
           continue;
 
@@ -152,7 +150,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
 
         User *use = load->user_back();
         if (StoreInst *store = dyn_cast<StoreInst>(use)) {
-          if (store->getOperand(0) != load) //getValueOperand
+          if (store->getOperand(0) != load)
             continue;
           aggrLoads.push_back(load);
         }
@@ -188,8 +186,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   //
   // Do the transformation of an aggr load/copy/set to a loop
   //
-  for (unsigned i = 0, e = aggrLoads.size(); i != e; ++i) {
-    LoadInst *load = aggrLoads[i];
+  for (LoadInst *load : aggrLoads) {
     StoreInst *store = dyn_cast<StoreInst>(*load->user_begin());
     Value *srcAddr = load->getOperand(0);
     Value *dstAddr = store->getOperand(1);
@@ -203,20 +200,19 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
     load->eraseFromParent();
   }
 
-  for (unsigned i = 0, e = aggrMemcpys.size(); i != e; ++i) {
-    MemTransferInst *cpy = aggrMemcpys[i];
-    Value *len = cpy->getLength();
-    // llvm 2.7 version of memcpy does not have volatile
-    // operand yet. So always making it non-volatile
-    // optimistically, so that we don't see unnecessary
-    // st.volatile in ptx
-    convertTransferToLoop(cpy, cpy->getSource(), cpy->getDest(), len, false,
-                          false, Context, F);
+  for (MemTransferInst *cpy : aggrMemcpys) {
+    convertTransferToLoop(/* splitAt */ cpy,
+                          /* srcAddr */ cpy->getSource(),
+                          /* dstAddr */ cpy->getDest(),
+                          /* len */ cpy->getLength(),
+                          /* srcVolatile */ cpy->isVolatile(),
+                          /* dstVolatile */ cpy->isVolatile(),
+                          /* Context */ Context,
+                          /* Function F */ F);
     cpy->eraseFromParent();
   }
 
-  for (unsigned i = 0, e = aggrMemsets.size(); i != e; ++i) {
-    MemSetInst *memsetinst = aggrMemsets[i];
+  for (MemSetInst *memsetinst : aggrMemsets) {
     Value *len = memsetinst->getLength();
     Value *val = memsetinst->getValue();
     convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context,
diff --git a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index e83f735a551e..5a83371b07f1 100644
--- a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -2,7 +2,7 @@
 //
 //                     The LLVM Compiler Infrastructure
 //
-// This file is distributed under the University of Illinois Open Source 
+// This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
@@ -115,7 +115,7 @@ bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) {
 
     replaceImageHandle(Handle, MF);
 
-    return true; 
+    return true;
   }
 
   return false;
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 71645dca69c5..bd2509a3c8c9 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -48,7 +48,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
                                const NVPTXTargetMachine &TM)
     : NVPTXGenSubtargetInfo(TT, CPU, FS), PTXVersion(0), SmVersion(20), TM(TM),
       InstrInfo(), TLInfo(TM, initializeSubtargetDependencies(CPU, FS)),
-      TSInfo(TM.getDataLayout()), FrameLowering() {}
+      FrameLowering() {}
 
 bool NVPTXSubtarget::hasImageHandles() const {
   // Enable handles for Kepler+, where CUDA supports indirect surfaces and
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 9d9072efc382..248f9e117d83 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -148,8 +148,9 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis(
-      [this](Function &) { return TargetTransformInfo(NVPTXTTIImpl(this)); });
+  return TargetIRAnalysis([this](Function &F) {
+    return TargetTransformInfo(NVPTXTTIImpl(this, F));
+  });
 }
 
 void NVPTXPassConfig::addIRPasses() {
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index dc81802f4b5a..e7250cdba5ac 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -94,7 +94,7 @@ unsigned NVPTXTTIImpl::getArithmeticInstrCost(
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
@@ -117,3 +117,15 @@ unsigned NVPTXTTIImpl::getArithmeticInstrCost(
                                          Opd1PropInfo, Opd2PropInfo);
   }
 }
+
+void NVPTXTTIImpl::getUnrollingPreferences(Loop *L,
+                                           TTI::UnrollingPreferences &UP) {
+  BaseT::getUnrollingPreferences(L, UP);
+
+  // Enable partial unrolling and runtime unrolling, but reduce the
+  // threshold.  This partially unrolls small loops which are often
+  // unrolled by the PTX to SASS compiler and unrolling earlier can be
+  // beneficial.
+  UP.Partial = UP.Runtime = true;
+  UP.PartialThreshold = UP.Threshold / 4;
+}
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 4280888988f9..5bcd1e27a558 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -37,8 +37,9 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
   const NVPTXTargetLowering *getTLI() const { return TLI; };
 
 public:
-  explicit NVPTXTTIImpl(const NVPTXTargetMachine *TM)
-      : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
+  explicit NVPTXTTIImpl(const NVPTXTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   NVPTXTTIImpl(const NVPTXTTIImpl &Arg)
@@ -46,18 +47,6 @@ public:
   NVPTXTTIImpl(NVPTXTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  NVPTXTTIImpl &operator=(const NVPTXTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  NVPTXTTIImpl &operator=(NVPTXTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   bool hasBranchDivergence() { return true; }
 
@@ -69,6 +58,8 @@ public:
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index fe168a547597..c0c83cc258b8 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -33,7 +33,6 @@ add_llvm_target(PowerPCCodeGen
   PPCTargetObjectFile.cpp
   PPCTargetTransformInfo.cpp
   PPCTOCRegDeps.cpp
-  PPCSelectionDAGInfo.cpp
   PPCTLSDynamicCall.cpp
   PPCVSXCopy.cpp
   PPCVSXFMAMutate.cpp
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 5c38fe173d96..30f232a9a91e 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -51,10 +51,9 @@ static MCInstrInfo *createPPCMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) {
-  Triple TheTriple(TT);
-  bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
-                  TheTriple.getArch() == Triple::ppc64le);
+static MCRegisterInfo *createPPCMCRegisterInfo(const Triple &TT) {
+  bool isPPC64 =
+      (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le);
   unsigned Flavour = isPPC64 ? 0 : 1;
   unsigned RA = isPPC64 ? PPC::LR8 : PPC::LR;
 
@@ -65,9 +64,7 @@ static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *createPPCMCSubtargetInfo(const Triple &TT,
                                                  StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitPPCMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createPPCMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
 static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
@@ -90,22 +87,20 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createPPCMCCodeGenInfo(const Triple &TT, Reloc::Model RM,
                                              CodeModel::Model CM,
                                              CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
 
   if (RM == Reloc::Default) {
-    Triple T(TT);
-    if (T.isOSDarwin())
+    if (TT.isOSDarwin())
       RM = Reloc::DynamicNoPIC;
     else
       RM = Reloc::Static;
   }
   if (CM == CodeModel::Default) {
-    Triple T(TT);
-    if (!T.isOSDarwin() &&
-        (T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le))
+    if (!TT.isOSDarwin() &&
+        (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le))
       CM = CodeModel::Medium;
   }
   X->initMCCodeGenInfo(RM, CM, OL);
@@ -231,7 +226,7 @@ static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
 static MCTargetStreamer *
 createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   const Triple &TT = STI.getTargetTriple();
-  if (TT.getObjectFormat() == Triple::ELF)
+  if (TT.isOSBinFormatELF())
     return new PPCTargetELFStreamer(S);
   return new PPCTargetMachOStreamer(S);
 }
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 87a5236e711f..199a0debf88b 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -197,7 +197,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
 
     // External or weakly linked global variables need non-lazily-resolved stubs
     if (TM.getRelocationModel() != Reloc::Static &&
-        (GV->isDeclaration() || GV->isWeakForLinker())) {
+        !GV->isStrongDefinitionForLinker()) {
       if (!GV->hasHiddenVisibility()) {
         SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
         MachineModuleInfoImpl::StubValueTy &StubSym = 
@@ -369,28 +369,70 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
     assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
            "High 16 bits of call target should be zero.");
     unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
-    EncodedBytes = 6*4;
+    EncodedBytes = 0;
     // Materialize the jump address:
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8)
                                     .addReg(ScratchReg)
                                     .addImm((CallTarget >> 32) & 0xFFFF));
+    ++EncodedBytes;
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm(32).addImm(16));
+    ++EncodedBytes;
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm((CallTarget >> 16) & 0xFFFF));
+    ++EncodedBytes;
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm(CallTarget & 0xFFFF));
 
+    // Save the current TOC pointer before the remote call.
+    int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD)
+                                    .addReg(PPC::X2)
+                                    .addImm(TOCSaveOffset)
+                                    .addReg(PPC::X1));
+    ++EncodedBytes;
+
+
+    // If we're on ELFv1, then we need to load the actual function pointer from
+    // the function descriptor.
+    if (!Subtarget->isELFv2ABI()) {
+      // Load the new TOC pointer and the function address, but not r11
+      // (needing this is rare, and loading it here would prevent passing it
+      // via a 'nest' parameter.
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+                                      .addReg(PPC::X2)
+                                      .addImm(8)
+                                      .addReg(ScratchReg));
+      ++EncodedBytes;
+      EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+                                      .addReg(ScratchReg)
+                                      .addImm(0)
+                                      .addReg(ScratchReg));
+      ++EncodedBytes;
+    }
+
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8).addReg(ScratchReg));
+    ++EncodedBytes;
     EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8));
+    ++EncodedBytes;
+
+    // Restore the TOC pointer after the call.
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD)
+                                    .addReg(PPC::X2)
+                                    .addImm(TOCSaveOffset)
+                                    .addReg(PPC::X1));
+    ++EncodedBytes;
   }
 
+  // Each instruction is 4 bytes.
+  EncodedBytes *= 4;
+
   // Emit padding.
   unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
   assert(NumBytes >= EncodedBytes &&
@@ -624,7 +666,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       IsExternal = GV->isDeclaration();
       IsCommon = GV->hasCommonLinkage();
       IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
-        (GV->isDeclaration() || GV->isWeakForLinker());
+        !GV->isStrongDefinitionForLinker();
       IsAvailExt = GV->hasAvailableExternallyLinkage();
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
@@ -706,7 +748,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MOSymbol = getSymbol(GV);
       IsExternal = GV->isDeclaration();
       IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
-        (GV->isDeclaration() || GV->isWeakForLinker());
+        !GV->isStrongDefinitionForLinker();
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
 
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 416131745806..baadf081a64c 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -351,8 +351,9 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
             Opcode = ISD::FTRUNC; break;
           }
 
-          MVT VTy =
-            TLI->getSimpleValueType(CI->getArgOperand(0)->getType(), true);
+          auto &DL = CI->getModule()->getDataLayout();
+          MVT VTy = TLI->getSimpleValueType(DL, CI->getArgOperand(0)->getType(),
+                                            true);
           if (VTy == MVT::Other)
             return true;
           
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index 874a6fce0656..5bc9124f8085 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -133,6 +133,9 @@ def CC_PPC32_SVR4_Common : CallingConv<[
   // register having an odd register number.
   CCIfType<[i32], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>,
 
+  // The 'nest' parameter, if any, is passed in R11.
+  CCIfNest<CCAssignToReg<[R11]>>,
+
   // The first 8 integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
 
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index fafcd76f9d18..5f236f744fc4 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -262,7 +262,7 @@ static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
 // fast-isel, and return its equivalent machine type in VT.
 // FIXME: Copied directly from ARM -- factor into base class?
 bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) {
-  EVT Evt = TLI.getValueType(Ty, true);
+  EVT Evt = TLI.getValueType(DL, Ty, true);
 
   // Only handle simple types.
   if (Evt == MVT::Other || !Evt.isSimple()) return false;
@@ -324,12 +324,13 @@ bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
       return PPCComputeAddress(U->getOperand(0), Addr);
     case Instruction::IntToPtr:
       // Look past no-op inttoptrs.
-      if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+          TLI.getPointerTy(DL))
         return PPCComputeAddress(U->getOperand(0), Addr);
       break;
     case Instruction::PtrToInt:
       // Look past no-op ptrtoints.
-      if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
         return PPCComputeAddress(U->getOperand(0), Addr);
       break;
     case Instruction::GetElementPtr: {
@@ -799,7 +800,7 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
 bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
                              bool IsZExt, unsigned DestReg) {
   Type *Ty = SrcValue1->getType();
-  EVT SrcEVT = TLI.getValueType(Ty, true);
+  EVT SrcEVT = TLI.getValueType(DL, Ty, true);
   if (!SrcEVT.isSimple())
     return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
@@ -893,8 +894,8 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
 // Attempt to fast-select a floating-point extend instruction.
 bool PPCFastISel::SelectFPExt(const Instruction *I) {
   Value *Src  = I->getOperand(0);
-  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
-  EVT DestVT = TLI.getValueType(I->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::f32 || DestVT != MVT::f64)
     return false;
@@ -911,8 +912,8 @@ bool PPCFastISel::SelectFPExt(const Instruction *I) {
 // Attempt to fast-select a floating-point truncate instruction.
 bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
   Value *Src  = I->getOperand(0);
-  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
-  EVT DestVT = TLI.getValueType(I->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::f64 || DestVT != MVT::f32)
     return false;
@@ -992,7 +993,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
     return false;
 
   Value *Src = I->getOperand(0);
-  EVT SrcEVT = TLI.getValueType(Src->getType(), true);
+  EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true);
   if (!SrcEVT.isSimple())
     return false;
 
@@ -1157,7 +1158,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
 // Attempt to fast-select a binary integer operation that isn't already
 // handled automatically.
 bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
-  EVT DestVT  = TLI.getValueType(I->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   // We can get here in the case when we have a binary operation on a non-legal
   // type and the target independent selector doesn't know how to handle it.
@@ -1594,7 +1595,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
 
   if (Ret->getNumOperands() > 0) {
     SmallVector<ISD::OutputArg, 4> Outs;
-    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
@@ -1641,7 +1642,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
         RetRegs.push_back(VA.getLocReg());
         unsigned SrcReg = Reg + VA.getValNo();
 
-        EVT RVEVT = TLI.getValueType(RV->getType());
+        EVT RVEVT = TLI.getValueType(DL, RV->getType());
         if (!RVEVT.isSimple())
           return false;
         MVT RVVT = RVEVT.getSimpleVT();
@@ -1769,8 +1770,8 @@ bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
 // Attempt to fast-select an integer truncate instruction.
 bool PPCFastISel::SelectTrunc(const Instruction *I) {
   Value *Src  = I->getOperand(0);
-  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
-  EVT DestVT = TLI.getValueType(I->getType(), true);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
 
   if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16)
     return false;
@@ -1806,8 +1807,8 @@ bool PPCFastISel::SelectIntExt(const Instruction *I) {
   if (!SrcReg) return false;
 
   EVT SrcEVT, DestEVT;
-  SrcEVT = TLI.getValueType(SrcTy, true);
-  DestEVT = TLI.getValueType(DestTy, true);
+  SrcEVT = TLI.getValueType(DL, SrcTy, true);
+  DestEVT = TLI.getValueType(DL, DestTy, true);
   if (!SrcEVT.isSimple())
     return false;
   if (!DestEVT.isSimple())
@@ -1979,7 +1980,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     // on the "if" path here.
     if (CModel == CodeModel::Large ||
         (GV->getType()->getElementType()->isFunctionTy() &&
-         (GV->isDeclaration() || GV->isWeakForLinker())) ||
+         !GV->isStrongDefinitionForLinker()) ||
         GV->isDeclaration() || GV->hasCommonLinkage() ||
         GV->hasAvailableExternallyLinkage())
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
@@ -2127,7 +2128,7 @@ unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT,
 // Materialize a constant into a register, and return the register
 // number (or zero if we failed to handle it).
 unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
 
   // Only handle simple types.
   if (!CEVT.isSimple()) return 0;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index b4008e4a886a..87229d80d9c1 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -306,9 +306,10 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   DebugLoc dl = MI->getDebugLoc();
 
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
   unsigned UsedRegMask = 0;
   for (unsigned i = 0; i != 32; ++i)
-    if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i]))
+    if (MRI.isPhysRegModified(VRRegNo[i]))
       UsedRegMask |= 1 << (31-i);
 
   // Live in and live out values already must be in the mask, so don't bother
@@ -1158,9 +1159,11 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void
-PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                   RegScavenger *) const {
+void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
   const PPCRegisterInfo *RegInfo =
       static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
 
@@ -1168,8 +1171,7 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   unsigned LR = RegInfo->getRARegister();
   FI->setMustSaveLR(MustSaveLR(MF, LR));
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  MRI.setPhysRegUnused(LR);
+  SavedRegs.reset(LR);
 
   //  Save R31 if necessary
   int FPSI = FI->getFramePointerSaveIndex();
@@ -1214,9 +1216,9 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the
   // function uses CR 2, 3, or 4.
   if (!isPPC64 && !isDarwinABI &&
-      (MRI.isPhysRegUsed(PPC::CR2) ||
-       MRI.isPhysRegUsed(PPC::CR3) ||
-       MRI.isPhysRegUsed(PPC::CR4))) {
+      (SavedRegs.test(PPC::CR2) ||
+       SavedRegs.test(PPC::CR3) ||
+       SavedRegs.test(PPC::CR4))) {
     int FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true);
     FI->setCRSpillFrameIndex(FrameIdx);
   }
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 28d074ecd79d..d6a389bfbf0d 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -45,8 +45,8 @@ public:
   bool needsFP(const MachineFunction &MF) const;
   void replaceFPWithRealFP(MachineFunction &MF) const;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS = nullptr) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                      RegScavenger *RS = nullptr) const override;
   void addScavengingSpillSlot(MachineFunction &MF, RegScavenger *RS) const;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index c85c2610d2f5..01a3acb742e6 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -102,7 +102,8 @@ namespace {
 
     /// getSmallIPtrImm - Return a target constant of pointer type.
     inline SDValue getSmallIPtrImm(unsigned Imm, SDLoc dl) {
-      return CurDAG->getTargetConstant(Imm, dl, PPCLowering->getPointerTy());
+      return CurDAG->getTargetConstant(
+          Imm, dl, PPCLowering->getPointerTy(CurDAG->getDataLayout()));
     }
 
     /// isRotateAndMask - Returns true if Mask and Shift can be folded into a
@@ -313,7 +314,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     const Module *M = MF->getFunction()->getParent();
     DebugLoc dl;
 
-    if (PPCLowering->getPointerTy() == MVT::i32) {
+    if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) {
       if (PPCSubTarget->isTargetELF()) {
         GlobalBaseReg = PPC::R30;
         if (M->getPICLevel() == PICLevel::Small) {
@@ -342,7 +343,8 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     }
   }
   return CurDAG->getRegister(GlobalBaseReg,
-                             PPCLowering->getPointerTy()).getNode();
+                             PPCLowering->getPointerTy(CurDAG->getDataLayout()))
+      .getNode();
 }
 
 /// isIntS16Immediate - This method tests to see if the node is either a 32-bit
@@ -2205,7 +2207,8 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   SDLoc dl(N);
   unsigned Imm;
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
-  EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT =
+      CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
   bool isPPC64 = (PtrVT == MVT::i64);
 
   if (!PPCSubTarget->useCRBits() &&
@@ -2468,10 +2471,11 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Offset, Base, Chain };
-      return transferMemOperands(N, CurDAG->getMachineNode(Opcode, dl,
-                                      LD->getValueType(0),
-                                      PPCLowering->getPointerTy(),
-                                      MVT::Other, Ops));
+      return transferMemOperands(
+          N, CurDAG->getMachineNode(
+                 Opcode, dl, LD->getValueType(0),
+                 PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other,
+                 Ops));
     } else {
       unsigned Opcode;
       bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
@@ -2506,10 +2510,11 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Base, Offset, Chain };
-      return transferMemOperands(N, CurDAG->getMachineNode(Opcode, dl,
-                                      LD->getValueType(0),
-                                      PPCLowering->getPointerTy(),
-                                      MVT::Other, Ops));
+      return transferMemOperands(
+          N, CurDAG->getMachineNode(
+                 Opcode, dl, LD->getValueType(0),
+                 PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other,
+                 Ops));
     }
   }
 
@@ -2662,7 +2667,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   }
   case ISD::SELECT_CC: {
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
-    EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
+    EVT PtrVT =
+        CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
     bool isPPC64 = (PtrVT == MVT::i64);
 
     // If this is a select of i1 operands, we'll pattern match it.
@@ -2901,7 +2907,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
       const GlobalValue *GValue = G->getGlobal();
       if ((GValue->getType()->getElementType()->isFunctionTy() &&
-           (GValue->isDeclaration() || GValue->isWeakForLinker())) ||
+           !GValue->isStrongDefinitionForLinker()) ||
           GValue->isDeclaration() || GValue->hasCommonLinkage() ||
           GValue->hasAvailableExternallyLinkage())
         return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl,
@@ -2915,7 +2921,9 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     // Generate a PIC-safe GOT reference.
     assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() &&
       "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
-    return CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT, PPCLowering->getPointerTy(),  MVT::i32);
+    return CurDAG->SelectNodeTo(
+        N, PPC::PPC32PICGOT, PPCLowering->getPointerTy(CurDAG->getDataLayout()),
+        MVT::i32);
   }
   case PPCISD::VADD_SPLAT: {
     // This expands into one of three sequences, depending on whether
@@ -3398,9 +3406,8 @@ void PPCDAGToDAGISel::PeepholeCROps() {
   bool IsModified;
   do {
     IsModified = false;
-    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-         E = CurDAG->allnodes_end(); I != E; ++I) {
-      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+    for (SDNode &Node : CurDAG->allnodes()) {
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
       if (!MachineNode || MachineNode->use_empty())
         continue;
       SDNode *ResNode = MachineNode;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 594472bbb47b..0ed9b051ffed 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -952,7 +952,8 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
 
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.
-unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
+unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
+                                                  const DataLayout &DL) const {
   // Darwin passes everything on 4 byte boundary.
   if (Subtarget.isDarwin())
     return 4;
@@ -1055,7 +1056,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
 }
 
-EVT PPCTargetLowering::getSetCCResultType(LLVMContext &C, EVT VT) const {
+EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
+                                          EVT VT) const {
   if (!VT.isVector())
     return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
 
@@ -1101,7 +1103,7 @@ static bool isConstantOrUndef(int Op, int Val) {
 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
-  bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian();
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
   if (ShuffleKind == 0) {
     if (IsLE)
       return false;
@@ -1132,7 +1134,7 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
-  bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian();
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
   if (ShuffleKind == 0) {
     if (IsLE)
       return false;
@@ -1174,7 +1176,7 @@ bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
   if (!Subtarget.hasP8Vector())
     return false;
 
-  bool IsLE = DAG.getTarget().getDataLayout()->isLittleEndian();
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
   if (ShuffleKind == 0) {
     if (IsLE)
       return false;
@@ -1237,7 +1239,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
 /// the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
                              unsigned ShuffleKind, SelectionDAG &DAG) {
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+  if (DAG.getDataLayout().isLittleEndian()) {
     if (ShuffleKind == 1) // unary
       return isVMerge(N, UnitSize, 0, 0);
     else if (ShuffleKind == 2) // swapped
@@ -1262,7 +1264,7 @@ bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
 /// the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
                              unsigned ShuffleKind, SelectionDAG &DAG) {
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+  if (DAG.getDataLayout().isLittleEndian()) {
     if (ShuffleKind == 1) // unary
       return isVMerge(N, UnitSize, 8, 8);
     else if (ShuffleKind == 2) // swapped
@@ -1352,7 +1354,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
  */
 bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
                               unsigned ShuffleKind, SelectionDAG &DAG) {
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+  if (DAG.getDataLayout().isLittleEndian()) {
     unsigned indexOffset = CheckEven ? 4 : 0;
     if (ShuffleKind == 1) // Unary
       return isVMerge(N, indexOffset, 0);
@@ -1399,7 +1401,7 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
   if (ShiftAmt < i) return -1;
 
   ShiftAmt -= i;
-  bool isLE = DAG.getTarget().getDataLayout()->isLittleEndian();
+  bool isLE = DAG.getDataLayout().isLittleEndian();
 
   if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
     // Check the rest of the elements to see if they are consecutive.
@@ -1456,7 +1458,7 @@ unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
                                 SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   assert(isSplatShuffleMask(SVOp, EltSize));
-  if (DAG.getTarget().getDataLayout()->isLittleEndian())
+  if (DAG.getDataLayout().isLittleEndian())
     return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
   else
     return SVOp->getMaskElt(0) / EltSize;
@@ -1796,7 +1798,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     }
   }
 
-  Disp = DAG.getTargetConstant(0, dl, getPointerTy());
+  Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));
   if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
     Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
     fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
@@ -2084,7 +2086,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   SDLoc dl(GA);
   const GlobalValue *GV = GA->getGlobal();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   bool is64bit = Subtarget.isPPC64();
   const Module *M = DAG.getMachineFunction().getFunction()->getParent();
   PICLevel::Level picLevel = M->getPICLevel();
@@ -2270,7 +2272,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
                                       const PPCSubtarget &Subtarget) const {
   SDNode *Node = Op.getNode();
   EVT VT = Node->getValueType(0);
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   SDValue InChain = Node->getOperand(0);
   SDValue VAListPtr = Node->getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
@@ -2399,11 +2401,9 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   SDLoc dl(Op);
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   bool isPPC64 = (PtrVT == MVT::i64);
-  Type *IntPtrTy =
-    DAG.getTargetLoweringInfo().getDataLayout()->getIntPtrType(
-                                                             *DAG.getContext());
+  Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -2440,7 +2440,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
   if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
     // memory location argument.
-    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
     const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
     return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
@@ -2476,8 +2476,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
   SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
   SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
 
-
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
 
   SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
                                             PtrVT);
@@ -2797,7 +2796,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
   MachineFrameInfo *MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
@@ -3023,7 +3022,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   assert(!(CallConv == CallingConv::Fast && isVarArg) &&
          "fastcc not supported on varargs functions");
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
@@ -3059,12 +3058,16 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   unsigned NumBytes = LinkageSize;
   unsigned AvailableFPRs = Num_FPR_Regs;
   unsigned AvailableVRs = Num_VR_Regs;
-  for (unsigned i = 0, e = Ins.size(); i != e; ++i)
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    if (Ins[i].Flags.isNest())
+      continue;
+
     if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
                                PtrByteSize, LinkageSize, ParamAreaSize,
                                NumBytes, AvailableFPRs, AvailableVRs,
                                Subtarget.hasQPX()))
       HasParameterArea = true;
+  }
 
   // Add DAG nodes to load the arguments or copy them out of registers.  On
   // entry to a function on PPC, the arguments start after the linkage area,
@@ -3216,6 +3219,17 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     case MVT::i1:
     case MVT::i32:
     case MVT::i64:
+      if (Flags.isNest()) {
+        // The 'nest' parameter, if any, is passed in R11.
+        unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+
+        break;
+      }
+
       // These can be scalar arguments or elements of an integer array type
       // passed directly.  Clang may use those instead of "byval" aggregate
       // types to avoid forcing arguments to memory unnecessarily.
@@ -3425,7 +3439,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   MachineFrameInfo *MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
   bool isPPC64 = PtrVT == MVT::i64;
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
@@ -3845,7 +3859,8 @@ static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
     return nullptr;  // Top 6 bits have to be sext of immediate.
 
   return DAG.getConstant((int)C->getZExtValue() >> 2, SDLoc(Op),
-                         DAG.getTargetLoweringInfo().getPointerTy()).getNode();
+                         DAG.getTargetLoweringInfo().getPointerTy(
+                             DAG.getDataLayout())).getNode();
 }
 
 namespace {
@@ -3991,7 +4006,7 @@ LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
                  bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
                  SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments,
                  SDLoc dl) {
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   if (!isTailCall) {
     if (isVector) {
       SDValue StackPtr;
@@ -4053,7 +4068,7 @@ static bool isFunctionGlobalAddress(SDValue Callee) {
 static
 unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
                      SDValue &Chain, SDValue CallSeqStart, SDLoc dl, int SPDiff,
-                     bool isTailCall, bool IsPatchPoint,
+                     bool isTailCall, bool IsPatchPoint, bool hasNest,
                      SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass,
                      SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
                      ImmutableCallSite *CS, const PPCSubtarget &Subtarget) {
@@ -4062,7 +4077,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
   bool isSVR4ABI = Subtarget.isSVR4ABI();
   bool isELFv2ABI = Subtarget.isELFv2ABI();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   NodeTys.push_back(MVT::Other);   // Returns a chain
   NodeTys.push_back(MVT::Glue);    // Returns a flag for retval copy to use.
 
@@ -4084,8 +4099,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
          (Subtarget.getTargetTriple().isMacOSX() &&
           Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
-         (G->getGlobal()->isDeclaration() ||
-          G->getGlobal()->isWeakForLinker())) ||
+         !G->getGlobal()->isStrongDefinitionForLinker()) ||
         (Subtarget.isTargetELF() && !isPPC64 &&
          !G->getGlobal()->hasLocalLinkage() &&
          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
@@ -4196,11 +4210,15 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       Chain = TOCVal.getValue(0);
       InFlag = TOCVal.getValue(1);
 
-      SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
-                                        InFlag);
+      // If the function call has an explicit 'nest' parameter, it takes the
+      // place of the environment pointer.
+      if (!hasNest) {
+        SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
+                                          InFlag);
 
-      Chain = EnvVal.getValue(0);
-      InFlag = EnvVal.getValue(1);
+        Chain = EnvVal.getValue(0);
+        InFlag = EnvVal.getValue(1);
+      }
 
       MTCTROps[0] = Chain;
       MTCTROps[1] = LoadFuncPtr;
@@ -4218,7 +4236,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     CallOpc = PPCISD::BCTRL;
     Callee.setNode(nullptr);
     // Add use of X11 (holding environment pointer)
-    if (isSVR4ABI && isPPC64 && !isELFv2ABI)
+    if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest)
       Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
     // Add CTR register as callee so a bctr can be emitted later.
     if (isTailCall)
@@ -4254,8 +4272,7 @@ static
 bool isLocalCall(const SDValue &Callee)
 {
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    return !G->getGlobal()->isDeclaration() &&
-           !G->getGlobal()->isWeakForLinker();
+    return G->getGlobal()->isStrongDefinitionForLinker();
   return false;
 }
 
@@ -4308,7 +4325,7 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 SDValue
 PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
                               bool isTailCall, bool isVarArg, bool IsPatchPoint,
-                              SelectionDAG &DAG,
+                              bool hasNest, SelectionDAG &DAG,
                               SmallVector<std::pair<unsigned, SDValue>, 8>
                                 &RegsToPass,
                               SDValue InFlag, SDValue Chain,
@@ -4321,8 +4338,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
   std::vector<EVT> NodeTys;
   SmallVector<SDValue, 8> Ops;
   unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
-                                 SPDiff, isTailCall, IsPatchPoint, RegsToPass,
-                                 Ops, NodeTys, CS, Subtarget);
+                                 SPDiff, isTailCall, IsPatchPoint, hasNest,
+                                 RegsToPass, Ops, NodeTys, CS, Subtarget);
 
   // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
   if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
@@ -4381,7 +4398,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
       // allocated and an unnecessary move instruction being generated.
       CallOpc = PPCISD::BCTRL_LOAD_TOC;
 
-      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
       SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
       unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
       SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
@@ -4586,7 +4603,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
       unsigned LocMemOffset = ByValVA.getLocMemOffset();
 
       SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-      PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+      PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
+                           StackPtr, PtrOff);
 
       // Create a copy of the argument in the local area of the current
       // stack frame.
@@ -4623,7 +4641,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
 
       if (!isTailCall) {
         SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-        PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+        PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
+                             StackPtr, PtrOff);
 
         MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
                                            MachinePointerInfo(),
@@ -4664,7 +4683,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
     PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
                     false, TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG,
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint,
+                    /* unused except on PPC64 ELFv1 */ false, DAG,
                     RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
                     NumBytes, Ins, InVals, CS);
 }
@@ -4703,8 +4723,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   bool isELFv2ABI = Subtarget.isELFv2ABI();
   bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned NumOps = Outs.size();
+  bool hasNest = false;
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   unsigned PtrByteSize = 8;
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -4758,6 +4779,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     EVT ArgVT = Outs[i].VT;
     EVT OrigVT = Outs[i].ArgVT;
 
+    if (Flags.isNest())
+      continue;
+
     if (CallConv == CallingConv::Fast) {
       if (Flags.isByVal())
         NumGPRsUsed += (Flags.getByValSize()+7)/8;
@@ -5021,6 +5045,13 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     case MVT::i1:
     case MVT::i32:
     case MVT::i64:
+      if (Flags.isNest()) {
+        // The 'nest' parameter, if any, is passed in R11.
+        RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
+        hasNest = true;
+        break;
+      }
+
       // These can be scalar arguments or elements of an integer array type
       // passed directly.  Clang may use those instead of "byval" aggregate
       // types to avoid forcing arguments to memory unnecessarily.
@@ -5302,9 +5333,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
                     FPOp, true, TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG,
-                    RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
-                    NumBytes, Ins, InVals, CS);
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint,
+		    hasNest, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
+                    Callee, SPDiff, NumBytes, Ins, InVals, CS);
 }
 
 SDValue
@@ -5320,7 +5351,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
 
   unsigned NumOps = Outs.size();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   bool isPPC64 = PtrVT == MVT::i64;
   unsigned PtrByteSize = isPPC64 ? 8 : 4;
 
@@ -5693,7 +5724,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
     PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp,
                     FPOp, true, TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, DAG,
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint,
+                    /* unused except on PPC64 ELFv1 */ false, DAG,
                     RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
                     NumBytes, Ins, InVals, CS);
 }
@@ -5764,7 +5796,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
 
   // Get the corect type for pointers.
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
 
   // Construct the stack pointer operand.
   bool isPPC64 = Subtarget.isPPC64();
@@ -5794,7 +5826,7 @@ SDValue
 PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool isPPC64 = Subtarget.isPPC64();
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
 
   // Get current frame pointer save index.  The users of this index will be
   // primarily DYNALLOC instructions.
@@ -5817,7 +5849,7 @@ SDValue
 PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool isPPC64 = Subtarget.isPPC64();
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
 
   // Get current frame pointer save index.  The users of this index will be
   // primarily DYNALLOC instructions.
@@ -5845,7 +5877,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDLoc dl(Op);
 
   // Get the corect type for pointers.
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   // Negate the size.
   SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
                                 DAG.getConstant(0, dl, PtrVT), Size);
@@ -5888,8 +5920,9 @@ SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDValue BasePtr = LD->getBasePtr();
   MachineMemOperand *MMO = LD->getMemOperand();
 
-  SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(), Chain,
-                                 BasePtr, MVT::i8, MMO);
+  SDValue NewLD =
+      DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,
+                     BasePtr, MVT::i8, MMO);
   SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
 
   SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
@@ -5913,7 +5946,8 @@ SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDValue Value = ST->getValue();
   MachineMemOperand *MMO = ST->getMemOperand();
 
-  Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(), Value);
+  Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),
+                      Value);
   return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
 }
 
@@ -6374,7 +6408,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                  SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
                SINT.getOperand(0).getValueType() == MVT::i32) {
       MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
 
       int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
       SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
@@ -6419,7 +6453,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
   // then lfd it and fcfid it.
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
 
   SDValue Ld;
   if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
@@ -6506,7 +6540,7 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 
   MachineFunction &MF = DAG.getMachineFunction();
   EVT VT = Op.getValueType();
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
 
   // Save FP Control Word to register
   EVT NodeTys[] = {
@@ -6727,7 +6761,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
     int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
     MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
-    EVT PtrVT = getPointerTy();
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
     SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
     assert(BVN->getNumOperands() == 4 &&
@@ -6760,9 +6794,9 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       }
 
       Constant *CP = ConstantVector::get(CV);
-      SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(),
-                      16 /* alignment */);
- 
+      SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()),
+                                          16 /* alignment */);
+
       SmallVector<SDValue, 2> Ops;
       Ops.push_back(DAG.getEntryNode());
       Ops.push_back(CPIdx);
@@ -7453,7 +7487,7 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
   // Create a stack slot that is 16-byte aligned.
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   // Store the input value into Value#0 of the stack slot.
@@ -7499,7 +7533,7 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   SDValue StoreChain = DAG.getEntryNode();
@@ -7651,9 +7685,9 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
 
     SmallVector<SDValue, 8> Stores;
     for (unsigned Idx = 0; Idx < 4; ++Idx) {
-      SDValue Ex =
-        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
-                    DAG.getConstant(Idx, dl, getVectorIdxTy()));
+      SDValue Ex = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
+          DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout())));
       SDValue Store;
       if (ScalarVT != ScalarMemVT)
         Store =
@@ -7715,7 +7749,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
   SmallVector<SDValue, 2> Ops;
@@ -7920,7 +7954,8 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
 
     assert(N->getValueType(0) == MVT::i1 &&
            "Unexpected result type for CTR decrement intrinsic");
-    EVT SVT = getSetCCResultType(*DAG.getContext(), N->getValueType(0));
+    EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                 N->getValueType(0));
     SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
     SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
                                  N->getOperand(1)); 
@@ -8248,7 +8283,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
 
-  MVT PVT = getPointerTy();
+  MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
   // For v = setjmp(buf), we generate
@@ -8386,7 +8421,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
 
-  MVT PVT = getPointerTy();
+  MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
 
@@ -9032,6 +9067,19 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 // Target Optimization Hooks
 //===----------------------------------------------------------------------===//
 
+static std::string getRecipOp(const char *Base, EVT VT) {
+  std::string RecipOp(Base);
+  if (VT.getScalarType() == MVT::f64)
+    RecipOp += "d";
+  else
+    RecipOp += "f";
+
+  if (VT.isVector())
+    RecipOp = "vec-" + RecipOp;
+
+  return RecipOp;
+}
+
 SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand,
                                             DAGCombinerInfo &DCI,
                                             unsigned &RefinementSteps,
@@ -9043,13 +9091,12 @@ SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand,
       (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
       (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
       (VT == MVT::v4f64 && Subtarget.hasQPX())) {
-    // Convergence is quadratic, so we essentially double the number of digits
-    // correct after every iteration. For both FRE and FRSQRTE, the minimum
-    // architected relative accuracy is 2^-5. When hasRecipPrec(), this is
-    // 2^-14. IEEE float has 23 digits and double has 52 digits.
-    RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
-    if (VT.getScalarType() == MVT::f64)
-      ++RefinementSteps;
+    TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+    std::string RecipOp = getRecipOp("sqrt", VT);
+    if (!Recips.isEnabled(RecipOp))
+      return SDValue();
+
+    RefinementSteps = Recips.getRefinementSteps(RecipOp);
     UseOneConstNR = true;
     return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
   }
@@ -9066,13 +9113,12 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand,
       (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
       (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
       (VT == MVT::v4f64 && Subtarget.hasQPX())) {
-    // Convergence is quadratic, so we essentially double the number of digits
-    // correct after every iteration. For both FRE and FRSQRTE, the minimum
-    // architected relative accuracy is 2^-5. When hasRecipPrec(), this is
-    // 2^-14. IEEE float has 23 digits and double has 52 digits.
-    RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
-    if (VT.getScalarType() == MVT::f64)
-      ++RefinementSteps;
+    TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+    std::string RecipOp = getRecipOp("div", VT);
+    if (!Recips.isEnabled(RecipOp))
+      return SDValue();
+
+    RefinementSteps = Recips.getRefinementSteps(RecipOp);
     return DCI.DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
   }
   return SDValue();
@@ -9854,7 +9900,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
 
   assert(N->getOpcode() == ISD::SIGN_EXTEND &&
          "Invalid extension type");
-  EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0));
+  EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
   SDValue ShiftCst =
     DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
   return DAG.getNode(ISD::SRA, dl, N->getValueType(0), 
@@ -10145,9 +10191,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 
     EVT MemVT = LD->getMemoryVT();
     Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
     Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());
-    unsigned ScalarABIAlignment = getDataLayout()->getABITypeAlignment(STy);
+    unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy);
     if (LD->isUnindexed() && VT.isVector() &&
         ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
           // P8 and later hardware should just use LOAD.
@@ -10219,7 +10265,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                 2*MemVT.getStoreSize()-1);
 
       // Create the new base load.
-      SDValue LDXIntID = DAG.getTargetConstant(IntrLD, dl, getPointerTy());
+      SDValue LDXIntID =
+          DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));
       SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
       SDValue BaseLoad =
         DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
@@ -10243,7 +10290,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       if (!findConsecutiveLoad(LD, DAG))
         --IncValue;
 
-      SDValue Increment = DAG.getConstant(IncValue, dl, getPointerTy());
+      SDValue Increment =
+          DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
 
       MachineMemOperand *ExtraMMO =
@@ -10691,7 +10739,7 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
 /// getConstraintType - Given a constraint, return the type of
 /// constraint it is for this target.
 PPCTargetLowering::ConstraintType
-PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
+PPCTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default: break;
@@ -10776,7 +10824,7 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
 
 std::pair<unsigned, const TargetRegisterClass *>
 PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                const std::string &Constraint,
+                                                StringRef Constraint,
                                                 MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC RS6000 Constraint Letters
@@ -10923,8 +10971,8 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
 // isLegalAddressingMode - Return true if the addressing mode represented
 // by AM is legal for this target, for a load/store of the specified type.
-bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              Type *Ty,
+bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
                                               unsigned AS) const {
   // PPC does not allow r+i addressing modes for vectors!
   if (Ty->isVectorTy() && AM.BaseOffs != 0)
@@ -10977,22 +11025,22 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   FuncInfo->setLRStoreRequired();
   bool isPPC64 = Subtarget.isPPC64();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset =
         DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
                         isPPC64 ? MVT::i64 : MVT::i32);
-    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                                   FrameAddr, Offset),
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
                        MachinePointerInfo(), false, false, false, 0);
   }
 
   // Just load the return address off the stack.
   SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
-  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+                     MachinePointerInfo(), false, false, false, 0);
 }
 
 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
@@ -11000,13 +11048,13 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
   SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
-  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  bool isPPC64 = PtrVT == MVT::i64;
-
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
 
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout());
+  bool isPPC64 = PtrVT == MVT::i64;
+
   // Naked functions never have a frame pointer, and so we use r1. For all
   // other functions, this decision must be delayed until during PEI.
   unsigned FrameReg;
@@ -11026,8 +11074,8 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned PPCTargetLowering::getRegisterByName(const char* RegName,
-                                              EVT VT) const {
+unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              SelectionDAG &DAG) const {
   bool isPPC64 = Subtarget.isPPC64();
   bool isDarwinABI = Subtarget.isDarwinABI();
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 02242b512a4f..6e13533cfdb3 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -423,7 +423,9 @@ namespace llvm {
     /// DAG node.
     const char *getTargetNodeName(unsigned Opcode) const override;
 
-    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i32;
+    }
 
     bool isCheapToSpeculateCttz() const override {
       return true;
@@ -434,7 +436,8 @@ namespace llvm {
     }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
 
     /// Return true if target always beneficiates from combining into FMA for a
     /// given value type. This must typically return false on targets where FMA
@@ -487,7 +490,8 @@ namespace llvm {
     SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                           std::vector<SDNode *> *Created) const override;
 
-    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
 
     void computeKnownBitsForTargetNode(const SDValue Op,
                                        APInt &KnownZero,
@@ -519,8 +523,7 @@ namespace llvm {
     MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
                                          MachineBasicBlock *MBB) const;
 
-    ConstraintType
-    getConstraintType(const std::string &Constraint) const override;
+    ConstraintType getConstraintType(StringRef Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
@@ -529,13 +532,13 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area.  This is the actual
     /// alignment, not its logarithm.
-    unsigned getByValTypeAlignment(Type *Ty) const override;
+    unsigned getByValTypeAlignment(Type *Ty,
+                                   const DataLayout &DL) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops.
@@ -544,8 +547,8 @@ namespace llvm {
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
 
-    unsigned getInlineAsmMemConstraint(
-        const std::string &ConstraintCode) const override {
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "es")
         return InlineAsm::Constraint_es;
       else if (ConstraintCode == "o")
@@ -561,8 +564,8 @@ namespace llvm {
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                               unsigned AS) const override;
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
@@ -745,7 +748,7 @@ namespace llvm {
                             SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
     SDValue FinishCall(CallingConv::ID CallConv, SDLoc dl, bool isTailCall,
-                       bool isVarArg, bool IsPatchPoint,
+                       bool isVarArg, bool IsPatchPoint, bool hasNest,
                        SelectionDAG &DAG,
                        SmallVector<std::pair<unsigned, SDValue>, 8>
                          &RegsToPass,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 696a83860e53..bf6e40296405 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -57,6 +57,10 @@ static cl::opt<bool> VSXSelfCopyCrash("crash-on-ppc-vsx-self-copy",
 cl::desc("Causes the backend to crash instead of generating a nop VSX copy"),
 cl::Hidden);
 
+static cl::opt<bool>
+UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
+  cl::desc("Use the old (incorrect) instruction latency calculation"));
+
 // Pin the vtable to this file.
 void PPCInstrInfo::anchor() {}
 
@@ -103,6 +107,35 @@ PPCInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
   return new ScoreboardHazardRecognizer(II, DAG);
 }
 
+unsigned PPCInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                       const MachineInstr *MI,
+                                       unsigned *PredCost) const {
+  if (!ItinData || UseOldLatencyCalc)
+    return PPCGenInstrInfo::getInstrLatency(ItinData, MI, PredCost);
+
+  // The default implementation of getInstrLatency calls getStageLatency, but
+  // getStageLatency does not do the right thing for us. While we have
+  // itinerary, most cores are fully pipelined, and so the itineraries only
+  // express the first part of the pipeline, not every stage. Instead, we need
+  // to use the listed output operand cycle number (using operand 0 here, which
+  // is an output).
+
+  unsigned Latency = 1;
+  unsigned DefClass = MI->getDesc().getSchedClass();
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef() || MO.isImplicit())
+      continue;
+
+    int Cycle = ItinData->getOperandCycle(DefClass, i);
+    if (Cycle < 0)
+      continue;
+
+    Latency = std::max(Latency, (unsigned) Cycle);
+  }
+
+  return Latency;
+}
 
 int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
                                     const MachineInstr *DefMI, unsigned DefIdx,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index e2d6346aa532..40badae644d6 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -95,6 +95,10 @@ public:
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
                                      const ScheduleDAG *DAG) const override;
 
+  unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                           const MachineInstr *MI,
+                           unsigned *PredCost = nullptr) const override;
+
   int getOperandLatency(const InstrItineraryData *ItinData,
                         const MachineInstr *DefMI, unsigned DefIdx,
                         const MachineInstr *UseMI,
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 43ba4994fde6..20c95fe888e0 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -989,6 +989,18 @@ def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B),
 def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B),
           (XVDIVDP $A, $B)>;
 
+// Reciprocal estimate
+def : Pat<(int_ppc_vsx_xvresp v4f32:$A),
+          (XVRESP $A)>;
+def : Pat<(int_ppc_vsx_xvredp v2f64:$A),
+          (XVREDP $A)>;
+
+// Recip. square root estimate
+def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A),
+          (XVRSQRTESP $A)>;
+def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A),
+          (XVRSQRTEDP $A)>;
+
 } // AddedComplexity
 } // HasVSX
 
@@ -1013,6 +1025,9 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                                                     v4i32:$XB)))]>;
   } // isCommutable
 
+  def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B),
+            (XXLEQV $A, $B)>;
+
   def XXLORC : XX3Form<60, 170,
                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                        "xxlorc $XT, $XA, $XB", IIC_VecGeneral,
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 656376c641aa..2b09b2f625de 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -165,8 +165,7 @@ void PPCRegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
 BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
-  const PPCFrameLowering *PPCFI =
-      static_cast<const PPCFrameLowering *>(Subtarget.getFrameLowering());
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
 
   // The ZERO register is not really a register, but the representation of r0
   // when used in instructions that treat r0 as the constant 0.
@@ -209,7 +208,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(PPC::X1);
     Reserved.set(PPC::X13);
 
-    if (PPCFI->needsFP(MF))
+    if (TFI->needsFP(MF))
       Reserved.set(PPC::X31);
 
     if (hasBasePointer(MF))
@@ -230,7 +229,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     }
   }
 
-  if (PPCFI->needsFP(MF))
+  if (TFI->needsFP(MF))
     Reserved.set(PPC::R31);
 
   if (hasBasePointer(MF)) {
@@ -256,8 +255,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
 unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                               MachineFunction &MF) const {
-  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
-  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
   const unsigned DefaultSafety = 1;
 
   switch (RC->getID()) {
@@ -341,7 +339,8 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   unsigned FrameSize = MFI->getStackSize();
   
   // Get stack alignments.
-  unsigned TargetAlign = Subtarget.getFrameLowering()->getStackAlignment();
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
+  unsigned TargetAlign = TFI->getStackAlignment();
   unsigned MaxAlign = MFI->getMaxAlignment();
   assert((maxCallFrameSize & (MaxAlign-1)) == 0 &&
          "Maximum call-frame size not sufficiently aligned");
@@ -864,8 +863,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 }
 
 unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
-  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
 
   if (!TM.isPPC64())
     return TFI->hasFP(MF) ? PPC::R31 : PPC::R1;
@@ -908,10 +906,10 @@ bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const {
 }
 
 bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
-  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
-  unsigned StackAlign = Subtarget.getFrameLowering()->getStackAlignment();
+  unsigned StackAlign = TFI->getStackAlignment();
   bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
                               F->hasFnAttribute(Attribute::StackAlignment));
 
@@ -946,11 +944,8 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
 
   MachineBasicBlock &MBB = *MI->getParent();
   MachineFunction &MF = *MBB.getParent();
-  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
-  const PPCFrameLowering *PPCFI =
-      static_cast<const PPCFrameLowering *>(Subtarget.getFrameLowering());
-  unsigned StackEst =
-    PPCFI->determineFrameLayout(MF, false, true);
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
+  unsigned StackEst = TFI->determineFrameLayout(MF, false, true);
 
   // If we likely don't need a stack frame, then we probably don't need a
   // virtual base register either.
@@ -1034,4 +1029,3 @@ bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
          MI->getOpcode() == TargetOpcode::PATCHPOINT ||
          (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0));
 }
-
diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td
index 635d154d10bf..267f56726180 100644
--- a/lib/Target/PowerPC/PPCScheduleP7.td
+++ b/lib/Target/PowerPC/PPCScheduleP7.td
@@ -315,6 +315,10 @@ def P7Itineraries : ProcessorItineraries<
                                                   P7_DU3, P7_DU4], 0>,
                                    InstrStage<1, [P7_VS1, P7_VS2]>],
                                   [5, 1, 1]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [5, 1, 1]>,
   InstrItinData<IIC_FPCompare   , [InstrStage<1, [P7_DU1, P7_DU2,
                                                   P7_DU3, P7_DU4], 0>,
                                    InstrStage<1, [P7_VS1, P7_VS2]>],
diff --git a/lib/Target/PowerPC/PPCScheduleP8.td b/lib/Target/PowerPC/PPCScheduleP8.td
index 020739baec3a..69e6d05c6604 100644
--- a/lib/Target/PowerPC/PPCScheduleP8.td
+++ b/lib/Target/PowerPC/PPCScheduleP8.td
@@ -323,6 +323,10 @@ def P8Itineraries : ProcessorItineraries<
                                                   P8_DU4, P8_DU5, P8_DU6], 0>,
                                    InstrStage<1, [P8_FPU1, P8_FPU2]>],
                                   [5, 1, 1]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [5, 1, 1]>,
   InstrItinData<IIC_FPCompare   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
                                                   P8_DU4, P8_DU5, P8_DU6], 0>,
                                    InstrStage<1, [P8_FPU1, P8_FPU2]>],
diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
deleted file mode 100644
index dc1674214769..000000000000
--- a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//===-- PPCSelectionDAGInfo.cpp - PowerPC SelectionDAG Info ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the PPCSelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PPCTargetMachine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "powerpc-selectiondag-info"
-
-PPCSelectionDAGInfo::PPCSelectionDAGInfo(const DataLayout *DL)
-    : TargetSelectionDAGInfo(DL) {}
-
-PPCSelectionDAGInfo::~PPCSelectionDAGInfo() {}
diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/lib/Target/PowerPC/PPCSelectionDAGInfo.h
deleted file mode 100644
index 2c1378d5670d..000000000000
--- a/lib/Target/PowerPC/PPCSelectionDAGInfo.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- PPCSelectionDAGInfo.h - PowerPC SelectionDAG Info -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the PowerPC subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_POWERPC_PPCSELECTIONDAGINFO_H
-#define LLVM_LIB_TARGET_POWERPC_PPCSELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class PPCTargetMachine;
-
-class PPCSelectionDAGInfo : public TargetSelectionDAGInfo {
-public:
-  explicit PPCSelectionDAGInfo(const DataLayout *DL);
-  ~PPCSelectionDAGInfo();
-};
-
-}
-
-#endif
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index cf603fe17723..58daccae90f2 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -53,7 +53,7 @@ PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU,
       IsPPC64(TargetTriple.getArch() == Triple::ppc64 ||
               TargetTriple.getArch() == Triple::ppc64le),
       TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, FS)),
-      InstrInfo(*this), TLInfo(TM, *this), TSInfo(TM.getDataLayout()) {}
+      InstrInfo(*this), TLInfo(TM, *this) {}
 
 void PPCSubtarget::initializeEnvironment() {
   StackAlignment = 16;
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index e9cc3d4bd5bc..0616c1f65604 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -17,10 +17,10 @@
 #include "PPCFrameLowering.h"
 #include "PPCISelLowering.h"
 #include "PPCInstrInfo.h"
-#include "PPCSelectionDAGInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -129,7 +129,7 @@ protected:
   PPCFrameLowering FrameLowering;
   PPCInstrInfo InstrInfo;
   PPCTargetLowering TLInfo;
-  PPCSelectionDAGInfo TSInfo;
+  TargetSelectionDAGInfo TSInfo;
 
 public:
   /// This constructor initializes the data members to match that
@@ -164,7 +164,7 @@ public:
   const PPCTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const PPCSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
   const PPCRegisterInfo *getRegisterInfo() const override {
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 074bc870751a..1daf244fed44 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -172,7 +172,26 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU,
                         computeFSAdditions(FS, OL, TT), Options, RM, CM, OL),
       TLOF(createTLOF(getTargetTriple())),
-      TargetABI(computeTargetABI(TT, Options)) {
+      TargetABI(computeTargetABI(TT, Options)),
+      Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) {
+
+  // For the estimates, convergence is quadratic, so we essentially double the
+  // number of digits correct after every iteration. For both FRE and FRSQRTE,
+  // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
+  // this is 2^-14. IEEE float has 23 digits and double has 52 digits.
+  unsigned RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3,
+           RefinementSteps64 = RefinementSteps + 1;
+
+  this->Options.Reciprocals.setDefaults("sqrtf", true, RefinementSteps);
+  this->Options.Reciprocals.setDefaults("vec-sqrtf", true, RefinementSteps);
+  this->Options.Reciprocals.setDefaults("divf", true, RefinementSteps);
+  this->Options.Reciprocals.setDefaults("vec-divf", true, RefinementSteps);
+
+  this->Options.Reciprocals.setDefaults("sqrtd", true, RefinementSteps64);
+  this->Options.Reciprocals.setDefaults("vec-sqrtd", true, RefinementSteps64);
+  this->Options.Reciprocals.setDefaults("divd", true, RefinementSteps64);
+  this->Options.Reciprocals.setDefaults("vec-divd", true, RefinementSteps64);
+
   initAsmInfo();
 }
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 5c0f7e629a69..6496339519a1 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -29,6 +29,8 @@ public:
 private:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   PPCABI TargetABI;
+  PPCSubtarget Subtarget;
+
   mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap;
 
 public:
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 25d563a7d975..e21c2b77f4d7 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -317,7 +317,7 @@ unsigned PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                      unsigned Alignment,
                                      unsigned AddressSpace) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
          "Invalid Opcode");
 
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 35e7a1497c83..368bef93f0dd 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -38,7 +38,8 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
 
 public:
   explicit PPCTTIImpl(const PPCTargetMachine *TM, Function &F)
-      : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   PPCTTIImpl(const PPCTTIImpl &Arg)
@@ -46,18 +47,6 @@ public:
   PPCTTIImpl(PPCTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  PPCTTIImpl &operator=(const PPCTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  PPCTTIImpl &operator=(PPCTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   /// \name Scalar TTI Implementations
   /// @{
diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index f352fa647ace..58d3c3d3fa2e 100644
--- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -136,6 +136,16 @@ protected:
         // source of the copy, it must still be live here.  We can't use
         // interval testing for a physical register, so as long as we're
         // walking the MIs we may as well test liveness here.
+        //
+        // FIXME: There is a case that occurs in practice, like this:
+        //   %vreg9<def> = COPY %F1; VSSRC:%vreg9
+        //   ...
+        //   %vreg6<def> = COPY %vreg9; VSSRC:%vreg6,%vreg9
+        //   %vreg7<def> = COPY %vreg9; VSSRC:%vreg7,%vreg9
+        //   %vreg9<def,tied1> = XSMADDASP %vreg9<tied0>, %vreg1, %vreg4; VSSRC:
+        //   %vreg6<def,tied1> = XSMADDASP %vreg6<tied0>, %vreg1, %vreg2; VSSRC:
+        //   %vreg7<def,tied1> = XSMADDASP %vreg7<tied0>, %vreg1, %vreg3; VSSRC:
+        // which prevents an otherwise-profitable transformation.
         bool OtherUsers = false, KillsAddendSrc = false;
         for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
              J != JE; --J) {
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index e7ab71ac2106..3fb1dcc3d4af 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -80,6 +80,7 @@ struct PPCVSXSwapEntry {
   unsigned int IsSwap : 1;
   unsigned int MentionsPhysVR : 1;
   unsigned int IsSwappable : 1;
+  unsigned int MentionsPartialVR : 1;
   unsigned int SpecialHandling : 3;
   unsigned int WebRejected : 1;
   unsigned int WillRemove : 1;
@@ -91,7 +92,9 @@ enum SHValues {
   SH_INSERT,
   SH_NOSWAP_LD,
   SH_NOSWAP_ST,
-  SH_SPLAT
+  SH_SPLAT,
+  SH_XXPERMDI,
+  SH_COPYSCALAR
 };
 
 struct PPCVSXSwapRemoval : public MachineFunctionPass {
@@ -167,6 +170,21 @@ private:
             isRegInClass(Reg, &PPC::VRRCRegClass));
   }
 
+  // Return true iff the given register is a partial vector register.
+  bool isScalarVecReg(unsigned Reg) {
+    return (isRegInClass(Reg, &PPC::VSFRCRegClass) ||
+            isRegInClass(Reg, &PPC::VSSRCRegClass));
+  }
+
+  // Return true iff the given register mentions all or part of a
+  // vector register.  Also sets Partial to true if the mention
+  // is for just the floating-point register overlap of the register.
+  bool isAnyVecReg(unsigned Reg, bool &Partial) {
+    if (isScalarVecReg(Reg))
+      Partial = true;
+    return isScalarVecReg(Reg) || isVecReg(Reg);
+  }
+
 public:
   // Main entry point for this pass.
   bool runOnMachineFunction(MachineFunction &MF) override {
@@ -223,12 +241,13 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
     for (MachineInstr &MI : MBB) {
 
       bool RelevantInstr = false;
+      bool Partial = false;
 
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isReg())
           continue;
         unsigned Reg = MO.getReg();
-        if (isVecReg(Reg)) {
+        if (isAnyVecReg(Reg, Partial)) {
           RelevantInstr = true;
           break;
         }
@@ -250,8 +269,13 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
         // Unless noted otherwise, an instruction is considered
         // safe for the optimization.  There are a large number of
         // such true-SIMD instructions (all vector math, logical,
-        // select, compare, etc.).
-        SwapVector[VecIdx].IsSwappable = 1;
+        // select, compare, etc.).  However, if the instruction
+        // mentions a partial vector register and does not have
+        // special handling defined, it is not swappable.
+        if (Partial)
+          SwapVector[VecIdx].MentionsPartialVR = 1;
+        else
+          SwapVector[VecIdx].IsSwappable = 1;
         break;
       case PPC::XXPERMDI: {
         // This is a swap if it is of the form XXPERMDI t, s, s, 2.
@@ -269,25 +293,37 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
                                                VecIdx);
           if (trueReg1 == trueReg2)
             SwapVector[VecIdx].IsSwap = 1;
-        }
+          else {
+            // We can still handle these if the two registers are not
+            // identical, by adjusting the form of the XXPERMDI.
+            SwapVector[VecIdx].IsSwappable = 1;
+            SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
+          }
         // This is a doubleword splat if it is of the form
         // XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3.  As above we
         // must look through chains of copy-likes to find the source
         // register.  We turn off the marking for mention of a physical
         // register, because splatting it is safe; the optimization
-        // will not swap the value in the physical register.
-        else if (immed == 0 || immed == 3) {
+        // will not swap the value in the physical register.  Whether
+        // or not the two input registers are identical, we can handle
+        // these by adjusting the form of the XXPERMDI.
+        } else if (immed == 0 || immed == 3) {
+
+          SwapVector[VecIdx].IsSwappable = 1;
+          SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
+
           unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(),
                                                VecIdx);
           unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(),
                                                VecIdx);
-          if (trueReg1 == trueReg2) {
-            SwapVector[VecIdx].IsSwappable = 1;
+          if (trueReg1 == trueReg2)
             SwapVector[VecIdx].MentionsPhysVR = 0;
-          }
+
+        } else {
+          // We can still handle these by adjusting the form of the XXPERMDI.
+          SwapVector[VecIdx].IsSwappable = 1;
+          SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
         }
-        // Any other form of XXPERMDI is lane-sensitive and unsafe
-        // for the optimization.
         break;
       }
       case PPC::LVX:
@@ -324,7 +360,32 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
         if (isVecReg(MI.getOperand(0).getReg()) &&
             isVecReg(MI.getOperand(1).getReg()))
           SwapVector[VecIdx].IsSwappable = 1;
+        // If we have a copy from one scalar floating-point register
+        // to another, we can accept this even if it is a physical
+        // register.  The only way this gets involved is if it feeds
+        // a SUBREG_TO_REG, which is handled by introducing a swap.
+        else if (isScalarVecReg(MI.getOperand(0).getReg()) &&
+                 isScalarVecReg(MI.getOperand(1).getReg()))
+          SwapVector[VecIdx].IsSwappable = 1;
+        break;
+      case PPC::SUBREG_TO_REG: {
+        // These are fine provided they are moving between full vector
+        // register classes.  If they are moving from a scalar
+        // floating-point class to a vector class, we can handle those
+        // as well, provided we introduce a swap.  It is generally the
+        // case that we will introduce fewer swaps than we remove, but
+        // (FIXME) a cost model could be used.  However, introduced
+        // swaps could potentially be CSEd, so this is not trivial.
+        if (isVecReg(MI.getOperand(0).getReg()) &&
+            isVecReg(MI.getOperand(2).getReg()))
+          SwapVector[VecIdx].IsSwappable = 1;
+        else if (isVecReg(MI.getOperand(0).getReg()) &&
+                 isScalarVecReg(MI.getOperand(2).getReg())) {
+          SwapVector[VecIdx].IsSwappable = 1;
+          SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYSCALAR;
+        }
         break;
+      }
       case PPC::VSPLTB:
       case PPC::VSPLTH:
       case PPC::VSPLTW:
@@ -425,6 +486,10 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
       case PPC::VUPKLSW:
       case PPC::XXMRGHW:
       case PPC::XXMRGLW:
+      // XXSLDWI could be replaced by a general permute with one of three
+      // permute control vectors (for shift values 1, 2, 3).  However,
+      // VPERM has a more restrictive register class.
+      case PPC::XXSLDWI:
       case PPC::XXSPLTW:
         break;
       }
@@ -501,18 +566,20 @@ void PPCVSXSwapRemoval::formWebs() {
     DEBUG(MI->dump());
 
     // It's sufficient to walk vector uses and join them to their unique
-    // definitions.  In addition, check *all* vector register operands
-    // for physical regs.
+    // definitions.  In addition, check full vector register operands
+    // for physical regs.  We exclude partial-vector register operands
+    // because we can handle them if copied to a full vector.
     for (const MachineOperand &MO : MI->operands()) {
       if (!MO.isReg())
         continue;
 
       unsigned Reg = MO.getReg();
-      if (!isVecReg(Reg))
+      if (!isVecReg(Reg) && !isScalarVecReg(Reg))
         continue;
 
       if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
-        SwapVector[EntryIdx].MentionsPhysVR = 1;
+        if (!(MI->isCopy() && isScalarVecReg(Reg)))
+          SwapVector[EntryIdx].MentionsPhysVR = 1;
         continue;
       }
 
@@ -545,15 +612,21 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
   for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
     int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
 
-    // Reject webs containing mentions of physical registers, or containing
-    // operations that we don't know how to handle in a lane-permuted region.
+    // If representative is already rejected, don't waste further time.
+    if (SwapVector[Repr].WebRejected)
+      continue;
+
+    // Reject webs containing mentions of physical or partial registers, or
+    // containing operations that we don't know how to handle in a lane-
+    // permuted region.
     if (SwapVector[EntryIdx].MentionsPhysVR ||
+        SwapVector[EntryIdx].MentionsPartialVR ||
         !(SwapVector[EntryIdx].IsSwappable || SwapVector[EntryIdx].IsSwap)) {
 
       SwapVector[Repr].WebRejected = 1;
 
       DEBUG(dbgs() <<
-            format("Web %d rejected for physreg, subreg, or not swap[pable]\n",
+            format("Web %d rejected for physreg, partial reg, or not swap[pable]\n",
                    Repr));
       DEBUG(dbgs() << "  in " << EntryIdx << ": ");
       DEBUG(SwapVector[EntryIdx].VSEMI->dump());
@@ -588,7 +661,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
         }
       }
 
-    // Reject webs than contain swapping stores that are fed by something
+    // Reject webs that contain swapping stores that are fed by something
     // other than a swap instruction.
     } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
       MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
@@ -670,7 +743,8 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
 // The identified swap entry requires special handling to allow its
 // containing computation to be optimized.  Perform that handling
 // here.
-// FIXME: This code is to be phased in with subsequent patches.
+// FIXME: Additional opportunities will be phased in with subsequent
+// patches.
 void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
   switch (SwapVector[EntryIdx].SpecialHandling) {
 
@@ -704,6 +778,91 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
     break;
   }
 
+  // For an XXPERMDI that isn't handled otherwise, we need to
+  // reverse the order of the operands.  If the selector operand
+  // has a value of 0 or 3, we need to change it to 3 or 0,
+  // respectively.  Otherwise we should leave it alone.  (This
+  // is equivalent to reversing the two bits of the selector
+  // operand and complementing the result.)
+  case SHValues::SH_XXPERMDI: {
+    MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+
+    DEBUG(dbgs() << "Changing XXPERMDI: ");
+    DEBUG(MI->dump());
+
+    unsigned Selector = MI->getOperand(3).getImm();
+    if (Selector == 0 || Selector == 3)
+      Selector = 3 - Selector;
+    MI->getOperand(3).setImm(Selector);
+
+    unsigned Reg1 = MI->getOperand(1).getReg();
+    unsigned Reg2 = MI->getOperand(2).getReg();
+    MI->getOperand(1).setReg(Reg2);
+    MI->getOperand(2).setReg(Reg1);
+
+    DEBUG(dbgs() << "  Into: ");
+    DEBUG(MI->dump());
+    break;
+  }
+
+  // For a copy from a scalar floating-point register to a vector
+  // register, removing swaps will leave the copied value in the
+  // wrong lane.  Insert a swap following the copy to fix this.
+  case SHValues::SH_COPYSCALAR: {
+    MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+
+    DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
+    DEBUG(MI->dump());
+
+    unsigned DstReg = MI->getOperand(0).getReg();
+    const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+    unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+
+    MI->getOperand(0).setReg(NewVReg);
+    DEBUG(dbgs() << "  Into: ");
+    DEBUG(MI->dump());
+
+    MachineBasicBlock::iterator InsertPoint = MI->getNextNode();
+
+    // Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
+    // is copying to a VRRC, we need to be careful to avoid a register
+    // assignment problem.  In this case we must copy from VRRC to VSRC
+    // prior to the swap, and from VSRC to VRRC following the swap.
+    // Coalescing will usually remove all this mess.
+
+    if (DstRC == &PPC::VRRCRegClass) {
+      unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
+      unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
+
+      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+              TII->get(PPC::COPY), VSRCTmp1)
+        .addReg(NewVReg);
+      DEBUG(MI->getNextNode()->dump());
+
+      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+              TII->get(PPC::XXPERMDI), VSRCTmp2)
+        .addReg(VSRCTmp1)
+        .addReg(VSRCTmp1)
+        .addImm(2);
+      DEBUG(MI->getNextNode()->getNextNode()->dump());
+
+      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+              TII->get(PPC::COPY), DstReg)
+        .addReg(VSRCTmp2);
+      DEBUG(MI->getNextNode()->getNextNode()->getNextNode()->dump());
+
+    } else {
+
+      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+              TII->get(PPC::XXPERMDI), DstReg)
+        .addReg(NewVReg)
+        .addReg(NewVReg)
+        .addImm(2);
+
+      DEBUG(MI->getNextNode()->dump());
+    }
+    break;
+  }
   }
 }
 
@@ -756,6 +915,8 @@ void PPCVSXSwapRemoval::dumpSwapVector() {
       DEBUG(dbgs() << "swap ");
     if (SwapVector[EntryIdx].MentionsPhysVR)
       DEBUG(dbgs() << "physreg ");
+    if (SwapVector[EntryIdx].MentionsPartialVR)
+      DEBUG(dbgs() << "partialreg ");
 
     if (SwapVector[EntryIdx].IsSwappable) {
       DEBUG(dbgs() << "swappable ");
@@ -780,6 +941,12 @@ void PPCVSXSwapRemoval::dumpSwapVector() {
       case SH_SPLAT:
         DEBUG(dbgs() << "special:splat ");
         break;
+      case SH_XXPERMDI:
+        DEBUG(dbgs() << "special:xxpermdi ");
+        break;
+      case SH_COPYSCALAR:
+        DEBUG(dbgs() << "special:copyscalar ");
+        break;
       }
     }
 
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index c486411f9a1e..5b7bfdd28020 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -22,7 +22,6 @@ add_llvm_target(SparcCodeGen
   SparcRegisterInfo.cpp
   SparcSubtarget.cpp
   SparcTargetMachine.cpp
-  SparcSelectionDAGInfo.cpp
   SparcMCInstLower.cpp
   SparcTargetObjectFile.cpp
   )
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index 91d2eeef0cc0..9113e4a46b96 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -57,7 +57,7 @@ static MCInstrInfo *createSparcMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createSparcMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createSparcMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitSparcMCRegisterInfo(X, SP::O7);
   return X;
@@ -65,11 +65,9 @@ static MCRegisterInfo *createSparcMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
   if (CPU.empty())
     CPU = (TT.getArch() == Triple::sparcv9) ? "v9" : "v8";
-  InitSparcMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createSparcMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
 // Code models. Some only make sense for 64-bit code.
@@ -83,7 +81,8 @@ createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
 //
 // All code models require that the text segment is smaller than 2GB.
 
-static MCCodeGenInfo *createSparcMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createSparcMCCodeGenInfo(const Triple &TT,
+                                               Reloc::Model RM,
                                                CodeModel::Model CM,
                                                CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
@@ -100,7 +99,8 @@ static MCCodeGenInfo *createSparcMCCodeGenInfo(StringRef TT, Reloc::Model RM,
   return X;
 }
 
-static MCCodeGenInfo *createSparcV9MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createSparcV9MCCodeGenInfo(const Triple &TT,
+                                                 Reloc::Model RM,
                                                  CodeModel::Model CM,
                                                  CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index bccc6bdd53eb..8fa10dcae114 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -190,11 +190,11 @@ static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
 {
 
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
-    if (MRI->isPhysRegUsed(reg))
+    if (!MRI->reg_nodbg_empty(reg))
       return false;
 
   for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
-    if (MRI->isPhysRegUsed(reg))
+    if (!MRI->reg_nodbg_empty(reg))
       return false;
 
   return true;
@@ -206,10 +206,10 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineFrameInfo    *MFI = MF.getFrameInfo();
 
-  return !(MFI->hasCalls()              // has calls
-           || MRI.isPhysRegUsed(SP::L0) // Too many registers needed
-           || MRI.isPhysRegUsed(SP::O6) // %SP is used
-           || hasFP(MF));               // need %FP
+  return !(MFI->hasCalls()                 // has calls
+           || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed
+           || !MRI.reg_nodbg_empty(SP::O6) // %SP is used
+           || hasFP(MF));                  // need %FP
 }
 
 void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
@@ -218,16 +218,13 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
 
   // Remap %i[0-7] to %o[0-7].
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
-    if (!MRI.isPhysRegUsed(reg))
+    if (MRI.reg_nodbg_empty(reg))
       continue;
     unsigned mapped_reg = (reg - SP::I0 + SP::O0);
-    assert(!MRI.isPhysRegUsed(mapped_reg));
+    assert(MRI.reg_nodbg_empty(mapped_reg));
 
     // Replace I register with O register.
     MRI.replaceRegWith(reg, mapped_reg);
-
-    // Mark the reg unused.
-    MRI.setPhysRegUnused(reg);
   }
 
   // Rewrite MBB's Live-ins.
@@ -247,9 +244,10 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
 #endif
 }
 
-void SparcFrameLowering::processFunctionBeforeCalleeSavedScan
-                  (MachineFunction &MF, RegScavenger *RS) const {
-
+void SparcFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                              BitVector &SavedRegs,
+                                              RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   if (!DisableLeafProc && isLeafProc(MF)) {
     SparcMachineFunctionInfo *MFI = MF.getInfo<SparcMachineFunctionInfo>();
     MFI->setLeafProc(true);
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index bb3b78861cbd..29fc7b7ba036 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -36,8 +36,8 @@ public:
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   bool hasFP(const MachineFunction &MF) const override;
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS = nullptr) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
 
 private:
   // Remap input registers to output registers for leaf procedure.
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index 9c594a9f0f65..340b72e7940f 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -67,13 +67,16 @@ private:
 
 SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = Subtarget->getInstrInfo()->getGlobalBaseReg(MF);
-  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode();
+  return CurDAG->getRegister(GlobalBaseReg,
+                             TLI->getPointerTy(CurDAG->getDataLayout()))
+      .getNode();
 }
 
 bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
                                      SDValue &Base, SDValue &Offset) {
   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), TLI->getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(
+        FIN->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout()));
     Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
     return true;
   }
@@ -88,8 +91,8 @@ bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
         if (FrameIndexSDNode *FIN =
                 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
           // Constant offset from frame ref.
-          Base =
-              CurDAG->getTargetFrameIndex(FIN->getIndex(), TLI->getPointerTy());
+          Base = CurDAG->getTargetFrameIndex(
+              FIN->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout()));
         } else {
           Base = Addr.getOperand(0);
         }
@@ -134,7 +137,7 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
   }
 
   R1 = Addr;
-  R2 = CurDAG->getRegister(SP::G0, TLI->getPointerTy());
+  R2 = CurDAG->getRegister(SP::G0, TLI->getPointerTy(CurDAG->getDataLayout()));
   return true;
 }
 
@@ -168,10 +171,9 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
     } else {
       TopPart = CurDAG->getRegister(SP::G0, MVT::i32);
     }
-    TopPart = SDValue(CurDAG->getMachineNode(SP::WRASRrr, dl, MVT::i32,
-                                 TopPart,
-                                 CurDAG->getRegister(SP::G0, MVT::i32)), 0);
-    TopPart = CurDAG->getCopyToReg(TopPart, dl, SP::Y, TopPart, SDValue()).getValue(1);
+    TopPart = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SP::Y, TopPart,
+                                   SDValue())
+                  .getValue(1);
 
     // FIXME: Handle div by immediate.
     unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr;
@@ -184,12 +186,11 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
     SDValue MulLHS = N->getOperand(0);
     SDValue MulRHS = N->getOperand(1);
     unsigned Opcode = N->getOpcode() == ISD::MULHU ? SP::UMULrr : SP::SMULrr;
-    SDNode *Mul = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Glue,
-                                         MulLHS, MulRHS);
-    // The high part is in the Y register.
-    return CurDAG->SelectNodeTo(N, SP::RDASR, MVT::i32,
-                                CurDAG->getRegister(SP::Y, MVT::i32),
-                                SDValue(Mul, 1));
+    SDNode *Mul =
+        CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32, MulLHS, MulRHS);
+    SDValue ResultHigh = SDValue(Mul, 1);
+    ReplaceUses(SDValue(N, 0), ResultHigh);
+    return nullptr;
   }
   }
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 0481676dc1ac..4879d4ee79e5 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -221,10 +221,11 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain,
     unsigned Reg = SFI->getSRetReturnReg();
     if (!Reg)
       llvm_unreachable("sret virtual register not created in the entry block");
-    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
+    auto PtrVT = getPointerTy(DAG.getDataLayout());
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, PtrVT);
     Chain = DAG.getCopyToReg(Chain, DL, SP::I0, Val, Flag);
     Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(SP::I0, getPointerTy()));
+    RetOps.push_back(DAG.getRegister(SP::I0, PtrVT));
     RetAddrOffset = 12; // CallInst + Delay Slot + Unimp
   }
 
@@ -418,6 +419,7 @@ LowerFormalArguments_32(SDValue Chain,
     assert(VA.isMemLoc());
 
     unsigned Offset = VA.getLocMemOffset()+StackOffset;
+    auto PtrVT = getPointerTy(DAG.getDataLayout());
 
     if (VA.needsCustom()) {
       assert(VA.getValVT() == MVT::f64);
@@ -426,7 +428,7 @@ LowerFormalArguments_32(SDValue Chain,
         int FI = MF.getFrameInfo()->CreateFixedObject(8,
                                                       Offset,
                                                       true);
-        SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+        SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
         SDValue Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr,
                                    MachinePointerInfo(),
                                    false,false, false, 0);
@@ -437,14 +439,14 @@ LowerFormalArguments_32(SDValue Chain,
       int FI = MF.getFrameInfo()->CreateFixedObject(4,
                                                     Offset,
                                                     true);
-      SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
       SDValue HiVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
                                   MachinePointerInfo(),
                                   false, false, false, 0);
       int FI2 = MF.getFrameInfo()->CreateFixedObject(4,
                                                      Offset+4,
                                                      true);
-      SDValue FIPtr2 = DAG.getFrameIndex(FI2, getPointerTy());
+      SDValue FIPtr2 = DAG.getFrameIndex(FI2, PtrVT);
 
       SDValue LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr2,
                                   MachinePointerInfo(),
@@ -460,7 +462,7 @@ LowerFormalArguments_32(SDValue Chain,
     int FI = MF.getFrameInfo()->CreateFixedObject(4,
                                                   Offset,
                                                   true);
-    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
     SDValue Load ;
     if (VA.getValVT() == MVT::i32 || VA.getValVT() == MVT::f32) {
       Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr,
@@ -607,10 +609,10 @@ LowerFormalArguments_64(SDValue Chain,
     if (VA.isExtInLoc())
       Offset += 8 - ValSize;
     int FI = MF.getFrameInfo()->CreateFixedObject(ValSize, Offset, true);
-    InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain,
-                                 DAG.getFrameIndex(FI, getPointerTy()),
-                                 MachinePointerInfo::getFixedStack(FI),
-                                 false, false, false, 0));
+    InVals.push_back(DAG.getLoad(
+        VA.getValVT(), DL, Chain,
+        DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
+        MachinePointerInfo::getFixedStack(FI), false, false, false, 0));
   }
 
   if (!IsVarArg)
@@ -637,10 +639,10 @@ LowerFormalArguments_64(SDValue Chain,
     unsigned VReg = MF.addLiveIn(SP::I0 + ArgOffset/8, &SP::I64RegsRegClass);
     SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
     int FI = MF.getFrameInfo()->CreateFixedObject(8, ArgOffset + ArgArea, true);
-    OutChains.push_back(DAG.getStore(Chain, DL, VArg,
-                                     DAG.getFrameIndex(FI, getPointerTy()),
-                                     MachinePointerInfo::getFixedStack(FI),
-                                     false, false, 0));
+    auto PtrVT = getPointerTy(MF.getDataLayout());
+    OutChains.push_back(
+        DAG.getStore(Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT),
+                     MachinePointerInfo::getFixedStack(FI), false, false, 0));
   }
 
   if (!OutChains.empty())
@@ -722,7 +724,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
     unsigned Align = Flags.getByValAlign();
 
     int FI = MFI->CreateStackObject(Size, Align, false);
-    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     SDValue SizeNode = DAG.getConstant(Size, dl, MVT::i32);
 
     Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
@@ -993,7 +995,7 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
 
   PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
   Type *ElementTy = Ty->getElementType();
-  return getDataLayout()->getTypeAllocSize(ElementTy);
+  return DAG.getDataLayout().getTypeAllocSize(ElementTy);
 }
 
 
@@ -1057,6 +1059,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   SelectionDAG &DAG = CLI.DAG;
   SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Sparc target does not yet support tail call optimization.
   CLI.IsTailCall = false;
@@ -1130,13 +1133,11 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
         // Store and reload into the interger register reg and reg+1.
         unsigned Offset = 8 * (VA.getLocReg() - SP::I0);
         unsigned StackOffset = Offset + Subtarget->getStackPointerBias() + 128;
-        SDValue StackPtr = DAG.getRegister(SP::O6, getPointerTy());
+        SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT);
         SDValue HiPtrOff = DAG.getIntPtrConstant(StackOffset, DL);
-        HiPtrOff         = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
-                                       HiPtrOff);
+        HiPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, HiPtrOff);
         SDValue LoPtrOff = DAG.getIntPtrConstant(StackOffset + 8, DL);
-        LoPtrOff         = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
-                                       LoPtrOff);
+        LoPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, LoPtrOff);
 
         // Store to %sp+BIAS+128+Offset
         SDValue Store = DAG.getStore(Chain, DL, Arg, HiPtrOff,
@@ -1180,13 +1181,13 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
     assert(VA.isMemLoc());
 
     // Create a store off the stack pointer for this argument.
-    SDValue StackPtr = DAG.getRegister(SP::O6, getPointerTy());
+    SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT);
     // The argument area starts at %fp+BIAS+128 in the callee frame,
     // %sp+BIAS+128 in ours.
     SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() +
                                            Subtarget->getStackPointerBias() +
                                            128, DL);
-    PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+    PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
     MemOpChains.push_back(DAG.getStore(Chain, DL, Arg, PtrOff,
                                        MachinePointerInfo(),
                                        false, false, 0));
@@ -1215,10 +1216,9 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   unsigned TF = ((getTargetMachine().getRelocationModel() == Reloc::PIC_)
                  ? SparcMCExpr::VK_Sparc_WPLT30 : 0);
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
-                                        TF);
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT, 0, TF);
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
-    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy(), TF);
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, TF);
 
   // Build the operands for the call instruction itself.
   SmallVector<SDValue, 8> Ops;
@@ -1370,6 +1370,8 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
 SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
                                          const SparcSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
+  auto &DL = *TM.getDataLayout();
+
   // Set up the register classes.
   addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
   addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
@@ -1394,10 +1396,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
 
   // Custom legalize GlobalAddress nodes into LO/HI parts.
-  setOperationAction(ISD::GlobalAddress, getPointerTy(), Custom);
-  setOperationAction(ISD::GlobalTLSAddress, getPointerTy(), Custom);
-  setOperationAction(ISD::ConstantPool, getPointerTy(), Custom);
-  setOperationAction(ISD::BlockAddress, getPointerTy(), Custom);
+  setOperationAction(ISD::GlobalAddress, getPointerTy(DL), Custom);
+  setOperationAction(ISD::GlobalTLSAddress, getPointerTy(DL), Custom);
+  setOperationAction(ISD::ConstantPool, getPointerTy(DL), Custom);
+  setOperationAction(ISD::BlockAddress, getPointerTy(DL), Custom);
 
   // Sparc doesn't have sext_inreg, replace them with shl/sra
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
@@ -1704,7 +1706,8 @@ const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
 }
 
-EVT SparcTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT SparcTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+                                            EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
@@ -1804,7 +1807,7 @@ SDValue SparcTargetLowering::makeHiLoPair(SDValue Op,
 // or ExternalSymbol SDNode.
 SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  EVT VT = getPointerTy();
+  EVT VT = getPointerTy(DAG.getDataLayout());
 
   // Handle PIC mode first.
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
@@ -1871,7 +1874,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   SDLoc DL(GA);
   const GlobalValue *GV = GA->getGlobal();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   TLSModel::Model model = getTargetMachine().getTLSModel(GV);
 
@@ -1983,7 +1986,7 @@ SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
   if (ArgTy->isFP128Ty()) {
     // Create a stack object and pass the pointer to the library function.
     int FI = MFI->CreateStackObject(16, 8, false);
-    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     Chain = DAG.getStore(Chain,
                          DL,
                          Entry.Node,
@@ -2008,8 +2011,9 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
   ArgListTy Args;
 
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  SDValue Callee = DAG.getExternalSymbol(LibFuncName, getPointerTy());
+  SDValue Callee = DAG.getExternalSymbol(LibFuncName, PtrVT);
   Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
   Type *RetTyABI = RetTy;
   SDValue Chain = DAG.getEntryNode();
@@ -2019,7 +2023,7 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
     // Create a Stack Object to receive the return value of type f128.
     ArgListEntry Entry;
     int RetFI = MFI->CreateStackObject(16, 8, false);
-    RetPtr = DAG.getFrameIndex(RetFI, getPointerTy());
+    RetPtr = DAG.getFrameIndex(RetFI, PtrVT);
     Entry.Node = RetPtr;
     Entry.Ty   = PointerType::getUnqual(RetTy);
     if (!Subtarget->is64Bit())
@@ -2082,7 +2086,8 @@ SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
   case SPCC::FCC_UE : LibCall = is64Bit? "_Qp_cmp" : "_Q_cmp"; break;
   }
 
-  SDValue Callee = DAG.getExternalSymbol(LibCall, getPointerTy());
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Callee = DAG.getExternalSymbol(LibCall, PtrVT);
   Type *RetTy = Type::getInt32Ty(*DAG.getContext());
   ArgListTy Args;
   SDValue Chain = DAG.getEntryNode();
@@ -2362,6 +2367,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
                             const SparcTargetLowering &TLI) {
   MachineFunction &MF = DAG.getMachineFunction();
   SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+  auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
 
   // Need frame address to find the address of VarArgsFrameIndex.
   MF.getFrameInfo()->setFrameAddressIsTaken(true);
@@ -2370,9 +2376,8 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
   // memory location argument.
   SDLoc DL(Op);
   SDValue Offset =
-    DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(),
-                DAG.getRegister(SP::I6, TLI.getPointerTy()),
-                DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
+      DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(SP::I6, PtrVT),
+                  DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
                       MachinePointerInfo(SV), false, false, 0);
@@ -2497,8 +2502,8 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
 
   SDValue RetAddr;
   if (depth == 0) {
-    unsigned RetReg = MF.addLiveIn(SP::I7,
-                                   TLI.getRegClassFor(TLI.getPointerTy()));
+    auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    unsigned RetReg = MF.addLiveIn(SP::I7, TLI.getRegClassFor(PtrVT));
     RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT);
     return RetAddr;
   }
@@ -3065,7 +3070,7 @@ SparcTargetLowering::expandAtomicRMW(MachineInstr *MI,
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 SparcTargetLowering::ConstraintType
-SparcTargetLowering::getConstraintType(const std::string &Constraint) const {
+SparcTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:  break;
@@ -3139,7 +3144,7 @@ LowerAsmOperandForConstraint(SDValue Op,
 
 std::pair<unsigned, const TargetRegisterClass *>
 SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                  const std::string &Constraint,
+                                                  StringRef Constraint,
                                                   MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index b6bc3d255713..bbc91a493c9d 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -72,7 +72,7 @@ namespace llvm {
 
     const char *getTargetNodeName(unsigned Opcode) const override;
 
-    ConstraintType getConstraintType(const std::string &Constraint) const override;
+    ConstraintType getConstraintType(StringRef Constraint) const override;
     ConstraintWeight
     getSingleConstraintMatchWeight(AsmOperandInfo &info,
                                    const char *constraint) const override;
@@ -82,14 +82,16 @@ namespace llvm {
                                       SelectionDAG &DAG) const override;
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
-    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i32;
+    }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
 
     SDValue
       LowerFormalArguments(SDValue Chain,
diff --git a/lib/Target/Sparc/SparcInstrAliases.td b/lib/Target/Sparc/SparcInstrAliases.td
index 670e9e989c81..25cc652dbd9e 100644
--- a/lib/Target/Sparc/SparcInstrAliases.td
+++ b/lib/Target/Sparc/SparcInstrAliases.td
@@ -245,6 +245,7 @@ multiclass fp_cond_alias<string cond, int condVal> {
 }
 
 defm : int_cond_alias<"a",    0b1000>;
+defm : int_cond_alias<"",     0b1000>; // same as a; gnu asm, not in manual
 defm : int_cond_alias<"n",    0b0000>;
 defm : int_cond_alias<"ne",   0b1001>;
 defm : int_cond_alias<"nz",   0b1001>; // same as ne
@@ -266,6 +267,7 @@ defm : int_cond_alias<"vc",   0b1111>;
 defm : int_cond_alias<"vs",   0b0111>;
 
 defm : fp_cond_alias<"a",     0b0000>;
+defm : fp_cond_alias<"",      0b0000>; // same as a; gnu asm, not in manual
 defm : fp_cond_alias<"n",     0b1000>;
 defm : fp_cond_alias<"u",     0b0111>;
 defm : fp_cond_alias<"g",     0b0110>;
@@ -284,7 +286,16 @@ defm : fp_cond_alias<"le",    0b1101>;
 defm : fp_cond_alias<"ule",   0b1110>;
 defm : fp_cond_alias<"o",     0b1111>;
 
-// Instruction aliases for JMPL.
+// Section A.3 Synthetic Instructions
+
+// Most are marked as Emit=0, so that they are not used for disassembly. This is
+// an aesthetic issue, but the chosen policy is to typically prefer using the
+// non-alias form, except for the most obvious and clarifying aliases: cmp, jmp,
+// call, tst, ret, retl.
+
+// Note: cmp is handled in SparcInstrInfo.
+//       jmp/call/ret/retl have special case handling for output in
+//       SparcInstPrinter.cpp
 
 // jmp addr -> jmpl addr, %g0
 def : InstAlias<"jmp $addr", (JMPLrr G0, MEMrr:$addr), 0>;
@@ -294,25 +305,129 @@ def : InstAlias<"jmp $addr", (JMPLri G0, MEMri:$addr), 0>;
 def : InstAlias<"call $addr", (JMPLrr O7, MEMrr:$addr), 0>;
 def : InstAlias<"call $addr", (JMPLri O7, MEMri:$addr), 0>;
 
-// retl -> RETL 8
-def : InstAlias<"retl", (RETL 8)>;
+// tst reg -> orcc %g0, reg, %g0
+def : InstAlias<"tst $rs2", (ORCCrr G0, IntRegs:$rs2, G0)>;
 
-// ret -> RET 8
+// ret -> jmpl %i7+8, %g0 (aka RET 8)
 def : InstAlias<"ret", (RET 8)>;
 
-// mov reg, rd -> or %g0, reg, rd
-def : InstAlias<"mov $rs2, $rd", (ORrr IntRegs:$rd, G0, IntRegs:$rs2)>;
+// retl -> jmpl %o7+8, %g0 (aka RETL 8)
+def : InstAlias<"retl", (RETL 8)>;
 
-// mov simm13, rd -> or %g0, simm13, rd
-def : InstAlias<"mov $simm13, $rd", (ORri IntRegs:$rd, G0, i32imm:$simm13)>;
+// restore -> restore %g0, %g0, %g0
+def : InstAlias<"restore", (RESTORErr G0, G0, G0)>;
+
+// save -> restore %g0, %g0, %g0
+def : InstAlias<"save", (SAVErr G0, G0, G0)>;
 
 // set value, rd
 // (turns into a sequence of sethi+or, depending on the value)
 // def : InstAlias<"set $val, $rd", (ORri IntRegs:$rd, (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
 def SET : AsmPseudoInst<(outs IntRegs:$rd), (ins i32imm:$val), "set $val, $rd">;
 
-// restore -> restore %g0, %g0, %g0
-def : InstAlias<"restore", (RESTORErr G0, G0, G0)>;
+// not rd -> xnor rd, %g0, rd
+def : InstAlias<"not $rd", (XNORrr IntRegs:$rd, IntRegs:$rd, G0), 0>;
+
+// not reg, rd -> xnor reg, %g0, rd
+def : InstAlias<"not $rs1, $rd", (XNORrr IntRegs:$rd, IntRegs:$rs1, G0), 0>;
+
+// neg rd -> sub %g0, rd, rd
+def : InstAlias<"neg $rd", (SUBrr IntRegs:$rd, G0, IntRegs:$rd), 0>;
+
+// neg reg, rd -> sub %g0, reg, rd
+def : InstAlias<"neg $rs2, $rd", (SUBrr IntRegs:$rd, G0, IntRegs:$rs2), 0>;
+
+// inc rd -> add rd, 1, rd
+def : InstAlias<"inc $rd", (ADDri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// inc simm13, rd -> add rd, simm13, rd
+def : InstAlias<"inc $simm13, $rd", (ADDri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// inccc rd -> addcc rd, 1, rd
+def : InstAlias<"inccc $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// inccc simm13, rd -> addcc rd, simm13, rd
+def : InstAlias<"inccc $simm13, $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// dec rd -> sub rd, 1, rd
+def : InstAlias<"dec $rd", (SUBri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// dec simm13, rd -> sub rd, simm13, rd
+def : InstAlias<"dec $simm13, $rd", (SUBri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// deccc rd -> subcc rd, 1, rd
+def : InstAlias<"deccc $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// deccc simm13, rd -> subcc rd, simm13, rd
+def : InstAlias<"deccc $simm13, $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// btst reg_or_imm, reg -> andcc reg,reg_or_imm,%g0
+def : InstAlias<"btst $rs2, $rs1", (ANDCCrr G0, IntRegs:$rs1, IntRegs:$rs2), 0>;
+def : InstAlias<"btst $simm13, $rs1", (ANDCCri G0, IntRegs:$rs1, i32imm:$simm13), 0>;
+
+// bset reg_or_imm, rd -> or rd,reg_or_imm,rd
+def : InstAlias<"bset $rs2, $rd", (ORrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
+def : InstAlias<"bset $simm13, $rd", (ORri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// bclr reg_or_imm, rd -> andn rd,reg_or_imm,rd
+def : InstAlias<"bclr $rs2, $rd", (ANDNrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
+def : InstAlias<"bclr $simm13, $rd", (ANDNri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// btog reg_or_imm, rd -> xor rd,reg_or_imm,rd
+def : InstAlias<"btog $rs2, $rd", (XORrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
+def : InstAlias<"btog $simm13, $rd", (XORri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+
+// clr rd -> or %g0, %g0, rd
+def : InstAlias<"clr $rd", (ORrr IntRegs:$rd, G0, G0), 0>;
+
+// clr{b,h,} [addr] -> st{b,h,} %g0, [addr]
+def : InstAlias<"clrb [$addr]", (STBrr MEMrr:$addr, G0), 0>;
+def : InstAlias<"clrb [$addr]", (STBri MEMri:$addr, G0), 0>;
+def : InstAlias<"clrh [$addr]", (STHrr MEMrr:$addr, G0), 0>;
+def : InstAlias<"clrh [$addr]", (STHri MEMri:$addr, G0), 0>;
+def : InstAlias<"clr [$addr]", (STrr MEMrr:$addr, G0), 0>;
+def : InstAlias<"clr [$addr]", (STri MEMri:$addr, G0), 0>;
+
+
+// mov reg_or_imm, rd -> or %g0, reg_or_imm, rd
+def : InstAlias<"mov $rs2, $rd", (ORrr IntRegs:$rd, G0, IntRegs:$rs2)>;
+def : InstAlias<"mov $simm13, $rd", (ORri IntRegs:$rd, G0, i32imm:$simm13)>;
+
+// mov specialreg, rd -> rd specialreg, rd
+def : InstAlias<"mov $asr, $rd", (RDASR IntRegs:$rd, ASRRegs:$asr), 0>;
+def : InstAlias<"mov %psr, $rd", (RDPSR IntRegs:$rd), 0>;
+def : InstAlias<"mov %wim, $rd", (RDWIM IntRegs:$rd), 0>;
+def : InstAlias<"mov %tbr, $rd", (RDTBR IntRegs:$rd), 0>;
+
+// mov reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg
+def : InstAlias<"mov $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, $asr", (WRASRri ASRRegs:$asr, G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $rs2, %psr", (WRPSRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, %psr", (WRPSRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $rs2, %wim", (WRWIMrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, %wim", (WRWIMri G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $rs2, %tbr", (WRTBRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
+
+// End of Section A.3
+
+// wr reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg
+// (aka: omit the first arg when it's g0. This is not in the manual, but is
+// supported by gnu and solaris as)
+def : InstAlias<"wr $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, $asr", (WRASRri ASRRegs:$asr, G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $rs2, %psr", (WRPSRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, %psr", (WRPSRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $rs2, %wim", (WRWIMrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, %wim", (WRWIMri G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $rs2, %tbr", (WRTBRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
+
+
+// flush -> flush %g0
+def : InstAlias<"flush", (FLUSH), 0>;
+
 
 def : MnemonicAlias<"return", "rett">, Requires<[HasV9]>;
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index f87cee43e319..6167c532db80 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -324,6 +324,15 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       numSubRegs = 4;
       movOpc     = SP::FMOVS;
     }
+  } else if (SP::ASRRegsRegClass.contains(DestReg) &&
+             SP::IntRegsRegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(SP::WRASRrr), DestReg)
+        .addReg(SP::G0)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  } else if (SP::IntRegsRegClass.contains(DestReg) &&
+             SP::ASRRegsRegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(SP::RDASR), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
   } else
     llvm_unreachable("Impossible reg-to-reg copy");
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index a02bae07a336..3b9e048ea8b3 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -536,6 +536,7 @@ let Defs = [ICC] in
 let Uses = [ICC] in
   defm SUBC   : F3_12np <"subx", 0b001100>;
 
+// cmp (from Section A.3) is a specialized alias for subcc
 let Defs = [ICC], rd = 0 in {
   def CMPrr   : F3_1<2, 0b010100,
                      (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
@@ -559,12 +560,12 @@ let Defs = [Y, ICC] in {
 }
 
 // Section B.19 - Divide Instructions, p. 115
-let Defs = [Y] in {
+let Uses = [Y], Defs = [Y] in {
   defm UDIV : F3_12np<"udiv", 0b001110>;
   defm SDIV : F3_12np<"sdiv", 0b001111>;
 }
 
-let Defs = [Y, ICC] in {
+let Uses = [Y], Defs = [Y, ICC] in {
   defm UDIVCC : F3_12np<"udivcc", 0b011110>;
   defm SDIVCC : F3_12np<"sdivcc", 0b011111>;
 }
@@ -828,6 +829,20 @@ let rd = 0 in
   def UNIMP : F2_1<0b000, (outs), (ins i32imm:$imm22),
                   "unimp $imm22", []>;
 
+// Section B.32 - Flush Instruction Memory
+let rd = 0 in {
+  def FLUSHrr : F3_1<2, 0b111011, (outs), (ins MEMrr:$addr),
+                       "flush $addr", []>;
+  def FLUSHri : F3_2<2, 0b111011, (outs), (ins MEMri:$addr),
+                       "flush $addr", []>;
+
+  // The no-arg FLUSH is only here for the benefit of the InstAlias
+  // "flush", which cannot seem to use FLUSHrr, due to the inability
+  // to construct a MEMrr with fixed G0 registers.
+  let rs1 = 0, rs2 = 0 in
+    def FLUSH   : F3_1<2, 0b111011, (outs), (ins), "flush %g0", []>;
+}
+
 // Section B.33 - Floating-point Operate (FPop) Instructions
 
 // Convert Integer to Floating-point Instructions, p. 141
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index e504da4d3b21..db8a7e86962d 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -249,4 +249,6 @@ def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
 
 // Ancillary state registers
 def ASRRegs : RegisterClass<"SP", [i32], 32,
-                            (add Y, (sequence "ASR%u", 1, 31))>;
+                            (add Y, (sequence "ASR%u", 1, 31))> {
+  let isAllocatable = 0;
+}
diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
deleted file mode 100644
index a308fc5e739e..000000000000
--- a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===-- SparcSelectionDAGInfo.cpp - Sparc SelectionDAG Info ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the SparcSelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "SparcSelectionDAGInfo.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "sparc-selectiondag-info"
-
-SparcSelectionDAGInfo::SparcSelectionDAGInfo(const DataLayout &DL)
-  : TargetSelectionDAGInfo(&DL) {
-}
-
-SparcSelectionDAGInfo::~SparcSelectionDAGInfo() {
-}
diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.h b/lib/Target/Sparc/SparcSelectionDAGInfo.h
deleted file mode 100644
index 6818291b30b4..000000000000
--- a/lib/Target/Sparc/SparcSelectionDAGInfo.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- SparcSelectionDAGInfo.h - Sparc SelectionDAG Info -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the Sparc subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_SPARC_SPARCSELECTIONDAGINFO_H
-#define LLVM_LIB_TARGET_SPARC_SPARCSELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class SparcTargetMachine;
-
-class SparcSelectionDAGInfo : public TargetSelectionDAGInfo {
-public:
-  explicit SparcSelectionDAGInfo(const DataLayout &DL);
-  ~SparcSelectionDAGInfo() override;
-};
-
-}
-
-#endif
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index 479b25d2723f..d69da409e428 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -54,7 +54,7 @@ SparcSubtarget::SparcSubtarget(const Triple &TT, const std::string &CPU,
                                bool is64Bit)
     : SparcGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      TSInfo(*TM.getDataLayout()), FrameLowering(*this) {}
+      FrameLowering(*this) {}
 
 int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
 
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index 983b1193975d..9d21911d88f0 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -17,9 +17,9 @@
 #include "SparcFrameLowering.h"
 #include "SparcInstrInfo.h"
 #include "SparcISelLowering.h"
-#include "SparcSelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -39,7 +39,7 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
   bool UsePopc;
   SparcInstrInfo InstrInfo;
   SparcTargetLowering TLInfo;
-  SparcSelectionDAGInfo TSInfo;
+  TargetSelectionDAGInfo TSInfo;
   SparcFrameLowering FrameLowering;
 
 public:
@@ -56,7 +56,7 @@ public:
   const SparcTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const SparcSelectionDAGInfo *getSelectionDAGInfo() const override {
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 81882106fc46..5fefa315a4cf 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -148,7 +148,7 @@ static MCInstrInfo *createSystemZMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createSystemZMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createSystemZMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitSystemZMCRegisterInfo(X, SystemZ::R14D);
   return X;
@@ -156,12 +156,11 @@ static MCRegisterInfo *createSystemZMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createSystemZMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitSystemZMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createSystemZMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static MCCodeGenInfo *createSystemZMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createSystemZMCCodeGenInfo(const Triple &TT,
+                                                 Reloc::Model RM,
                                                  CodeModel::Model CM,
                                                  CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index a636b35635ce..397de472a6ee 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -61,11 +61,12 @@ SystemZFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
   return SpillOffsetTable;
 }
 
-void SystemZFrameLowering::
-processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS) const {
+void SystemZFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                BitVector &SavedRegs,
+                                                RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
   MachineFrameInfo *MFFrame = MF.getFrameInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   bool HasFP = hasFP(MF);
   SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
@@ -77,17 +78,17 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // argument register R6D.
   if (IsVarArg)
     for (unsigned I = MFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
-      MRI.setPhysRegUsed(SystemZ::ArgGPRs[I]);
+      SavedRegs.set(SystemZ::ArgGPRs[I]);
 
   // If the function requires a frame pointer, record that the hard
   // frame pointer will be clobbered.
   if (HasFP)
-    MRI.setPhysRegUsed(SystemZ::R11D);
+    SavedRegs.set(SystemZ::R11D);
 
   // If the function calls other functions, record that the return
   // address register will be clobbered.
   if (MFFrame->hasCalls())
-    MRI.setPhysRegUsed(SystemZ::R14D);
+    SavedRegs.set(SystemZ::R14D);
 
   // If we are saving GPRs other than the stack pointer, we might as well
   // save and restore the stack pointer at the same time, via STMG and LMG.
@@ -96,8 +97,8 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
   for (unsigned I = 0; CSRegs[I]; ++I) {
     unsigned Reg = CSRegs[I];
-    if (SystemZ::GR64BitRegClass.contains(Reg) && MRI.isPhysRegUsed(Reg)) {
-      MRI.setPhysRegUsed(SystemZ::R15D);
+    if (SystemZ::GR64BitRegClass.contains(Reg) && SavedRegs.test(Reg)) {
+      SavedRegs.set(SystemZ::R15D);
       break;
     }
   }
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index 60bad894ee44..5ade757f17f7 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -27,8 +27,8 @@ public:
   bool isFPCloseToIncomingSP() const override { return false; }
   const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const
     override;
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
                                  const std::vector<CalleeSavedInfo> &CSI,
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 372f6fb3ea50..056ee02dcc21 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -81,10 +81,11 @@ static MachineOperand earlyUseOperand(MachineOperand Op) {
   return Op;
 }
 
-SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
+SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
                                              const SystemZSubtarget &STI)
-    : TargetLowering(tm), Subtarget(STI) {
-  MVT PtrVT = getPointerTy();
+    : TargetLowering(TM), Subtarget(STI) {
+  auto &DL = *TM.getDataLayout();
+  MVT PtrVT = getPointerTy(DL);
 
   // Set up the register classes.
   if (Subtarget.hasHighWord())
@@ -455,7 +456,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
   MaxStoresPerMemsetOptSize = 0;
 }
 
-EVT SystemZTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
+                                              LLVMContext &, EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
@@ -507,8 +509,8 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   return true;
 }
 
-bool SystemZTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                  Type *Ty,
+bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                  const AddrMode &AM, Type *Ty,
                                                   unsigned AS) const {
   // Punt on globals for now, although they can be used in limited
   // RELATIVE LONG cases.
@@ -544,7 +546,7 @@ bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
 //===----------------------------------------------------------------------===//
 
 TargetLowering::ConstraintType
-SystemZTargetLowering::getConstraintType(const std::string &Constraint) const {
+SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'a': // Address register
@@ -641,13 +643,14 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
 // has already been verified.  MC is the class associated with "t" and
 // Map maps 0-based register numbers to LLVM register numbers.
 static std::pair<unsigned, const TargetRegisterClass *>
-parseRegisterNumber(const std::string &Constraint,
-                    const TargetRegisterClass *RC, const unsigned *Map) {
+parseRegisterNumber(StringRef Constraint, const TargetRegisterClass *RC,
+                    const unsigned *Map) {
   assert(*(Constraint.end()-1) == '}' && "Missing '}'");
   if (isdigit(Constraint[2])) {
-    std::string Suffix(Constraint.data() + 2, Constraint.size() - 2);
-    unsigned Index = atoi(Suffix.c_str());
-    if (Index < 16 && Map[Index])
+    unsigned Index;
+    bool Failed =
+        Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index);
+    if (!Failed && Index < 16 && Map[Index])
       return std::make_pair(Map[Index], RC);
   }
   return std::make_pair(0U, nullptr);
@@ -655,8 +658,7 @@ parseRegisterNumber(const std::string &Constraint,
 
 std::pair<unsigned, const TargetRegisterClass *>
 SystemZTargetLowering::getRegForInlineAsmConstraint(
-    const TargetRegisterInfo *TRI, const std::string &Constraint,
-    MVT VT) const {
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC Constraint Letters
     switch (Constraint[0]) {
@@ -687,7 +689,7 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
       return std::make_pair(0U, &SystemZ::FP32BitRegClass);
     }
   }
-  if (Constraint[0] == '{') {
+  if (Constraint.size() > 0 && Constraint[0] == '{') {
     // We need to override the default register parsing for GPRs and FPRs
     // because the interpretation depends on VT.  The internal names of
     // the registers are also different from the external names
@@ -931,7 +933,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
       // Create the SelectionDAG nodes corresponding to a load
       // from this parameter.  Unpromoted ints and floats are
       // passed as right-justified 8-byte values.
-      EVT PtrVT = getPointerTy();
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
@@ -969,7 +971,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
       for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
         unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]);
         int FI = MFI->CreateFixedObject(8, RegSaveOffset + Offset, true);
-        SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+        SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
         unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
                                      &SystemZ::FP64BitRegClass);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
@@ -1019,7 +1021,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   CallingConv::ID CallConv = CLI.CallConv;
   bool IsVarArg = CLI.IsVarArg;
   MachineFunction &MF = DAG.getMachineFunction();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   // Detect unsupported vector argument and return types.
   if (Subtarget.hasVector()) {
@@ -2401,7 +2403,7 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
   SDLoc DL(Node);
   const GlobalValue *GV = Node->getGlobal();
   int64_t Offset = Node->getOffset();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   Reloc::Model RM = DAG.getTarget().getRelocationModel();
   CodeModel::Model CM = DAG.getTarget().getCodeModel();
 
@@ -2440,7 +2442,7 @@ SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
                                                  unsigned Opcode,
                                                  SDValue GOTOffset) const {
   SDLoc DL(Node);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Chain = DAG.getEntryNode();
   SDValue Glue;
 
@@ -2486,7 +2488,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
 						     SelectionDAG &DAG) const {
   SDLoc DL(Node);
   const GlobalValue *GV = Node->getGlobal();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
 
   // The high part of the thread pointer is in access register 0.
@@ -2587,7 +2589,7 @@ SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
   SDLoc DL(Node);
   const BlockAddress *BA = Node->getBlockAddress();
   int64_t Offset = Node->getOffset();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset);
   Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
@@ -2597,7 +2599,7 @@ SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
 SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
                                               SelectionDAG &DAG) const {
   SDLoc DL(JT);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
 
   // Use LARL to load the address of the table.
@@ -2607,7 +2609,7 @@ SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
 SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
                                                  SelectionDAG &DAG) const {
   SDLoc DL(CP);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   SDValue Result;
   if (CP->isMachineConstantPoolEntry())
@@ -2671,7 +2673,7 @@ SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   SystemZMachineFunctionInfo *FuncInfo =
     MF.getInfo<SystemZMachineFunctionInfo>();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   SDValue Chain   = Op.getOperand(0);
   SDValue Addr    = Op.getOperand(1);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 2f7617bbdac3..949b67f114ea 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -339,10 +339,10 @@ public:
                                  const SystemZSubtarget &STI);
 
   // Override TargetLowering.
-  MVT getScalarShiftAmountTy(EVT LHSTy) const override {
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
     return MVT::i32;
   }
-  MVT getVectorIdxTy() const override {
+  MVT getVectorIdxTy(const DataLayout &DL) const override {
     // Only the lower 12 bits of an element index are used, so we don't
     // want to clobber the upper 32 bits of a GPR unnecessarily.
     return MVT::i32;
@@ -364,12 +364,13 @@ public:
       return TypeWidenVector;
     return TargetLoweringBase::getPreferredVectorAction(VT);
   }
-  EVT getSetCCResultType(LLVMContext &, EVT) const override;
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                         EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
   bool isLegalICmpImmediate(int64_t Imm) const override;
   bool isLegalAddImmediate(int64_t Imm) const override;
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
@@ -379,10 +380,9 @@ public:
   const char *getTargetNodeName(unsigned Opcode) const override;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                               const std::string &Constraint,
-                               MVT VT) const override;
+                               StringRef Constraint, MVT VT) const override;
   TargetLowering::ConstraintType
-    getConstraintType(const std::string &Constraint) const override;
+  getConstraintType(StringRef Constraint) const override;
   TargetLowering::ConstraintWeight
     getSingleConstraintMatchWeight(AsmOperandInfo &info,
                                    const char *constraint) const override;
@@ -391,8 +391,7 @@ public:
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
 
-  unsigned getInlineAsmMemConstraint(
-      const std::string &ConstraintCode) const override {
+  unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
     if (ConstraintCode.size() == 1) {
       switch(ConstraintCode[0]) {
       default:
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 7cabea962e91..dc7bd25d7ed5 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -36,7 +36,7 @@ SystemZRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
 BitVector
 SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const SystemZFrameLowering *TFI = getFrameLowering(MF);
 
   if (TFI->hasFP(MF)) {
     // R11D is the frame pointer.  Reserve all aliases.
@@ -64,7 +64,7 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   MachineFunction &MF = *MBB.getParent();
   auto *TII =
       static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const SystemZFrameLowering *TFI = getFrameLowering(MF);
   DebugLoc DL = MI->getDebugLoc();
 
   // Decompose the frame index into a base and offset.
@@ -135,6 +135,6 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
 unsigned
 SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const SystemZFrameLowering *TFI = getFrameLowering(MF);
   return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D;
 }
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index e7e0268dbb8a..178aa3817311 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -18,12 +18,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "systemz-selectiondag-info"
 
-SystemZSelectionDAGInfo::SystemZSelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() {
-}
-
 // Decide whether it is best to use a loop or straight-line code for
 // a block operation of Size bytes with source address Src and destination
 // address Dest.  Sequence is the opcode to use for straight-line code
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index a257d6b55494..246fa3e5e656 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -22,8 +22,7 @@ class SystemZTargetMachine;
 
 class SystemZSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit SystemZSelectionDAGInfo(const DataLayout &DL);
-  ~SystemZSelectionDAGInfo();
+  explicit SystemZSelectionDAGInfo() = default;
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                                   SDValue Dst, SDValue Src,
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index eb5e5c0b9ff8..0b49fcdd8f78 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -42,7 +42,7 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       HasTransactionalExecution(false), HasProcessorAssist(false),
       HasVector(false), TargetTriple(TT),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      TSInfo(*TM.getDataLayout()), FrameLowering() {}
+      TSInfo(), FrameLowering() {}
 
 // Return true if GV binds locally under reloc model RM.
 static bool bindsLocally(const GlobalValue *GV, Reloc::Model RM) {
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index e9cabe968eea..4b80973ed879 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -29,7 +29,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
 
 public:
   explicit SystemZTTIImpl(const SystemZTargetMachine *TM, Function &F)
-    : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   SystemZTTIImpl(const SystemZTTIImpl &Arg)
@@ -37,18 +38,6 @@ public:
   SystemZTTIImpl(SystemZTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  SystemZTTIImpl &operator=(const SystemZTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  SystemZTTIImpl &operator=(SystemZTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   /// \name Scalar TTI Implementations
   /// @{
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 0b05303f71bf..83174c20c8e9 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -150,8 +150,9 @@ void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
 }
 
 TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis(
-      [this](Function &) { return TargetTransformInfo(getDataLayout()); });
+  return TargetIRAnalysis([this](Function &F) {
+    return TargetTransformInfo(F.getParent()->getDataLayout());
+  });
 }
 
 static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo,
diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index 87df7af84525..6a61fcdf0f86 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp
@@ -19,7 +19,14 @@ using namespace llvm;
 //---------------------------------------------------------------------------
 // TargetSubtargetInfo Class
 //
-TargetSubtargetInfo::TargetSubtargetInfo() {}
+TargetSubtargetInfo::TargetSubtargetInfo(
+    const Triple &TT, StringRef CPU, StringRef FS,
+    ArrayRef<SubtargetFeatureKV> PF, ArrayRef<SubtargetFeatureKV> PD,
+    const SubtargetInfoKV *ProcSched, const MCWriteProcResEntry *WPR,
+    const MCWriteLatencyEntry *WL, const MCReadAdvanceEntry *RA,
+    const InstrStage *IS, const unsigned *OC, const unsigned *FP)
+    : MCSubtargetInfo(TT, CPU, FS, PF, PD, ProcSched, WPR, WL, RA, IS, OC, FP) {
+}
 
 TargetSubtargetInfo::~TargetSubtargetInfo() {}
 
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index df04c2a3460b..25de9eee0831 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(LLVM_TARGET_DEFINITIONS WebAssembly.td)
 
 tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(WebAssemblyCommonTableGen)
 
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index d248556c62d7..224aa773a80e 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -29,6 +29,9 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_MC_DESC
 #include "WebAssemblyGenSubtargetInfo.inc"
 
+#define GET_REGINFO_MC_DESC
+#include "WebAssemblyGenRegisterInfo.inc"
+
 static MCAsmInfo *createWebAssemblyMCAsmInfo(const MCRegisterInfo &MRI,
                                              const Triple &TT) {
   MCAsmInfo *MAI = new WebAssemblyMCAsmInfo(TT);
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 24893daec7ea..eebf5b72f62b 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -47,6 +47,9 @@ MCAsmBackend *createWebAssemblyAsmBackend(const Target &T,
 // Defines symbolic names for WebAssembly registers. This defines a mapping from
 // register name to register number.
 //
+#define GET_REGINFO_ENUM
+#include "WebAssemblyGenRegisterInfo.inc"
+
 #define GET_SUBTARGETINFO_ENUM
 #include "WebAssemblyGenSubtargetInfo.inc"
 
diff --git a/lib/Target/WebAssembly/Makefile b/lib/Target/WebAssembly/Makefile
index 35d835c6506c..f102d73f6e86 100644
--- a/lib/Target/WebAssembly/Makefile
+++ b/lib/Target/WebAssembly/Makefile
@@ -12,7 +12,8 @@ LIBRARYNAME = LLVMWebAssemblyCodeGen
 TARGET = WebAssembly
 
 # Make sure that tblgen is run, first thing.
-BUILT_SOURCES = WebAssemblyGenSubtargetInfo.inc WebAssemblyGenMCCodeEmitter.inc
+BUILT_SOURCES = WebAssemblyGenRegisterInfo.inc WebAssemblyGenSubtargetInfo.inc \
+		WebAssemblyGenMCCodeEmitter.inc
 
 DIRS = InstPrinter TargetInfo MCTargetDesc
 
diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index 7a71060a638f..63e02c455895 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -12,4 +12,15 @@ binary encoding of WebAssembly itself:
   * https://github.com/WebAssembly/design/blob/master/AstSemantics.md
   * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
 
+Interesting work that remains to be done:
+* Write a pass to restructurize irreducible control flow. This needs to be done
+  before register allocation to be efficient, because it may duplicate basic
+  blocks and WebAssembly performs register allocation at a whole-function
+  level. Note that LLVM's GPU code has such a pass, but it linearizes control
+  flow (e.g. both sides of branches execute and are masked) which is undesirable
+  for WebAssembly.
+* Basic relooper to expose control flow as an AST.
+* Figure out how to properly use MC for virtual ISAs. This may require some
+  refactoring of MC.
+
 //===---------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4eec02efbd94..4184eb6dc5a6 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -38,6 +38,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   // WebAssembly does not produce floating-point exceptions on normal floating
   // point operations.
   setHasFloatingPointExceptions(false);
+  // We don't know the microarchitecture here, so just reduce register pressure.
+  setSchedulingPreference(Sched::RegPressure);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index 35e88eec8573..64415658ed81 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -6,9 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// WebAssembly Atomic operand code-gen constructs.
-//
+///
+/// \file
+/// \brief WebAssembly Atomic operand code-gen constructs.
+///
 //===----------------------------------------------------------------------===//
 
 // TODO: Implement atomic instructions.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
new file mode 100644
index 000000000000..6b5b6cd54173
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -0,0 +1,21 @@
+//===- WebAssemblyInstrCall.td-WebAssembly Call codegen support -*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Call operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+/*
+ * TODO(jfb): Add the following.
+ *
+ * call_direct: call function directly
+ * call_indirect: call function indirectly
+ * addressof: obtain a function pointer value for a given function
+ */
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
new file mode 100644
index 000000000000..3fa29061b1de
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -0,0 +1,44 @@
+//===-- WebAssemblyInstrConv.td-WebAssembly Conversion support -*- tablegen -*-=
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly datatype conversions, truncations, reinterpretations,
+/// promotions, and demotions operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+/*
+ * TODO(jfb): Add the following.
+ *
+ * int32.wrap[int64]: wrap a 64-bit integer to a 32-bit integer
+ * int32.trunc_signed[float32]: truncate a 32-bit float to a signed 32-bit integer
+ * int32.trunc_signed[float64]: truncate a 64-bit float to a signed 32-bit integer
+ * int32.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 32-bit integer
+ * int32.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 32-bit integer
+ * int32.reinterpret[float32]: reinterpret the bits of a 32-bit float as a 32-bit integer
+ * int64.extend_signed[int32]: extend a signed 32-bit integer to a 64-bit integer
+ * int64.extend_unsigned[int32]: extend an unsigned 32-bit integer to a 64-bit integer
+ * int64.trunc_signed[float32]: truncate a 32-bit float to a signed 64-bit integer
+ * int64.trunc_signed[float64]: truncate a 64-bit float to a signed 64-bit integer
+ * int64.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 64-bit integer
+ * int64.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 64-bit integer
+ * int64.reinterpret[float64]: reinterpret the bits of a 64-bit float as a 64-bit integer
+ * float32.demote[float64]: demote a 64-bit float to a 32-bit float
+ * float32.cvt_signed[int32]: convert a signed 32-bit integer to a 32-bit float
+ * float32.cvt_signed[int64]: convert a signed 64-bit integer to a 32-bit float
+ * float32.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 32-bit float
+ * float32.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 32-bit float
+ * float32.reinterpret[int32]: reinterpret the bits of a 32-bit integer as a 32-bit float
+ * float64.promote[float32]: promote a 32-bit float to a 64-bit float
+ * float64.cvt_signed[int32]: convert a signed 32-bit integer to a 64-bit float
+ * float64.cvt_signed[int64]: convert a signed 64-bit integer to a 64-bit float
+ * float64.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 64-bit float
+ * float64.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 64-bit float
+ * float64.reinterpret[int64]: reinterpret the bits of a 64-bit integer as a 64-bit float
+ */
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
new file mode 100644
index 000000000000..30ef6339d65a
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -0,0 +1,44 @@
+// WebAssemblyInstrFloat.td-WebAssembly Float codegen support ---*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Floating-point operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+defm FADD : BinaryFP<fadd>;
+defm FSUB : BinaryFP<fsub>;
+defm FMUL : BinaryFP<fmul>;
+defm FDIV : BinaryFP<fdiv>;
+defm FABS : UnaryFP<fabs>;
+defm FNEG : UnaryFP<fneg>;
+defm COPYSIGN : BinaryFP<fcopysign>;
+defm CEIL : UnaryFP<fceil>;
+defm FLOOR : UnaryFP<ffloor>;
+defm TRUNC : UnaryFP<ftrunc>;
+defm NEARESTINT : UnaryFP<fnearbyint>;
+
+/*
+ * TODO(jfb): Add the following for 32-bit and 64-bit.
+ *
+ * float32.eq: compare equal
+ * float32.lt: less than
+ * float32.le: less than or equal
+ * float32.gt: greater than
+ * float32.ge: greater than or equal
+ */
+
+defm SQRT : UnaryFP<fsqrt>;
+
+/*
+ * TODO(jfb): Add the following for 32-bit and 64-bit.
+ *
+ * float32.min: minimum (binary operator); if either operand is NaN, returns NaN
+ * float32.max: maximum (binary operator); if either operand is NaN, returns NaN
+ */
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 8bbf3e9ec87b..513c36fa2ec2 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -6,9 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// WebAssembly instruction format definitions.
-//
+///
+/// \file
+/// \brief WebAssembly instruction format definitions.
+///
 //===----------------------------------------------------------------------===//
 
 // WebAssembly Instruction Format
@@ -26,3 +27,29 @@ class I<dag oops, dag iops, list<dag> pattern, string cstr = "">
   dag InOperandList  = iops;
   let Pattern        = pattern;
 }
+
+// Unary and binary instructions, for the local types that WebAssembly supports.
+multiclass UnaryInt<SDNode node> {
+  def _I32 : I<(outs Int32:$dst), (ins Int32:$src),
+               [(set Int32:$dst, (node Int32:$src))]>;
+  def _I64 : I<(outs Int64:$dst), (ins Int64:$src),
+               [(set Int64:$dst, (node Int64:$src))]>;
+}
+multiclass BinaryInt<SDNode node> {
+  def _I32 : I<(outs Int32:$dst), (ins Int32:$lhs, Int32:$rhs),
+               [(set Int32:$dst, (node Int32:$lhs, Int32:$rhs))]>;
+  def _I64 : I<(outs Int64:$dst), (ins Int64:$lhs, Int64:$rhs),
+               [(set Int64:$dst, (node Int64:$lhs, Int64:$rhs))]>;
+}
+multiclass UnaryFP<SDNode node> {
+  def _F32 : I<(outs Float32:$dst), (ins Float32:$src),
+               [(set Float32:$dst, (node Float32:$src))]>;
+  def _F64 : I<(outs Float64:$dst), (ins Float64:$src),
+               [(set Float64:$dst, (node Float64:$src))]>;
+}
+multiclass BinaryFP<SDNode node> {
+  def _F32 : I<(outs Float32:$dst), (ins Float32:$lhs, Float32:$rhs),
+               [(set Float32:$dst, (node Float32:$lhs, Float32:$rhs))]>;
+  def _F64 : I<(outs Float64:$dst), (ins Float64:$lhs, Float64:$rhs),
+               [(set Float64:$dst, (node Float64:$lhs, Float64:$rhs))]>;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 142eccfbcaa5..fe3ca76dc08a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -6,9 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// WebAssembly Instruction definitions.
-//
+///
+/// \file
+/// \brief WebAssembly Instruction definitions.
+///
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
@@ -32,6 +33,13 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
 // WebAssembly-specific Operands.
 //===----------------------------------------------------------------------===//
 
+/*
+ * TODO(jfb): Add the following.
+ *
+ * get_local: read the current value of a local variable
+ * set_local: set the current value of a local variable
+*/
+
 //===----------------------------------------------------------------------===//
 // WebAssembly Instruction Format Definitions.
 //===----------------------------------------------------------------------===//
@@ -42,5 +50,10 @@ include "WebAssemblyInstrFormats.td"
 // Additional sets of instructions.
 //===----------------------------------------------------------------------===//
 
+include "WebAssemblyInstrMemory.td"
+include "WebAssemblyInstrCall.td"
+include "WebAssemblyInstrInteger.td"
+include "WebAssemblyInstrFloat.td"
+include "WebAssemblyInstrConv.td"
 include "WebAssemblyInstrAtomics.td"
 include "WebAssemblyInstrSIMD.td"
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
new file mode 100644
index 000000000000..5f60fe81b1a2
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -0,0 +1,45 @@
+// WebAssemblyInstrInteger.td-WebAssembly Integer codegen -------*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Integer operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+defm ADD : BinaryInt<add>;
+defm SUB : BinaryInt<sub>;
+defm MUL : BinaryInt<mul>;
+defm SDIV : BinaryInt<sdiv>;
+defm UDIV : BinaryInt<udiv>;
+defm SREM : BinaryInt<srem>;
+defm UREM : BinaryInt<urem>;
+defm AND : BinaryInt<and>;
+defm IOR : BinaryInt<or>;
+defm XOR : BinaryInt<xor>;
+defm SHL : BinaryInt<shl>;
+defm SHR : BinaryInt<srl>;
+defm SAR : BinaryInt<sra>;
+
+/*
+ * TODO(jfb): Add the following for 32-bit and 64-bit.
+ *
+ * int32.eq: signed-less compare equal
+ * int32.slt: signed less than
+ * int32.sle: signed less than or equal
+ * int32.ult: unsigned less than
+ * int32.ule: unsigned less than or equal
+ * int32.sgt: signed greater than
+ * int32.sge: signed greater than or equal
+ * int32.ugt: unsigned greater than
+ * int32.uge: unsigned greater than or equal
+ */
+
+defm CLZ : UnaryInt<ctlz>;
+defm CTZ : UnaryInt<cttz>;
+defm POPCNT : UnaryInt<ctpop>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
new file mode 100644
index 000000000000..5ab40e826caa
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -0,0 +1,46 @@
+// WebAssemblyInstrMemory.td-WebAssembly Memory codegen support -*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Memory operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+/*
+ * TODO(jfb): Add the following.
+ * Each has optional alignment and immediate byte offset.
+ *
+ * int32.load_sx[int8]: sign-extend to int32
+ * int32.load_sx[int16]: sign-extend to int32
+ * int32.load_zx[int8]: zero-extend to int32
+ * int32.load_zx[int16]: zero-extend to int32
+ * int32.load[int32]: (no conversion)
+ * int64.load_sx[int8]: sign-extend to int64
+ * int64.load_sx[int16]: sign-extend to int64
+ * int64.load_sx[int32]: sign-extend to int64
+ * int64.load_zx[int8]: zero-extend to int64
+ * int64.load_zx[int16]: zero-extend to int64
+ * int64.load_zx[int32]: zero-extend to int64
+ * int64.load[int64]: (no conversion)
+ * float32.load[float32]: (no conversion)
+ * float64.load[float64]: (no conversion)
+ * 
+ * int32.store[int8]: wrap int32 to int8
+ * int32.store[int16]: wrap int32 to int16
+ * int32.store[int32]: (no conversion)
+ * int64.store[int8]: wrap int64 to int8
+ * int64.store[int16]: wrap int64 to int16
+ * int64.store[int32]: wrap int64 to int32
+ * int64.store[int64]: (no conversion)
+ * float32.store[float32]: (no conversion)
+ * float64.store[float64]: (no conversion)
+ * 
+ * load_global: load the value of a given global variable
+ * store_global: store a given value to a given global variable
+ */
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index e25483ad3f7a..3e29906219d2 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -6,9 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// WebAssembly SIMD operand code-gen constructs.
-//
+///
+/// \file
+/// \brief WebAssembly SIMD operand code-gen constructs.
+///
 //===----------------------------------------------------------------------===//
 
 // TODO: Implement SIMD instructions.
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index ad24c90af6a2..385c40bf6693 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -30,4 +30,58 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-reg-info"
 
-WebAssemblyRegisterInfo::WebAssemblyRegisterInfo(const Triple &TT) : TT(TT) {}
+#define GET_REGINFO_TARGET_DESC
+#include "WebAssemblyGenRegisterInfo.inc"
+
+WebAssemblyRegisterInfo::WebAssemblyRegisterInfo(const Triple &TT)
+    : WebAssemblyGenRegisterInfo(0), TT(TT) {}
+
+const MCPhysReg *
+WebAssemblyRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const {
+  static const MCPhysReg CalleeSavedRegs[] = {0};
+  return CalleeSavedRegs;
+}
+
+BitVector
+WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  for (auto Reg : {WebAssembly::SP32, WebAssembly::SP64, WebAssembly::FP32,
+                   WebAssembly::FP64})
+    Reserved.set(Reg);
+  return Reserved;
+}
+
+void WebAssemblyRegisterInfo::eliminateFrameIndex(
+    MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum,
+    RegScavenger *RS) const {
+  llvm_unreachable("WebAssemblyRegisterInfo::eliminateFrameIndex"); // FIXME
+}
+
+unsigned
+WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  static const unsigned Regs[2][2] = {
+      /*            !isArch64Bit       isArch64Bit      */
+      /* !hasFP */ {WebAssembly::SP32, WebAssembly::SP64},
+      /*  hasFP */ {WebAssembly::FP32, WebAssembly::FP64}};
+  const WebAssemblyFrameLowering *TFI = getFrameLowering(MF);
+  return Regs[TFI->hasFP(MF)][TT.isArch64Bit()];
+}
+
+bool WebAssemblyRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  return !MF.getFunction()->hasFnAttribute("no-realign-stack");
+}
+
+// FIXME: share this with other backends with identical implementation?
+bool WebAssemblyRegisterInfo::needsStackRealignment(
+    const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const WebAssemblyFrameLowering *TFI = getFrameLowering(MF);
+  const Function *F = MF.getFunction();
+  unsigned StackAlign = TFI->getStackAlignment();
+  bool requiresRealignment =
+      ((MFI->getMaxAlignment() > StackAlign) ||
+       F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::StackAlignment));
+
+  return requiresRealignment && canRealignStack(MF);
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
index 55300287a51e..dbdb9d0457af 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
@@ -16,6 +16,9 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYREGISTERINFO_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYREGISTERINFO_H
 
+#define GET_REGINFO_HEADER
+#include "WebAssemblyGenRegisterInfo.inc"
+
 namespace llvm {
 
 class MachineFunction;
@@ -23,11 +26,25 @@ class RegScavenger;
 class TargetRegisterClass;
 class Triple;
 
-class WebAssemblyRegisterInfo final {
+class WebAssemblyRegisterInfo final : public WebAssemblyGenRegisterInfo {
   const Triple &TT;
 
 public:
   explicit WebAssemblyRegisterInfo(const Triple &TT);
+
+  // Code Generation virtual methods.
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  // Base pointer (stack realignment) support.
+  bool canRealignStack(const MachineFunction &MF) const;
+  bool needsStackRealignment(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 7b3d636a2605..2ba42eb94a40 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -6,10 +6,11 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file describes the WebAssembly register classes and some nominal
-// physical registers.
-//
+///
+/// \file
+/// \brief This file describes the WebAssembly register classes and some nominal
+/// physical registers.
+///
 //===----------------------------------------------------------------------===//
 
 class WebAssemblyReg<string n> : Register<n> {
@@ -23,6 +24,31 @@ class WebAssemblyRegClass<list<ValueType> regTypes, int alignment, dag regList>
 // Registers
 //===----------------------------------------------------------------------===//
 
+// Special registers used as the frame and stack pointer.
+//
+// WebAssembly may someday supports mixed 32-bit and 64-bit heaps in the same
+// application, which requires separate width FP and SP.
+def FP32 : WebAssemblyReg<"%FP32">;
+def FP64 : WebAssemblyReg<"%FP64">;
+def SP32 : WebAssemblyReg<"%SP32">;
+def SP64 : WebAssemblyReg<"%SP64">;
+
+// TODO(jfb) The following comes from NVPTX. Is it really needed, or can we do
+//           away with it? Try deleting once the backend works.
+// WebAssembly uses virtual registers, but the backend defines a few physical
+// registers here to keep SDAG and the MachineInstr layers happy.
+foreach i = 0-4 in {
+  def I#i : WebAssemblyReg<"%i."#i>; // i32
+  def L#i : WebAssemblyReg<"%l."#i>; // i64
+  def F#i : WebAssemblyReg<"%f."#i>; // f32
+  def D#i : WebAssemblyReg<"%d."#i>; // f64
+}
+
 //===----------------------------------------------------------------------===//
 //  Register classes
 //===----------------------------------------------------------------------===//
+
+def Int32 : WebAssemblyRegClass<[i32], 32, (add (sequence "I%u", 0, 4), SP32)>;
+def Int64 : WebAssemblyRegClass<[i64], 64, (add (sequence "L%u", 0, 4), SP64)>;
+def Float32 : WebAssemblyRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
+def Float64 : WebAssemblyRegClass<[f64], 64, (add (sequence "D%u", 0, 4))>;
diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index cfd1bafff236..fae9c6100510 100644
--- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -17,7 +17,4 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-selectiondag-info"
 
-WebAssemblySelectionDAGInfo::WebAssemblySelectionDAGInfo(const DataLayout *DL)
-    : TargetSelectionDAGInfo(DL) {}
-
 WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() {}
diff --git a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index 03e8d393558d..13d96671276d 100644
--- a/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -22,7 +22,6 @@ namespace llvm {
 
 class WebAssemblySelectionDAGInfo final : public TargetSelectionDAGInfo {
 public:
-  explicit WebAssemblySelectionDAGInfo(const DataLayout *DL);
   ~WebAssemblySelectionDAGInfo() override;
 };
 
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index addea8e3cc36..3d9e7aacbfbf 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -42,7 +42,7 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
                                            const TargetMachine &TM)
     : WebAssemblyGenSubtargetInfo(TT, CPU, FS), HasSIMD128(false),
       CPUString(CPU), TargetTriple(TT), FrameLowering(),
-      InstrInfo(initializeSubtargetDependencies(FS)),
-      TSInfo(TM.getDataLayout()), TLInfo(TM, *this) {}
+      InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
+      TLInfo(TM, *this) {}
 
 bool WebAssemblySubtarget::enableMachineScheduler() const { return true; }
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 08bd88c06985..7ffb6047b963 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -31,7 +31,6 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
   typedef TargetTransformInfo TTI;
   friend BaseT;
 
-  const WebAssemblyTargetMachine *TM;
   const WebAssemblySubtarget *ST;
   const WebAssemblyTargetLowering *TLI;
 
@@ -40,30 +39,15 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
 
 public:
   WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, Function &F)
-      : BaseT(TM), TM(TM), ST(TM->getSubtargetImpl(F)),
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   WebAssemblyTTIImpl(const WebAssemblyTTIImpl &Arg)
-      : BaseT(static_cast<const BaseT &>(Arg)), TM(Arg.TM), ST(Arg.ST),
-        TLI(Arg.TLI) {}
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
   WebAssemblyTTIImpl(WebAssemblyTTIImpl &&Arg)
-      : BaseT(std::move(static_cast<BaseT &>(Arg))), TM(std::move(Arg.TM)),
-        ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {}
-  WebAssemblyTTIImpl &operator=(const WebAssemblyTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    TM = RHS.TM;
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  WebAssemblyTTIImpl &operator=(WebAssemblyTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    TM = std::move(RHS.TM);
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
 
   /// \name Scalar TTI Implementations
   /// @{
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 3cad9fa1e2ae..91b144a44824 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -878,6 +878,29 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
+  case X86::EXTRQI:
+    if (MI->getOperand(2).isImm() &&
+        MI->getOperand(3).isImm())
+      DecodeEXTRQIMask(MI->getOperand(2).getImm(),
+                       MI->getOperand(3).getImm(),
+                       ShuffleMask);
+
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    break;
+
+  case X86::INSERTQI:
+    if (MI->getOperand(3).isImm() &&
+        MI->getOperand(4).isImm())
+      DecodeINSERTQIMask(MI->getOperand(3).getImm(),
+                         MI->getOperand(4).getImm(),
+                         ShuffleMask);
+
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    break;
+
   case X86::PMOVZXBWrr:
   case X86::PMOVZXBDrr:
   case X86::PMOVZXBQrr:
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 3e0dc1424609..629802f5dc5e 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -220,7 +220,6 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) {
   case X86::PUSH32i8:  return X86::PUSHi32;
   case X86::PUSH16i8:  return X86::PUSHi16;
   case X86::PUSH64i8:  return X86::PUSH64i32;
-  case X86::PUSH64i16: return X86::PUSH64i32;
   }
 }
 
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
index 89f394582631..ddb764facdbf 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -34,14 +34,16 @@ public:
       report_fatal_error(EC.message());
     StringRef SymName = *SymNameOrErr;
 
-    uint64_t  SymAddr; SymI->getAddress(SymAddr);
+    ErrorOr<uint64_t> SymAddr = SymI->getAddress();
+    if (std::error_code EC = SymAddr.getError())
+      report_fatal_error(EC.message());
     uint64_t SymSize = SymI->getSize();
     int64_t Addend = *ELFRelocationRef(Rel).getAddend();
 
     MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName);
     // FIXME: check that the value is actually the same.
     if (!Sym->isVariable())
-      Sym->setVariableValue(MCConstantExpr::create(SymAddr, Ctx));
+      Sym->setVariableValue(MCConstantExpr::create(*SymAddr, Ctx));
 
     const MCExpr *Expr = nullptr;
     // If hasAddend is true, then we need to add Addend (r_addend) to Expr.
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 431010d4cbc2..83b4091d7665 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -88,9 +88,7 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
   if (CPUName.empty())
     CPUName = "generic";
 
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitX86MCSubtargetInfo(X, TT, CPUName, ArchFS);
-  return X;
+  return createX86MCSubtargetInfoImpl(TT, CPUName, ArchFS);
 }
 
 static MCInstrInfo *createX86MCInstrInfo() {
@@ -99,17 +97,14 @@ static MCInstrInfo *createX86MCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
-  Triple TheTriple(TT);
-  unsigned RA = (TheTriple.getArch() == Triple::x86_64)
-    ? X86::RIP     // Should have dwarf #16.
-    : X86::EIP;    // Should have dwarf #8.
+static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) {
+  unsigned RA = (TT.getArch() == Triple::x86_64)
+                    ? X86::RIP  // Should have dwarf #16.
+                    : X86::EIP; // Should have dwarf #8.
 
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitX86MCRegisterInfo(X, RA,
-                        X86_MC::getDwarfRegFlavour(TheTriple, false),
-                        X86_MC::getDwarfRegFlavour(TheTriple, true),
-                        RA);
+  InitX86MCRegisterInfo(X, RA, X86_MC::getDwarfRegFlavour(TT, false),
+                        X86_MC::getDwarfRegFlavour(TT, true), RA);
   X86_MC::InitLLVM2SEHRegisterMapping(X);
   return X;
 }
@@ -156,24 +151,23 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createX86MCCodeGenInfo(const Triple &TT, Reloc::Model RM,
                                              CodeModel::Model CM,
                                              CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
 
-  Triple T(TT);
-  bool is64Bit = T.getArch() == Triple::x86_64;
+  bool is64Bit = TT.getArch() == Triple::x86_64;
 
   if (RM == Reloc::Default) {
     // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
     // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
     // use static relocation model by default.
-    if (T.isOSDarwin()) {
+    if (TT.isOSDarwin()) {
       if (is64Bit)
         RM = Reloc::PIC_;
       else
         RM = Reloc::DynamicNoPIC;
-    } else if (T.isOSWindows() && is64Bit)
+    } else if (TT.isOSWindows() && is64Bit)
       RM = Reloc::PIC_;
     else
       RM = Reloc::Static;
@@ -186,13 +180,13 @@ static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM,
   if (RM == Reloc::DynamicNoPIC) {
     if (is64Bit)
       RM = Reloc::PIC_;
-    else if (!T.isOSDarwin())
+    else if (!TT.isOSDarwin())
       RM = Reloc::Static;
   }
 
   // If we are on Darwin, disallow static relocation model in X86-64 mode, since
   // the Mach-O file format doesn't support it.
-  if (RM == Reloc::Static && T.isOSDarwin() && is64Bit)
+  if (RM == Reloc::Static && TT.isOSDarwin() && is64Bit)
     RM = Reloc::PIC_;
 
   // For static codegen, if we're not already set, use Small codegen.
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
index c9479b62f7b6..9bfe999424fa 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -34,7 +34,7 @@ public:
     if (std::error_code EC = SymNameOrErr.getError())
       report_fatal_error(EC.message());
     StringRef SymName = *SymNameOrErr;
-    uint64_t  SymAddr; SymI->getAddress(SymAddr);
+    uint64_t SymAddr = SymI->getValue();
 
     any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl());
     bool isPCRel = Obj->getAnyRelocationPCRel(RE);
@@ -90,8 +90,7 @@ public:
         const MCExpr *LHS = MCSymbolRefExpr::create(Sym, Ctx);
 
         symbol_iterator RSymI = Rel.getSymbol();
-        uint64_t RSymAddr;
-        RSymI->getAddress(RSymAddr);
+        uint64_t RSymAddr = RSymI->getValue();
         ErrorOr<StringRef> RSymName = RSymI->getName();
         if (std::error_code EC = RSymName.getError())
           report_fatal_error(EC.message());
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index ef3318ba7580..cae865a40819 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -255,15 +255,13 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
 
 void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask) {
-  if (Imm & 0x88)
-    return; // Not a shuffle
-
   unsigned HalfSize = VT.getVectorNumElements() / 2;
 
   for (unsigned l = 0; l != 2; ++l) {
-    unsigned HalfBegin = ((Imm >> (l * 4)) & 0x3) * HalfSize;
+    unsigned HalfMask = Imm >> (l * 4);
+    unsigned HalfBegin = (HalfMask & 0x3) * HalfSize;
     for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i)
-      ShuffleMask.push_back(i);
+      ShuffleMask.push_back(HalfMask & 8 ? SM_SentinelZero : (int)i);
   }
 }
 
@@ -431,4 +429,78 @@ void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
   for (unsigned i = 1; i < NumElts; i++)
     Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
 }
+
+void DecodeEXTRQIMask(int Len, int Idx,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  // Only the bottom 6 bits are valid for each immediate.
+  Len &= 0x3F;
+  Idx &= 0x3F;
+
+  // We can only decode this bit extraction instruction as a shuffle if both the
+  // length and index work with whole bytes.
+  if (0 != (Len % 8) || 0 != (Idx % 8))
+    return;
+
+  // A length of zero is equivalent to a bit length of 64.
+  if (Len == 0)
+    Len = 64;
+
+  // If the length + index exceeds the bottom 64 bits the result is undefined.
+  if ((Len + Idx) > 64) {
+    ShuffleMask.append(16, SM_SentinelUndef);
+    return;
+  }
+
+  // Convert index and index to work with bytes.
+  Len /= 8;
+  Idx /= 8;
+
+  // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes
+  // of the lower 64-bits. The upper 64-bits are undefined.
+  for (int i = 0; i != Len; ++i)
+    ShuffleMask.push_back(i + Idx);
+  for (int i = Len; i != 8; ++i)
+    ShuffleMask.push_back(SM_SentinelZero);
+  for (int i = 8; i != 16; ++i)
+    ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeINSERTQIMask(int Len, int Idx,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  // Only the bottom 6 bits are valid for each immediate.
+  Len &= 0x3F;
+  Idx &= 0x3F;
+
+  // We can only decode this bit insertion instruction as a shuffle if both the
+  // length and index work with whole bytes.
+  if (0 != (Len % 8) || 0 != (Idx % 8))
+    return;
+
+  // A length of zero is equivalent to a bit length of 64.
+  if (Len == 0)
+    Len = 64;
+
+  // If the length + index exceeds the bottom 64 bits the result is undefined.
+  if ((Len + Idx) > 64) {
+    ShuffleMask.append(16, SM_SentinelUndef);
+    return;
+  }
+
+  // Convert index and index to work with bytes.
+  Len /= 8;
+  Idx /= 8;
+
+  // INSERTQ: Extract lowest Len bytes from lower half of second source and
+  // insert over first source starting at Idx byte. The upper 64-bits are
+  // undefined.
+  for (int i = 0; i != Idx; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = 0; i != Len; ++i)
+    ShuffleMask.push_back(i + 16);
+  for (int i = Idx + Len; i != 8; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = 8; i != 16; ++i)
+    ShuffleMask.push_back(SM_SentinelUndef);
+}
+
 } // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 14b69434806e..3d10d18e860e 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -100,6 +100,14 @@ void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 /// \brief Decode a scalar float move instruction as a shuffle mask.
 void DecodeScalarMoveMask(MVT VT, bool IsLoad,
                           SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask.
+void DecodeEXTRQIMask(int Len, int Idx,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
+void DecodeINSERTQIMask(int Len, int Idx,
+                        SmallVectorImpl<int> &ShuffleMask);
 } // llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 02645460b6a2..b4319c8bb04f 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -317,7 +317,7 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
 }
 
 bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
-  EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
+  EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
   if (evt == MVT::Other || !evt.isSimple())
     // Unhandled type. Halt "fast" selection and bail.
     return false;
@@ -608,7 +608,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
         // Prepare for inserting code in the local-value area.
         SavePoint SaveInsertPt = enterLocalValueArea();
 
-        if (TLI.getPointerTy() == MVT::i64) {
+        if (TLI.getPointerTy(DL) == MVT::i64) {
           Opc = X86::MOV64rm;
           RC  = &X86::GR64RegClass;
 
@@ -690,13 +690,14 @@ redo_gep:
 
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs.
-    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
       return X86SelectAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints.
-    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return X86SelectAddress(U->getOperand(0), AM);
     break;
 
@@ -866,14 +867,14 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs if its operand is in the same BB.
     if (InMBB &&
-        TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+        TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+            TLI.getPointerTy(DL))
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints if its operand is in the same BB.
-    if (InMBB &&
-        TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
   }
@@ -1000,7 +1001,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 
   if (Ret->getNumOperands() > 0) {
     SmallVector<ISD::OutputArg, 4> Outs;
-    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
@@ -1031,7 +1032,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
       return false;
 
     unsigned SrcReg = Reg + VA.getValNo();
-    EVT SrcVT = TLI.getValueType(RV->getType());
+    EVT SrcVT = TLI.getValueType(DL, RV->getType());
     EVT DstVT = VA.getValVT();
     // Special handling for extended integers.
     if (SrcVT != DstVT) {
@@ -1300,7 +1301,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectZExt(const Instruction *I) {
-  EVT DstVT = TLI.getValueType(I->getType());
+  EVT DstVT = TLI.getValueType(DL, I->getType());
   if (!TLI.isTypeLegal(DstVT))
     return false;
 
@@ -1309,7 +1310,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
     return false;
 
   // Handle zero-extension from i1 to i8, which is common.
-  MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType());
+  MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
   if (SrcVT.SimpleTy == MVT::i1) {
     // Set the high bits to zero.
     ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
@@ -1362,7 +1363,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   X86::CondCode CC;
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
-      EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
+      EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());
 
       // Try to optimize or fold the cmp.
       CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
@@ -1802,7 +1803,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
     if (NeedSwap)
       std::swap(CmpLHS, CmpRHS);
 
-    EVT CmpVT = TLI.getValueType(CmpLHS->getType());
+    EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
     // Emit a compare of the LHS and RHS, setting the flags.
     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
       return false;
@@ -2004,7 +2005,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
     if (NeedSwap)
       std::swap(CmpLHS, CmpRHS);
 
-    EVT CmpVT = TLI.getValueType(CmpLHS->getType());
+    EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
       return false;
   } else {
@@ -2166,8 +2167,8 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectTrunc(const Instruction *I) {
-  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
-  EVT DstVT = TLI.getValueType(I->getType());
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(DL, I->getType());
 
   // This code only handles truncation to byte.
   if (DstVT != MVT::i8 && DstVT != MVT::i1)
@@ -2416,7 +2417,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
   }
   case Intrinsic::stackprotector: {
     // Emit code to store the stack guard onto the stack.
-    EVT PtrTy = TLI.getPointerTy();
+    EVT PtrTy = TLI.getPointerTy(DL);
 
     const Value *Op1 = II->getArgOperand(0); // The guard's value.
     const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
@@ -2735,7 +2736,7 @@ bool X86FastISel::fastLowerArguments() {
     if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
       return false;
 
-    EVT ArgVT = TLI.getValueType(ArgTy);
+    EVT ArgVT = TLI.getValueType(DL, ArgTy);
     if (!ArgVT.isSimple()) return false;
     switch (ArgVT.getSimpleVT().SimpleTy) {
     default: return false;
@@ -2772,7 +2773,7 @@ bool X86FastISel::fastLowerArguments() {
   unsigned GPRIdx = 0;
   unsigned FPRIdx = 0;
   for (auto const &Arg : F->args()) {
-    MVT VT = TLI.getSimpleValueType(Arg.getType());
+    MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
     const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
     unsigned SrcReg;
     switch (VT.SimpleTy) {
@@ -3108,7 +3109,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
         GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
       OpFlags = X86II::MO_PLT;
     } else if (Subtarget->isPICStyleStubAny() &&
-               (GV->isDeclaration() || GV->isWeakForLinker()) &&
+               !GV->isStrongDefinitionForLinker() &&
                (!Subtarget->getTargetTriple().isMacOSX() ||
                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
@@ -3240,8 +3241,8 @@ X86FastISel::fastSelectInstruction(const Instruction *I)  {
     return X86SelectSIToFP(I);
   case Instruction::IntToPtr: // Deliberate fall-through.
   case Instruction::PtrToInt: {
-    EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
-    EVT DstVT = TLI.getValueType(I->getType());
+    EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(DL, I->getType());
     if (DstVT.bitsGT(SrcVT))
       return X86SelectZExt(I);
     if (DstVT.bitsLT(SrcVT))
@@ -3384,7 +3385,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
     addDirectMem(MIB, AddrReg);
     MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
         MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
-        TM.getDataLayout()->getPointerSize(), Align);
+        DL.getPointerSize(), Align);
     MIB->addMemOperand(*FuncInfo.MF, MMO);
     return ResultReg;
   }
@@ -3411,17 +3412,17 @@ unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
 
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
     if (TM.getRelocationModel() == Reloc::Static &&
-        TLI.getPointerTy() == MVT::i64) {
+        TLI.getPointerTy(DL) == MVT::i64) {
       // The displacement code could be more than 32 bits away so we need to use
       // an instruction with a 64 bit immediate
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
               ResultReg)
         .addGlobalAddress(GV);
     } else {
-      unsigned Opc = TLI.getPointerTy() == MVT::i32
-                     ? (Subtarget->isTarget64BitILP32()
-                        ? X86::LEA64_32r : X86::LEA32r)
-                     : X86::LEA64r;
+      unsigned Opc =
+          TLI.getPointerTy(DL) == MVT::i32
+              ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+              : X86::LEA64r;
       addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                              TII.get(Opc), ResultReg), AM);
     }
@@ -3431,7 +3432,7 @@ unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
 }
 
 unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
 
   // Only handle simple types.
   if (!CEVT.isSimple())
@@ -3463,11 +3464,11 @@ unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
   X86AddressMode AM;
   if (!X86SelectAddress(C, AM))
     return 0;
-  unsigned Opc = TLI.getPointerTy() == MVT::i32
-                 ? (Subtarget->isTarget64BitILP32()
-                    ? X86::LEA64_32r : X86::LEA32r)
-                 : X86::LEA64r;
-  const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
+  unsigned Opc =
+      TLI.getPointerTy(DL) == MVT::i32
+          ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+          : X86::LEA64r;
+  const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
   unsigned ResultReg = createResultReg(RC);
   addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                          TII.get(Opc), ResultReg), AM);
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 40b9c8a863a3..36a8cdbab55b 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -301,8 +301,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
   bool FPIsUsed = false;
 
   static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned i = 0; i <= 6; ++i)
-    if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
+    if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
       FPIsUsed = true;
       break;
     }
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 85c5b6499131..2a35c4cf31f3 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -90,7 +90,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           TRI->needsStackRealignment(MF) ||
           MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() ||
+          MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
           MMI.callsUnwindInit() || MMI.callsEHReturn() ||
           MFI->hasStackMap() || MFI->hasPatchPoint());
@@ -967,13 +967,26 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       .addReg(StackPtr)
       .setMIFlag(MachineInstr::FrameSetup);
     if (X86FI->getRestoreBasePointer()) {
-      // Stash value of base pointer.  Saving RSP instead of EBP shortens dependence chain.
+      // Stash value of base pointer.  Saving RSP instead of EBP shortens
+      // dependence chain. Used by SjLj EH.
       unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
                    FramePtr, true, X86FI->getRestoreBasePointerOffset())
         .addReg(StackPtr)
         .setMIFlag(MachineInstr::FrameSetup);
     }
+
+    if (X86FI->getHasSEHFramePtrSave()) {
+      // Stash the value of the frame pointer relative to the base pointer for
+      // Win32 EH. This supports Win32 EH, which does the inverse of the above:
+      // it recovers the frame pointer from the base pointer rather than the
+      // other way around.
+      unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), BasePtr, true,
+                   getFrameIndexOffset(MF, X86FI->getSEHFramePtrSaveIndex()))
+          .addReg(FramePtr)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
   }
 
   if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
@@ -1412,9 +1425,11 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
-void
-X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                       RegScavenger *RS) const {
+void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1436,7 +1451,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   // Spill the BasePtr if it's used.
   if (TRI->hasBasePointer(MF))
-    MF.getRegInfo().setPhysRegUsed(TRI->getBaseRegister());
+    SavedRegs.set(TRI->getBaseRegister());
 }
 
 static bool
@@ -1667,8 +1682,6 @@ void X86FrameLowering::adjustForSegmentedStacks(
       .addImm(StackSize);
     BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
       .addImm(X86FI->getArgumentStackSize());
-    MF.getRegInfo().setPhysRegUsed(Reg10);
-    MF.getRegInfo().setPhysRegUsed(Reg11);
   } else {
     BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
       .addImm(X86FI->getArgumentStackSize());
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index c274c8820149..495cfcd1c3f7 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -68,8 +68,8 @@ public:
   void adjustForHiPEPrologue(MachineFunction &MF,
                              MachineBasicBlock &PrologueMBB) const override;
 
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS = nullptr) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
 
   bool
   assignCalleeSavedSpillSlots(MachineFunction &MF,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 6b23e62a2d35..d5351d25d6ed 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -246,8 +246,9 @@ namespace {
                                    SDValue &Index, SDValue &Disp,
                                    SDValue &Segment) {
       Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
-                 ? CurDAG->getTargetFrameIndex(AM.Base_FrameIndex,
-                                               TLI->getPointerTy())
+                 ? CurDAG->getTargetFrameIndex(
+                       AM.Base_FrameIndex,
+                       TLI->getPointerTy(CurDAG->getDataLayout()))
                  : AM.Base_Reg;
       Scale = getI8Imm(AM.Scale, DL);
       Index = AM.IndexReg;
@@ -581,11 +582,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
 void X86DAGToDAGISel::EmitSpecialCodeForMain() {
   if (Subtarget->isTargetCygMing()) {
     TargetLowering::ArgListTy Args;
+    auto &DL = CurDAG->getDataLayout();
 
     TargetLowering::CallLoweringInfo CLI(*CurDAG);
     CLI.setChain(CurDAG->getRoot())
         .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
-                   CurDAG->getExternalSymbol("__main", TLI->getPointerTy()),
+                   CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
                    std::move(Args), 0);
     const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
@@ -1025,7 +1027,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
   switch (N.getOpcode()) {
   default: break;
-  case ISD::FRAME_ALLOC_RECOVER: {
+  case ISD::LOCAL_RECOVER: {
     if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
       if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
         // Use the symbol and don't prefix it.
@@ -1638,7 +1640,8 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
 ///
 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
-  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode();
+  auto &DL = MF->getDataLayout();
+  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
 }
 
 /// Atomic opcode table
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b16bd18aefaa..6e22ab30057c 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -76,7 +76,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     : TargetLowering(TM), Subtarget(&STI) {
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
-  TD = getDataLayout();
+  TD = TM.getDataLayout();
 
   // Set up the TargetLowering object.
   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
@@ -505,7 +505,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(*TD), Custom);
 
   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
@@ -825,6 +825,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 
+    setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
+
     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
@@ -944,6 +949,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
     }
 
+    setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
+
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
@@ -1018,6 +1032,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
 
+    setOperationAction(ISD::SRA,               MVT::v2i64, Custom);
     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
   }
 
@@ -1141,6 +1156,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
 
+      setOperationAction(ISD::SMAX,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::SMAX,            MVT::v16i16, Legal);
+      setOperationAction(ISD::SMAX,            MVT::v8i32,  Legal);
+      setOperationAction(ISD::UMAX,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::UMAX,            MVT::v16i16, Legal);
+      setOperationAction(ISD::UMAX,            MVT::v8i32,  Legal);
+      setOperationAction(ISD::SMIN,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::SMIN,            MVT::v16i16, Legal);
+      setOperationAction(ISD::SMIN,            MVT::v8i32,  Legal);
+      setOperationAction(ISD::UMIN,            MVT::v32i8,  Legal);
+      setOperationAction(ISD::UMIN,            MVT::v16i16, Legal);
+      setOperationAction(ISD::UMIN,            MVT::v8i32,  Legal);
+
       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
       // when we have a 256bit-wide blend with immediate.
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
@@ -1184,6 +1212,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
 
+    setOperationAction(ISD::SRA,               MVT::v4i64, Custom);
     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
 
     // Custom lower several nodes for 256-bit types.
@@ -1376,6 +1405,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SELECT,             MVT::v16i1, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8i1,  Custom);
 
+    setOperationAction(ISD::SMAX,               MVT::v16i32, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v8i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v16i32, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v8i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v16i32, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v8i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
+
     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
 
@@ -1473,6 +1511,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
+    setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
+    setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
@@ -1492,6 +1532,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
 
+    setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v64i8, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v32i16, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v64i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v32i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
+
     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
       const MVT VT = (MVT::SimpleValueType)i;
 
@@ -1531,6 +1580,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
     setOperationAction(ISD::SRA,                MVT::v2i64, Custom);
     setOperationAction(ISD::SRA,                MVT::v4i64, Custom);
+
+    setOperationAction(ISD::SMAX,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v4i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v2i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v4i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v4i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v2i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v4i64, Legal);
   }
 
   // We want to custom lower some of our intrinsics.
@@ -1611,6 +1669,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
   setTargetDAGCombine(ISD::BUILD_VECTOR);
@@ -1652,7 +1711,8 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const {
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
-EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                          EVT VT) const {
   if (!VT.isVector())
     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
 
@@ -1724,10 +1784,11 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
 /// function arguments in the caller parameter area. For X86, aggregates
 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
 /// are at 4-byte boundaries.
-unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
+                                                  const DataLayout &DL) const {
   if (Subtarget->is64Bit()) {
     // Max of 8 and alignment of type.
-    unsigned TyAlign = TD->getABITypeAlignment(Ty);
+    unsigned TyAlign = DL.getABITypeAlignment(Ty);
     if (TyAlign > 8)
       return TyAlign;
     return 8;
@@ -1840,7 +1901,8 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
   if (!Subtarget->is64Bit())
     // This doesn't have SDLoc associated with it, but is not really the
     // same as a Register.
-    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
+    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+                       getPointerTy(DAG.getDataLayout()));
   return Table;
 }
 
@@ -2032,7 +2094,8 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   // false, then an sret argument may be implicitly inserted in the SelDAG. In
   // either case FuncInfo->setSRetReturnReg() will have been called.
   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg,
+                                     getPointerTy(MF.getDataLayout()));
 
     unsigned RetValReg
         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
@@ -2041,7 +2104,8 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     Flag = Chain.getValue(1);
 
     // RAX/EAX now acts like a return value.
-    RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
+    RetOps.push_back(
+        DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
   }
 
   RetOps[0] = Chain;  // Update chain.
@@ -2288,11 +2352,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
     unsigned Bytes = Flags.getByValSize();
     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
-    return DAG.getFrameIndex(FI, getPointerTy());
+    return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
   } else {
     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
                                     VA.getLocMemOffset(), isImmutable);
-    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     SDValue Val =  DAG.getLoad(ValVT, dl, Chain, FIN,
                                MachinePointerInfo::getFixedStack(FI),
                                false, false, false, 0);
@@ -2471,7 +2535,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
     if (Ins[i].Flags.isSRet()) {
       unsigned Reg = FuncInfo->getSRetReturnReg();
       if (!Reg) {
-        MVT PtrTy = getPointerTy();
+        MVT PtrTy = getPointerTy(DAG.getDataLayout());
         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
         FuncInfo->setSRetReturnReg(Reg);
       }
@@ -2499,7 +2563,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
   MachineModuleInfo &MMI = MF.getMMI();
   const Function *WinEHParent = nullptr;
-  if (IsWin64 && MMI.hasWinEHFuncInfo(Fn))
+  if (MMI.hasWinEHFuncInfo(Fn))
     WinEHParent = MMI.getWinEHParent(Fn);
   bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn;
   bool IsWinEHParent = WinEHParent && WinEHParent == Fn;
@@ -2561,11 +2625,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
     // Store the integer parameter registers.
     SmallVector<SDValue, 8> MemOps;
     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
-                                      getPointerTy());
+                                      getPointerTy(DAG.getDataLayout()));
     unsigned Offset = FuncInfo->getVarArgsGPOffset();
     for (SDValue Val : LiveGPRs) {
-      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
-                                DAG.getIntPtrConstant(Offset, dl));
+      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                                RSFIN, DAG.getIntPtrConstant(Offset, dl));
       SDValue Store =
         DAG.getStore(Val.getValue(1), dl, Val, FIN,
                      MachinePointerInfo::getFixedStack(
@@ -2592,7 +2656,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
     if (!MemOps.empty())
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
-  } else if (IsWinEHOutlined) {
+  } else if (IsWin64 && IsWinEHOutlined) {
     // Get to the caller-allocated home save location.  Add 8 to account
     // for the return address.
     int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
@@ -2605,8 +2669,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
     // Store the second integer parameter (rdx) into rsp+16 relative to the
     // stack pointer at the entry of the function.
-    SDValue RSFIN =
-        DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy());
+    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+                                      getPointerTy(DAG.getDataLayout()));
     unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass);
     SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64);
     Chain = DAG.getStore(
@@ -2680,14 +2744,21 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
   FuncInfo->setArgumentStackSize(StackSize);
 
   if (IsWinEHParent) {
-    int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
-    SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
-    MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
-    SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64);
-    Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
-                         MachinePointerInfo::getFixedStack(UnwindHelpFI),
-                         /*isVolatile=*/true,
-                         /*isNonTemporal=*/false, /*Alignment=*/0);
+    if (Is64Bit) {
+      int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
+      SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
+      MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
+      SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64);
+      Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
+                           MachinePointerInfo::getFixedStack(UnwindHelpFI),
+                           /*isVolatile=*/true,
+                           /*isNonTemporal=*/false, /*Alignment=*/0);
+    } else {
+      // Functions using Win32 EH are considered to have opaque SP adjustments
+      // to force local variables to be addressed from the frame or base
+      // pointers.
+      MFI->setHasOpaqueSPAdjustment(true);
+    }
   }
 
   return Chain;
@@ -2701,7 +2772,8 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
                                     ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
-  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                       StackPtr, PtrOff);
   if (Flags.isByVal())
     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
 
@@ -2718,7 +2790,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
                                            bool IsTailCall, bool Is64Bit,
                                            int FPDiff, SDLoc dl) const {
   // Adjust the Return address stack slot.
-  EVT VT = getPointerTy();
+  EVT VT = getPointerTy(DAG.getDataLayout());
   OutRetAddr = getReturnAddressFrameIndex(DAG);
 
   // Load the "old" Return address.
@@ -2942,7 +3014,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       assert(VA.isMemLoc());
       if (!StackPtr.getNode())
         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
-                                      getPointerTy());
+                                      getPointerTy(DAG.getDataLayout()));
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
                                              dl, DAG, VA, Flags));
     }
@@ -2955,8 +3027,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // ELF / PIC requires GOT in the EBX register before function calls via PLT
     // GOT pointer.
     if (!isTailCall) {
-      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
-               DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
+      RegsToPass.push_back(std::make_pair(
+          unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+                                          getPointerTy(DAG.getDataLayout()))));
     } else {
       // If we are tail calling and generating PIC/GOT style code load the
       // address of the callee into ECX. The value in ecx is used as target of
@@ -3036,16 +3109,16 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       int32_t Offset = VA.getLocMemOffset()+FPDiff;
       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
-      FIN = DAG.getFrameIndex(FI, getPointerTy());
+      FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
 
       if (Flags.isByVal()) {
         // Copy relative to framepointer.
         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
         if (!StackPtr.getNode())
-          StackPtr = DAG.getCopyFromReg(Chain, dl,
-                                        RegInfo->getStackRegister(),
-                                        getPointerTy());
-        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                                        getPointerTy(DAG.getDataLayout()));
+        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                             StackPtr, Source);
 
         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
                                                          ArgChain,
@@ -3064,8 +3137,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     // Store the return address to the appropriate stack slot.
     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
-                                     getPointerTy(), RegInfo->getSlotSize(),
-                                     FPDiff, dl);
+                                     getPointerTy(DAG.getDataLayout()),
+                                     RegInfo->getSlotSize(), FPDiff, dl);
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
@@ -3106,7 +3179,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
         OpFlags = X86II::MO_PLT;
       } else if (Subtarget->isPICStyleStubAny() &&
-                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
+                 !GV->isStrongDefinitionForLinker() &&
                  (!Subtarget->getTargetTriple().isMacOSX() ||
                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
         // PC-relative references to external symbols should go through $stub,
@@ -3123,17 +3196,18 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         ExtraLoad = true;
       }
 
-      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
-                                          G->getOffset(), OpFlags);
+      Callee = DAG.getTargetGlobalAddress(
+          GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
 
       // Add a wrapper if needed.
       if (WrapperKind != ISD::DELETED_NODE)
-        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
+        Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
+                             getPointerTy(DAG.getDataLayout()), Callee);
       // Add extra indirection if needed.
       if (ExtraLoad)
-        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
-                             MachinePointerInfo::getGOT(),
-                             false, false, false, 0);
+        Callee = DAG.getLoad(
+            getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
+            MachinePointerInfo::getGOT(), false, false, false, 0);
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     unsigned char OpFlags = 0;
@@ -3152,8 +3226,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       OpFlags = X86II::MO_DARWIN_STUB;
     }
 
-    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
-                                         OpFlags);
+    Callee = DAG.getTargetExternalSymbol(
+        S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
   } else if (Subtarget->isTarget64BitILP32() &&
              Callee->getValueType(0) == MVT::i32) {
     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
@@ -3184,9 +3258,24 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
-  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+  const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
+
+  // If this is an invoke in a 32-bit function using an MSVC personality, assume
+  // the function clobbers all registers. If an exception is thrown, the runtime
+  // will not restore CSRs.
+  // FIXME: Model this more precisely so that we can register allocate across
+  // the normal edge and spill and fill across the exceptional edge.
+  if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
+    const Function *CallerFn = MF.getFunction();
+    EHPersonality Pers =
+        CallerFn->hasPersonalityFn()
+            ? classifyEHPersonality(CallerFn->getPersonalityFn())
+            : EHPersonality::Unknown;
+    if (isMSVCEHPersonality(Pers))
+      Mask = RegInfo->getNoPreservedMask();
+  }
+
   Ops.push_back(DAG.getRegisterMask(Mask));
 
   if (InFlag.getNode())
@@ -3650,7 +3739,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
 
-  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
 }
 
 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
@@ -3881,6 +3970,15 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
   return Subtarget->hasLZCNT();
 }
 
+/// isUndefInRange - Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is undef.
+static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
+  for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
+    if (0 <= Mask[i])
+      return false;
+  return true;
+}
+
 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
 /// the specified range (L, H].
 static bool isUndefOrInRange(int Val, int Low, int Hi) {
@@ -4322,6 +4420,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
 /// shuffles which use a single input multiple times, and in those cases it will
 /// adjust the mask to only have indices within that single input.
+/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero.
 static bool getTargetShuffleMask(SDNode *N, MVT VT,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
@@ -4451,6 +4550,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     if (Mask.empty()) return false;
+    // Mask only contains negative index if an element is zero.
+    if (std::any_of(Mask.begin(), Mask.end(),
+                    [](int M){ return M == SM_SentinelZero; }))
+      return false;
     break;
   case X86ISD::MOVSLDUP:
     DecodeMOVSLDUPMask(VT, Mask);
@@ -4764,7 +4867,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
   MVT ShVT = MVT::v2i64;
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   SrcOp = DAG.getBitcast(ShVT, SrcOp);
-  MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
+  MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
@@ -5082,7 +5185,8 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
       assert(C && "Invalid constant type");
 
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+      SDValue CP =
+          DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
                        MachinePointerInfo::getConstantPool(),
@@ -6857,6 +6961,136 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
   return SDValue();
 }
 
+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
+                                           SDValue V2, ArrayRef<int> Mask,
+                                           SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+
+  int Size = Mask.size();
+  int HalfSize = Size / 2;
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  // Upper half must be undefined.
+  if (!isUndefInRange(Mask, HalfSize, HalfSize))
+    return SDValue();
+
+  // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+  // Remainder of lower half result is zero and upper half is all undef.
+  auto LowerAsEXTRQ = [&]() {
+    // Determine the extraction length from the part of the
+    // lower half that isn't zeroable.
+    int Len = HalfSize;
+    for (; Len >= 0; --Len)
+      if (!Zeroable[Len - 1])
+        break;
+    assert(Len > 0 && "Zeroable shuffle mask");
+
+    // Attempt to match first Len sequential elements from the lower half.
+    SDValue Src;
+    int Idx = -1;
+    for (int i = 0; i != Len; ++i) {
+      int M = Mask[i];
+      if (M < 0)
+        continue;
+      SDValue &V = (M < Size ? V1 : V2);
+      M = M % Size;
+
+      // All mask elements must be in the lower half.
+      if (M > HalfSize)
+        return SDValue();
+
+      if (Idx < 0 || (Src == V && Idx == (M - i))) {
+        Src = V;
+        Idx = M - i;
+        continue;
+      }
+      return SDValue();
+    }
+
+    if (Idx < 0)
+      return SDValue();
+
+    assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+    int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+    int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+    return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
+                       DAG.getConstant(BitLen, DL, MVT::i8),
+                       DAG.getConstant(BitIdx, DL, MVT::i8));
+  };
+
+  if (SDValue ExtrQ = LowerAsEXTRQ())
+    return ExtrQ;
+
+  // INSERTQ: Extract lowest Len elements from lower half of second source and
+  // insert over first source, starting at Idx.
+  // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+  auto LowerAsInsertQ = [&]() {
+    for (int Idx = 0; Idx != HalfSize; ++Idx) {
+      SDValue Base;
+
+      // Attempt to match first source from mask before insertion point.
+      if (isUndefInRange(Mask, 0, Idx)) {
+        /* EMPTY */
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+        Base = V1;
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+        Base = V2;
+      } else {
+        continue;
+      }
+
+      // Extend the extraction length looking to match both the insertion of
+      // the second source and the remaining elements of the first.
+      for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+        SDValue Insert;
+        int Len = Hi - Idx;
+
+        // Match insertion.
+        if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+          Insert = V1;
+        } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+          Insert = V2;
+        } else {
+          continue;
+        }
+
+        // Match the remaining elements of the lower half.
+        if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
+          /* EMPTY */
+        } else if ((!Base || (Base == V1)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
+          Base = V1;
+        } else if ((!Base || (Base == V2)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+                                              Size + Hi)) {
+          Base = V2;
+        } else {
+          continue;
+        }
+
+        // We may not have a base (first source) - this can safely be undefined.
+        if (!Base)
+          Base = DAG.getUNDEF(VT);
+
+        int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+        int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+        return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
+                           DAG.getConstant(BitLen, DL, MVT::i8),
+                           DAG.getConstant(BitIdx, DL, MVT::i8));
+      }
+    }
+
+    return SDValue();
+  };
+
+  if (SDValue InsertQ = LowerAsInsertQ())
+    return InsertQ;
+
+  return SDValue();
+}
+
 /// \brief Lower a vector shuffle as a zero or any extension.
 ///
 /// Given a specific number of elements, element bit width, and extension
@@ -6864,7 +7098,7 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
 /// features of the subtarget.
 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
-    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+    ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   assert(Scale > 1 && "Need a scale to extend.");
   int NumElements = VT.getVectorNumElements();
   int EltBits = VT.getScalarSizeInBits();
@@ -6901,6 +7135,28 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
                         getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
   }
 
+  // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
+  // to 64-bits.
+  if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
+    assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
+    assert(VT.getSizeInBits() == 128 && "Unexpected vector width!");
+
+    SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                             DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                         DAG.getConstant(EltBits, DL, MVT::i8),
+                                         DAG.getConstant(0, DL, MVT::i8)));
+    if (isUndefInRange(Mask, NumElements/2, NumElements/2))
+      return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
+
+    SDValue Hi =
+        DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                    DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                DAG.getConstant(EltBits, DL, MVT::i8),
+                                DAG.getConstant(EltBits, DL, MVT::i8)));
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
+  }
+
   // If this would require more than 2 unpack instructions to expand, use
   // pshufb when available. We can only use more than 2 unpack instructions
   // when zero extending i8 elements which also makes it easier to use pshufb.
@@ -6991,7 +7247,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
       return SDValue();
 
     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
-        DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
+        DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG);
   };
 
   // The widest scale possible for extending is to a 64-bit integer.
@@ -7166,9 +7422,9 @@ static SDValue lowerVectorShuffleAsElementInsertion(
       V2 = DAG.getBitcast(MVT::v2i64, V2);
       V2 = DAG.getNode(
           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
-          DAG.getConstant(
-              V2Index * EltVT.getSizeInBits()/8, DL,
-              DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
+          DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
+                          DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
+                              DAG.getDataLayout(), VT)));
       V2 = DAG.getBitcast(VT, V2);
     }
   }
@@ -8518,6 +8774,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
     return Shift;
 
+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget->hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
+      return V;
+
   // There are special ways we can lower some single-element blends.
   if (NumV2Inputs == 1)
     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
@@ -8670,6 +8931,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
     return ZExt;
 
+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget->hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
+      return V;
+
   int NumV2Elements =
       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
 
@@ -10613,12 +10879,13 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                     MaskEltVT.getSizeInBits());
 
       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
-                                getZeroVector(MaskVT, Subtarget, DAG, dl),
-                                Idx, DAG.getConstant(0, dl, getPointerTy()));
+                                 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
+                                 DAG.getConstant(0, dl, PtrVT));
       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
-                        Perm, DAG.getConstant(0, dl, getPointerTy()));
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
+                         DAG.getConstant(0, dl, PtrVT));
     }
     return SDValue();
   }
@@ -11009,17 +11276,16 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   else if (Subtarget->isPICStyleStubPIC())
     OpFlag = X86II::MO_PIC_BASE_OFFSET;
 
-  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
-                                             CP->getAlignment(),
-                                             CP->getOffset(), OpFlag);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetConstantPool(
+      CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
   SDLoc DL(CP);
-  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
   // With PIC, the address is actually $g + Offset.
   if (OpFlag) {
-    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     SDLoc(), getPointerTy()),
-                         Result);
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   }
 
   return Result;
@@ -11042,17 +11308,16 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   else if (Subtarget->isPICStyleStubPIC())
     OpFlag = X86II::MO_PIC_BASE_OFFSET;
 
-  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
-                                          OpFlag);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
   SDLoc DL(JT);
-  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (OpFlag)
-    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     SDLoc(), getPointerTy()),
-                         Result);
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
 
   return Result;
 }
@@ -11080,24 +11345,24 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
     OpFlag = X86II::MO_DARWIN_NONLAZY;
   }
 
-  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
 
   SDLoc DL(Op);
-  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
       !Subtarget->is64Bit()) {
-    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     SDLoc(), getPointerTy()),
-                         Result);
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   }
 
   // For symbols that require a load from a stub to get the address, emit the
   // load.
   if (isGlobalStubReference(OpFlag))
-    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
+    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
                          MachinePointerInfo::getGOT(), false, false, false, 0);
 
   return Result;
@@ -11112,20 +11377,19 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   SDLoc dl(Op);
-  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
-                                             OpFlags);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
-    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
   else
-    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+    Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (isGlobalRelativeToPICBase(OpFlags)) {
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
-                         Result);
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   }
 
   return Result;
@@ -11139,40 +11403,40 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
   unsigned char OpFlags =
       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
   CodeModel::Model M = DAG.getTarget().getCodeModel();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result;
   if (OpFlags == X86II::MO_NO_FLAG &&
       X86::isOffsetSuitableForCodeModel(Offset, M)) {
     // A direct static reference to a global.
-    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
+    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
     Offset = 0;
   } else {
-    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
+    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
   }
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
-    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
   else
-    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+    Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (isGlobalRelativeToPICBase(OpFlags)) {
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
-                         Result);
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   }
 
   // For globals that require a load from a stub to get the address, emit the
   // load.
   if (isGlobalStubReference(OpFlags))
-    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+    Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
                          MachinePointerInfo::getGOT(), false, false, false, 0);
 
   // If there was a non-zero offset that we didn't fold, create an explicit
   // addition for it.
   if (Offset != 0)
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
-                         DAG.getConstant(Offset, dl, getPointerTy()));
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
+                         DAG.getConstant(Offset, dl, PtrVT));
 
   return Result;
 }
@@ -11336,22 +11600,23 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GA->getGlobal();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Subtarget->isTargetELF()) {
     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
     switch (model) {
       case TLSModel::GeneralDynamic:
         if (Subtarget->is64Bit())
-          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
-        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+          return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+        return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
       case TLSModel::LocalDynamic:
-        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
+        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
                                            Subtarget->is64Bit());
       case TLSModel::InitialExec:
       case TLSModel::LocalExec:
-        return LowerToTLSExecModel(
-            GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
-            DAG.getTarget().getRelocationModel() == Reloc::PIC_);
+        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(),
+                                   DAG.getTarget().getRelocationModel() ==
+                                       Reloc::PIC_);
     }
     llvm_unreachable("Unknown TLS model.");
   }
@@ -11374,13 +11639,12 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
                                                 GA->getValueType(0),
                                                 GA->getOffset(), OpFlag);
-    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+    SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
 
     // With PIC32, the address is actually $g + Offset.
     if (PIC32)
-      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                           DAG.getNode(X86ISD::GlobalBaseReg,
-                                       SDLoc(), getPointerTy()),
+      Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
+                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
                            Offset);
 
     // Lowering the machine isd will make sure everything is in the right
@@ -11397,8 +11661,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // And our return value (tls address) is in the standard call return value
     // location.
     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
-    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
-                              Chain.getValue(1));
+    return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
   }
 
   if (Subtarget->isTargetKnownWindowsMSVC() ||
@@ -11426,50 +11689,50 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                                         : Type::getInt32PtrTy(*DAG.getContext(),
                                                               257));
 
-    SDValue TlsArray =
-        Subtarget->is64Bit()
-            ? DAG.getIntPtrConstant(0x58, dl)
-            : (Subtarget->isTargetWindowsGNU()
-                   ? DAG.getIntPtrConstant(0x2C, dl)
-                   : DAG.getExternalSymbol("_tls_array", getPointerTy()));
+    SDValue TlsArray = Subtarget->is64Bit()
+                           ? DAG.getIntPtrConstant(0x58, dl)
+                           : (Subtarget->isTargetWindowsGNU()
+                                  ? DAG.getIntPtrConstant(0x2C, dl)
+                                  : DAG.getExternalSymbol("_tls_array", PtrVT));
 
     SDValue ThreadPointer =
-        DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
-                    MachinePointerInfo(Ptr), false, false, false, 0);
+        DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false,
+                    false, false, 0);
 
     SDValue res;
     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
       res = ThreadPointer;
     } else {
       // Load the _tls_index variable
-      SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
+      SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
       if (Subtarget->is64Bit())
-        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, IDX,
+        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
                              MachinePointerInfo(), MVT::i32, false, false,
                              false, 0);
       else
-        IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
-                          false, false, false, 0);
+        IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false,
+                          false, false, 0);
 
-      SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), dl,
-                                      getPointerTy());
-      IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
+      auto &DL = DAG.getDataLayout();
+      SDValue Scale =
+          DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
+      IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
 
-      res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
+      res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
     }
 
-    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
-                      false, false, false, 0);
+    res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false,
+                      false, 0);
 
     // Get the offset of start of .tls section
     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                              GA->getValueType(0),
                                              GA->getOffset(), X86II::MO_SECREL);
-    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
+    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
 
     // The address of the thread local variable is the add of the thread
     // pointer with the offset of the variable.
-    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
+    return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
   }
 
   llvm_unreachable("TLS not implemented for this target.");
@@ -11564,8 +11827,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
 
   unsigned Size = SrcVT.getSizeInBits()/8;
   MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                                StackSlot,
                                MachinePointerInfo::getFixedStack(SSFI),
@@ -11614,7 +11878,8 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
     MachineFunction &MF = DAG.getMachineFunction();
     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
-    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    auto PtrVT = getPointerTy(MF.getDataLayout());
+    SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
     Tys = DAG.getVTList(MVT::Other);
     SDValue Ops[] = {
       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
@@ -11656,7 +11921,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   // Build some magic constants.
   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   Constant *C0 = ConstantDataVector::get(*Context, CV0);
-  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
 
   SmallVector<Constant*,2> CV1;
   CV1.push_back(
@@ -11666,7 +11932,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
                                       APInt(64, 0x4530000000000000ULL))));
   Constant *C1 = ConstantVector::get(CV1);
-  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
+  SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
 
   // Load the 64-bit value into an XMM register.
   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
@@ -11882,6 +12148,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Op.getValueType().isVector())
     return lowerUINT_TO_FP_vec(Op, DAG);
@@ -11904,9 +12171,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   // Make a 64-bit buffer, and use it to build an FILD.
   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
   if (SrcVT == MVT::i32) {
-    SDValue WordOff = DAG.getConstant(4, dl, getPointerTy());
-    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
-                                     getPointerTy(), StackSlot, WordOff);
+    SDValue WordOff = DAG.getConstant(4, dl, PtrVT);
+    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff);
     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                                   StackSlot, MachinePointerInfo(),
                                   false, false, 0);
@@ -11940,22 +12206,20 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   APInt FF(32, 0x5F800000ULL);
 
   // Check whether the sign bit is set.
-  SDValue SignSet = DAG.getSetCC(dl,
-                                 getSetCCResultType(*DAG.getContext(), MVT::i64),
-                                 Op.getOperand(0),
-                                 DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
+  SDValue SignSet = DAG.getSetCC(
+      dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
+      Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
 
   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
   SDValue FudgePtr = DAG.getConstantPool(
-                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
-                                         getPointerTy());
+      ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
 
   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   SDValue Zero = DAG.getIntPtrConstant(0, dl);
   SDValue Four = DAG.getIntPtrConstant(4, dl);
   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
                                Zero, Four);
-  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
+  FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
 
   // Load the value out, extending it from f32 to f80.
   // FIXME: Avoid the extend by constructing the right constant pool?
@@ -11974,6 +12238,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   SDLoc DL(Op);
 
   EVT DstTy = Op.getValueType();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
@@ -11998,7 +12263,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   MachineFunction &MF = DAG.getMachineFunction();
   unsigned MemSize = DstTy.getSizeInBits()/8;
   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
 
   unsigned Opc;
   if (!IsSigned && isIntegerTypeFTOL(DstTy))
@@ -12032,7 +12297,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
     Chain = Value.getValue(1);
     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
-    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   }
 
   MachineMemOperand *MMO =
@@ -12403,7 +12668,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   Constant *C = ConstantInt::get(*Context, MaskElt);
   C = ConstantVector::getSplat(NumElts, C);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
+  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
@@ -12462,7 +12727,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   CV[0] = ConstantFP::get(*Context,
                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
   Constant *C = ConstantVector::get(CV);
-  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
+  auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+  SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
                               MachinePointerInfo::getConstantPool(),
                               false, false, false, 16);
@@ -12483,7 +12749,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
   }
   C = ConstantVector::get(CV);
-  CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
+  CPIdx = DAG.getConstantPool(C, PtrVT, 16);
   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                             MachinePointerInfo::getConstantPool(),
                             false, false, false, 16);
@@ -13352,8 +13618,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   if (hasMinMax) {
     switch (SetCCOpcode) {
     default: break;
-    case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
-    case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
+    case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
+    case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
     }
 
     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
@@ -14172,8 +14438,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
 
   SmallVector<SDValue, 8> Chains;
   SDValue Ptr = Ld->getBasePtr();
-  SDValue Increment =
-      DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, TLI.getPointerTy());
+  SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
+                                      TLI.getPointerTy(DAG.getDataLayout()));
   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
 
   for (unsigned i = 0; i < NumLoads; ++i) {
@@ -14613,7 +14879,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   EVT VT = Op.getNode()->getValueType(0);
 
   bool Is64Bit = Subtarget->is64Bit();
-  EVT SPTy = getPointerTy();
+  MVT SPTy = getPointerTy(DAG.getDataLayout());
 
   if (SplitStack) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -14630,8 +14896,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                              "have nested arguments.");
     }
 
-    const TargetRegisterClass *AddrRegClass =
-      getRegClassFor(getPointerTy());
+    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
@@ -14666,6 +14931,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
@@ -14674,8 +14940,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
     // memory location argument.
-    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
-                                   getPointerTy());
+    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                         MachinePointerInfo(SV), false, false, 0);
   }
@@ -14695,8 +14960,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MemOps.push_back(Store);
 
   // Store fp_offset
-  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                    FIN, DAG.getIntPtrConstant(4, DL));
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
   Store = DAG.getStore(Op.getOperand(0), DL,
                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
                                        MVT::i32),
@@ -14704,20 +14968,16 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MemOps.push_back(Store);
 
   // Store ptr to overflow_arg_area
-  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                    FIN, DAG.getIntPtrConstant(4, DL));
-  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
-                                    getPointerTy());
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
+  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
                        MachinePointerInfo(SV, 8),
                        false, false, 0);
   MemOps.push_back(Store);
 
   // Store ptr to reg_save_area.
-  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                    FIN, DAG.getIntPtrConstant(8, DL));
-  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
-                                    getPointerTy());
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(8, DL));
+  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
                        MachinePointerInfo(SV, 16), false, false, 0);
   MemOps.push_back(Store);
@@ -14739,7 +14999,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
 
   EVT ArgVT = Op.getNode()->getValueType(0);
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-  uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+  uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
   uint8_t ArgMode;
 
   // Decide which area this value should be read from.
@@ -14768,7 +15028,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
                        DAG.getConstant(ArgMode, dl, MVT::i8),
                        DAG.getConstant(Align, dl, MVT::i32)};
-  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
+  SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
                                           VTs, InstOps, MVT::i64,
                                           MachinePointerInfo(SV),
@@ -14995,6 +15255,20 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
 }
 
+static int getSEHRegistrationNodeSize(const Function *Fn) {
+  if (!Fn->hasPersonalityFn())
+    report_fatal_error(
+        "querying registration node size for function without personality");
+  // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
+  // WinEHStatePass for the full struct definition.
+  switch (classifyEHPersonality(Fn->getPersonalityFn())) {
+  case EHPersonality::MSVC_X86SEH: return 24;
+  case EHPersonality::MSVC_CXX: return 16;
+  default: break;
+  }
+  report_fatal_error("can only recover FP for MSVC EH personality functions");
+}
+
 /// When the 32-bit MSVC runtime transfers control to us, either to an outlined
 /// function or when returning to a parent frame after catching an exception, we
 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
@@ -15009,7 +15283,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
   SDLoc dl;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  MVT PtrVT = TLI.getPointerTy();
+  MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
 
   // It's possible that the parent function no longer has a personality function
   // if the exceptional code was optimized away, in which case we just return
@@ -15017,15 +15291,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
   if (!Fn->hasPersonalityFn())
     return EntryEBP;
 
-  // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
-  // WinEHStatePass for the full struct definition.
-  int RegNodeSize;
-  switch (classifyEHPersonality(Fn->getPersonalityFn())) {
-  default:
-    report_fatal_error("can only recover FP for MSVC EH personality functions");
-  case EHPersonality::MSVC_X86SEH: RegNodeSize = 24; break;
-  case EHPersonality::MSVC_CXX: RegNodeSize = 16; break;
-  }
+  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
 
   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
   // registration.
@@ -15034,7 +15300,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
           GlobalValue::getRealLinkageName(Fn->getName()));
   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
   SDValue RegNodeFrameOffset =
-      DAG.getNode(ISD::FRAME_ALLOC_RECOVER, dl, PtrVT, OffsetSymVal);
+      DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
 
   // RegNodeBase = EntryEBP - RegNodeSize
   // ParentFP = RegNodeBase - RegNodeFrameOffset
@@ -15059,6 +15325,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
     case INTR_TYPE_3OP:
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
         Op.getOperand(2), Op.getOperand(3));
+    case INTR_TYPE_4OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
     case INTR_TYPE_1OP_MASK_RM: {
       SDValue Src = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
@@ -15143,7 +15412,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       SDValue Rnd;
       if (Op.getNumOperands() == 6)
         Rnd = Op.getOperand(5);
-      else 
+      else
         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                               Src1, Src2, Rnd),
@@ -15173,7 +15442,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                               Src1, Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
-    case VPERM_3OP_MASKZ: 
+    case VPERM_3OP_MASKZ:
     case VPERM_3OP_MASK:
     case FMA_OP_MASK3:
     case FMA_OP_MASKZ:
@@ -15499,6 +15768,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
           "llvm.x86.seh.recoverfp must take a function as the first argument");
     return recoverFramePointer(DAG, Fn, IncomingFPOp);
   }
+
+  case Intrinsic::localaddress: {
+    // Returns one of the stack, base, or frame pointer registers, depending on
+    // which is used to reference local variables.
+    MachineFunction &MF = DAG.getMachineFunction();
+    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+    unsigned Reg;
+    if (RegInfo->hasBasePointer(MF))
+      Reg = RegInfo->getBaseRegister();
+    else // This function handles the SP or FP case.
+      Reg = RegInfo->getPtrSizedFrameRegister(MF);
+    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+  }
   }
 }
 
@@ -15712,34 +15994,60 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
 static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget,
                                     SelectionDAG &DAG) {
   MachineFunction &MF = DAG.getMachineFunction();
+  const Function *Fn = MF.getFunction();
   SDLoc dl(Op);
   SDValue Chain = Op.getOperand(0);
 
+  assert(Subtarget->getFrameLowering()->hasFP(MF) &&
+         "using llvm.x86.seh.restoreframe requires a frame pointer");
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  MVT VT = TLI.getPointerTy();
+  MVT VT = TLI.getPointerTy(DAG.getDataLayout());
 
   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   unsigned FrameReg =
       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   unsigned SPReg = RegInfo->getStackRegister();
+  unsigned SlotSize = RegInfo->getSlotSize();
 
   // Get incoming EBP.
   SDValue IncomingEBP =
       DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
 
-  // Load [EBP-24] into SP.
-  SDValue SPAddr =
-      DAG.getNode(ISD::ADD, dl, VT, IncomingEBP, DAG.getConstant(-24, dl, VT));
+  // SP is saved in the first field of every registration node, so load
+  // [EBP-RegNodeSize] into SP.
+  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
+  SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP,
+                               DAG.getConstant(-RegNodeSize, dl, VT));
   SDValue NewSP =
       DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false,
                   false, VT.getScalarSizeInBits() / 8);
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP);
 
-  // FIXME: Restore the base pointer in case of stack realignment!
+  if (!RegInfo->needsStackRealignment(MF)) {
+    // Adjust EBP to point back to the original frame position.
+    SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP);
+    Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP);
+  } else {
+    assert(RegInfo->hasBasePointer(MF) &&
+           "functions with Win32 EH must use frame or base pointer register");
+
+    // Reload the base pointer (ESI) with the adjusted incoming EBP.
+    SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP);
+    Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP);
+
+    // Reload the spilled EBP value, now that the stack and base pointers are
+    // set up.
+    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    X86FI->setHasSEHFramePtrSave(true);
+    int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize);
+    X86FI->setSEHFramePtrSaveIndex(FI);
+    SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT),
+                                MachinePointerInfo(), false, false, false,
+                                VT.getScalarSizeInBits() / 8);
+    Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP);
+  }
 
-  // Adjust EBP to point back to the original frame position.
-  SDValue NewFP = recoverFramePointer(DAG, MF.getFunction(), IncomingEBP);
-  Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP);
   return Chain;
 }
 
@@ -15910,7 +16218,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDLoc dl(Op);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -15969,14 +16277,36 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-unsigned X86TargetLowering::getRegisterByName(const char* RegName,
-                                              EVT VT) const {
+unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              SelectionDAG &DAG) const {
+  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+  const MachineFunction &MF = DAG.getMachineFunction();
+
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("esp", X86::ESP)
                        .Case("rsp", X86::RSP)
+                       .Case("ebp", X86::EBP)
+                       .Case("rbp", X86::RBP)
                        .Default(0);
+
+  if (Reg == X86::EBP || Reg == X86::RBP) {
+    if (!TFI.hasFP(MF))
+      report_fatal_error("register " + StringRef(RegName) +
+                         " is allocatable: function has no frame pointer");
+#ifndef NDEBUG
+    else {
+      const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+      unsigned FrameReg =
+          RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+      assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
+             "Invalid Frame Register!");
+    }
+#endif
+  }
+
   if (Reg)
     return Reg;
+
   report_fatal_error("Invalid register name global variable");
 }
 
@@ -15992,7 +16322,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Handler   = Op.getOperand(2);
   SDLoc dl      (Op);
 
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
@@ -16211,7 +16541,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 
   // Save FP Control Word to stack slot
   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue StackSlot =
+      DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
 
   MachineMemOperand *MMO =
    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
@@ -16572,7 +16903,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
   }
 
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
-                                         getPointerTy());
+                                         getPointerTy(DAG.getDataLayout()));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(InChain)
@@ -16642,9 +16973,9 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
   // If we have a signed multiply but no PMULDQ fix up the high parts of a
   // unsigned multiply.
   if (IsSigned && !Subtarget->hasSSE41()) {
-    SDValue ShAmt =
-        DAG.getConstant(31, dl,
-                        DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
+    SDValue ShAmt = DAG.getConstant(
+        31, dl,
+        DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
@@ -16717,6 +17048,38 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
 
+  auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
+    assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
+    MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+    SDValue Ex = DAG.getBitcast(ExVT, R);
+
+    if (ShiftAmt >= 32) {
+      // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
+      SDValue Upper =
+          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
+      SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+                                                 ShiftAmt - 32, DAG);
+      if (VT == MVT::v2i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
+      if (VT == MVT::v4i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+                                  {9, 1, 11, 3, 13, 5, 15, 7});
+    } else {
+      // SRA upper i32, SHL whole i64 and select lower i32.
+      SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+                                                 ShiftAmt, DAG);
+      SDValue Lower =
+          getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
+      Lower = DAG.getBitcast(ExVT, Lower);
+      if (VT == MVT::v2i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
+      if (VT == MVT::v4i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+                                  {8, 1, 10, 3, 12, 5, 14, 7});
+    }
+    return DAG.getBitcast(VT, Ex);
+  };
+
   // Optimize shl/srl/sra with constant shift amount.
   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
@@ -16725,6 +17088,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
       if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
 
+      // i64 SRA needs to be performed as partial shifts.
+      if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+          Op.getOpcode() == ISD::SRA)
+        return ArithmeticShiftRight64(ShiftAmt);
+
       if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
         unsigned NumElts = VT.getVectorNumElements();
         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
@@ -16808,7 +17176,12 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
       if (ShAmt != ShiftAmt)
         return SDValue();
     }
-    return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+    if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+      return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+    if (Op.getOpcode() == ISD::SRA)
+      return ArithmeticShiftRight64(ShiftAmt);
   }
 
   return SDValue();
@@ -16890,7 +17263,9 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         if (Vals[j] != Amt.getOperand(i + j))
           return SDValue();
     }
-    return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
+
+    if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
+      return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
   }
   return SDValue();
 }
@@ -17042,6 +17417,53 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
     }
   }
 
+  // v4i32 Non Uniform Shifts.
+  // If the shift amount is constant we can shift each lane using the SSE2
+  // immediate shifts, else we need to zero-extend each lane to the lower i64
+  // and shift using the SSE2 variable shifts.
+  // The separate results can then be blended together.
+  if (VT == MVT::v4i32) {
+    unsigned Opc = Op.getOpcode();
+    SDValue Amt0, Amt1, Amt2, Amt3;
+    if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
+      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
+      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
+      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
+    } else {
+      // ISD::SHL is handled above but we include it here for completeness.
+      switch (Opc) {
+      default:
+        llvm_unreachable("Unknown target vector shift node");
+      case ISD::SHL:
+        Opc = X86ISD::VSHL;
+        break;
+      case ISD::SRL:
+        Opc = X86ISD::VSRL;
+        break;
+      case ISD::SRA:
+        Opc = X86ISD::VSRA;
+        break;
+      }
+      // The SSE2 shifts use the lower i64 as the same shift amount for
+      // all lanes and the upper i64 is ignored. These shuffle masks
+      // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
+      SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
+      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
+      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
+      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
+    }
+
+    SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
+    SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
+    SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
+    SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
+    SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
+    SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
+    return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
+  }
+
   if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) {
     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
     unsigned ShiftOpcode = Op->getOpcode();
@@ -17944,7 +18366,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
   // the results are returned via SRet in memory.
   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
+  SDValue Callee =
+      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = isF64
     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
@@ -18443,10 +18866,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::HSUB:               return "X86ISD::HSUB";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
-  case X86ISD::UMAX:               return "X86ISD::UMAX";
-  case X86ISD::UMIN:               return "X86ISD::UMIN";
-  case X86ISD::SMAX:               return "X86ISD::SMAX";
-  case X86ISD::SMIN:               return "X86ISD::SMIN";
   case X86ISD::ABS:                return "X86ISD::ABS";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
@@ -18456,6 +18875,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FMINC:              return "X86ISD::FMINC";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
+  case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
@@ -18478,6 +18899,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   case X86ISD::CVTDQ2PD:           return "X86ISD::CVTDQ2PD";
+  case X86ISD::CVTUDQ2PD:          return "X86ISD::CVTUDQ2PD";
   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   case X86ISD::VSHL:               return "X86ISD::VSHL";
@@ -18594,16 +19016,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::ADDS:               return "X86ISD::ADDS";
   case X86ISD::SUBS:               return "X86ISD::SUBS";
   case X86ISD::AVG:                return "X86ISD::AVG";
+  case X86ISD::MULHRS:             return "X86ISD::MULHRS";
   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
+  case X86ISD::FP_TO_SINT_RND:     return "X86ISD::FP_TO_SINT_RND";
+  case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
   }
   return nullptr;
 }
 
 // isLegalAddressingMode - Return true if the addressing mode represented
 // by AM is legal for this target, for a load/store of the specified type.
-bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              Type *Ty,
+bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
                                               unsigned AS) const {
   // X86 supports extremely general addressing modes.
   CodeModel::Model M = getTargetMachine().getCodeModel();
@@ -19555,7 +19980,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const TargetRegisterClass *AddrRegClass =
-    getRegClassFor(getPointerTy());
+      getRegClassFor(getPointerTy(MF->getDataLayout()));
 
   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
@@ -19750,7 +20175,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
 
   MemOpndSlot = CurOp;
 
-  MVT PVT = getPointerTy();
+  MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
 
@@ -19882,7 +20307,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
 
-  MVT PVT = getPointerTy();
+  MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
 
@@ -21377,7 +21802,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   // alignment is valid.
   unsigned Align = LN0->getAlignment();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
+  unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
       EltVT.getTypeForEVT(*DAG.getContext()));
 
   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
@@ -21513,14 +21938,15 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 
   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
     SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
-    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
+    auto &DL = DAG.getDataLayout();
+    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
       DAG.getConstant(0, dl, VecIdxTy));
     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
       DAG.getConstant(1, dl, VecIdxTy));
 
-    SDValue ShAmt = DAG.getConstant(32, dl,
-      DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
+    SDValue ShAmt = DAG.getConstant(
+        32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
@@ -21539,10 +21965,11 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     // Replace each use (extract) with a load of the appropriate element.
     for (unsigned i = 0; i < 4; ++i) {
       uint64_t Offset = EltSize * i;
-      SDValue OffsetVal = DAG.getConstant(Offset, dl, TLI.getPointerTy());
+      auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+      SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
 
-      SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
-                                       StackPtr, OffsetVal);
+      SDValue ScalarAddr =
+          DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
 
       // Load the scalar.
       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
@@ -21622,16 +22049,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
     default: break;
     case ISD::SETULT:
     case ISD::SETULE:
-      Opc = hasUnsigned ? X86ISD::UMIN : 0u; break;
+      Opc = hasUnsigned ? ISD::UMIN : 0; break;
     case ISD::SETUGT:
     case ISD::SETUGE:
-      Opc = hasUnsigned ? X86ISD::UMAX : 0u; break;
+      Opc = hasUnsigned ? ISD::UMAX : 0; break;
     case ISD::SETLT:
     case ISD::SETLE:
-      Opc = hasSigned ? X86ISD::SMIN : 0u; break;
+      Opc = hasSigned ? ISD::SMIN : 0; break;
     case ISD::SETGT:
     case ISD::SETGE:
-      Opc = hasSigned ? X86ISD::SMAX : 0u; break;
+      Opc = hasSigned ? ISD::SMAX : 0; break;
     }
   // Check for x CC y ? y : x -- a min/max with reversed arms.
   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
@@ -21640,16 +22067,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
     default: break;
     case ISD::SETULT:
     case ISD::SETULE:
-      Opc = hasUnsigned ? X86ISD::UMAX : 0u; break;
+      Opc = hasUnsigned ? ISD::UMAX : 0; break;
     case ISD::SETUGT:
     case ISD::SETUGE:
-      Opc = hasUnsigned ? X86ISD::UMIN : 0u; break;
+      Opc = hasUnsigned ? ISD::UMIN : 0; break;
     case ISD::SETLT:
     case ISD::SETLE:
-      Opc = hasSigned ? X86ISD::SMAX : 0u; break;
+      Opc = hasSigned ? ISD::SMAX : 0; break;
     case ISD::SETGT:
     case ISD::SETGE:
-      Opc = hasSigned ? X86ISD::SMIN : 0u; break;
+      Opc = hasSigned ? ISD::SMIN : 0; break;
     }
   }
 
@@ -22106,7 +22533,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // Check if the selector will be produced by CMPP*/PCMP*
         Cond.getOpcode() == ISD::SETCC &&
         // Check if SETCC has already been promoted
-        TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
+        TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
+            CondVT) {
       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
 
@@ -22826,7 +23254,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
       // We shift all of the values by one. In many cases we do not have
       // hardware support for this operation. This is better expressed as an ADD
       // of two values.
-      if (N1SplatC->getZExtValue() == 1)
+      if (N1SplatC->getAPIntValue() == 1)
         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
     }
 
@@ -23478,7 +23906,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
       return SDValue();
 
     SDValue Ptr = Ld->getBasePtr();
-    SDValue Increment = DAG.getConstant(16, dl, TLI.getPointerTy());
+    SDValue Increment =
+        DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
 
     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                   NumElems/2);
@@ -23687,7 +24116,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
 
-    SDValue Stride = DAG.getConstant(16, dl, TLI.getPointerTy());
+    SDValue Stride =
+        DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout()));
     SDValue Ptr0 = St->getBasePtr();
     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
 
@@ -23760,8 +24190,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
     SmallVector<SDValue, 8> Chains;
-    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, dl,
-                                        TLI.getPointerTy());
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
     SDValue Ptr = St->getBasePtr();
 
     // Perform one or more big stores into memory.
@@ -24659,6 +25089,31 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
+                                        const X86Subtarget *Subtarget) {
+  SDValue Op0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT InVT = Op0.getValueType();
+  EVT InSVT = InVT.getScalarType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
+  // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
+  if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
+    SDLoc dl(N);
+    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                 InVT.getVectorNumElements());
+    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+
+    if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
+      return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
+
+    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+  }
+
+  return SDValue();
+}
+
 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                         const X86Subtarget *Subtarget) {
   // First try to optimize away the conversion entirely when it's
@@ -24913,6 +25368,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
+  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG, Subtarget);
   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
@@ -25135,7 +25591,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
       AsmPieces.clear();
-      const std::string &ConstraintsStr = IA->getConstraintString();
+      StringRef ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
       if (clobbersFlagRegisters(AsmPieces))
@@ -25149,7 +25605,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
       AsmPieces.clear();
-      const std::string &ConstraintsStr = IA->getConstraintString();
+      StringRef ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
       if (clobbersFlagRegisters(AsmPieces))
@@ -25176,7 +25632,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 X86TargetLowering::ConstraintType
-X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+X86TargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'R':
@@ -25508,7 +25964,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
 std::pair<unsigned, const TargetRegisterClass *>
 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                const std::string &Constraint,
+                                                StringRef Constraint,
                                                 MVT VT) const {
   // First, see if this is a constraint that directly corresponds to an LLVM
   // register class.
@@ -25717,8 +26173,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   return Res;
 }
 
-int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                            Type *Ty,
+int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
+                                            const AddrMode &AM, Type *Ty,
                                             unsigned AS) const {
   // Scaling factors are not free at all.
   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
@@ -25738,7 +26194,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
   // E.g., on Haswell:
   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
-  if (isLegalAddressingMode(AM, Ty, AS))
+  if (isLegalAddressingMode(DL, AM, Ty, AS))
     // Scale represents reg2 * scale, thus account for 1
     // as soon as we use a second register.
     return AM.Scale != 0;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 17660891635c..723d5304495c 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -233,12 +233,6 @@ namespace llvm {
       /// Floating point horizontal sub.
       FHSUB,
 
-      /// Unsigned integer max and min.
-      UMAX, UMIN,
-
-      /// Signed integer max and min.
-      SMAX, SMIN,
-
       // Integer absolute value
       ABS,
 
@@ -298,8 +292,8 @@ namespace llvm {
       // Vector FP round.
       VFPROUND,
 
-      // Vector signed integer to double.
-      CVTDQ2PD,
+      // Vector signed/unsigned integer to double.
+      CVTDQ2PD, CVTUDQ2PD,
 
       // 128-bit vector logical left / right shift
       VSHLDQ, VSRLDQ,
@@ -400,10 +394,15 @@ namespace llvm {
       VINSERT,
       VEXTRACT,
 
+      /// SSE4A Extraction and Insertion.
+      EXTRQI, INSERTQI,
+
       // Vector multiply packed unsigned doubleword integers
       PMULUDQ,
       // Vector multiply packed signed doubleword integers
       PMULDQ,
+      // Vector Multiply Packed UnsignedIntegers with Round and Scale
+      MULHRS,
 
       // FMA nodes
       FMADD,
@@ -429,6 +428,9 @@ namespace llvm {
       //with rounding mode
       SINT_TO_FP_RND,
       UINT_TO_FP_RND,
+
+      // Vector float/double to signed/unsigned integer.
+      FP_TO_SINT_RND, FP_TO_UINT_RND,
       // Save xmm argument registers to the stack, according to %al. An operator
       // is needed so that this can be expanded with control flow.
       VASTART_SAVE_XMM_REGS,
@@ -599,7 +601,9 @@ namespace llvm {
     unsigned getJumpTableEncoding() const override;
     bool useSoftFloat() const override;
 
-    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; }
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i8;
+    }
 
     const MCExpr *
     LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
@@ -617,7 +621,8 @@ namespace llvm {
     /// function arguments in the caller parameter area. For X86, aggregates
     /// that contains are placed at 16-byte boundaries while the rest are at
     /// 4-byte boundaries.
-    unsigned getByValTypeAlignment(Type *Ty) const override;
+    unsigned getByValTypeAlignment(Type *Ty,
+                                   const DataLayout &DL) const override;
 
     /// Returns the target specific optimal type for load
     /// and store operations as a result of memset, memcpy, and memmove
@@ -685,7 +690,8 @@ namespace llvm {
     bool isCheapToSpeculateCtlz() const override;
 
     /// Return the value type to use for ISD::SETCC.
-    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
 
     /// Determine which of the bits specified in Mask are known to be either
     /// zero or one and return them in the KnownZero/KnownOne bitsets.
@@ -707,8 +713,7 @@ namespace llvm {
 
     bool ExpandInlineAsm(CallInst *CI) const override;
 
-    ConstraintType
-      getConstraintType(const std::string &Constraint) const override;
+    ConstraintType getConstraintType(StringRef Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
@@ -726,8 +731,8 @@ namespace llvm {
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
 
-    unsigned getInlineAsmMemConstraint(
-        const std::string &ConstraintCode) const override {
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "i")
         return InlineAsm::Constraint_i;
       else if (ConstraintCode == "o")
@@ -745,13 +750,12 @@ namespace llvm {
     /// error, this returns a register number of 0.
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     /// Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                               unsigned AS) const override;
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
 
     /// Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
@@ -770,7 +774,7 @@ namespace llvm {
     /// of the specified type.
     /// If the AM is supported, the return value must be >= 0.
     /// If the AM is not supported, it returns a negative value.
-    int getScalingFactorCost(const AddrMode &AM, Type *Ty,
+    int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
     bool isVectorShiftByScalarCheap(Type *Ty) const override;
@@ -872,7 +876,8 @@ namespace llvm {
       return nullptr; // nothing to do, move along.
     }
 
-    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
 
     /// This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index b309b8210851..faa91500b181 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -3136,6 +3136,12 @@ defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul,
                                    SSE_INTALU_ITINS_P, HasBWI, 1>;
 defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul,
                                    SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
+defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulh", mulhs, SSE_INTALU_ITINS_P,
+                                    HasBWI, 1>;
+defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhu", mulhu, SSE_INTMUL_ITINS_P, 
+                                    HasBWI, 1>;
+defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrs", X86mulhrs, SSE_INTMUL_ITINS_P, 
+                                    HasBWI, 1>, T8PD;
 defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
                                    
@@ -3230,32 +3236,32 @@ let Predicates = [HasBWI] in {
   defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W;
 }
 
-defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", X86smax,
+defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", smax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", X86smax,
+defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", smax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", X86smax,
+defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax,
                                      SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
-defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", X86umax,
+defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", umax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", X86umax,
+defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", umax,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", X86umax,
+defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax,
                                      SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
-defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", X86smin,
+defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", smin,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", X86smin,
+defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", smin,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", X86smin,
+defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin,
                                      SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
-defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", X86umin,
+defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", umin,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>;
-defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", X86umin,
+defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", umin,
                                      SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
-defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", X86umin,
+defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
                                      SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
 //===----------------------------------------------------------------------===//
@@ -4035,7 +4041,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
               OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
               !strconcat("$src2, ${src3}", _.BroadcastStr ),
               (OpNode _.RC:$src1,
-               _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,	
+               _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
               AVX512FMA3Base, EVEX_B;
   }
 }
@@ -4394,16 +4400,16 @@ def : Pat<(f64 (sint_to_fp GR32:$src)),
 def : Pat<(f64 (sint_to_fp GR64:$src)),
           (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
 
-defm VCVTUSI2SSZ   : avx512_vcvtsi_common<0x7B, X86SuintToFpRnd, GR32,
+defm VCVTUSI2SSZ   : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR32,
                                   v4f32x_info, i32mem, loadi32,
                                   "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86SuintToFpRnd, GR64,
+defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
                                   v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">,
                                   XS, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, X86SuintToFpRnd, GR32, v2f64x_info,
+defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, X86UintToFpRnd, GR32, v2f64x_info,
                                   i32mem, loadi32, "cvtusi2sd{l}">,
                                   XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86SuintToFpRnd, GR64,
+defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
                                   v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
                                   XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
@@ -4604,117 +4610,389 @@ def : Pat<(extloadf32 addr:$src),
 def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>,
            Requires<[HasAVX512]>;
 
-multiclass avx512_vcvt_fp_with_rc<bits<8> opc, string asm, RegisterClass SrcRC,
-               RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag,
-               X86MemOperand x86memop, ValueType OpVT, ValueType InVT,
-               Domain d> {
-let hasSideEffects = 0 in {
-  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst,
-                (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
-  def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
-              !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
-              [], d>, EVEX, EVEX_B, EVEX_RC;
-  let mayLoad = 1 in
-  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst,
-                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
-} // hasSideEffects = 0
+//===----------------------------------------------------------------------===//
+// AVX-512  Vector convert from signed/unsigned integer to float/double
+//          and from float/double to signed/unsigned integer
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNode,
+                         string Broadcast = _.BroadcastStr,
+                         string Alias = ""> {
+
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _Src.RC:$src), OpcodeStr, "$src", "$src",
+                         (_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX;
+
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _Src.MemOp:$src), OpcodeStr#Alias, "$src", "$src",
+                         (_.VT (OpNode (_Src.VT
+                             (bitconvert (_Src.LdFrag addr:$src)))))>, EVEX;
+
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _Src.MemOp:$src), OpcodeStr,
+                         "${src}"##Broadcast, "${src}"##Broadcast,
+                         (_.VT (OpNode (_Src.VT
+                                  (X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
+                            ))>, EVEX, EVEX_B;
+}
+// Coversion with SAE - suppress all exceptions
+multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _Src.RC:$src), OpcodeStr,
+                        "{sae}, $src", "$src, {sae}",
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src),
+                               (i32 FROUND_NO_EXC)))>,
+                        EVEX, EVEX_B;
 }
 
-multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC,
-               RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag,
-               X86MemOperand x86memop, ValueType OpVT, ValueType InVT,
-               Domain d> {
-let hasSideEffects = 0 in {
-  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst,
-                (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
-  let mayLoad = 1 in
-  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [(set DstRC:$dst,
-                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
-} // hasSideEffects = 0
+// Conversion with rounding control (RC)
+multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
+                        "$rc, $src", "$src, $rc",
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>,
+                        EVEX, EVEX_B, EVEX_RC;
 }
 
-defm VCVTPD2PSZ : avx512_vcvt_fp_with_rc<0x5A, "vcvtpd2ps", VR512, VR256X, fround,
-                                loadv8f64, f512mem, v8f32, v8f64,
-                                SSEPackedSingle>, EVEX_V512, VEX_W, PD,
-                                EVEX_CD8<64, CD8VF>;
+// Extend Float to Double
+multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fextend>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
+                                X86vfpextRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
+                               X86vfpext, "{1to2}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fextend>,
+                                     EVEX_V256;
+  }
+}
+
+// Truncate Double to Float
+multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fround>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
+                               X86vfproundRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
+                               X86vfpround, "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fround,
+                               "{1to4}", "{y}">, EVEX_V256;
+  }
+}
+
+defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps">,
+                                  VEX_W, PD, EVEX_CD8<64, CD8VF>;
+defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd">,
+                                  PS, EVEX_CD8<32, CD8VH>;
 
-defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend,
-                                loadv4f64, f256mem, v8f64, v8f32,
-                                SSEPackedDouble>, EVEX_V512, PS,
-                                EVEX_CD8<32, CD8VH>;
 def : Pat<(v8f64 (extloadv8f32 addr:$src)),
             (VCVTPS2PDZrm addr:$src)>;
 
-def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src),
-                   (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), (i32 FROUND_CURRENT))),
-          (VCVTPD2PSZrr VR512:$src)>;
+let Predicates = [HasVLX] in {
+  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
+              (VCVTPS2PDZ256rm addr:$src)>;
+}
 
-def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src),
-                   (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), imm:$rc)),
-          (VCVTPD2PSZrrb VR512:$src, imm:$rc)>;
+// Convert Signed/Unsigned Doubleword to Double
+multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           SDNode OpNode128> {
+  // No rounding in this op
+  let Predicates = [HasAVX512] in
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode>,
+                                     EVEX_V512;
 
-//===----------------------------------------------------------------------===//
-// AVX-512  Vector convert from sign integer to float/double
-//===----------------------------------------------------------------------===//
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
+                                     OpNode128, "{1to2}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
 
-defm VCVTDQ2PSZ : avx512_vcvt_fp_with_rc<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp,
-                                loadv8i64, i512mem, v16f32, v16i32,
-                                SSEPackedSingle>, EVEX_V512, PS,
-                                EVEX_CD8<32, CD8VF>;
+// Convert Signed/Unsigned Doubleword to Float
+multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
+                               OpNodeRnd>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode>,
+                                     EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
+                                OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
+                                     EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Doubleword
+multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
+                                OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
+                                     EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Double to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
+                                OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+    // memory forms of these instructions in Asm Parcer. They have the same
+    // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+    // due to the same reason.
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
+                               "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+                               "{1to4}", "{y}">, EVEX_V256;
+  }
+}
+
+// Convert Double to Signed/Unsigned Doubleword
+multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+    // memory forms of these instructions in Asm Parcer. They have the same
+    // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+    // due to the same reason.
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
+                               "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+                               "{1to4}", "{y}">, EVEX_V256;
+  }
+}
+
+// Convert Double to Signed/Unsigned Quardword
+multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
+                               EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
 
-defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp,
-                                loadv4i64, i256mem, v8f64, v8i32,
-                                SSEPackedDouble>, EVEX_V512, XS,
+// Convert Double to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
+                               EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Signed/Unsigned Quardword to Double
+multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode>,
+                               EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Quardword
+multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    // Explicitly specified broadcast string, since we take only 2 elements
+    // from v4f32x_info source
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+                               "{1to2}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    // Explicitly specified broadcast string, since we take only 2 elements
+    // from v4f32x_info source
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+                               "{1to2}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Signed/Unsigned Quardword to Float
+multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+    // memory forms of these instructions in Asm Parcer. They have the same
+    // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+    // due to the same reason.
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode,
+                               "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
+                               "{1to4}", "{y}">, EVEX_V256;
+  }
+}
+
+defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86cvtdq2pd>, XS,
                                 EVEX_CD8<32, CD8VH>;
 
-defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint,
-                                 loadv16f32, f512mem, v16i32, v16f32,
-                                 SSEPackedSingle>, EVEX_V512, XS,
+defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
+                                X86VSintToFpRnd>,
+                                PS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
+                                X86VFpToSintRnd>,
+                                XS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint,
+                                 X86VFpToSintRnd>,
+                                 PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
+                                 X86VFpToUintRnd>, PS,
                                  EVEX_CD8<32, CD8VF>;
 
-defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint,
-                                 loadv8f64, f512mem, v8i32, v8f64,
-                                 SSEPackedDouble>, EVEX_V512, PD, VEX_W,
+defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
+                                 X86VFpToUintRnd>, PS, VEX_W,
                                  EVEX_CD8<64, CD8VF>;
 
-defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint,
-                                 loadv16f32, f512mem, v16i32, v16f32,
-                                 SSEPackedSingle>, EVEX_V512, PS,
+defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86cvtudq2pd>,
+                                 XS, EVEX_CD8<32, CD8VH>;
+
+defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
+                                 X86VUintToFpRnd>, XD,
                                  EVEX_CD8<32, CD8VF>;
 
-// cvttps2udq (src, 0, mask-all-ones, sae-current)
-def : Pat<(v16i32 (int_x86_avx512_mask_cvttps2udq_512 (v16f32 VR512:$src),
-                   (v16i32 immAllZerosV), (i16 -1), FROUND_CURRENT)),
-          (VCVTTPS2UDQZrr VR512:$src)>;
+defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtps2Int,
+                                 X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VF>;
 
-defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint,
-                                 loadv8f64, f512mem, v8i32, v8f64,
-                                 SSEPackedDouble>, EVEX_V512, PS, VEX_W,
+defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtpd2Int,
+                                 X86cvtpd2IntRnd>, XD, VEX_W,
                                  EVEX_CD8<64, CD8VF>;
 
-// cvttpd2udq (src, 0, mask-all-ones, sae-current)
-def : Pat<(v8i32 (int_x86_avx512_mask_cvttpd2udq_512 (v8f64 VR512:$src),
-                   (v8i32 immAllZerosV), (i8 -1), FROUND_CURRENT)),
-          (VCVTTPD2UDQZrr VR512:$src)>;
+defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtps2UInt,
+                                 X86cvtps2UIntRnd>,
+                                 PS, EVEX_CD8<32, CD8VF>;
+defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtpd2UInt,
+                                 X86cvtpd2UIntRnd>, VEX_W,
+                                 PS, EVEX_CD8<64, CD8VF>;
 
-defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp,
-                                 loadv4i64, f256mem, v8f64, v8i32,
-                                 SSEPackedDouble>, EVEX_V512, XS,
-                                 EVEX_CD8<32, CD8VH>;
+defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtpd2Int,
+                                 X86cvtpd2IntRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
 
-defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp,
-                                 loadv16i32, f512mem, v16f32, v16i32,
-                                 SSEPackedSingle>, EVEX_V512, XD,
-                                 EVEX_CD8<32, CD8VF>;
+defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtps2Int,
+                                 X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtpd2UInt,
+                                 X86cvtpd2UIntRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtps2UInt,
+                                 X86cvtps2UIntRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint,
+                                 X86VFpToSlongRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint,
+                                 X86VFpToSlongRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint,
+                                 X86VFpToUlongRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint,
+                                 X86VFpToUlongRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
+                            X86VSlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
 
+defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
+                            X86VUlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp,
+                            X86VSlongToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp,
+                            X86VUlongToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;
+
+let Predicates = [NoVLX] in {
 def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
           (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
@@ -4734,67 +5012,8 @@ def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
 def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
           (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
            (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>;
-
-def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src),
-                   (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
-          (VCVTDQ2PSZrrb VR512:$src, imm:$rc)>;
-def : Pat<(v8f64 (int_x86_avx512_mask_cvtdq2pd_512 (v8i32 VR256X:$src),
-                   (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
-          (VCVTDQ2PDZrr VR256X:$src)>;
-def : Pat<(v16f32 (int_x86_avx512_mask_cvtudq2ps_512 (v16i32 VR512:$src),
-                   (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
-          (VCVTUDQ2PSZrrb VR512:$src, imm:$rc)>;
-def : Pat<(v8f64 (int_x86_avx512_mask_cvtudq2pd_512 (v8i32 VR256X:$src),
-                   (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
-          (VCVTUDQ2PDZrr VR256X:$src)>;
-
-multiclass avx512_vcvt_fp2int<bits<8> opc, string asm, RegisterClass SrcRC,
-               RegisterClass DstRC, PatFrag mem_frag,
-               X86MemOperand x86memop, Domain d> {
-let hasSideEffects = 0 in {
-  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [], d>, EVEX;
-  def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc),
-              !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
-              [], d>, EVEX, EVEX_B, EVEX_RC;
-  let mayLoad = 1 in
-  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
-              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
-              [], d>, EVEX;
-} // hasSideEffects = 0
 }
 
-defm VCVTPS2DQZ : avx512_vcvt_fp2int<0x5B, "vcvtps2dq", VR512, VR512,
-                                 loadv16f32, f512mem, SSEPackedSingle>, PD,
-                                 EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VCVTPD2DQZ : avx512_vcvt_fp2int<0xE6, "vcvtpd2dq", VR512, VR256X,
-                                 loadv8f64, f512mem, SSEPackedDouble>, XD, VEX_W,
-                                 EVEX_V512, EVEX_CD8<64, CD8VF>;
-
-def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2dq_512 (v16f32 VR512:$src),
-                    (v16i32 immAllZerosV), (i16 -1), imm:$rc)),
-           (VCVTPS2DQZrrb VR512:$src, imm:$rc)>;
-
-def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2dq_512 (v8f64 VR512:$src),
-                    (v8i32 immAllZerosV), (i8 -1), imm:$rc)),
-           (VCVTPD2DQZrrb VR512:$src, imm:$rc)>;
-
-defm VCVTPS2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtps2udq", VR512, VR512,
-                                 loadv16f32, f512mem, SSEPackedSingle>,
-                                 PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VCVTPD2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtpd2udq", VR512, VR256X,
-                                 loadv8f64, f512mem, SSEPackedDouble>, VEX_W,
-                                 PS, EVEX_V512, EVEX_CD8<64, CD8VF>;
-
-def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2udq_512 (v16f32 VR512:$src),
-                    (v16i32 immAllZerosV), (i16 -1), imm:$rc)),
-           (VCVTPS2UDQZrrb VR512:$src, imm:$rc)>;
-
-def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2udq_512 (v8f64 VR512:$src),
-                    (v8i32 immAllZerosV), (i8 -1), imm:$rc)),
-           (VCVTPD2UDQZrrb VR512:$src, imm:$rc)>;
-
 let Predicates = [HasAVX512] in {
   def : Pat<(v8f32 (fround (loadv8f64 addr:$src))),
             (VCVTPD2PSZrm addr:$src)>;
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index 6ab961f04ecf..4cd5563ce727 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -105,14 +105,16 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in
   // jecxz.
   let Uses = [CX] in
     def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize16;
+                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize16,
+                        Requires<[Not64BitMode]>;
   let Uses = [ECX] in
     def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
                         "jecxz\t$dst", [], IIC_JCXZ>, AdSize32;
 
   let Uses = [RCX] in
     def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
-                           "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64;
+                         "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64,
+                         Requires<[In64BitMode]>;
 }
 
 // Indirect branches
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index fe245c3a7e38..1f61ffa84e9a 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -39,11 +39,6 @@ def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
                                        SDTCisFP<1>, SDTCisVT<3, i8>,
                                        SDTCisVec<1>]>;
 
-def X86umin    : SDNode<"X86ISD::UMIN",      SDTIntBinOp>;
-def X86umax    : SDNode<"X86ISD::UMAX",      SDTIntBinOp>;
-def X86smin    : SDNode<"X86ISD::SMIN",      SDTIntBinOp>;
-def X86smax    : SDNode<"X86ISD::SMAX",      SDTIntBinOp>;
-
 def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
 def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
 
@@ -75,6 +70,9 @@ def X86cmps    : SDNode<"X86ISD::FSETCC",     SDTX86Cmps>;
 def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD",
                  SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
                                       SDTCisVT<1, v4i32>]>>;
+def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD",
+                 SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
+                                      SDTCisVT<1, v4i32>]>>;
 def X86pshufb  : SDNode<"X86ISD::PSHUFB",
                  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
@@ -187,6 +185,7 @@ def X86addus   : SDNode<"X86ISD::ADDUS", SDTIntBinOp>;
 def X86subus   : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
 def X86adds    : SDNode<"X86ISD::ADDS", SDTIntBinOp>;
 def X86subs    : SDNode<"X86ISD::SUBS", SDTIntBinOp>;
+def X86mulhrs  : SDNode<"X86ISD::MULHRS" , SDTIntBinOp>;
 def X86avg     : SDNode<"X86ISD::AVG" , SDTIntBinOp>;
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
@@ -208,6 +207,14 @@ def X86pmuldq  : SDNode<"X86ISD::PMULDQ",
                          SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                        SDTCisSameAs<1,2>]>>;
 
+def X86extrqi : SDNode<"X86ISD::EXTRQI",
+                  SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+                                       SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>;
+def X86insertqi : SDNode<"X86ISD::INSERTQI",
+                    SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+                                         SDTCisSameAs<1,2>, SDTCisVT<3, i8>,
+                                         SDTCisVT<4, i8>]>>;
+
 // Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
 // translated into one of the target nodes below during lowering.
 // Note: this is a work in progress...
@@ -357,8 +364,70 @@ def X86expand  : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
 def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
                                SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisInt<3>]>;
 
-def X86SintToFpRnd   : SDNode<"X86ISD::SINT_TO_FP_RND",  SDTintToFPRound>;
-def X86SuintToFpRnd  : SDNode<"X86ISD::UINT_TO_FP_RND",  SDTintToFPRound>;
+def SDTDoubleToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
+def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>;
+
+def SDTDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>;
+def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>;
+
+def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<0>, SDTCVecEltisVT<1, i32>,
+                                           SDTCisInt<2>]>;
+def SDTVlongToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<0>, SDTCVecEltisVT<1, i64>,
+                                           SDTCisInt<2>]>;
+
+def SDTVFPToIntRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<1>, SDTCVecEltisVT<0, i32>,
+                                           SDTCisInt<2>]>;
+def SDTVFPToLongRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<1>, SDTCVecEltisVT<0, i64>,
+                                           SDTCisInt<2>]>;
+
+// Scalar
+def X86SintToFpRnd  : SDNode<"X86ISD::SINT_TO_FP_RND",  SDTintToFPRound>;
+def X86UintToFpRnd  : SDNode<"X86ISD::UINT_TO_FP_RND",  SDTintToFPRound>;
+
+// Vector with rounding mode
+
+// cvtt fp-to-int staff
+def X86VFpToSintRnd   : SDNode<"ISD::FP_TO_SINT",  SDTVFPToIntRound>;
+def X86VFpToUintRnd   : SDNode<"ISD::FP_TO_UINT",  SDTVFPToIntRound>;
+def X86VFpToSlongRnd  : SDNode<"ISD::FP_TO_SINT",  SDTVFPToLongRound>;
+def X86VFpToUlongRnd  : SDNode<"ISD::FP_TO_UINT",  SDTVFPToLongRound>;
+
+def X86VSintToFpRnd   : SDNode<"ISD::SINT_TO_FP",  SDTVintToFPRound>;
+def X86VUintToFpRnd   : SDNode<"ISD::UINT_TO_FP",  SDTVintToFPRound>;
+def X86VSlongToFpRnd  : SDNode<"ISD::SINT_TO_FP",  SDTVlongToFPRound>;
+def X86VUlongToFpRnd  : SDNode<"ISD::UINT_TO_FP",  SDTVlongToFPRound>;
+
+// cvt fp-to-int staff
+def X86cvtps2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTFloatToIntRnd>;
+def X86cvtps2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTFloatToIntRnd>;
+def X86cvtpd2IntRnd      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTDoubleToIntRnd>;
+def X86cvtpd2UIntRnd     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTDoubleToIntRnd>;
+
+// Vector without rounding mode
+def X86cvtps2Int      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTFloatToInt>;
+def X86cvtps2UInt     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTFloatToInt>;
+def X86cvtpd2Int      : SDNode<"X86ISD::FP_TO_SINT_RND",  SDTDoubleToInt>;
+def X86cvtpd2UInt     : SDNode<"X86ISD::FP_TO_UINT_RND",  SDTDoubleToInt>;
+
+def X86vfpextRnd  : SDNode<"X86ISD::VFPEXT",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCisOpSmallerThanOp<1, 0>,
+                                             SDTCisInt<2>]>>;
+def X86vfproundRnd: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>,
+                                             SDTCVecEltisVT<0, f32>,
+                                             SDTCVecEltisVT<1, f64>,
+                                             SDTCisInt<2>]>>;
 
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index b92ba99fb100..786150760b93 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -269,14 +269,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::XOR8rr,      X86::XOR8mr,     0 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2Addr); i != e; ++i) {
-    unsigned RegOp = MemoryFoldTable2Addr[i].RegOp;
-    unsigned MemOp = MemoryFoldTable2Addr[i].MemOp;
-    unsigned Flags = MemoryFoldTable2Addr[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
     AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                   // Index 0, folded load and store, no alignment requirement.
-                  Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+                  Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
@@ -424,12 +421,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VCVTPS2PHYrr,       X86::VCVTPS2PHYmr,     TB_FOLDED_STORE }
   };
 
-  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable0); i != e; ++i) {
-    unsigned RegOp      = MemoryFoldTable0[i].RegOp;
-    unsigned MemOp      = MemoryFoldTable0[i].MemOp;
-    unsigned Flags      = MemoryFoldTable0[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
     AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
-                  RegOp, MemOp, TB_INDEX_0 | Flags);
+                  Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
@@ -862,14 +856,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable1); i != e; ++i) {
-    unsigned RegOp = MemoryFoldTable1[i].RegOp;
-    unsigned MemOp = MemoryFoldTable1[i].MemOp;
-    unsigned Flags = MemoryFoldTable1[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
     AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                   // Index 1, folded load
-                  Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
+                  Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
@@ -1116,6 +1107,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
     { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
     { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
+    { X86::ROUNDSDr,        X86::ROUNDSDm,      0 },
+    { X86::ROUNDSSr,        X86::ROUNDSSm,      0 },
     { X86::SBB32rr,         X86::SBB32rm,       0 },
     { X86::SBB64rr,         X86::SBB64rm,       0 },
     { X86::SHUFPDrri,       X86::SHUFPDrmi,     TB_ALIGN_16 },
@@ -1412,6 +1405,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLQDQrr,     X86::VPUNPCKLQDQrm,      0 },
     { X86::VPUNPCKLWDrr,      X86::VPUNPCKLWDrm,       0 },
     { X86::VPXORrr,           X86::VPXORrm,            0 },
+    { X86::VROUNDSDr,         X86::VROUNDSDm,          0 },
+    { X86::VROUNDSSr,         X86::VROUNDSSm,          0 },
     { X86::VSHUFPDrri,        X86::VSHUFPDrmi,         0 },
     { X86::VSHUFPSrri,        X86::VSHUFPSrmi,         0 },
     { X86::VSUBPDrr,          X86::VSUBPDrm,           0 },
@@ -1733,14 +1728,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2); i != e; ++i) {
-    unsigned RegOp = MemoryFoldTable2[i].RegOp;
-    unsigned MemOp = MemoryFoldTable2[i].MemOp;
-    unsigned Flags = MemoryFoldTable2[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
     AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                   // Index 2, folded load
-                  Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+                  Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
@@ -1949,14 +1941,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMAXPDZ128rrkz,        X86::VMAXPDZ128rmkz,        0 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable3); i != e; ++i) {
-    unsigned RegOp = MemoryFoldTable3[i].RegOp;
-    unsigned MemOp = MemoryFoldTable3[i].MemOp;
-    unsigned Flags = MemoryFoldTable3[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
     AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                   // Index 3, folded load
-                  Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+                  Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
@@ -2001,14 +1990,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMAXPDZ128rrk,      X86::VMAXPDZ128rmk,        0 }
   };
 
-  for (unsigned i = 0, e = array_lengthof(MemoryFoldTable4); i != e; ++i) {
-    unsigned RegOp = MemoryFoldTable4[i].RegOp;
-    unsigned MemOp = MemoryFoldTable4[i].MemOp;
-    unsigned Flags = MemoryFoldTable4[i].Flags;
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
     AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
-                  RegOp, MemOp,
+                  Entry.RegOp, Entry.MemOp,
                   // Index 4, folded load
-                  Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
+                  Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
   }
 }
 
@@ -3820,7 +3806,7 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
                                                X86::MOVPQIto64rr);
     if (X86::VR64RegClass.contains(SrcReg))
       // Copy from a VR64 register to a GR64 register.
-      return X86::MOVSDto64rr;
+      return X86::MMX_MOVD64from64rr;
   } else if (X86::GR64RegClass.contains(SrcReg)) {
     // Copy from a GR64 register to a VR128 register.
     if (X86::VR128XRegClass.contains(DestReg))
@@ -3828,7 +3814,7 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
                                                X86::MOV64toPQIrr);
     // Copy from a GR64 register to a VR64 register.
     if (X86::VR64RegClass.contains(DestReg))
-      return X86::MOV64toSDrr;
+      return X86::MMX_MOVD64to64rr;
   }
 
   // SrcReg(FR32) -> DestReg(GR32)
@@ -6413,22 +6399,40 @@ static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) {
       hasVirtualRegDefsInBasicBlock(*MI1, MBB) &&
       MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
     return true;
-  
+
   return false;
 }
 
+// TODO: There are many more machine instruction opcodes to match:
+//       1. Other data types (integer, vectors)
+//       2. Other math / logic operations (and, or)
+static bool isAssociativeAndCommutative(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::ADDSDrr:
+  case X86::ADDSSrr:
+  case X86::VADDSDrr:
+  case X86::VADDSSrr:
+  case X86::MULSDrr:
+  case X86::MULSSrr:
+  case X86::VMULSDrr:
+  case X86::VMULSSrr:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// Return true if the input instruction is part of a chain of dependent ops
 /// that are suitable for reassociation, otherwise return false.
 /// If the instruction's operands must be commuted to have a previous
 /// instruction of the same type define the first source operand, Commuted will
 /// be set to true.
-static bool isReassocCandidate(const MachineInstr &Inst, unsigned AssocOpcode,
-                               bool &Commuted) {
-  // 1. The instruction must have the correct type.
+static bool isReassocCandidate(const MachineInstr &Inst, bool &Commuted) {
+  // 1. The operation must be associative and commutative.
   // 2. The instruction must have virtual register definitions for its
   //    operands in the same basic block.
-  // 3. The instruction must have a reassociatable sibling.
-  if (Inst.getOpcode() == AssocOpcode &&
+  // 3. The instruction must have a reassociable sibling.
+  if (isAssociativeAndCommutative(Inst.getOpcode()) &&
       hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) &&
       hasReassocSibling(Inst, Commuted))
     return true;
@@ -6455,14 +6459,8 @@ bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root,
   //   B = A op X (Prev)
   //   C = B op Y (Root)
 
-  // TODO: There are many more associative instruction types to match:
-  //       1. Other forms of scalar FP add (non-AVX)
-  //       2. Other data types (double, integer, vectors)
-  //       3. Other math / logic operations (mul, and, or)
-  unsigned AssocOpcode = X86::VADDSSrr;
-
-  bool Commute = false;
-  if (isReassocCandidate(Root, AssocOpcode, Commute)) {
+  bool Commute;
+  if (isReassocCandidate(Root, Commute)) {
     // We found a sequence of instructions that may be suitable for a
     // reassociation of operands to increase ILP. Specify each commutation
     // possibility for the Prev instruction in the sequence and let the
@@ -6512,7 +6510,7 @@ static void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
   MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]);
   MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]);
   MachineOperand &OpC = Root.getOperand(0);
-  
+
   unsigned RegA = OpA.getReg();
   unsigned RegB = OpB.getReg();
   unsigned RegX = OpX.getReg();
@@ -6547,7 +6545,7 @@ static void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
       .addReg(RegX, getKillRegState(KillX))
       .addReg(RegY, getKillRegState(KillY));
   InsInstrs.push_back(MIB1);
-  
+
   MachineInstrBuilder MIB2 =
     BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC)
       .addReg(RegA, getKillRegState(KillA))
@@ -6579,7 +6577,7 @@ void X86InstrInfo::genAlternativeCodeSequence(
       Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
   }
   assert(Prev && "Unknown pattern for machine combiner");
-  
+
   reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
   return;
 }
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 6f38cb8eaf33..52bab9c79b45 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -194,7 +194,7 @@ def X86rdpmc   : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
 def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
 def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
 
-def X86RecoverFrameAlloc : SDNode<"ISD::FRAME_ALLOC_RECOVER",
+def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER",
                                   SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                        SDTCisInt<1>]>>;
 
@@ -1028,14 +1028,13 @@ def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
                  IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
 
 def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
-                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16,
-                   Requires<[Not64BitMode]>;
+                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+
 def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
                    "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
                    Requires<[Not64BitMode]>;
-def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
-                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16,
-                   Requires<[Not64BitMode]>;
 def PUSHi32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
                    "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
                    Requires<[Not64BitMode]>;
@@ -1081,9 +1080,6 @@ let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
     SchedRW = [WriteStore] in {
 def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
                     "push{q}\t$imm", [], IIC_PUSH_IMM>, Requires<[In64BitMode]>;
-def PUSH64i16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
-                    "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16,
-                    Requires<[In64BitMode]>;
 def PUSH64i32  : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
                     "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
                     Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 2a896dfe8aa8..a5ff9edf05a3 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -4035,13 +4035,13 @@ defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
                              SSE_INTALU_ITINS_P, 0>;
 defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
                              SSE_INTALU_ITINS_P, 0>;
-defm PMINUB  : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8,
+defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
                              SSE_INTALU_ITINS_P, 1>;
-defm PMINSW  : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16,
+defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
                              SSE_INTALU_ITINS_P, 1>;
-defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8,
+defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
                              SSE_INTALU_ITINS_P, 1>;
-defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16,
+defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
                              SSE_INTALU_ITINS_P, 1>;
 
 // Intrinsic forms
@@ -6834,29 +6834,28 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-  let isCommutable = 0 in
-  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
+  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
+  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
+  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
+  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
+  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
+  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
+  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
+  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
   defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
@@ -6865,29 +6864,28 @@ let Predicates = [HasAVX, NoVLX] in {
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
-  let isCommutable = 0 in
-  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
+  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
+  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
+  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
+  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
+  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
+  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
+  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
+  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
   defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
@@ -6896,22 +6894,21 @@ let Predicates = [HasAVX2, NoVLX] in {
 }
 
 let Constraints = "$src1 = $dst" in {
-  let isCommutable = 0 in
-  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
+  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
+  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
+  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
+  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
+  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
+  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
+  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
+  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMULDQ   : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
                                   VR128, memopv2i64, i128mem,
@@ -7773,7 +7770,7 @@ let Constraints = "$src = $dst" in {
 def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
                  (ins VR128:$src, u8imm:$len, u8imm:$idx),
                  "extrq\t{$idx, $len, $src|$src, $len, $idx}",
-                 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
+                 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
                                     imm:$idx))]>, PD;
 def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
               (ins VR128:$src, VR128:$mask),
@@ -7784,8 +7781,8 @@ def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
                    "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
-                   [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
-                                      VR128:$src2, imm:$len, imm:$idx))]>, XD;
+                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
+                                      imm:$len, imm:$idx))]>, XD;
 def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
                  (ins VR128:$src, VR128:$mask),
                  "insertq\t{$mask, $src|$src, $mask}",
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 61a33484b8bf..2c8b95bcba22 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -19,7 +19,7 @@ namespace llvm {
 enum IntrinsicType {
   INTR_NO_TYPE,
   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
-  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
+  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
   CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
   INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
   INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
@@ -213,18 +213,18 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
   X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, ISD::UMIN, 0),
   X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0),
@@ -596,60 +596,69 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_128, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_256, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_512, INTR_TYPE_2OP_MASK, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_b_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_b_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_b_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_d_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_d_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_d_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_q_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_q_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_q_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_w_128, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_w_256, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmins_w_512, INTR_TYPE_2OP_MASK, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_b_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_b_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_b_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_d_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_d_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_d_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_q_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_q_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_q_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_b_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_b_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_b_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_d_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_d_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_d_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_q_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_q_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_q_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_w_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_w_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmins_w_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_b_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_b_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_b_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_d_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_d_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_d_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_q_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_q_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_q_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK,
                      X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK,
                      X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmul_dq_512, INTR_TYPE_2OP_MASK,
                      X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_128, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_256, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_512, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmull_d_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmull_d_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmull_d_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
@@ -1008,10 +1017,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(sse2_pmaxs_w,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(sse2_pmaxu_b,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(sse2_pmins_w,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(sse2_pminu_b,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse2_pmaxs_w,      INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(sse2_pmaxu_b,      INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(sse2_pmins_w,      INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(sse2_pminu_b,      INTR_TYPE_2OP, ISD::UMIN, 0),
   X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
   X86_INTRINSIC_DATA(sse2_pmulhu_w,     INTR_TYPE_2OP, ISD::MULHU, 0),
   X86_INTRINSIC_DATA(sse2_pmulu_dq,     INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
@@ -1049,14 +1058,14 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse3_hsub_ps,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
   X86_INTRINSIC_DATA(sse41_insertps,    INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
   X86_INTRINSIC_DATA(sse41_packusdw,    INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(sse41_pmaxsb,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(sse41_pmaxsd,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
-  X86_INTRINSIC_DATA(sse41_pmaxud,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(sse41_pmaxuw,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
-  X86_INTRINSIC_DATA(sse41_pminsb,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(sse41_pminsd,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
-  X86_INTRINSIC_DATA(sse41_pminud,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
-  X86_INTRINSIC_DATA(sse41_pminuw,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxsb,      INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxsd,      INTR_TYPE_2OP, ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxud,      INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxuw,      INTR_TYPE_2OP, ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pminsb,      INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pminsd,      INTR_TYPE_2OP, ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pminud,      INTR_TYPE_2OP, ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pminuw,      INTR_TYPE_2OP, ISD::UMIN, 0),
   X86_INTRINSIC_DATA(sse41_pmovsxbd,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(sse41_pmovsxbq,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(sse41_pmovsxbw,    INTR_TYPE_1OP, X86ISD::VSEXT, 0),
@@ -1070,6 +1079,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse41_pmovzxwd,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
   X86_INTRINSIC_DATA(sse41_pmovzxwq,    INTR_TYPE_1OP, X86ISD::VZEXT, 0),
   X86_INTRINSIC_DATA(sse41_pmuldq,      INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(sse4a_extrqi,      INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
+  X86_INTRINSIC_DATA(sse4a_insertqi,    INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
   X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index d598b55aae3e..e6db9708b677 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -30,59 +30,67 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// pointer for reasons other than it containing dynamic allocation or
   /// that FP eliminatation is turned off. For example, Cygwin main function
   /// contains stack pointer re-alignment code which requires FP.
-  bool ForceFramePointer;
+  bool ForceFramePointer = false;
 
   /// RestoreBasePointerOffset - Non-zero if the function has base pointer
   /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a
   /// displacement from the frame pointer to a slot where the base pointer
   /// is stashed.
-  signed char RestoreBasePointerOffset;
+  signed char RestoreBasePointerOffset = 0;
 
   /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
   /// stack frame in bytes.
-  unsigned CalleeSavedFrameSize;
+  unsigned CalleeSavedFrameSize = 0;
 
   /// BytesToPopOnReturn - Number of bytes function pops on return (in addition
   /// to the space used by the return address).
   /// Used on windows platform for stdcall & fastcall name decoration
-  unsigned BytesToPopOnReturn;
+  unsigned BytesToPopOnReturn = 0;
 
   /// ReturnAddrIndex - FrameIndex for return slot.
-  int ReturnAddrIndex;
+  int ReturnAddrIndex = 0;
 
   /// \brief FrameIndex for return slot.
-  int FrameAddrIndex;
+  int FrameAddrIndex = 0;
 
   /// TailCallReturnAddrDelta - The number of bytes by which return address
   /// stack slot is moved as the result of tail call optimization.
-  int TailCallReturnAddrDelta;
+  int TailCallReturnAddrDelta = 0;
 
   /// SRetReturnReg - Some subtargets require that sret lowering includes
   /// returning the value of the returned struct in a register. This field
   /// holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg;
+  unsigned SRetReturnReg = 0;
 
   /// GlobalBaseReg - keeps track of the virtual register initialized for
   /// use as the global base register. This is used for PIC in some PIC
   /// relocation models.
-  unsigned GlobalBaseReg;
+  unsigned GlobalBaseReg = 0;
 
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
-  int VarArgsFrameIndex;
+  int VarArgsFrameIndex = 0;
   /// RegSaveFrameIndex - X86-64 vararg func register save area.
-  int RegSaveFrameIndex;
+  int RegSaveFrameIndex = 0;
   /// VarArgsGPOffset - X86-64 vararg func int reg offset.
-  unsigned VarArgsGPOffset;
+  unsigned VarArgsGPOffset = 0;
   /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
-  unsigned VarArgsFPOffset;
+  unsigned VarArgsFPOffset = 0;
   /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
   /// being passed on the stack.
-  unsigned ArgumentStackSize;
+  unsigned ArgumentStackSize = 0;
   /// NumLocalDynamics - Number of local-dynamic TLS accesses.
-  unsigned NumLocalDynamics;
+  unsigned NumLocalDynamics = 0;
   /// HasPushSequences - Keeps track of whether this function uses sequences
   /// of pushes to pass function parameters.
-  bool HasPushSequences;
+  bool HasPushSequences = false;
+
+  /// True if the function uses llvm.x86.seh.restoreframe, and it needed a spill
+  /// slot for the frame pointer.
+  bool HasSEHFramePtrSave = false;
+
+  /// The frame index of a stack object containing the original frame pointer
+  /// used to address arguments in a function using a base pointer.
+  int SEHFramePtrSaveIndex = 0;
 
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
@@ -90,40 +98,9 @@ private:
   SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
 
 public:
-  X86MachineFunctionInfo() : ForceFramePointer(false),
-                             RestoreBasePointerOffset(0),
-                             CalleeSavedFrameSize(0),
-                             BytesToPopOnReturn(0),
-                             ReturnAddrIndex(0),
-                             FrameAddrIndex(0),
-                             TailCallReturnAddrDelta(0),
-                             SRetReturnReg(0),
-                             GlobalBaseReg(0),
-                             VarArgsFrameIndex(0),
-                             RegSaveFrameIndex(0),
-                             VarArgsGPOffset(0),
-                             VarArgsFPOffset(0),
-                             ArgumentStackSize(0),
-                             NumLocalDynamics(0),
-                             HasPushSequences(false) {}
-
-  explicit X86MachineFunctionInfo(MachineFunction &MF)
-    : ForceFramePointer(false),
-      RestoreBasePointerOffset(0),
-      CalleeSavedFrameSize(0),
-      BytesToPopOnReturn(0),
-      ReturnAddrIndex(0),
-      FrameAddrIndex(0),
-      TailCallReturnAddrDelta(0),
-      SRetReturnReg(0),
-      GlobalBaseReg(0),
-      VarArgsFrameIndex(0),
-      RegSaveFrameIndex(0),
-      VarArgsGPOffset(0),
-      VarArgsFPOffset(0),
-      ArgumentStackSize(0),
-      NumLocalDynamics(0),
-      HasPushSequences(false) {}
+  X86MachineFunctionInfo() = default;
+
+  explicit X86MachineFunctionInfo(MachineFunction &MF) {};
 
   bool getForceFramePointer() const { return ForceFramePointer;}
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
@@ -174,6 +151,12 @@ public:
   unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
 
+  bool getHasSEHFramePtrSave() const { return HasSEHFramePtrSave; }
+  void setHasSEHFramePtrSave(bool V) { HasSEHFramePtrSave = V; }
+
+  int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; }
+  void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; }
+
   SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
     return ForwardedMustTailRegParms;
   }
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 0033b5058187..d8495e53e0e3 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -202,7 +202,7 @@ X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
 unsigned
 X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                      MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const X86FrameLowering *TFI = getFrameLowering(MF);
 
   unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
   switch (RC->getID()) {
@@ -343,7 +343,7 @@ X86RegisterInfo::getNoPreservedMask() const {
 
 BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const X86FrameLowering *TFI = getFrameLowering(MF);
 
   // Set the stack-pointer register and its aliases as reserved.
   for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
@@ -452,7 +452,7 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
    // use both the SP and the FP, we need a separate base pointer register.
    bool CantUseFP = needsStackRealignment(MF);
    bool CantUseSP =
-       MFI->hasVarSizedObjects() || MFI->hasInlineAsmWithSPAdjust();
+       MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment();
    return CantUseFP && CantUseSP;
 }
 
@@ -477,9 +477,9 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
 
 bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const X86FrameLowering *TFI = getFrameLowering(MF);
   const Function *F = MF.getFunction();
-  unsigned StackAlign =
-    MF.getSubtarget().getFrameLowering()->getStackAlignment();
+  unsigned StackAlign = TFI->getStackAlignment();
   bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
                               F->hasFnAttribute(Attribute::StackAlignment));
 
@@ -503,7 +503,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                      RegScavenger *RS) const {
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const X86FrameLowering *TFI = getFrameLowering(MF);
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   unsigned BasePtr;
 
@@ -519,18 +519,17 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   else
     BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
 
-  // FRAME_ALLOC uses a single offset, with no register. It only works in the
+  // LOCAL_ESCAPE uses a single offset, with no register. It only works in the
   // simple FP case, and doesn't work with stack realignment. On 32-bit, the
   // offset is from the traditional base pointer location.  On 64-bit, the
   // offset is from the SP at the end of the prologue, not the FP location. This
   // matches the behavior of llvm.frameaddress.
-  if (Opc == TargetOpcode::FRAME_ALLOC) {
+  if (Opc == TargetOpcode::LOCAL_ESCAPE) {
     MachineOperand &FI = MI.getOperand(FIOperandNum);
     bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
     int Offset;
     if (IsWinEH)
-      Offset = static_cast<const X86FrameLowering *>(TFI)
-                     ->getFrameIndexOffsetFromSP(MF, FrameIndex);
+      Offset = TFI->getFrameIndexOffsetFromSP(MF, FrameIndex);
     else
       Offset = TFI->getFrameIndexOffset(MF, FrameIndex);
     FI.ChangeToImmediate(Offset);
@@ -584,7 +583,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 }
 
 unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const X86FrameLowering *TFI = getFrameLowering(MF);
   return TFI->hasFP(MF) ? FramePtr : StackPtr;
 }
 
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 5ca40bc0091b..ce79fcf9ad81 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -24,11 +24,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "x86-selectiondag-info"
 
-X86SelectionDAGInfo::X86SelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-X86SelectionDAGInfo::~X86SelectionDAGInfo() {}
-
 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
     SelectionDAG &DAG, ArrayRef<unsigned> ClobberSet) const {
   // We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -37,7 +32,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
   // dynamic stack adjustments (hopefully rare) and the base pointer would
   // conflict if we had to use it.
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  if (!MFI->hasVarSizedObjects() && !MFI->hasInlineAsmWithSPAdjust())
+  if (!MFI->hasVarSizedObjects() && !MFI->hasOpaqueSPAdjustment())
     return false;
 
   const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
@@ -81,8 +76,9 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
 
     if (const char *bzeroEntry =  V &&
         V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
-      EVT IntPtr = DAG.getTargetLoweringInfo().getPointerTy();
-      Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+      EVT IntPtr =
+          DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+      Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
       Entry.Node = Dst;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index eb7e0ed9de6c..961bd8c8d5ef 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h
@@ -29,8 +29,7 @@ class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
                                  ArrayRef<unsigned> ClobberSet) const;
 
 public:
-  explicit X86SelectionDAGInfo(const DataLayout &DL);
-  ~X86SelectionDAGInfo();
+  explicit X86SelectionDAGInfo() = default;
 
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 3b25d30dc221..dff3624b7efe 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -68,7 +68,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
   if (GV->hasDLLImportStorageClass())
     return X86II::MO_DLLIMPORT;
 
-  bool isDecl = GV->isDeclarationForLinker();
+  bool isDef = GV->isStrongDefinitionForLinker();
 
   // X86-64 in PIC mode.
   if (isPICStyleRIPRel()) {
@@ -80,8 +80,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
       // If symbol visibility is hidden, the extra load is not needed if
       // target is x86-64 or the symbol is definitely defined in the current
       // translation unit.
-      if (GV->hasDefaultVisibility() &&
-          (isDecl || GV->isWeakForLinker()))
+      if (GV->hasDefaultVisibility() && !isDef)
         return X86II::MO_GOTPCREL;
     } else if (!isTargetWin64()) {
       assert(isTargetELF() && "Unknown rip-relative target");
@@ -107,7 +106,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
 
     // If this is a strong reference to a definition, it is definitely not
     // through a stub.
-    if (!isDecl && !GV->isWeakForLinker())
+    if (isDef)
       return X86II::MO_PIC_BASE_OFFSET;
 
     // Unless we have a symbol with hidden visibility, we have to go through a
@@ -117,7 +116,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
 
     // If symbol visibility is hidden, we have a stub for common symbol
     // references and external declarations.
-    if (isDecl || GV->hasCommonLinkage()) {
+    if (GV->isDeclarationForLinker() || GV->hasCommonLinkage()) {
       // Hidden $non_lazy_ptr reference.
       return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE;
     }
@@ -131,7 +130,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
 
     // If this is a strong reference to a definition, it is definitely not
     // through a stub.
-    if (!isDecl && !GV->isWeakForLinker())
+    if (isDef)
       return X86II::MO_NO_FLAG;
 
     // Unless we have a symbol with hidden visibility, we have to go through a
@@ -193,12 +192,9 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       FullFS = "+64bit,+sse2";
   }
 
-  // If feature string is not empty, parse features string.
+  // Parse features string and set the CPU.
   ParseSubtargetFeatures(CPUName, FullFS);
 
-  // Make sure the right MCSchedModel is used.
-  InitCPUSchedModel(CPUName);
-
   InstrItins = getInstrItineraryForCPU(CPUName);
 
   // It's important to keep the MCSubtargetInfo feature bits in sync with
@@ -298,9 +294,8 @@ X86Subtarget::X86Subtarget(const Triple &TT, const std::string &CPU,
                   TargetTriple.getEnvironment() != Triple::CODE16),
       In16BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() == Triple::CODE16),
-      TSInfo(*TM.getDataLayout()),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      FrameLowering(*this, getStackAlignment()) {
+      TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
   // Determine the PICStyle based on the target selected.
   if (TM.getRelocationModel() == Reloc::Static) {
     // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index d420abbe1433..f026d4295f71 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -447,8 +447,26 @@ public:
   }
 
   bool isCallingConvWin64(CallingConv::ID CC) const {
-    return (isTargetWin64() && CC != CallingConv::X86_64_SysV) ||
-           CC == CallingConv::X86_64_Win64;
+    switch (CC) {
+    // On Win64, all these conventions just use the default convention.
+    case CallingConv::C:
+    case CallingConv::Fast:
+    case CallingConv::X86_FastCall:
+    case CallingConv::X86_StdCall:
+    case CallingConv::X86_ThisCall:
+    case CallingConv::X86_VectorCall:
+    case CallingConv::Intel_OCL_BI:
+      return isTargetWin64();
+    // This convention allows using the Win64 convention on other targets.
+    case CallingConv::X86_64_Win64:
+      return true;
+    // This convention allows using the SysV convention on Windows targets.
+    case CallingConv::X86_64_SysV:
+      return false;
+    // Otherwise, who knows what this is.
+    default:
+      return false;
+    }
   }
 
   /// ClassifyGlobalReference - Classify a global variable reference for the
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 0c82a700952b..7df726091843 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -89,7 +89,7 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
@@ -117,6 +117,8 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
 
   static const CostTblEntry<MVT::SimpleValueType>
   AVX2UniformConstCostTable[] = {
+    { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
+
     { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
     { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
     { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
@@ -211,6 +213,7 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v16i8,  4 }, // psrlw, pand, pxor, psubb.
     { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
     { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
+    { ISD::SRA,  MVT::v2i64,  4 }, // 2 x psrad + shuffle.
 
     { ISD::SDIV, MVT::v8i16,  6 }, // pmulhw sequence
     { ISD::UDIV, MVT::v8i16,  6 }, // pmulhuw sequence
@@ -261,12 +264,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost(
 
     { ISD::SRL,  MVT::v16i8,    26 }, // cmpgtb sequence.
     { ISD::SRL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v4i32,  4*10 }, // Scalarized.
+    { ISD::SRL,  MVT::v4i32,    16 }, // Shift each lane + blend.
     { ISD::SRL,  MVT::v2i64,  2*10 }, // Scalarized.
 
     { ISD::SRA,  MVT::v16i8,    54 }, // unpacked cmpgtb sequence.
     { ISD::SRA,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRA,  MVT::v4i32,  4*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v4i32,    16 }, // Shift each lane + blend.
     { ISD::SRA,  MVT::v2i64,  2*10 }, // Scalarized.
 
     // It is not a good idea to vectorize division. We have to scalarize it and
@@ -352,7 +355,7 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 
   if (Kind == TTI::SK_Reverse) {
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
     unsigned Cost = 1;
     if (LT.second.getSizeInBits() > 128)
       Cost = 3; // Extract + insert + copy.
@@ -364,7 +367,7 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   if (Kind == TTI::SK_Alternate) {
     // 64-bit packed float vectors (v2f32) are widened to type v4f32.
     // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
     // The backend knows how to generate a single VEX.256 version of
     // instruction VPBLENDW if the target supports AVX2.
@@ -464,8 +467,8 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src);
-  std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst);
+  std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
+  std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
 
   static const TypeConversionCostTblEntry<MVT::SimpleValueType>
   SSE2ConvTbl[] = {
@@ -537,8 +540,8 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     if (Idx != -1)
       return AVX512ConversionTbl[Idx].Cost;
   }
-  EVT SrcTy = TLI->getValueType(Src);
-  EVT DstTy = TLI->getValueType(Dst);
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
 
   // The function getSimpleVT only handles simple value types.
   if (!SrcTy.isSimple() || !DstTy.isSimple())
@@ -667,7 +670,7 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
 unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                         Type *CondTy) {
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
   MVT MTy = LT.second;
 
@@ -740,7 +743,7 @@ unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 
   if (Index != -1U) {
     // Legalize the type.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
 
     // This type is legalized to a scalar type.
     if (!LT.second.isVector())
@@ -803,7 +806,7 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   }
 
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
          "Invalid Opcode");
 
@@ -850,9 +853,9 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
   }
 
   // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(SrcVTy);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
   unsigned Cost = 0;
-  if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() &&
+  if (LT.second != TLI->getValueType(DL, SrcVTy).getSimpleVT() &&
       LT.second.getVectorNumElements() == NumElem)
     // Promotion requires expand/truncate for data and a shuffle for mask.
     Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) +
@@ -887,7 +890,7 @@ unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
 unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
                                       bool IsPairwise) {
 
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
   MVT MTy = LT.second;
 
@@ -1117,11 +1120,11 @@ unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
 
 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) {
   int DataWidth = DataTy->getPrimitiveSizeInBits();
-  
+
   // Todo: AVX512 allows gather/scatter, works with strided and random as well
   if ((DataWidth < 32) || (Consecutive == 0))
     return false;
-  if (ST->hasAVX512() || ST->hasAVX2()) 
+  if (ST->hasAVX512() || ST->hasAVX2())
     return true;
   return false;
 }
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index a83158440193..da3f36c2e27e 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -40,7 +40,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
 
 public:
   explicit X86TTIImpl(const X86TargetMachine *TM, Function &F)
-      : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   X86TTIImpl(const X86TTIImpl &Arg)
@@ -48,18 +49,6 @@ public:
   X86TTIImpl(X86TTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  X86TTIImpl &operator=(const X86TTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  X86TTIImpl &operator=(X86TTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   /// \name Scalar TTI Implementations
   /// @{
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 90357257b9ef..9190d0be9e4d 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -113,8 +113,8 @@ char WinEHStatePass::ID = 0;
 
 bool WinEHStatePass::doInitialization(Module &M) {
   TheModule = &M;
-  FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::frameescape);
-  FrameRecover = Intrinsic::getDeclaration(TheModule, Intrinsic::framerecover);
+  FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::localescape);
+  FrameRecover = Intrinsic::getDeclaration(TheModule, Intrinsic::localrecover);
   FrameAddress = Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress);
   return false;
 }
@@ -133,7 +133,7 @@ bool WinEHStatePass::doFinalization(Module &M) {
 
 void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const {
   // This pass should only insert a stack allocation, memory accesses, and
-  // framerecovers.
+  // localrecovers.
   AU.setPreservesCFG();
 }
 
@@ -336,9 +336,11 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
   FunctionType *TargetFuncTy =
       FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 5),
                         /*isVarArg=*/false);
-  Function *Trampoline = Function::Create(
-      TrampolineTy, GlobalValue::InternalLinkage,
-      Twine("__ehhandler$") + ParentFunc->getName(), TheModule);
+  Function *Trampoline =
+      Function::Create(TrampolineTy, GlobalValue::InternalLinkage,
+                       Twine("__ehhandler$") + GlobalValue::getRealLinkageName(
+                                                   ParentFunc->getName()),
+                       TheModule);
   BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
   IRBuilder<> Builder(EntryBB);
   Value *LSDA = emitEHLSDA(Builder, ParentFunc);
@@ -419,14 +421,14 @@ void WinEHStatePass::addCXXStateStores(Function &F, MachineModuleInfo &MMI) {
 }
 
 /// Escape RegNode so that we can access it from child handlers. Find the call
-/// to frameescape, if any, in the entry block and append RegNode to the list
+/// to localescape, if any, in the entry block and append RegNode to the list
 /// of arguments.
 int WinEHStatePass::escapeRegNode(Function &F) {
-  // Find the call to frameescape and extract its arguments.
+  // Find the call to localescape and extract its arguments.
   IntrinsicInst *EscapeCall = nullptr;
   for (Instruction &I : F.getEntryBlock()) {
     IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
-    if (II && II->getIntrinsicID() == Intrinsic::frameescape) {
+    if (II && II->getIntrinsicID() == Intrinsic::localescape) {
       EscapeCall = II;
       break;
     }
@@ -440,8 +442,10 @@ int WinEHStatePass::escapeRegNode(Function &F) {
 
   // Replace the call (if it exists) with new one. Otherwise, insert at the end
   // of the entry block.
-  IRBuilder<> Builder(&F.getEntryBlock(),
-                      EscapeCall ? EscapeCall : F.getEntryBlock().end());
+  Instruction *InsertPt = EscapeCall;
+  if (!EscapeCall)
+    InsertPt = F.getEntryBlock().getTerminator();
+  IRBuilder<> Builder(&F.getEntryBlock(), InsertPt);
   Builder.CreateCall(FrameEscape, Args);
   if (EscapeCall)
     EscapeCall->eraseFromParent();
@@ -520,6 +524,11 @@ void WinEHStatePass::addSEHStateStores(Function &F, MachineModuleInfo &MMI) {
           for (auto &Handler : ActionList) {
             if (auto *CH = dyn_cast<CatchHandler>(Handler.get())) {
               auto *BA = cast<BlockAddress>(CH->getHandlerBlockOrFunc());
+#ifndef NDEBUG
+              for (BasicBlock *Pred : predecessors(BA->getBasicBlock()))
+                assert(Pred->isLandingPad() &&
+                       "WinEHPrepare failed to split block");
+#endif
               ExceptBlocks.insert(BA->getBasicBlock());
             }
           }
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index ac954d0a8fa4..b4085835f285 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -40,7 +40,7 @@ static MCInstrInfo *createXCoreMCInstrInfo() {
   return X;
 }
 
-static MCRegisterInfo *createXCoreMCRegisterInfo(StringRef TT) {
+static MCRegisterInfo *createXCoreMCRegisterInfo(const Triple &TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitXCoreMCRegisterInfo(X, XCore::LR);
   return X;
@@ -48,9 +48,7 @@ static MCRegisterInfo *createXCoreMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *
 createXCoreMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitXCoreMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
+  return createXCoreMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
 static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
@@ -64,7 +62,8 @@ static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createXCoreMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+static MCCodeGenInfo *createXCoreMCCodeGenInfo(const Triple &TT,
+                                               Reloc::Model RM,
                                                CodeModel::Model CM,
                                                CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index bd834cc5be4b..76c3d8130e75 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -525,12 +525,15 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-void XCoreFrameLowering::
-processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS) const {
+void XCoreFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                              BitVector &SavedRegs,
+                                              RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
 
-  bool LRUsed = MF.getRegInfo().isPhysRegUsed(XCore::LR);
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool LRUsed = MRI.isPhysRegModified(XCore::LR);
 
   if (!LRUsed && !MF.getFunction()->isVarArg() &&
       MF.getFrameInfo()->estimateStackSize(MF))
@@ -550,7 +553,7 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   if (LRUsed) {
     // We will handle the LR in the prologue/epilogue
     // and allocate space on the stack ourselves.
-    MF.getRegInfo().setPhysRegUnused(XCore::LR);
+    SavedRegs.reset(XCore::LR);
     XFI->createLRSpillSlot(MF);
   }
 
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index 607c77248952..69c71adc8d3f 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -47,8 +47,8 @@ namespace llvm {
 
     bool hasFP(const MachineFunction &MF) const override;
 
-    void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS = nullptr) const override;
+    void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                              RegScavenger *RS = nullptr) const override;
 
     void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                      RegScavenger *RS = nullptr) const override;
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index f5b180b1ac0d..9d4a966dfba4 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -144,10 +144,9 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
                                     MVT::i32, MskSize);
     }
     else if (!isUInt<16>(Val)) {
-      SDValue CPIdx =
-        CurDAG->getTargetConstantPool(ConstantInt::get(
-                              Type::getInt32Ty(*CurDAG->getContext()), Val),
-                                      getTargetLowering()->getPointerTy());
+      SDValue CPIdx = CurDAG->getTargetConstantPool(
+          ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
+          getTargetLowering()->getPointerTy(CurDAG->getDataLayout()));
       SDNode *node = CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32,
                                             MVT::Other, CPIdx,
                                             CurDAG->getEntryNode());
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index aa71241102ff..d62e7428299d 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -281,7 +281,8 @@ static bool IsSmallObject(const GlobalValue *GV, const XCoreTargetLowering &XTL)
   if (!ObjType->isSized())
     return false;
 
-  unsigned ObjSize = XTL.getDataLayout()->getTypeAllocSize(ObjType);
+  auto &DL = GV->getParent()->getDataLayout();
+  unsigned ObjSize = DL.getTypeAllocSize(ObjType);
   return ObjSize < CodeModelLargeSize && ObjSize != 0;
 }
 
@@ -312,8 +313,9 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
     Constant *GAI = ConstantExpr::getGetElementPtr(
         Type::getInt8Ty(*DAG.getContext()), GA, Idx);
     SDValue CP = DAG.getConstantPool(GAI, MVT::i32);
-    return DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), CP,
-                       MachinePointerInfo(), false, false, false, 0);
+    return DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL,
+                       DAG.getEntryNode(), CP, MachinePointerInfo(), false,
+                       false, false, 0);
   }
 }
 
@@ -321,11 +323,11 @@ SDValue XCoreTargetLowering::
 LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
 {
   SDLoc DL(Op);
-
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT);
 
-  return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, getPointerTy(), Result);
+  return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, PtrVT, Result);
 }
 
 SDValue XCoreTargetLowering::
@@ -378,9 +380,10 @@ SDValue XCoreTargetLowering::
 lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base,
                                        int64_t Offset, SelectionDAG &DAG) const
 {
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
   if ((Offset & 0x3) == 0) {
-    return DAG.getLoad(getPointerTy(), DL, Chain, Base, MachinePointerInfo(),
-                       false, false, false, 0);
+    return DAG.getLoad(PtrVT, DL, Chain, Base, MachinePointerInfo(), false,
+                       false, false, 0);
   }
   // Lower to pair of consecutive word aligned loads plus some bit shifting.
   int32_t HighOffset = RoundUpToAlignment(Offset, 4);
@@ -401,11 +404,9 @@ lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base,
   SDValue LowShift = DAG.getConstant((Offset - LowOffset) * 8, DL, MVT::i32);
   SDValue HighShift = DAG.getConstant((HighOffset - Offset) * 8, DL, MVT::i32);
 
-  SDValue Low = DAG.getLoad(getPointerTy(), DL, Chain,
-                            LowAddr, MachinePointerInfo(),
+  SDValue Low = DAG.getLoad(PtrVT, DL, Chain, LowAddr, MachinePointerInfo(),
                             false, false, false, 0);
-  SDValue High = DAG.getLoad(getPointerTy(), DL, Chain,
-                             HighAddr, MachinePointerInfo(),
+  SDValue High = DAG.getLoad(PtrVT, DL, Chain, HighAddr, MachinePointerInfo(),
                              false, false, false, 0);
   SDValue LowShifted = DAG.getNode(ISD::SRL, DL, MVT::i32, Low, LowShift);
   SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High, HighShift);
@@ -435,8 +436,9 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
                                      LD->getAlignment()))
     return SDValue();
 
-  unsigned ABIAlignment = getDataLayout()->
-    getABITypeAlignment(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  auto &TD = DAG.getDataLayout();
+  unsigned ABIAlignment = TD.getABITypeAlignment(
+      LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
   // Leave aligned load alone.
   if (LD->getAlignment() >= ABIAlignment)
     return SDValue();
@@ -486,7 +488,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Lower to a call to __misaligned_load(BasePtr).
-  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = TD.getIntPtrType(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
@@ -495,10 +497,11 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   Args.push_back(Entry);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(DL).setChain(Chain)
-    .setCallee(CallingConv::C, IntPtrTy,
-               DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
-               std::move(Args), 0);
+  CLI.setDebugLoc(DL).setChain(Chain).setCallee(
+      CallingConv::C, IntPtrTy,
+      DAG.getExternalSymbol("__misaligned_load",
+                            getPointerTy(DAG.getDataLayout())),
+      std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   SDValue Ops[] = { CallResult.first, CallResult.second };
@@ -516,8 +519,8 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
                                      ST->getAlignment())) {
     return SDValue();
   }
-  unsigned ABIAlignment = getDataLayout()->
-    getABITypeAlignment(ST->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(
+      ST->getMemoryVT().getTypeForEVT(*DAG.getContext()));
   // Leave aligned store alone.
   if (ST->getAlignment() >= ABIAlignment) {
     return SDValue();
@@ -545,7 +548,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   }
 
   // Lower to a call to __misaligned_store(BasePtr, Value).
-  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
@@ -557,10 +560,11 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   Args.push_back(Entry);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-               DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
-               std::move(Args), 0);
+  CLI.setDebugLoc(dl).setChain(Chain).setCallee(
+      CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+      DAG.getExternalSymbol("__misaligned_store",
+                            getPointerTy(DAG.getDataLayout())),
+      std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
@@ -833,9 +837,9 @@ LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   int FI = XFI->createLRSpillSlot(MF);
   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
-  return DAG.getLoad(getPointerTy(), SDLoc(Op), DAG.getEntryNode(), FIN,
-                     MachinePointerInfo::getFixedStack(FI), false, false,
-                     false, 0);
+  return DAG.getLoad(
+      getPointerTy(DAG.getDataLayout()), SDLoc(Op), DAG.getEntryNode(), FIN,
+      MachinePointerInfo::getFixedStack(FI), false, false, false, 0);
 }
 
 SDValue XCoreTargetLowering::
@@ -979,11 +983,10 @@ LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const {
   if (N->getMemoryVT() == MVT::i32) {
     if (N->getAlignment() < 4)
       report_fatal_error("atomic load must be aligned");
-    return DAG.getLoad(getPointerTy(), SDLoc(Op), N->getChain(),
-                       N->getBasePtr(), N->getPointerInfo(),
-                       N->isVolatile(), N->isNonTemporal(),
-                       N->isInvariant(), N->getAlignment(),
-                       N->getAAInfo(), N->getRanges());
+    return DAG.getLoad(getPointerTy(DAG.getDataLayout()), SDLoc(Op),
+                       N->getChain(), N->getBasePtr(), N->getPointerInfo(),
+                       N->isVolatile(), N->isNonTemporal(), N->isInvariant(),
+                       N->getAlignment(), N->getAAInfo(), N->getRanges());
   }
   if (N->getMemoryVT() == MVT::i16) {
     if (N->getAlignment() < 2)
@@ -1150,9 +1153,10 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = RetCCInfo.getNextStackOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, dl,
-                                 getPointerTy(), true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -1239,11 +1243,8 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
-  Chain = DAG.getCALLSEQ_END(Chain,
-                             DAG.getConstant(NumBytes, dl, getPointerTy(),
-                                             true),
-                             DAG.getConstant(0, dl, getPointerTy(), true),
-                             InFlag, dl);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, dl, PtrVT, true),
+                             DAG.getConstant(0, dl, PtrVT, true), InFlag, dl);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
@@ -1830,7 +1831,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     if (StoreBits % 8) {
       break;
     }
-    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(
+    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(
         ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext()));
     unsigned Alignment = ST->getAlignment();
     if (Alignment >= ABIAlignment) {
@@ -1924,15 +1925,13 @@ static inline bool isImmUs4(int64_t val)
 
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
-bool
-XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                           Type *Ty,
-                                           unsigned AS) const {
+bool XCoreTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
+                                                unsigned AS) const {
   if (Ty->getTypeID() == Type::VoidTyID)
     return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
 
-  const DataLayout *TD = TM.getDataLayout();
-  unsigned Size = TD->getTypeAllocSize(Ty);
+  unsigned Size = DL.getTypeAllocSize(Ty);
   if (AM.BaseGV) {
     return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 &&
                  AM.BaseOffs%4 == 0;
@@ -1970,7 +1969,7 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 
 std::pair<unsigned, const TargetRegisterClass *>
 XCoreTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                  const std::string &Constraint,
+                                                  StringRef Constraint,
                                                   MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 97f0494b6fe3..ddd675c5164d 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -101,7 +101,9 @@ namespace llvm {
 
 
     unsigned getJumpTableEncoding() const override;
-    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+    MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override {
+      return MVT::i32;
+    }
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -120,8 +122,8 @@ namespace llvm {
       EmitInstrWithCustomInserter(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const override;
 
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
-                               unsigned AS) const override;
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
 
   private:
     const TargetMachine &TM;
@@ -175,8 +177,7 @@ namespace llvm {
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 const std::string &Constraint,
-                                 MVT VT) const override;
+                                 StringRef Constraint, MVT VT) const override;
 
     // Expand specifics
     SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 1d569e8936df..1cfb57dc3af3 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -222,7 +222,7 @@ XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     XCore::R8, XCore::R9,
     0
   };
-  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  const XCoreFrameLowering *TFI = getFrameLowering(*MF);
   if (TFI->hasFP(*MF))
     return CalleeSavedRegsFP;
   return CalleeSavedRegs;
@@ -230,7 +230,7 @@ XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 BitVector XCoreRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const XCoreFrameLowering *TFI = getFrameLowering(MF);
 
   Reserved.set(XCore::CP);
   Reserved.set(XCore::DP);
@@ -270,7 +270,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   const XCoreInstrInfo &TII =
       *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
 
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const XCoreFrameLowering *TFI = getFrameLowering(MF);
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
   int StackSize = MF.getFrameInfo()->getStackSize();
 
@@ -324,7 +324,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
 
 unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const XCoreFrameLowering *TFI = getFrameLowering(MF);
 
   return TFI->hasFP(MF) ? XCore::R10 : XCore::SP;
 }
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index a34884480cea..40568d124de0 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -16,12 +16,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "xcore-selectiondag-info"
 
-XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const DataLayout &DL)
-    : TargetSelectionDAGInfo(&DL) {}
-
-XCoreSelectionDAGInfo::~XCoreSelectionDAGInfo() {
-}
-
 SDValue XCoreSelectionDAGInfo::
 EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
                         SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
@@ -36,18 +30,20 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
     const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
     TargetLowering::ArgListTy Args;
     TargetLowering::ArgListEntry Entry;
-    Entry.Ty = TLI.getDataLayout()->getIntPtrType(*DAG.getContext());
+    Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
     Entry.Node = Dst; Args.push_back(Entry);
     Entry.Node = Src; Args.push_back(Entry);
     Entry.Node = Size; Args.push_back(Entry);
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(dl).setChain(Chain)
-      .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
-                 Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()),
-                 std::move(Args), 0)
-      .setDiscardResult();
+    CLI.setDebugLoc(dl)
+        .setChain(Chain)
+        .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                   Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol("__memcpy_4",
+                                         TLI.getPointerTy(DAG.getDataLayout())),
+                   std::move(Args), 0)
+        .setDiscardResult();
 
     std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
     return CallResult.second;
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h
index cfd80b3f3172..77b3527d77e3 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.h
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -22,8 +22,6 @@ class XCoreTargetMachine;
 
 class XCoreSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit XCoreSelectionDAGInfo(const DataLayout &DL);
-  ~XCoreSelectionDAGInfo();
 
   SDValue
   EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp
index c98518b60225..99ad2c88504f 100644
--- a/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/lib/Target/XCore/XCoreSubtarget.cpp
@@ -28,4 +28,4 @@ void XCoreSubtarget::anchor() { }
 XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS, const TargetMachine &TM)
     : XCoreGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this),
-      TLInfo(TM, *this), TSInfo(*TM.getDataLayout()) {}
+      TLInfo(TM, *this), TSInfo() {}
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 370b64b26688..f420081868f9 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -85,6 +85,7 @@ extern "C" void LLVMInitializeXCoreTarget() {
 }
 
 TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis(
-      [this](Function &) { return TargetTransformInfo(XCoreTTIImpl(this)); });
+  return TargetIRAnalysis([this](Function &F) {
+    return TargetTransformInfo(XCoreTTIImpl(this, F));
+  });
 }
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.h b/lib/Target/XCore/XCoreTargetTransformInfo.h
index 70b47dfa1156..e23aef3e3b4a 100644
--- a/lib/Target/XCore/XCoreTargetTransformInfo.h
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -37,8 +37,9 @@ class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> {
   const XCoreTargetLowering *getTLI() const { return TLI; }
 
 public:
-  explicit XCoreTTIImpl(const XCoreTargetMachine *TM)
-      : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
+  explicit XCoreTTIImpl(const XCoreTargetMachine *TM, Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
+        TLI(ST->getTargetLowering()) {}
 
   // Provide value semantics. MSVC requires that we spell all of these out.
   XCoreTTIImpl(const XCoreTTIImpl &Arg)
@@ -46,18 +47,6 @@ public:
   XCoreTTIImpl(XCoreTTIImpl &&Arg)
       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
         TLI(std::move(Arg.TLI)) {}
-  XCoreTTIImpl &operator=(const XCoreTTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  XCoreTTIImpl &operator=(XCoreTTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
 
   unsigned getNumberOfRegisters(bool Vector) {
     if (Vector) {
author	Dimitry Andric <dim@FreeBSD.org>	2015-08-07 23:01:33 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2015-08-07 23:01:33 +0000
commit	ee8648bdac07986a0f1ec897b02ec82a2f144d46 (patch)
tree	52d1861acda1205241ee35a94aa63129c604d469 /lib/Target
parent	1a82d4c088707c791c792f6822f611b47a12bdfe (diff)