vendor/llvm/llvm-trunk-r306325

author: Dimitry Andric <dim@FreeBSD.org> 2017-06-26 20:32:52 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2017-06-26 20:32:52 +0000
commit: 08bbd35a80bf7765fe0d3043f9eb5a2f2786b649 (patch)
tree: 80108f0f128657f8623f8f66ad9735b4d88e7b47 /lib/Target
parent: 7c7aba6e5fef47a01a136be655b0a92cfd7090f6 (diff)
157 files changed, 4121 insertions, 1766 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 3e0e3978b90b5..37b9690d0434a 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -31,6 +31,7 @@ class MachineFunctionPass;
 
 FunctionPass *createAArch64DeadRegisterDefinitions();
 FunctionPass *createAArch64RedundantCopyEliminationPass();
+FunctionPass *createAArch64CondBrTuning();
 FunctionPass *createAArch64ConditionalCompares();
 FunctionPass *createAArch64AdvSIMDScalar();
 FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
@@ -55,6 +56,7 @@ void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
 void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
 void initializeAArch64CollectLOHPass(PassRegistry&);
+void initializeAArch64CondBrTuningPass(PassRegistry &);
 void initializeAArch64ConditionalComparesPass(PassRegistry&);
 void initializeAArch64ConditionOptimizerPass(PassRegistry&);
 void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp
new file mode 100644
index 0000000000000..f27bc97ec3f3e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -0,0 +1,336 @@
+//===-- AArch64CondBrTuning.cpp --- Conditional branch tuning for AArch64 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file contains a pass that transforms CBZ/CBNZ/TBZ/TBNZ instructions
+/// into a conditional branch (B.cond), when the NZCV flags can be set for
+/// "free".  This is preferred on targets that have more flexibility when
+/// scheduling B.cond instructions as compared to CBZ/CBNZ/TBZ/TBNZ (assuming
+/// all other variables are equal).  This can also reduce register pressure.
+///
+/// A few examples:
+///
+/// 1) add w8, w0, w1  -> cmn w0, w1             ; CMN is an alias of ADDS.
+///    cbz w8, .LBB_2  -> b.eq .LBB0_2
+///
+/// 2) add w8, w0, w1  -> adds w8, w0, w1        ; w8 has multiple uses.
+///    cbz w8, .LBB1_2 -> b.eq .LBB1_2
+///
+/// 3) sub w8, w0, w1       -> subs w8, w0, w1   ; w8 has multiple uses.
+///    tbz w8, #31, .LBB6_2 -> b.ge .LBB6_2
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-cond-br-tuning"
+#define AARCH64_CONDBR_TUNING_NAME "AArch64 Conditional Branch Tuning"
+
+namespace {
+class AArch64CondBrTuning : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  MachineRegisterInfo *MRI;
+
+public:
+  static char ID;
+  AArch64CondBrTuning() : MachineFunctionPass(ID) {
+    initializeAArch64CondBrTuningPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override { return AARCH64_CONDBR_TUNING_NAME; }
+
+private:
+  MachineInstr *getOperandDef(const MachineOperand &MO);
+  MachineInstr *convertToFlagSetting(MachineInstr &MI, bool IsFlagSetting);
+  MachineInstr *convertToCondBr(MachineInstr &MI);
+  bool tryToTuneBranch(MachineInstr &MI, MachineInstr &DefMI);
+};
+} // end anonymous namespace
+
+char AArch64CondBrTuning::ID = 0;
+
+INITIALIZE_PASS(AArch64CondBrTuning, "aarch64-cond-br-tuning",
+                AARCH64_CONDBR_TUNING_NAME, false, false)
+
+void AArch64CondBrTuning::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *AArch64CondBrTuning::getOperandDef(const MachineOperand &MO) {
+  if (!TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    return nullptr;
+  return MRI->getUniqueVRegDef(MO.getReg());
+}
+
+MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI,
+                                                        bool IsFlagSetting) {
+  // If this is already the flag setting version of the instruction (e.g., SUBS)
+  // just make sure the implicit-def of NZCV isn't marked dead.
+  if (IsFlagSetting) {
+    for (unsigned I = MI.getNumExplicitOperands(), E = MI.getNumOperands();
+         I != E; ++I) {
+      MachineOperand &MO = MI.getOperand(I);
+      if (MO.isReg() && MO.isDead() && MO.getReg() == AArch64::NZCV)
+        MO.setIsDead(false);
+    }
+    return &MI;
+  }
+  bool Is64Bit;
+  unsigned NewOpc = TII->convertToFlagSettingOpc(MI.getOpcode(), Is64Bit);
+  unsigned NewDestReg = MI.getOperand(0).getReg();
+  if (MRI->hasOneNonDBGUse(MI.getOperand(0).getReg()))
+    NewDestReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                    TII->get(NewOpc), NewDestReg);
+  for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
+    MIB.add(MI.getOperand(I));
+
+  return MIB;
+}
+
+MachineInstr *AArch64CondBrTuning::convertToCondBr(MachineInstr &MI) {
+  AArch64CC::CondCode CC;
+  MachineBasicBlock *TargetMBB = TII->getBranchDestBlock(MI);
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+    CC = AArch64CC::EQ;
+    break;
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    CC = AArch64CC::NE;
+    break;
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+    CC = AArch64CC::GE;
+    break;
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    CC = AArch64CC::LT;
+    break;
+  }
+  return BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::Bcc))
+      .addImm(CC)
+      .addMBB(TargetMBB);
+}
+
+bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
+                                          MachineInstr &DefMI) {
+  // We don't want NZCV bits live across blocks.
+  if (MI.getParent() != DefMI.getParent())
+    return false;
+
+  bool IsFlagSetting = true;
+  unsigned MIOpc = MI.getOpcode();
+  MachineInstr *NewCmp = nullptr, *NewBr = nullptr;
+  switch (DefMI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::ADDWri:
+  case AArch64::ADDWrr:
+  case AArch64::ADDWrs:
+  case AArch64::ADDWrx:
+  case AArch64::ANDWri:
+  case AArch64::ANDWrr:
+  case AArch64::ANDWrs:
+  case AArch64::BICWrr:
+  case AArch64::BICWrs:
+  case AArch64::SUBWri:
+  case AArch64::SUBWrr:
+  case AArch64::SUBWrs:
+  case AArch64::SUBWrx:
+    IsFlagSetting = false;
+  case AArch64::ADDSWri:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSWrx:
+  case AArch64::ANDSWri:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSWrs:
+  case AArch64::BICSWrr:
+  case AArch64::BICSWrs:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSWrx:
+    switch (MIOpc) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+    case AArch64::TBZW:
+    case AArch64::TBNZW:
+      // Check to see if the TBZ/TBNZ is checking the sign bit.
+      if ((MIOpc == AArch64::TBZW || MIOpc == AArch64::TBNZW) &&
+          MI.getOperand(1).getImm() != 31)
+        return false;
+
+      // There must not be any instruction between DefMI and MI that clobbers or
+      // reads NZCV.
+      MachineBasicBlock::iterator I(DefMI), E(MI);
+      for (I = std::next(I); I != E; ++I) {
+        if (I->modifiesRegister(AArch64::NZCV, TRI) ||
+            I->readsRegister(AArch64::NZCV, TRI))
+          return false;
+      }
+      DEBUG(dbgs() << "  Replacing instructions:\n    ");
+      DEBUG(DefMI.print(dbgs()));
+      DEBUG(dbgs() << "    ");
+      DEBUG(MI.print(dbgs()));
+
+      NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
+      NewBr = convertToCondBr(MI);
+      break;
+    }
+    break;
+
+  case AArch64::ADDXri:
+  case AArch64::ADDXrr:
+  case AArch64::ADDXrs:
+  case AArch64::ADDXrx:
+  case AArch64::ANDXri:
+  case AArch64::ANDXrr:
+  case AArch64::ANDXrs:
+  case AArch64::BICXrr:
+  case AArch64::BICXrs:
+  case AArch64::SUBXri:
+  case AArch64::SUBXrr:
+  case AArch64::SUBXrs:
+  case AArch64::SUBXrx:
+    IsFlagSetting = false;
+  case AArch64::ADDSXri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDSXrx:
+  case AArch64::ANDSXri:
+  case AArch64::ANDSXrr:
+  case AArch64::ANDSXrs:
+  case AArch64::BICSXrr:
+  case AArch64::BICSXrs:
+  case AArch64::SUBSXri:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBSXrx:
+    switch (MIOpc) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+    case AArch64::TBZX:
+    case AArch64::TBNZX: {
+      // Check to see if the TBZ/TBNZ is checking the sign bit.
+      if ((MIOpc == AArch64::TBZX || MIOpc == AArch64::TBNZX) &&
+          MI.getOperand(1).getImm() != 63)
+        return false;
+      // There must not be any instruction between DefMI and MI that clobbers or
+      // reads NZCV.
+      MachineBasicBlock::iterator I(DefMI), E(MI);
+      for (I = std::next(I); I != E; ++I) {
+        if (I->modifiesRegister(AArch64::NZCV, TRI) ||
+            I->readsRegister(AArch64::NZCV, TRI))
+          return false;
+      }
+      DEBUG(dbgs() << "  Replacing instructions:\n    ");
+      DEBUG(DefMI.print(dbgs()));
+      DEBUG(dbgs() << "    ");
+      DEBUG(MI.print(dbgs()));
+
+      NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
+      NewBr = convertToCondBr(MI);
+      break;
+    }
+    }
+    break;
+  }
+  assert(NewCmp && NewBr && "Expected new instructions.");
+
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(NewCmp->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(NewBr->print(dbgs()));
+
+  // If this was a flag setting version of the instruction, we use the original
+  // instruction by just clearing the dead marked on the implicit-def of NCZV.
+  // Therefore, we should not erase this instruction.
+  if (!IsFlagSetting)
+    DefMI.eraseFromParent();
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AArch64CondBrTuning::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  DEBUG(dbgs() << "********** AArch64 Conditional Branch Tuning  **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+
+  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    bool LocalChange = false;
+    for (MachineBasicBlock::iterator I = MBB.getFirstTerminator(),
+                                     E = MBB.end();
+         I != E; ++I) {
+      MachineInstr &MI = *I;
+      switch (MI.getOpcode()) {
+      default:
+        break;
+      case AArch64::CBZW:
+      case AArch64::CBZX:
+      case AArch64::CBNZW:
+      case AArch64::CBNZX:
+      case AArch64::TBZW:
+      case AArch64::TBZX:
+      case AArch64::TBNZW:
+      case AArch64::TBNZX:
+        MachineInstr *DefMI = getOperandDef(MI.getOperand(0));
+        LocalChange = (DefMI && tryToTuneBranch(MI, *DefMI));
+        break;
+      }
+      // If the optimization was successful, we can't optimize any other
+      // branches because doing so would clobber the NZCV flags.
+      if (LocalChange) {
+        Changed = true;
+        break;
+      }
+    }
+  }
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64CondBrTuning() {
+  return new AArch64CondBrTuning();
+}
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 544f67433fd53..ee54550c9900b 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -13,7 +13,9 @@
 
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -84,6 +86,51 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
       DEBUG(dbgs() << "    Ignoring, XZR or WZR already used by the instruction\n");
       continue;
     }
+    if (MF.getSubtarget<AArch64Subtarget>().hasLSE()) {
+      // XZ/WZ for LSE can only be used when acquire semantics are not used,
+      // LDOPAL WZ is an invalid opcode.
+      switch (MI.getOpcode()) {
+      case AArch64::CASALb:
+      case AArch64::CASALh:
+      case AArch64::CASALs:
+      case AArch64::CASALd:
+      case AArch64::SWPALb:
+      case AArch64::SWPALh:
+      case AArch64::SWPALs:
+      case AArch64::SWPALd:
+      case AArch64::LDADDALb:
+      case AArch64::LDADDALh:
+      case AArch64::LDADDALs:
+      case AArch64::LDADDALd:
+      case AArch64::LDEORALb:
+      case AArch64::LDEORALh:
+      case AArch64::LDEORALs:
+      case AArch64::LDEORALd:
+      case AArch64::LDSETALb:
+      case AArch64::LDSETALh:
+      case AArch64::LDSETALs:
+      case AArch64::LDSETALd:
+      case AArch64::LDSMINALb:
+      case AArch64::LDSMINALh:
+      case AArch64::LDSMINALs:
+      case AArch64::LDSMINALd:
+      case AArch64::LDSMAXALb:
+      case AArch64::LDSMAXALh:
+      case AArch64::LDSMAXALs:
+      case AArch64::LDSMAXALd:
+      case AArch64::LDUMINALb:
+      case AArch64::LDUMINALh:
+      case AArch64::LDUMINALs:
+      case AArch64::LDUMINALd:
+      case AArch64::LDUMAXALb:
+      case AArch64::LDUMAXALh:
+      case AArch64::LDUMAXALs:
+      case AArch64::LDUMAXALd:
+        continue;
+      default:
+        break;
+      }
+    }
     const MCInstrDesc &Desc = MI.getDesc();
     for (int I = 0, E = Desc.getNumDefs(); I != E; ++I) {
       MachineOperand &MO = MI.getOperand(I);
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 8c2c0a564c302..04687847c1a30 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -201,7 +201,7 @@ private:
 
   bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
 
-  void SelectCMP_SWAP(SDNode *N);
+  bool SelectCMP_SWAP(SDNode *N);
 
 };
 } // end anonymous namespace
@@ -2609,9 +2609,13 @@ bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
 }
 
 /// We've got special pseudo-instructions for these
-void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   unsigned Opcode;
   EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+
+  // Leave IR for LSE if subtarget supports it.
+  if (Subtarget->hasLSE()) return false;
+
   if (MemTy == MVT::i8)
     Opcode = AArch64::CMP_SWAP_8;
   else if (MemTy == MVT::i16)
@@ -2637,6 +2641,8 @@ void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
   ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
   CurDAG->RemoveDeadNode(N);
+
+  return true;
 }
 
 void AArch64DAGToDAGISel::Select(SDNode *Node) {
@@ -2660,8 +2666,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case ISD::ATOMIC_CMP_SWAP:
-    SelectCMP_SWAP(Node);
-    return;
+    if (SelectCMP_SWAP(Node))
+      return;
+    break;
 
   case ISD::READ_REGISTER:
     if (tryReadRegister(Node))
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 083ca2156598f..2965106fd2708 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10563,11 +10563,20 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
+  if (Size > 128) return AtomicExpansionKind::None;
+  // Nand not supported in LSE.
+  if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
+  // Currently leaving And and Sub to LLSC
+  if ((AI->getOperation() == AtomicRMWInst::And) || (AI->getOperation() == AtomicRMWInst::Sub))
+    return AtomicExpansionKind::LLSC;
+  // Leave 128 bits to LLSC.
+  return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
 }
 
 bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
+  // If subtarget has LSE, leave cmpxchg intact for codegen.
+  if (Subtarget->hasLSE()) return false;
   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
   // implement cmpxchg without spilling. If the address being exchanged is also
   // on the stack and close enough to the spill slot, this can lead to a
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index 71826bec6b11f..de283b70210fe 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -405,3 +405,49 @@ def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch),
                           (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
                                GPR64:$newLo, GPR64:$newHi), []>,
                    Sched<[WriteAtomic]>;
+
+// v8.1 Atomic instructions:
+def : Pat<(atomic_load_add_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_or_8 GPR64:$Rn, GPR32:$Rs), (LDSETALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_16 GPR64:$Rn, GPR32:$Rs), (LDSETALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_32 GPR64:$Rn, GPR32:$Rs), (LDSETALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_64 GPR64:$Rn, GPR64:$Rs), (LDSETALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_xor_8 GPR64:$Rn, GPR32:$Rs), (LDEORALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_16 GPR64:$Rn, GPR32:$Rs), (LDEORALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_32 GPR64:$Rn, GPR32:$Rs), (LDEORALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_64 GPR64:$Rn, GPR64:$Rs), (LDEORALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_max_8 GPR64:$Rn, GPR32:$Rs), (LDSMAXALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_16 GPR64:$Rn, GPR32:$Rs), (LDSMAXALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_32 GPR64:$Rn, GPR32:$Rs), (LDSMAXALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_64 GPR64:$Rn, GPR64:$Rs), (LDSMAXALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_umax_8 GPR64:$Rn, GPR32:$Rs), (LDUMAXALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_16 GPR64:$Rn, GPR32:$Rs), (LDUMAXALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_32 GPR64:$Rn, GPR32:$Rs), (LDUMAXALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_64 GPR64:$Rn, GPR64:$Rs), (LDUMAXALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_min_8 GPR64:$Rn, GPR32:$Rs), (LDSMINALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_16 GPR64:$Rn, GPR32:$Rs), (LDSMINALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_32 GPR64:$Rn, GPR32:$Rs), (LDSMINALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_64 GPR64:$Rn, GPR64:$Rs), (LDSMINALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_umin_8 GPR64:$Rn, GPR32:$Rs), (LDUMINALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_16 GPR64:$Rn, GPR32:$Rs), (LDUMINALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_32 GPR64:$Rn, GPR32:$Rs), (LDUMINALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_64 GPR64:$Rn, GPR64:$Rs), (LDUMINALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_cmp_swap_8 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALb GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_16 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALh GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_32 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALs GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_64 GPR64:$Rn, GPR64:$Rold, GPR64:$Rnew), (CASALd GPR64:$Rold, GPR64:$Rnew, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index eea012382150c..314e89bbca863 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1036,7 +1036,7 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) {
 /// \brief Return the opcode that does not set flags when possible - otherwise
 /// return the original opcode. The caller is responsible to do the actual
 /// substitution and legality checking.
-static unsigned convertFlagSettingOpcode(const MachineInstr &MI) {
+static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
   // Don't convert all compare instructions, because for some the zero register
   // encoding becomes the sp register.
   bool MIDefinesZeroReg = false;
@@ -1145,7 +1145,7 @@ bool AArch64InstrInfo::optimizeCompareInstr(
       return true;
     }
     unsigned Opc = CmpInstr.getOpcode();
-    unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
+    unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
     if (NewOpc == Opc)
       return false;
     const MCInstrDesc &MCID = get(NewOpc);
@@ -3318,7 +3318,7 @@ static bool getMaddPatterns(MachineInstr &Root,
     // When NZCV is live bail out.
     if (Cmp_NZCV == -1)
       return false;
-    unsigned NewOpc = convertFlagSettingOpcode(Root);
+    unsigned NewOpc = convertToNonFlagSettingOpc(Root);
     // When opcode can't change bail out.
     // CHECKME: do we miss any cases for opcode conversion?
     if (NewOpc == Opc)
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 59f3405fe439a..58e9ce583d44c 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -119,6 +119,44 @@ public:
     }
   }
 
+  /// \brief Return the opcode that set flags when possible.  The caller is
+  /// responsible for ensuring the opc has a flag setting equivalent.
+  static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit) {
+    switch (Opc) {
+    default:
+      llvm_unreachable("Opcode has no flag setting equivalent!");
+    // 32-bit cases:
+    case AArch64::ADDWri: Is64Bit = false; return AArch64::ADDSWri;
+    case AArch64::ADDWrr: Is64Bit = false; return AArch64::ADDSWrr;
+    case AArch64::ADDWrs: Is64Bit = false; return AArch64::ADDSWrs;
+    case AArch64::ADDWrx: Is64Bit = false; return AArch64::ADDSWrx;
+    case AArch64::ANDWri: Is64Bit = false; return AArch64::ANDSWri;
+    case AArch64::ANDWrr: Is64Bit = false; return AArch64::ANDSWrr;
+    case AArch64::ANDWrs: Is64Bit = false; return AArch64::ANDSWrs;
+    case AArch64::BICWrr: Is64Bit = false; return AArch64::BICSWrr;
+    case AArch64::BICWrs: Is64Bit = false; return AArch64::BICSWrs;
+    case AArch64::SUBWri: Is64Bit = false; return AArch64::SUBSWri;
+    case AArch64::SUBWrr: Is64Bit = false; return AArch64::SUBSWrr;
+    case AArch64::SUBWrs: Is64Bit = false; return AArch64::SUBSWrs;
+    case AArch64::SUBWrx: Is64Bit = false; return AArch64::SUBSWrx;
+    // 64-bit cases:
+    case AArch64::ADDXri: Is64Bit = true; return AArch64::ADDSXri;
+    case AArch64::ADDXrr: Is64Bit = true; return AArch64::ADDSXrr;
+    case AArch64::ADDXrs: Is64Bit = true; return AArch64::ADDSXrs;
+    case AArch64::ADDXrx: Is64Bit = true; return AArch64::ADDSXrx;
+    case AArch64::ANDXri: Is64Bit = true; return AArch64::ANDSXri;
+    case AArch64::ANDXrr: Is64Bit = true; return AArch64::ANDSXrr;
+    case AArch64::ANDXrs: Is64Bit = true; return AArch64::ANDSXrs;
+    case AArch64::BICXrr: Is64Bit = true; return AArch64::BICSXrr;
+    case AArch64::BICXrs: Is64Bit = true; return AArch64::BICSXrs;
+    case AArch64::SUBXri: Is64Bit = true; return AArch64::SUBSXri;
+    case AArch64::SUBXrr: Is64Bit = true; return AArch64::SUBSXrr;
+    case AArch64::SUBXrs: Is64Bit = true; return AArch64::SUBSXrs;
+    case AArch64::SUBXrx: Is64Bit = true; return AArch64::SUBSXrx;
+    }
+  }
+
+
   /// Return true if this is a load/store that can be potentially paired/merged.
   bool isCandidateToMergeOrPair(MachineInstr &MI) const;
 
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 9243eb91cc1ac..005f2d51e4036 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -795,6 +795,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
   int LoadSize = getMemScale(*LoadI);
   int StoreSize = getMemScale(*StoreI);
   unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+  const MachineOperand &StMO = getLdStRegOp(*StoreI);
   unsigned StRt = getLdStRegOp(*StoreI).getReg();
   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
 
@@ -807,7 +808,13 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     // Remove the load, if the destination register of the loads is the same
     // register for stored value.
     if (StRt == LdRt && LoadSize == 8) {
-      StoreI->clearRegisterKills(StRt, TRI);
+      for (MachineInstr &MI : make_range(StoreI->getIterator(),
+                                         LoadI->getIterator())) {
+        if (MI.killsRegister(StRt, TRI)) {
+          MI.clearRegisterKills(StRt, TRI);
+          break;
+        }
+      }
       DEBUG(dbgs() << "Remove load instruction:\n    ");
       DEBUG(LoadI->print(dbgs()));
       DEBUG(dbgs() << "\n");
@@ -819,7 +826,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
-            .addReg(StRt)
+            .add(StMO)
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
   } else {
     // FIXME: Currently we disable this transformation in big-endian targets as
@@ -860,14 +867,14 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
                   DestReg)
-              .addReg(StRt)
+              .add(StMO)
               .addImm(AndMaskEncoded);
     } else {
       BitExtMI =
           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
                   DestReg)
-              .addReg(StRt)
+              .add(StMO)
               .addImm(Immr)
               .addImm(Imms);
     }
@@ -876,7 +883,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
   // Clear kill flags between store and load.
   for (MachineInstr &MI : make_range(StoreI->getIterator(),
                                      BitExtMI->getIterator()))
-    MI.clearRegisterKills(StRt, TRI);
+    if (MI.killsRegister(StRt, TRI)) {
+      MI.clearRegisterKills(StRt, TRI);
+      break;
+    }
 
   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
   DEBUG(StoreI->print(dbgs()));
diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
index 3b71d529db59b..ccc9d2ad1b482 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -7,37 +7,27 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// \file This file contains the AArch64 implementation of the DAG scheduling mutation
-// to pair instructions back to back.
+/// \file This file contains the AArch64 implementation of the DAG scheduling
+///  mutation to pair instructions back to back.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64MacroFusion.h"
 #include "AArch64Subtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/CodeGen/MacroFusion.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
-#define DEBUG_TYPE "misched"
-
-STATISTIC(NumFused, "Number of instr pairs fused");
-
 using namespace llvm;
 
-static cl::opt<bool> EnableMacroFusion("aarch64-misched-fusion", cl::Hidden,
-  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
-
 namespace {
 
-/// \brief Verify that the instr pair, FirstMI and SecondMI, should be fused
-/// together.  Given an anchor instr, when the other instr is unspecified, then
-/// check if the anchor instr may be part of a fused pair at all.
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
 static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
                                    const TargetSubtargetInfo &TSI,
                                    const MachineInstr *FirstMI,
-                                   const MachineInstr *SecondMI) {
-  assert((FirstMI || SecondMI) && "At least one instr must be specified");
-
+                                   const MachineInstr &SecondMI) {
   const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII);
   const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
 
@@ -45,9 +35,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   unsigned FirstOpcode =
     FirstMI ? FirstMI->getOpcode()
 	    : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
-  unsigned SecondOpcode =
-    SecondMI ? SecondMI->getOpcode()
-             : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = SecondMI.getOpcode();
 
   if (ST.hasArithmeticBccFusion())
     // Fuse CMN, CMP, TST followed by Bcc.
@@ -128,158 +116,49 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
 
   if (ST.hasFuseAES())
     // Fuse AES crypto operations.
-    switch(FirstOpcode) {
+    switch(SecondOpcode) {
     // AES encode.
-    case AArch64::AESErr:
-      return SecondOpcode == AArch64::AESMCrr ||
-             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    case AArch64::AESMCrr :
+      return FirstOpcode == AArch64::AESErr ||
+             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
     // AES decode.
-    case AArch64::AESDrr:
-      return SecondOpcode == AArch64::AESIMCrr ||
-             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    case AArch64::AESIMCrr:
+      return FirstOpcode == AArch64::AESDrr ||
+             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
     }
 
   if (ST.hasFuseLiterals())
     // Fuse literal generation operations.
-    switch (FirstOpcode) {
+    switch (SecondOpcode) {
     // PC relative address.
-    case AArch64::ADRP:
-      return SecondOpcode == AArch64::ADDXri ||
-             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    case AArch64::ADDXri:
+      return FirstOpcode == AArch64::ADRP ||
+             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
     // 32 bit immediate.
-    case AArch64::MOVZWi:
-      return (SecondOpcode == AArch64::MOVKWi &&
-              SecondMI->getOperand(3).getImm() == 16) ||
-             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
-    // Lower half of 64 bit immediate.
-    case AArch64::MOVZXi:
-      return (SecondOpcode == AArch64::MOVKXi &&
-              SecondMI->getOperand(3).getImm() == 16) ||
-             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
-    // Upper half of 64 bit immediate.
+    case AArch64::MOVKWi:
+      return (FirstOpcode == AArch64::MOVZWi &&
+              SecondMI.getOperand(3).getImm() == 16) ||
+             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
+    // Lower and upper half of 64 bit immediate.
     case AArch64::MOVKXi:
-      return FirstMI->getOperand(3).getImm() == 32 &&
-             ((SecondOpcode == AArch64::MOVKXi &&
-               SecondMI->getOperand(3).getImm() == 48) ||
-              SecondOpcode == AArch64::INSTRUCTION_LIST_END);
+      return FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+             (FirstOpcode == AArch64::MOVZXi &&
+              SecondMI.getOperand(3).getImm() == 16) ||
+             (FirstOpcode == AArch64::MOVKXi &&
+              FirstMI->getOperand(3).getImm() == 32 &&
+              SecondMI.getOperand(3).getImm() == 48);
     }
 
   return false;
 }
 
-/// \brief Implement the fusion of instr pairs in the scheduling DAG,
-/// anchored at the instr in AnchorSU..
-static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) {
-  const MachineInstr *AnchorMI = AnchorSU.getInstr();
-  if (!AnchorMI || AnchorMI->isPseudo() || AnchorMI->isTransient())
-    return false;
-
-  // If the anchor instr is the ExitSU, then consider its predecessors;
-  // otherwise, its successors.
-  bool Preds = (&AnchorSU == &DAG->ExitSU);
-  SmallVectorImpl<SDep> &AnchorDeps = Preds ? AnchorSU.Preds : AnchorSU.Succs;
-
-  const MachineInstr *FirstMI = Preds ? nullptr : AnchorMI;
-  const MachineInstr *SecondMI = Preds ? AnchorMI : nullptr;
-
-  // Check if the anchor instr may be fused.
-  if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
-                              FirstMI, SecondMI))
-    return false;
-
-  // Explorer for fusion candidates among the dependencies of the anchor instr.
-  for (SDep &Dep : AnchorDeps) {
-    // Ignore dependencies that don't enforce ordering.
-    if (Dep.isWeak())
-      continue;
-
-    SUnit &DepSU = *Dep.getSUnit();
-    // Ignore the ExitSU if the dependents are successors.
-    if (!Preds && &DepSU == &DAG->ExitSU)
-      continue;
-
-    const MachineInstr *DepMI = DepSU.getInstr();
-    if (!DepMI || DepMI->isPseudo() || DepMI->isTransient())
-      continue;
-
-    FirstMI = Preds ? DepMI : AnchorMI;
-    SecondMI = Preds ? AnchorMI : DepMI;
-    if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
-                                FirstMI, SecondMI))
-      continue;
-
-    // Create a single weak edge between the adjacent instrs. The only effect is
-    // to cause bottom-up scheduling to heavily prioritize the clustered instrs.
-    SUnit &FirstSU = Preds ? DepSU : AnchorSU;
-    SUnit &SecondSU = Preds ? AnchorSU : DepSU;
-    DAG->addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster));
-
-    // Adjust the latency between the anchor instr and its
-    // predecessors/successors.
-    for (SDep &IDep : AnchorDeps)
-      if (IDep.getSUnit() == &DepSU)
-        IDep.setLatency(0);
-
-    // Adjust the latency between the dependent instr and its
-    // successors/predecessors.
-    for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds)
-      if (IDep.getSUnit() == &AnchorSU)
-        IDep.setLatency(0);
-
-    DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
-          FirstSU.print(dbgs(), DAG); dbgs() << " - ";
-          SecondSU.print(dbgs(), DAG); dbgs() << " /  ";
-          dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " <<
-                    DAG->TII->getName(SecondMI->getOpcode()) << '\n'; );
-
-    if (&SecondSU != &DAG->ExitSU)
-      // Make instructions dependent on FirstSU also dependent on SecondSU to
-      // prevent them from being scheduled between FirstSU and and SecondSU.
-      for (SUnit::const_succ_iterator
-             SI = FirstSU.Succs.begin(), SE = FirstSU.Succs.end();
-           SI != SE; ++SI) {
-        if (!SI->getSUnit() || SI->getSUnit() == &SecondSU)
-          continue;
-        DEBUG(dbgs() << "  Copy Succ ";
-              SI->getSUnit()->print(dbgs(), DAG); dbgs() << '\n';);
-        DAG->addEdge(SI->getSUnit(), SDep(&SecondSU, SDep::Artificial));
-      }
-
-    ++NumFused;
-    return true;
-  }
-
-  return false;
-}
-
-/// \brief Post-process the DAG to create cluster edges between instrs that may
-/// be fused by the processor into a single operation.
-class AArch64MacroFusion : public ScheduleDAGMutation {
-public:
-  AArch64MacroFusion() {}
-
-  void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-
-void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
-  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
-
-  // For each of the SUnits in the scheduling block, try to fuse the instr in it
-  // with one in its successors.
-  for (SUnit &ISU : DAG->SUnits)
-    scheduleAdjacentImpl(DAG, ISU);
-
-  // Try to fuse the instr in the ExitSU with one in its predecessors.
-  scheduleAdjacentImpl(DAG, DAG->ExitSU);
-}
-
 } // end namespace
 
 
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () {
-  return EnableMacroFusion ? make_unique<AArch64MacroFusion>() : nullptr;
+  return createMacroFusionDAGMutation(shouldScheduleAdjacent);
 }
 
 } // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64MacroFusion.h b/lib/Target/AArch64/AArch64MacroFusion.h
index e5efedd9fbfd9..32d90d4c40d6f 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.h
+++ b/lib/Target/AArch64/AArch64MacroFusion.h
@@ -2,23 +2,18 @@
 //
 //                     The LLVM Compiler Infrastructure
 //
-// \fileThis file is distributed under the University of Illinois Open Source
+// This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the AArch64 definition of the DAG scheduling mutation
-// to pair instructions back to back.
+/// \file This file contains the AArch64 definition of the DAG scheduling
+/// mutation to pair instructions back to back.
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64InstrInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
-//===----------------------------------------------------------------------===//
-// AArch64MacroFusion - DAG post-processing to encourage fusion of macro ops.
-//===----------------------------------------------------------------------===//
-
 namespace llvm {
 
 /// Note that you have to add:
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 9b3899e0681cf..69124dbd0f838 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -469,10 +469,6 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
         getCopyMapping(DstRB.getID(), SrcRB.getID(), Size),
         /*NumOperands*/ 2);
   }
-  case TargetOpcode::G_SEQUENCE:
-    // FIXME: support this, but the generic code is really not going to do
-    // anything sane.
-    return getInvalidInstructionMapping();
   default:
     break;
   }
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index 303398ea0b7f3..5d1608ef04afa 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// The Cortex-A57 is a traditional superscaler microprocessor with a
+// The Cortex-A57 is a traditional superscalar microprocessor with a
 // conservative 3-wide in-order stage for decode and dispatch. Combined with the
 // much wider out-of-order issue stage, this produced a need to carefully
 // schedule micro-ops so that all three decoded each cycle are successfully
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 3d737402022d8..0aeb1f3e30584 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -32,8 +32,8 @@
 
 //===----------------------------------------------------------------------===//
 // Define 0 micro-op types
-def FalkorWr_none_1cyc : SchedWriteRes<[]> {
-  let Latency = 1;
+def FalkorWr_LdStInc_none_3cyc : SchedWriteRes<[]> {
+  let Latency = 3;
   let NumMicroOps = 0;
 }
 def FalkorWr_none_3cyc : SchedWriteRes<[]> {
@@ -505,7 +505,8 @@ def FalkorWr_4VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
   let NumMicroOps = 12;
 }
 
-// Forwarding logic is modeled for multiply add/accumulate.
+// Forwarding logic is modeled for multiply add/accumulate and
+// load/store base register increment.
 // -----------------------------------------------------------------------------
 def FalkorReadIMA32  : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
 def FalkorReadIMA64  : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
@@ -513,9 +514,13 @@ def FalkorReadVMA    : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr
 def FalkorReadFMA32  : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
 def FalkorReadFMA64  : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
 
+def FalkorReadIncLd  : SchedReadAdvance<2, [FalkorWr_LdStInc_none_3cyc]>;
+def FalkorReadIncSt  : SchedReadAdvance<1, [FalkorWr_LdStInc_none_3cyc]>;
+
 // SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast
 // -----------------------------------------------------------------------------
-def FalkorImmZPred    : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>;
+def FalkorImmZPred    : SchedPredicate<[{MI->getOperand(1).isImm() &&
+                                         MI->getOperand(1).getImm() == 0}]>;
 def FalkorOp1ZrReg    : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR ||
 
                                          MI->getOperand(1).getReg() == AArch64::XZR}]>;
@@ -770,84 +775,113 @@ def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
 
 // SIMD Load Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1LD_3cyc],                           (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],       (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],                           (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],       (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],                           (instrs LD2i64)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],       (instrs LD2i64_POST)>;
-
-def : InstRW<[FalkorWr_1LD_1VXVY_4cyc],                     (instregex "^LD1i(8|16|32)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_1LD_1none_3cyc],                     (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc],                     (instregex "^LD2Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc],                     (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_3cyc],                           (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc],                           (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc],                           (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc],                           (instrs LD3i64)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instrs LD3i64_POST)>;
-def : InstRW<[FalkorWr_2LD_3cyc],                           (instrs LD4i64)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instrs LD4i64_POST)>;
-
-def : InstRW<[FalkorWr_1LD_2VXVY_4cyc],                     (instregex "^LD2i(8|16|32)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_1none_3cyc],                     (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_1none_3cyc],                     (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_3LD_3cyc],                           (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc],       (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_3LD_3cyc],                           (instrs LD3Threev2d)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc],       (instrs LD3Threev2d_POST)>;
-def : InstRW<[FalkorWr_3LD_3cyc],                           (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc],       (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[FalkorWr_1LD_3VXVY_4cyc],                     (instregex "^LD3i(8|16|32)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2none_3cyc],                     (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_2none_3cyc],                     (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_4LD_3cyc],                           (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc],       (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_4LD_3cyc],                           (instrs LD4Fourv2d)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc],       (instrs LD4Fourv2d_POST)>;
-def : InstRW<[FalkorWr_4LD_3cyc],                           (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc],       (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[FalkorWr_1LD_4VXVY_4cyc],                     (instregex "^LD4i(8|16|32)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc],               (instregex "^LD3Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1none_4cyc],
-                                                            (instregex "^LD3Threev(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc],               (instregex "^LD4Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2none_4cyc],
-                                                            (instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc],           (instregex "^LD3Threev(16b|8h|4s)$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc],           (instregex "^LD4Fourv(16b|8h|4s)$")>;
-
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc],
-                                                            (instregex "^LD3Threev(16b|8h|4s)_POST$")>;
-
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc],
-                                                            (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],       (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],       (instrs LD2i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD2i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd], (instregex "^LD1i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instrs LD3i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD3i64_POST)>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instrs LD4i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD4i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd], (instregex "^LD2i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd],       (instrs LD3Threev2d)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD3Threev2d_POST)>;
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd],       (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd], (instregex "^LD3i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd],       (instrs LD4Fourv2d)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD4Fourv2d_POST)>;
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd],       (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd], (instregex "^LD4i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Threev(8b|4h|2s)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Fourv(8b|4h|2s)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Threev(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Fourv(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Threev(16b|8h|4s)_POST$")>;
+
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
 
 // Arithmetic and Logical Instructions
 // -----------------------------------------------------------------------------
@@ -929,87 +963,105 @@ def : InstRW<[FalkorWr_5VXVY_7cyc],   (instregex "^TBX(v8i8Four|v16i8Four)$")>;
 // SIMD Store Instructions
 // -----------------------------------------------------------------------------
 
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STR(Q|D|S|H|B)ui$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STR(Q|D|S|H|B)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STR(Q|D|S|H|B)(post|pre)$")>;
-def : InstRW<[FalkorWr_STRVro],        (instregex "^STR(D|S|H|B)ro(W|X)$")>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^STPQi$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_STRVro, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STR(D|S|H|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STPQi$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STPQ(post|pre)$")>;
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STP(D|S)(i)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STP(D|S)(i)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
                                        (instregex "^STP(D|S)(post|pre)$")>;
-def : InstRW<[FalkorWr_STRQro],        (instregex "^STRQro(W|X)$")>;
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STUR(Q|D|S|B|H)i$")>;
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instrs STNPDi, STNPSi)>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instrs STNPQi)>;
-
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+def : InstRW<[FalkorWr_STRQro, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STRQro(W|X)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STUR(Q|D|S|B|H)i$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instrs STNPDi, STNPSi)>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instrs STNPQi)>;
+
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>;
-def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>;
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc],
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc],
-                                       (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
-
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST3(i8|i16|i32|i64)$")>;
-def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST4(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))_POST$")>;
+
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST3(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST4(i8|i16|i32|i64)$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST3(i8|i16|i32|i64)_POST$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST4(i8|i16|i32|i64)_POST$")>;
 
-def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc],
-                                       (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>;
+def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST3Three(v8b|v4h|v2s)$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc],
-                                       (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST3Three(v8b|v4h|v2s)_POST$")>;
 
-def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instrs ST3Threev2d)>;
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instrs ST3Threev2d)>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instrs ST3Threev2d_POST)>;
 
-def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc],
-                                       (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>;
+def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST4Four(v8b|v4h|v2s)$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc],
-                                       (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST4Four(v8b|v4h|v2s)_POST$")>;
 
-def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instrs ST4Fourv2d)>;
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instrs ST4Fourv2d)>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instrs ST4Fourv2d_POST)>;
 
-def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc],
+def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST3Three(v16b|v8h|v4s)$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST3Three(v16b|v8h|v4s)_POST$")>;
 
-def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc],
+def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST4Four(v16b|v8h|v4s)$")>;
 // FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
-def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc],
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
                                        (instregex "^ST4Four(v16b|v8h|v4s)_POST$")>;
 
 // Branch Instructions
@@ -1033,22 +1085,25 @@ def : InstRW<[FalkorWr_4VXVY_3cyc],   (instrs SHA256SU1rrr)>;
 
 // FP Load Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
                                       (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDUR(Q|D|S|H|B)i$")>;
-def : InstRW<[FalkorWr_LDRro],        (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDUR(Q|D|S|H|B)i$")>;
+def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd],
+                                      (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instrs LDNPQi)>;
-def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instrs LDPQi)>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "LDNP(D|S)i$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "LDP(D|S)i$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "LDP(D|S)(pre|post)$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "^LDPQ(pre|post)$")>;
 
 // FP Data Processing Instructions
@@ -1106,31 +1161,41 @@ def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)CVTF(v2i64|v4i32|v2f64|v
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFMui, PRFMl)>;
 def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFUMi)>;
-def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "^LDNP(W|X)i$")>;
-def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "^LDP(W|X)i$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
                                       (instregex "^LDP(W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR(BB|HH|W|X)ui$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
                                       (instregex "^LDR(BB|HH|W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_LDRro],        (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR(W|X)l$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDTR(B|H|W|X)i$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDUR(BB|HH|W|X)i$")>;
+def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd],
+                                      (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDR(W|X)l$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDUR(BB|HH|W|X)i$")>;
 def : InstRW<[FalkorWr_PRFMro],       (instregex "^PRFMro(W|X)$")>;
-def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc],
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
                                       (instrs LDPSWi)>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc],
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
                                       (instregex "^LDPSW(post|pre)$")>;
-def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc],
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorReadIncLd],
                                       (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
-def : InstRW<[FalkorWr_LDRSro],       (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
-def : InstRW<[FalkorWr_1LD_4cyc],     (instrs LDRSWl)>;
-def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
-def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
+def : InstRW<[FalkorWr_LDRSro, FalkorReadIncLd],
+                                      (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instrs LDRSWl)>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
 
 // Miscellaneous Data-Processing Instructions
 // -----------------------------------------------------------------------------
@@ -1178,32 +1243,46 @@ def : InstRW<[FalkorWr_1none_0cyc],   (instrs BRK, DCPS1, DCPS2, DCPS3, HINT, HL
 def : InstRW<[FalkorWr_1ST_0cyc],     (instrs SYSxt, SYSLxt)>;
 def : InstRW<[FalkorWr_1Z_0cyc],      (instrs MSRpstateImm1, MSRpstateImm4)>;
 
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^(LDAR(B|H|W|X)|LDAXP(W|X)|LDAXR(B|H|W|X)|LDXP(W|X)|LDXR(B|H|W|X))$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^(LDAR(B|H|W|X)|LDAXR(B|H|W|X)|LDXR(B|H|W|X))$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+                                      (instregex "^(LDAXP(W|X)|LDXP(W|X))$")>;
 def : InstRW<[FalkorWr_1LD_3cyc],     (instrs MRS, MOVbaseTLS)>;
 
 def : InstRW<[FalkorWr_1LD_1Z_3cyc],  (instrs DRPS)>;
 
 def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>;
-def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs STNPWi, STNPXi)>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instrs STNPWi, STNPXi)>;
 def : InstRW<[FalkorWr_2LD_1Z_3cyc],  (instrs ERET)>;
 
 def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>;
-def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STLR(B|H|W|X)$")>;
-def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXP(W|X)$")>;
-def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXR(B|H|W|X)$")>;
-
-def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXP(W|X)$")>;
-def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXR(B|H|W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STLR(B|H|W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STXP(W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STXR(B|H|W|X)$")>;
+
+def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STLXP(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STLXR(B|H|W|X)$")>;
 
 // Store Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1SD_1ST_0cyc],     (instregex "^STP(W|X)i$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc],
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STP(W|X)i$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
                                           (instregex "^STP(W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_1SD_1ST_0cyc],     (instregex "^STR(BB|HH|W|X)ui$")>;
-def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc],
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
                                           (instregex "^STR(BB|HH|W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_STRro],            (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
-def : InstRW<[FalkorWr_1SD_1ST_0cyc],     (instregex "^STTR(B|H|W|X)i$")>;
-def : InstRW<[FalkorWr_1SD_1ST_0cyc],     (instregex "^STUR(BB|HH|W|X)i$")>;
+def : InstRW<[FalkorWr_STRro, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STUR(BB|HH|W|X)i$")>;
 
diff --git a/lib/Target/AArch64/AArch64SchedKryoDetails.td b/lib/Target/AArch64/AArch64SchedKryoDetails.td
index 02cccccd3078c..cf4cdabb8cbfc 100644
--- a/lib/Target/AArch64/AArch64SchedKryoDetails.td
+++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -1374,7 +1374,9 @@ def KryoWrite_3cyc_LS_LS_400ln :
 	let Latency = 3; let NumMicroOps = 2;
 }
 def : InstRW<[KryoWrite_3cyc_LS_LS_400ln],
-	(instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>;
+	(instregex "LDAX?R(B|H|W|X)")>;
+def : InstRW<[KryoWrite_3cyc_LS_LS_400ln, WriteLDHi],
+	(instregex "LDAXP(W|X)")>;
 def KryoWrite_3cyc_LS_LS_401ln :
 	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
 	let Latency = 3; let NumMicroOps = 2;
@@ -1565,7 +1567,7 @@ def KryoWrite_3cyc_LS_258ln :
 	SchedWriteRes<[KryoUnitLS]> {
 	let Latency = 3; let NumMicroOps = 1;
 }
-def : InstRW<[KryoWrite_3cyc_LS_258ln],
+def : InstRW<[KryoWrite_3cyc_LS_258ln, WriteLDHi],
 	(instregex "LDXP(W|X)")>;
 def KryoWrite_3cyc_LS_258_1ln :
 	SchedWriteRes<[KryoUnitLS]> {
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index d4a8cecdb29f1..6660f0babb8a6 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -47,6 +47,11 @@ static cl::opt<bool> EnableCCMP("aarch64-enable-ccmp",
                                 cl::desc("Enable the CCMP formation pass"),
                                 cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+    EnableCondBrTuning("aarch64-enable-cond-br-tune",
+                       cl::desc("Enable the conditional branch tuning pass"),
+                       cl::init(true), cl::Hidden);
+
 static cl::opt<bool> EnableMCR("aarch64-enable-mcr",
                                cl::desc("Enable the machine combiner pass"),
                                cl::init(true), cl::Hidden);
@@ -429,6 +434,8 @@ bool AArch64PassConfig::addILPOpts() {
     addPass(createAArch64ConditionalCompares());
   if (EnableMCR)
     addPass(&MachineCombinerID);
+  if (EnableCondBrTuning)
+    addPass(createAArch64CondBrTuning());
   if (EnableEarlyIfConversion)
     addPass(&EarlyIfConverterID);
   if (EnableStPairSuppress)
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index f0f50f29be0f3..02b12b5e90ca2 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(AArch64CodeGen
   AArch64AsmPrinter.cpp
   AArch64CleanupLocalDynamicTLSPass.cpp
   AArch64CollectLOH.cpp
+  AArch64CondBrTuning.cpp
   AArch64ConditionalCompares.cpp
   AArch64DeadRegisterDefinitionsPass.cpp
   AArch64ExpandPseudoInsts.cpp
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 43a6fa9ce0896..3d075018904c0 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -43,26 +43,25 @@ public:
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
     const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
-      // This table *must* be in the order that the fixup_* kinds are defined in
-      // AArch64FixupKinds.h.
-      //
-      // Name                           Offset (bits) Size (bits)     Flags
-      { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_aarch64_add_imm12", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
-      { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
-      { "fixup_aarch64_movw", 5, 16, 0 },
-      { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
-      { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
-    };
+        // This table *must* be in the order that the fixup_* kinds are defined
+        // in AArch64FixupKinds.h.
+        //
+        // Name                           Offset (bits) Size (bits)     Flags
+        {"fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal},
+        {"fixup_aarch64_add_imm12", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale1", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale2", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale4", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale8", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale16", 10, 12, 0},
+        {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal},
+        {"fixup_aarch64_movw", 5, 16, 0},
+        {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal},
+        {"fixup_aarch64_tlsdesc_call", 0, 0, 0}};
 
     if (Kind < FirstTargetFixupKind)
       return MCAsmBackend::getFixupKindInfo(Kind);
@@ -72,8 +71,9 @@ public:
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   bool mayNeedRelaxation(const MCInst &Inst) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -261,13 +261,15 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
   }
 }
 
-void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                   unsigned DataSize, uint64_t Value,
-                                   bool IsPCRel, MCContext &Ctx) const {
+void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                   const MCValue &Target,
+                                   MutableArrayRef<char> Data, uint64_t Value,
+                                   bool IsPCRel) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
   if (!Value)
     return; // Doesn't change encoding.
   MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  MCContext &Ctx = Asm.getContext();
   // Apply any target-specific value adjustments.
   Value = adjustFixupValue(Fixup, Value, Ctx);
 
@@ -275,7 +277,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   Value <<= Info.TargetOffset;
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // Used to point to big endian bytes.
   unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind());
@@ -289,7 +291,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
     }
   } else {
     // Handle as big-endian
-    assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!");
+    assert((Offset + FulleSizeInBytes) <= Data.size() && "Invalid fixup size!");
     assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!");
     for (unsigned i = 0; i != NumBytes; ++i) {
       unsigned Idx = FulleSizeInBytes - 1 - i;
@@ -539,16 +541,14 @@ public:
     return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32);
   }
 
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
+  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                         const MCValue &Target, bool &IsResolved) override;
 };
 
-void ELFAArch64AsmBackend::processFixupValue(
-    const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
-    const MCFragment *DF, const MCValue &Target, uint64_t &Value,
-    bool &IsResolved) {
+void ELFAArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
+                                             const MCFixup &Fixup,
+                                             const MCValue &Target,
+                                             bool &IsResolved) {
   // The ADRP instruction adds some multiple of 0x1000 to the current PC &
   // ~0xfff. This means that the required offset to reach a symbol can vary by
   // up to one step depending on where the ADRP is in memory. For example:
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 6ab2b9ef04598..7494e5decd6f6 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -238,6 +238,36 @@ def FeatureSDWA : SubtargetFeature<"sdwa",
   "Support SDWA (Sub-DWORD Addressing) extension"
 >;
 
+def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod",
+  "HasSDWAOmod",
+  "true",
+  "Support OMod with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar",
+  "HasSDWAScalar",
+  "true",
+  "Support scalar register with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst",
+  "HasSDWASdst",
+  "true",
+  "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAMac : SubtargetFeature<"sdwa-mav",
+  "HasSDWAMac",
+  "true",
+  "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc",
+  "HasSDWAClampVOPC",
+  "true",
+  "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension"
+>;
+
 def FeatureDPP : SubtargetFeature<"dpp",
   "HasDPP",
   "true",
@@ -421,8 +451,8 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
-   FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA,
-   FeatureDPP
+   FeatureScalarStores, FeatureInv2PiInlineImm,
+   FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP
   ]
 >;
 
@@ -432,7 +462,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
    FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
-   FeatureFastFMAF32, FeatureSDWA, FeatureDPP,
+   FeatureFastFMAF32, FeatureDPP,
+   FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
   ]
 >;
@@ -449,14 +480,14 @@ class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
 
 def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
   [FeatureSouthernIslands,
-   FeatureFastFMAF32, 
+   FeatureFastFMAF32,
    HalfRate64Ops,
    FeatureLDSBankCount32]>;
 
 def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
   [FeatureSouthernIslands,
    FeatureLDSBankCount32]>;
-   
+
 def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
   [FeatureSeaIslands,
    FeatureLDSBankCount32]>;
@@ -644,7 +675,11 @@ def isCIVI : Predicate <
   "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
   AssemblerPredicate<"FeatureCIInsts">;
 
-def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
+def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
+  AssemblerPredicate<"FeatureFlatAddressSpace">;
+
+def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
+  AssemblerPredicate<"FeatureFlatGlobalInsts">;
 
 def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
   AssemblerPredicate<"Feature16BitInsts">;
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5586b513b5fca..96f819fd0e684 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3527,18 +3527,25 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 //===----------------------------------------------------------------------===//
 
 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
-                                                  const TargetRegisterClass *RC,
-                                                   unsigned Reg, EVT VT) const {
+                                                   const TargetRegisterClass *RC,
+                                                   unsigned Reg, EVT VT,
+                                                   const SDLoc &SL,
+                                                   bool RawReg) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  unsigned VirtualRegister;
+  unsigned VReg;
+
   if (!MRI.isLiveIn(Reg)) {
-    VirtualRegister = MRI.createVirtualRegister(RC);
-    MRI.addLiveIn(Reg, VirtualRegister);
+    VReg = MRI.createVirtualRegister(RC);
+    MRI.addLiveIn(Reg, VReg);
   } else {
-    VirtualRegister = MRI.getLiveInVirtReg(Reg);
+    VReg = MRI.getLiveInVirtReg(Reg);
   }
-  return DAG.getRegister(VirtualRegister, VT);
+
+  if (RawReg)
+    return DAG.getRegister(VReg, VT);
+
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
 }
 
 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
@@ -3657,6 +3664,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
+  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
   NODE_NAME_CASE(ATOMIC_INC)
   NODE_NAME_CASE(ATOMIC_DEC)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 0d066cdbdff4d..a45234e2b39f2 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -216,10 +216,25 @@ public:
   /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
   /// MachineFunction.
   ///
-  /// \returns a RegisterSDNode representing Reg.
-  virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
-                                       const TargetRegisterClass *RC,
-                                       unsigned Reg, EVT VT) const;
+  /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
+  /// a copy from the register.
+  SDValue CreateLiveInRegister(SelectionDAG &DAG,
+                               const TargetRegisterClass *RC,
+                               unsigned Reg, EVT VT,
+                               const SDLoc &SL,
+                               bool RawReg = false) const;
+  SDValue CreateLiveInRegister(SelectionDAG &DAG,
+                               const TargetRegisterClass *RC,
+                               unsigned Reg, EVT VT) const {
+    return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()));
+  }
+
+  // Returns the raw live in register rather than a copy from it.
+  SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG,
+                                  const TargetRegisterClass *RC,
+                                  unsigned Reg, EVT VT) const {
+    return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
+  }
 
   enum ImplicitParameter {
     FIRST_IMPLICIT,
@@ -388,6 +403,8 @@ enum NodeType : unsigned {
   STORE_MSKOR,
   LOAD_CONSTANT,
   TBUFFER_STORE_FORMAT,
+  TBUFFER_STORE_FORMAT_X3,
+  TBUFFER_LOAD_FORMAT,
   ATOMIC_CMP_SWAP,
   ATOMIC_INC,
   ATOMIC_DEC,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index a01f5d37c7c16..69dc529861729 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -66,7 +66,9 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
 enum SIEncodingFamily {
   SI = 0,
-  VI = 1
+  VI = 1,
+  SDWA = 2,
+  SDWA9 = 3
 };
 
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
@@ -101,7 +103,12 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
 }
 
 int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
-  int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST));
+  SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+  if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
+    Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
+                                                      : SIEncodingFamily::SDWA;
+
+  int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
 
   // -1 means that Opcode is already a native instruction.
   if (MCOp == -1)
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index e286558ce60d7..bcf89bb78ad66 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -70,6 +70,10 @@ def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
   [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
 >;
 
+def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
+  [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
+>;
+
 //===----------------------------------------------------------------------===//
 // AMDGPU DAG Nodes
 //
@@ -179,6 +183,12 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
 // out = (src1 > src0) ? 1 : 0
 def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;
 
+// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own
+// nodes in TargetSelectionDAG.td.
+def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>;
+
+def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>;
+
 def AMDGPUSetCCOp : SDTypeProfile<1, 3, [        // setcc
   SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
 ]>;
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 790a69b843979..cc56216c355bf 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -29,12 +29,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
   using namespace TargetOpcode;
 
   const LLT S1= LLT::scalar(1);
+  const LLT V2S16 = LLT::vector(2, 16);
   const LLT S32 = LLT::scalar(32);
   const LLT S64 = LLT::scalar(64);
   const LLT P1 = LLT::pointer(1, 64);
   const LLT P2 = LLT::pointer(2, 64);
 
   setAction({G_ADD, S32}, Legal);
+  setAction({G_AND, S32}, Legal);
+
+  setAction({G_BITCAST, V2S16}, Legal);
+  setAction({G_BITCAST, 1, S32}, Legal);
+
+  setAction({G_BITCAST, S32}, Legal);
+  setAction({G_BITCAST, 1, V2S16}, Legal);
 
   // FIXME: i1 operands to intrinsics should always be legal, but other i1
   // values may not be legal.  We need to figure out how to distinguish
@@ -61,6 +69,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
   setAction({G_SELECT, S32}, Legal);
   setAction({G_SELECT, 1, S1}, Legal);
 
+  setAction({G_SHL, S32}, Legal);
+
   setAction({G_STORE, S32}, Legal);
   setAction({G_STORE, 1, P1}, Legal);
 
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 8d157e2f98f24..ab5abf2039a5b 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -124,6 +124,11 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasScalarStores(false),
     HasInv2PiInlineImm(false),
     HasSDWA(false),
+    HasSDWAOmod(false),
+    HasSDWAScalar(false),
+    HasSDWASdst(false),
+    HasSDWAMac(false),
+    HasSDWAClampVOPC(false),
     HasDPP(false),
     FlatAddressSpace(false),
     FlatInstOffsets(false),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 5f4f20316a6ba..2b16289c723ef 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -149,6 +149,11 @@ protected:
   bool HasScalarStores;
   bool HasInv2PiInlineImm;
   bool HasSDWA;
+  bool HasSDWAOmod;
+  bool HasSDWAScalar;
+  bool HasSDWASdst;
+  bool HasSDWAMac;
+  bool HasSDWAClampVOPC;
   bool HasDPP;
   bool FlatAddressSpace;
   bool FlatInstOffsets;
@@ -431,6 +436,26 @@ public:
     return HasSDWA;
   }
 
+  bool hasSDWAOmod() const {
+    return HasSDWAOmod;
+  }
+
+  bool hasSDWAScalar() const {
+    return HasSDWAScalar;
+  }
+
+  bool hasSDWASdst() const {
+    return HasSDWASdst;
+  }
+
+  bool hasSDWAMac() const {
+    return HasSDWAMac;
+  }
+
+  bool hasSDWAClampVOPC() const {
+    return HasSDWAClampVOPC;
+  }
+
   /// \brief Returns the offset in bytes from the start of the input buffer
   ///        of the first explicit kernel argument.
   unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b644eba536fa4..04fe9f689806c 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -342,6 +342,14 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
         PM.add(createAMDGPUExternalAAWrapperPass());
       }
   });
+
+  Builder.addExtension(
+    PassManagerBuilder::EP_CGSCCOptimizerLate,
+    [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+      // Add infer address spaces pass to the opt pipeline after inlining
+      // but before SROA to increase SROA opportunities.
+      PM.add(createInferAddressSpacesPass());
+  });
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 0d6689bd04c4e..88245b01683a5 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -184,9 +184,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
   }
 }
 
-unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
-  if (Vec)
-    return 0;
+unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+  // The concept of vector registers doesn't really exist. Some packed vector
+  // operations operate on the normal 32-bit registers.
 
   // Number of VGPRs on SI.
   if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
@@ -195,8 +195,18 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 }
 
+unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
+  // This is really the number of registers to fill when vectorizing /
+  // interleaving loops, so we lie to avoid trying to use all registers.
+  return getHardwareNumberOfRegisters(Vec) >> 3;
+}
+
 unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
-  return Vector ? 0 : 32;
+  return 32;
+}
+
+unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
+  return 32;
 }
 
 unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
@@ -247,11 +257,11 @@ bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
 
 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // Disable unrolling if the loop is not vectorized.
+  // TODO: Enable this again.
   if (VF == 1)
     return 1;
 
-  // Semi-arbitrary large amount.
-  return 64;
+  return 8;
 }
 
 int AMDGPUTTIImpl::getArithmeticInstrCost(
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index a60b1bb1b59c7..485e20411ab49 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -75,8 +75,10 @@ public:
     return TTI::PSK_FastHardware;
   }
 
-  unsigned getNumberOfRegisters(bool Vector);
-  unsigned getRegisterBitWidth(bool Vector) const;
+  unsigned getHardwareNumberOfRegisters(bool Vector) const;
+  unsigned getNumberOfRegisters(bool Vector) const;
+  unsigned getRegisterBitWidth(bool Vector) const ;
+  unsigned getMinVectorRegisterBitWidth() const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
 
   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 392e9d89bd9ba..7b8756050b752 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -152,6 +152,8 @@ public:
     ImmTyExpTgt,
     ImmTyExpCompr,
     ImmTyExpVM,
+    ImmTyDFMT,
+    ImmTyNFMT,
     ImmTyHwreg,
     ImmTyOff,
     ImmTySendMsg,
@@ -260,6 +262,8 @@ public:
     return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
   }
 
+  bool isSDWARegKind() const;
+
   bool isImmTy(ImmTy ImmT) const {
     return isImm() && Imm.Type == ImmT;
   }
@@ -292,6 +296,8 @@ public:
   bool isGLC() const { return isImmTy(ImmTyGLC); }
   bool isSLC() const { return isImmTy(ImmTySLC); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
+  bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
+  bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
   bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
   bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
   bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@@ -636,6 +642,8 @@ public:
     case ImmTyGLC: OS << "GLC"; break;
     case ImmTySLC: OS << "SLC"; break;
     case ImmTyTFE: OS << "TFE"; break;
+    case ImmTyDFMT: OS << "DFMT"; break;
+    case ImmTyNFMT: OS << "NFMT"; break;
     case ImmTyClampSI: OS << "ClampSI"; break;
     case ImmTyOModSI: OS << "OModSI"; break;
     case ImmTyDppCtrl: OS << "DppCtrl"; break;
@@ -993,7 +1001,9 @@ private:
   void errorExpTgt();
   OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
 
-  bool validateOperandLimitations(const MCInst &Inst);
+  bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc);
+  bool validateConstantBusLimitations(const MCInst &Inst);
+  bool validateEarlyClobberLimitations(const MCInst &Inst);
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
   unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
@@ -1029,6 +1039,8 @@ public:
   void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
   void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
   void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+  void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
+
   AMDGPUOperand::Ptr defaultGLC() const;
   AMDGPUOperand::Ptr defaultSLC() const;
   AMDGPUOperand::Ptr defaultTFE() const;
@@ -1042,6 +1054,7 @@ public:
   AMDGPUOperand::Ptr defaultSMRDOffset20() const;
   AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
   AMDGPUOperand::Ptr defaultOffsetU12() const;
+  AMDGPUOperand::Ptr defaultOffsetS13() const;
 
   OperandMatchResultTy parseOModOperand(OperandVector &Operands);
 
@@ -1243,6 +1256,15 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
   return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
 }
 
+bool AMDGPUOperand::isSDWARegKind() const {
+  if (AsmParser->isVI())
+    return isVReg();
+  else if (AsmParser->isGFX9())
+    return isRegKind();
+  else
+    return false;
+}
+
 uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
 {
   assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
@@ -2083,7 +2105,7 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
          isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo());
 }
 
-bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
+bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
   const unsigned Opcode = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opcode);
   unsigned ConstantBusUseCount = 0;
@@ -2137,6 +2159,60 @@ bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
   return ConstantBusUseCount <= 1;
 }
 
+bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
+
+  const unsigned Opcode = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opcode);
+
+  const int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+  if (DstIdx == -1 ||
+      Desc.getOperandConstraint(DstIdx, MCOI::EARLY_CLOBBER) == -1) {
+    return true;
+  }
+
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
+  const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+  const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+  const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+  assert(DstIdx != -1);
+  const MCOperand &Dst = Inst.getOperand(DstIdx);
+  assert(Dst.isReg());
+  const unsigned DstReg = mc2PseudoReg(Dst.getReg());
+
+  const int SrcIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+  for (int SrcIdx : SrcIndices) {
+    if (SrcIdx == -1) break;
+    const MCOperand &Src = Inst.getOperand(SrcIdx);
+    if (Src.isReg()) {
+      const unsigned SrcReg = mc2PseudoReg(Src.getReg());
+      if (isRegIntersect(DstReg, SrcReg, TRI)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
+                                          const SMLoc &IDLoc) {
+  if (!validateConstantBusLimitations(Inst)) {
+    Error(IDLoc,
+      "invalid operand (violates constant bus restrictions)");
+    return false;
+  }
+  if (!validateEarlyClobberLimitations(Inst)) {
+    Error(IDLoc,
+      "destination must be different than all sources");
+    return false;
+  }
+
+  return true;
+}
+
 bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                               OperandVector &Operands,
                                               MCStreamer &Out,
@@ -2169,9 +2245,8 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   switch (Result) {
   default: break;
   case Match_Success:
-    if (!validateOperandLimitations(Inst)) {
-      return Error(IDLoc,
-                   "invalid operand (violates constant bus restrictions)");
+    if (!validateInstruction(Inst, IDLoc)) {
+      return true;
     }
     Inst.setLoc(IDLoc);
     Out.EmitInstruction(Inst, getSTI());
@@ -2554,11 +2629,21 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
         return MatchOperand_ParseFail;
 
       Parser.Lex();
+
+      bool IsMinus = false;
+      if (getLexer().getKind() == AsmToken::Minus) {
+        Parser.Lex();
+        IsMinus = true;
+      }
+
       if (getLexer().isNot(AsmToken::Integer))
         return MatchOperand_ParseFail;
 
       if (getParser().parseAbsoluteExpression(Int))
         return MatchOperand_ParseFail;
+
+      if (IsMinus)
+        Int = -Int;
       break;
     }
   }
@@ -3743,6 +3828,44 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
 }
 
+void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle the case where soffset is an immediate
+    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+      Op.addImmOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle tokens like 'offen' which are sometimes hard-coded into the
+    // asm string.  There are no MCInst operands for these.
+    if (Op.isToken()) {
+      continue;
+    }
+    assert(Op.isImm());
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                        AMDGPUOperand::ImmTyOffset);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+}
+
 //===----------------------------------------------------------------------===//
 // mimg
 //===----------------------------------------------------------------------===//
@@ -3870,6 +3993,10 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetU12() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
 }
 
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
 //===----------------------------------------------------------------------===//
 // vop3
 //===----------------------------------------------------------------------===//
@@ -3919,6 +4046,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
   {"gds",     AMDGPUOperand::ImmTyGDS, true, nullptr},
   {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
+  {"dfmt",    AMDGPUOperand::ImmTyDFMT, false, nullptr},
+  {"nfmt",    AMDGPUOperand::ImmTyNFMT, false, nullptr},
   {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
   {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
   {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
@@ -4475,12 +4604,11 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
 
   if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 &&
       Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
-    // V_NOP_sdwa_vi has no optional sdwa arguments
+    // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments
     switch (BasicInstType) {
     case SIInstrFlags::VOP1:
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
-      if (isGFX9() &&
-          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
         addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
       }
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
@@ -4490,8 +4618,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
 
     case SIInstrFlags::VOP2:
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
-      if (isGFX9() &&
-          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
         addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
       }
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
@@ -4501,9 +4628,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
       break;
 
     case SIInstrFlags::VOPC:
-      if (isVI()) {
-        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
-      }
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
       break;
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 2aca65ac84303..2e96c14eaa320 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -57,6 +57,11 @@ class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
   string OpName = NAME # suffix;
 }
 
+class MTBUFAddr64Table <bit is_addr64, string suffix = ""> {
+  bit IsAddr64 = is_addr64;
+  string OpName = NAME # suffix;
+}
+
 //===----------------------------------------------------------------------===//
 // MTBUF classes
 //===----------------------------------------------------------------------===//
@@ -78,14 +83,31 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
   let EXP_CNT = 1;
   let MTBUF = 1;
   let Uses = [EXEC];
-
   let hasSideEffects = 0;
   let SchedRW = [WriteVMEM];
+
+  let AsmMatchConverter = "cvtMtbuf";
+
+  bits<1> offen       = 0;
+  bits<1> idxen       = 0;
+  bits<1> addr64      = 0;
+  bits<1> has_vdata   = 1;
+  bits<1> has_vaddr   = 1;
+  bits<1> has_glc     = 1;
+  bits<1> glc_value   = 0; // the value for glc if no such operand
+  bits<4> dfmt_value  = 1; // the value for dfmt if no such operand
+  bits<3> nfmt_value  = 0; // the value for nfmt if no such operand
+  bits<1> has_srsrc   = 1;
+  bits<1> has_soffset = 1;
+  bits<1> has_offset  = 1;
+  bits<1> has_slc     = 1;
+  bits<1> has_tfe     = 1;
+  bits<1> has_dfmt    = 1;
+  bits<1> has_nfmt    = 1;
 }
 
 class MTBUF_Real <MTBUF_Pseudo ps> :
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
-  Enc64 {
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -97,57 +119,168 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
 
-  bits<8> vdata;
   bits<12> offset;
-  bits<1> offen;
-  bits<1> idxen;
-  bits<1> glc;
-  bits<1> addr64;
-  bits<4> dfmt;
-  bits<3> nfmt;
-  bits<8> vaddr;
-  bits<7> srsrc;
-  bits<1> slc;
-  bits<1> tfe;
-  bits<8> soffset;
-
-  let Inst{11-0}  = offset;
-  let Inst{12}    = offen;
-  let Inst{13}    = idxen;
-  let Inst{14}    = glc;
-  let Inst{22-19} = dfmt;
-  let Inst{25-23} = nfmt;
-  let Inst{31-26} = 0x3a; //encoding
-  let Inst{39-32} = vaddr;
-  let Inst{47-40} = vdata;
-  let Inst{52-48} = srsrc{6-2};
-  let Inst{54}    = slc;
-  let Inst{55}    = tfe;
-  let Inst{63-56} = soffset;
+  bits<1>  glc;
+  bits<4>  dfmt;
+  bits<3>  nfmt;
+  bits<8>  vaddr;
+  bits<8>  vdata;
+  bits<7>  srsrc;
+  bits<1>  slc;
+  bits<1>  tfe;
+  bits<8>  soffset;
+}
+
+class getMTBUFInsDA<list<RegisterClass> vdataList,
+                    list<RegisterClass> vaddrList=[]> {
+  RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+  RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+  dag InsNoData = !if(!empty(vaddrList),
+    (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+    (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+  );
+  dag InsData = !if(!empty(vaddrList),
+    (ins vdataClass:$vdata,                    SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+         slc:$slc, tfe:$tfe),
+    (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+         slc:$slc, tfe:$tfe)
+  );
+  dag ret = !if(!empty(vdataList), InsNoData, InsData);
 }
 
-class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
-  opName, (outs regClass:$dst),
-  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
-       i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
-  " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
-  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+  dag ret =
+    !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+    (ins))))));
+}
+
+class getMTBUFAsmOps<int addrKind> {
+  string Pfx =
+    !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset",
+    !if(!eq(addrKind, BUFAddrKind.OffEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen",
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen",
+    !if(!eq(addrKind, BUFAddrKind.BothEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen",
+    !if(!eq(addrKind, BUFAddrKind.Addr64),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64",
+    "")))));
+  string ret = Pfx # "$offset";
+}
+
+class MTBUF_SetupAddr<int addrKind> {
+  bits<1> offen  = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> idxen  = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+
+  bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+}
+
+class MTBUF_Load_Pseudo <string opName,
+                         int addrKind,
+                         RegisterClass vdataClass,
+                         list<dag> pattern=[],
+                         // Workaround bug bz30254
+                         int addrKindCopy = addrKind>
+  : MTBUF_Pseudo<opName,
+                 (outs vdataClass:$vdata),
+                 getMTBUFIns<addrKindCopy>.ret,
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MTBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 1;
   let mayStore = 0;
 }
 
-class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
-  opName, (outs),
-  (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
-       i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
-       SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
-  " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
-  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+                              ValueType load_vt = i32,
+                              SDPatternOperator ld = null_frag> {
+
+  def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
+                      i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MTBUFAddr64Table<0>;
+
+  def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
+                      i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MTBUFAddr64Table<1>;
+
+  def _OFFEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+class MTBUF_Store_Pseudo <string opName,
+                          int addrKind,
+                          RegisterClass vdataClass,
+                          list<dag> pattern=[],
+                          // Workaround bug bz30254
+                          int addrKindCopy = addrKind,
+                          RegisterClass vdataClassCopy = vdataClass>
+  : MTBUF_Pseudo<opName,
+                 (outs),
+                 getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MTBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 0;
   let mayStore = 1;
 }
 
+multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+                               ValueType store_vt = i32,
+                               SDPatternOperator st = null_frag> {
+
+  def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+                                       i1:$slc, i1:$tfe))]>,
+    MTBUFAddr64Table<0>;
+
+  def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+                                       i1:$slc, i1:$tfe))]>,
+    MTBUFAddr64Table<1>;
+
+  def _OFFEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+
 //===----------------------------------------------------------------------===//
 // MUBUF classes
 //===----------------------------------------------------------------------===//
@@ -676,14 +809,14 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
 // MTBUF Instructions
 //===----------------------------------------------------------------------===//
 
-//def TBUFFER_LOAD_FORMAT_X    : MTBUF_ <0, "tbuffer_load_format_x", []>;
-//def TBUFFER_LOAD_FORMAT_XY   : MTBUF_ <1, "tbuffer_load_format_xy", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ  : MTBUF_ <2, "tbuffer_load_format_xyz", []>;
-def TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Load_Pseudo  <"tbuffer_load_format_xyzw", VReg_128>;
-def TBUFFER_STORE_FORMAT_X    : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>;
-def TBUFFER_STORE_FORMAT_XY   : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>;
-def TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>;
-def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_x",     VGPR_32>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xy",    VReg_64>;
+defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyz",   VReg_128>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyzw",  VReg_128>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_x",    VGPR_32>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",   VReg_64>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz",  VReg_128>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
 
 } // End let SubtargetPredicate = isGCN
 
@@ -1093,22 +1226,98 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OF
 // MTBUF Patterns
 //===----------------------------------------------------------------------===//
 
-// TBUFFER_STORE_FORMAT_*, addr64=0
-class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
-  (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
-                   i32:$soffset, imm:$inst_offset, imm:$dfmt,
-                   imm:$nfmt, imm:$offen, imm:$idxen,
-                   imm:$glc, imm:$slc, imm:$tfe),
-  (opcode
-    $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
-    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
-    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
->;
+//===----------------------------------------------------------------------===//
+// tbuffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                  string opcode> {
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
 
-def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
-def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
-def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
-def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32,   "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32,   "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
+
+multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                   string opcode> {
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
+                                (as_i16imm $offset), (as_i8imm $dfmt),
+                                (as_i8imm $nfmt), (as_i1imm $glc),
+                                (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i8imm $dfmt),
+                                   (as_i8imm $nfmt), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i8imm $dfmt),
+                                   (as_i8imm $nfmt), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
+          imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
+      $vdata,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32,   "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32,   "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
 
 } // End let Predicates = [isGCN]
 
@@ -1224,21 +1433,44 @@ def BUFFER_WBINVL1_si           : MUBUF_Real_si <0x71, BUFFER_WBINVL1>;
 
 class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
   MTBUF_Real<ps>,
+  Enc64,
   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
   let AssemblerPredicate=isSICI;
   let DecoderNamespace="SICI";
 
-  bits<1> addr64;
-  let Inst{15}    = addr64;
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{15}    = ps.addr64;
   let Inst{18-16} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
-def TBUFFER_LOAD_FORMAT_XYZW_si  : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_si    : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_si   : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_si  : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
+  def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>;
+  def _OFFEN_si  : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_si  : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
 
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_si <0>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_si <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_si <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_si <3>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_si <4>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_si <5>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_si <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
 
 //===----------------------------------------------------------------------===//
 // CI
@@ -1350,16 +1582,39 @@ def BUFFER_WBINVL1_VOL_vi       : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
 
 class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
   MTBUF_Real<ps>,
+  Enc64,
   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
   let AssemblerPredicate=isVI;
   let DecoderNamespace="VI";
 
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
   let Inst{18-15} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
-def TBUFFER_LOAD_FORMAT_XYZW_vi  : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_vi    : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_vi   : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_vi  : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
+  def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
 
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_vi <0>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_vi <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_vi <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_vi <3>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_vi <4>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_vi <5>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_vi <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>;
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 88c92b9582fd0..04308fb3aaf64 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -49,6 +49,17 @@ addOperand(MCInst &Inst, const MCOperand& Opnd) {
     MCDisassembler::SoftFail;
 }
 
+static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
+                                uint16_t NameIdx) {
+  int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx);
+  if (OpIdx != -1) {
+    auto I = MI.begin();
+    std::advance(I, OpIdx);
+    MI.insert(I, Op);
+  }
+  return OpIdx;
+}
+
 static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
                                        uint64_t Addr, const void *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
@@ -106,12 +117,12 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
   return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
 }
 
-#define DECODE_SDWA9(DecName) \
-DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName)
+#define DECODE_SDWA(DecName) \
+DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
 
-DECODE_SDWA9(Src32)
-DECODE_SDWA9(Src16)
-DECODE_SDWA9(VopcDst)
+DECODE_SDWA(Src32)
+DECODE_SDWA(Src16)
+DECODE_SDWA(VopcDst)
 
 #include "AMDGPUGenDisassemblerTables.inc"
 
@@ -149,6 +160,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                                                 raw_ostream &WS,
                                                 raw_ostream &CS) const {
   CommentStream = &CS;
+  bool IsSDWA = false;
 
   // ToDo: AMDGPUDisassembler supports only VI ISA.
   if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding])
@@ -170,10 +182,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       if (Res) break;
 
       Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
-      if (Res) break;
+      if (Res) { IsSDWA = true;  break; }
 
       Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
-      if (Res) break;
+      if (Res) { IsSDWA = true;  break; }
     }
 
     // Reinitialize Bytes as DPP64 could have eaten too much
@@ -200,17 +212,36 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
               MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
               MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) {
     // Insert dummy unused src2_modifiers.
-    int Src2ModIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
-                                                AMDGPU::OpName::src2_modifiers);
-    auto I = MI.begin();
-    std::advance(I, Src2ModIdx);
-    MI.insert(I, MCOperand::createImm(0));
+    insertNamedMCOperand(MI, MCOperand::createImm(0),
+                         AMDGPU::OpName::src2_modifiers);
   }
 
+  if (Res && IsSDWA)
+    Res = convertSDWAInst(MI);
+
   Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
   return Res;
 }
 
+DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
+  if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
+    if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1)
+      // VOPC - insert clamp
+      insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
+  } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
+    int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst);
+    if (SDst != -1) {
+      // VOPC - insert VCC register as sdst
+      insertNamedMCOperand(MI, MCOperand::createReg(AMDGPU::VCC),
+                           AMDGPU::OpName::sdst);
+    } else {
+      // VOP1/2 - insert omod if present in instruction
+      insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
+    }
+  }
+  return MCDisassembler::Success;
+}
+
 const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
   return getContext().getRegisterInfo()->
     getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
@@ -524,8 +555,6 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
     return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
   }
 
-  assert(Width == OPW16 || Width == OPW32 || Width == OPW64);
-
   if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
     return decodeIntImmed(Val);
 
@@ -592,36 +621,43 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
   return errOperand(Val, "unknown operand encoding " + Twine(Val));
 }
 
-MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width,
-                                             unsigned Val) const {
+MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
+                                            unsigned Val) const {
   using namespace AMDGPU::SDWA;
 
-  if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&
-      Val <= SDWA9EncValues::SRC_VGPR_MAX) {
-    return createRegOperand(getVgprClassId(Width),
-                            Val - SDWA9EncValues::SRC_VGPR_MIN);
-  } 
-  if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
-      Val <= SDWA9EncValues::SRC_SGPR_MAX) {
-    return createSRegOperand(getSgprClassId(Width),
-                             Val - SDWA9EncValues::SRC_SGPR_MIN);
-  }
+  if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
+    if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&
+        Val <= SDWA9EncValues::SRC_VGPR_MAX) {
+      return createRegOperand(getVgprClassId(Width),
+                              Val - SDWA9EncValues::SRC_VGPR_MIN);
+    }
+    if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
+        Val <= SDWA9EncValues::SRC_SGPR_MAX) {
+      return createSRegOperand(getSgprClassId(Width),
+                               Val - SDWA9EncValues::SRC_SGPR_MIN);
+    }
 
-  return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+    return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+  } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
+    return createRegOperand(getVgprClassId(Width), Val);
+  }
+  llvm_unreachable("unsupported target");
 }
 
-MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const {
-  return decodeSDWA9Src(OPW16, Val);
+MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const {
+  return decodeSDWASrc(OPW16, Val);
 }
 
-MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const {
-  return decodeSDWA9Src(OPW32, Val);
+MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
+  return decodeSDWASrc(OPW32, Val);
 }
 
 
-MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const {
+MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
   using namespace AMDGPU::SDWA;
 
+  assert(STI.getFeatureBits()[AMDGPU::FeatureGFX9] &&
+         "SDWAVopcDst should be present only on GFX9");
   if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
     Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
     if (Val > AMDGPU::EncValues::SGPR_MAX) {
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 5fa3cf1a223fa..3d71db909e20d 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -65,6 +65,8 @@ public:
                               uint64_t Inst,
                               uint64_t Address) const;
 
+  DecodeStatus convertSDWAInst(MCInst &MI) const;
+
   MCOperand decodeOperand_VGPR_32(unsigned Val) const;
   MCOperand decodeOperand_VS_32(unsigned Val) const;
   MCOperand decodeOperand_VS_64(unsigned Val) const;
@@ -105,10 +107,10 @@ public:
   MCOperand decodeSpecialReg32(unsigned Val) const;
   MCOperand decodeSpecialReg64(unsigned Val) const;
 
-  MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const;
-  MCOperand decodeSDWA9Src16(unsigned Val) const;
-  MCOperand decodeSDWA9Src32(unsigned Val) const;
-  MCOperand decodeSDWA9VopcDst(unsigned Val) const;
+  MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val) const;
+  MCOperand decodeSDWASrc16(unsigned Val) const;
+  MCOperand decodeSDWASrc32(unsigned Val) const;
+  MCOperand decodeSDWAVopcDst(unsigned Val) const;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 98eda288bcacb..edca6fcd812c8 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -31,8 +31,6 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   let VM_CNT = 1;
   let LGKM_CNT = 1;
 
-  let Uses = [EXEC, FLAT_SCR]; // M0
-
   let UseNamedOperandTable = 1;
   let hasSideEffects = 0;
   let SchedRW = [WriteVMEM];
@@ -40,10 +38,16 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   string Mnemonic = opName;
   string AsmOperands = asmOps;
 
+  bits<1> is_flat_global = 0;
+  bits<1> is_flat_scratch = 0;
+
   bits<1> has_vdst = 1;
   bits<1> has_data = 1;
   bits<1> has_glc  = 1;
   bits<1> glcValue = 0;
+
+  // TODO: M0 if it could possibly access LDS (before gfx9? only)?
+  let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]);
 }
 
 class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -68,7 +72,10 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
 
   // Only valid on gfx9
   bits<1> lds = 0; // XXX - What does this actually do?
-  bits<2> seg; // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
+
+  // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
+  bits<2> seg = !if(ps.is_flat_global, 0b10,
+                  !if(ps.is_flat_scratch, 0b01, 0));
 
   // Signed offset. Highest bit ignored for flat and treated as 12-bit
   // unsigned for flat acceses.
@@ -81,7 +88,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   // Only valid on GFX9+
   let Inst{12-0} = offset;
   let Inst{13} = lds;
-  let Inst{15-14} = 0;
+  let Inst{15-14} = seg;
 
   let Inst{16}    = !if(ps.has_glc, glc, ps.glcValue);
   let Inst{17}    = slc;
@@ -106,6 +113,16 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
   let mayLoad = 1;
 }
 
+class FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Load_Pseudo<opName, regClass, 1> {
+  let is_flat_global = 1;
+}
+
+class FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Load_Pseudo<opName, regClass, 1> {
+  let is_flat_scratch = 1;
+}
+
 class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
   bit HasSignedOffset = 0> : FLAT_Pseudo<
   opName,
@@ -119,6 +136,16 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
   let has_vdst = 0;
 }
 
+class FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Store_Pseudo<opName, regClass, 1> {
+  let is_flat_global = 1;
+}
+
+class FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Store_Pseudo<opName, regClass, 1> {
+  let is_flat_scratch = 1;
+}
+
 multiclass FLAT_Atomic_Pseudo<
   string opName,
   RegisterClass vdst_rc,
@@ -306,6 +333,26 @@ defm FLAT_ATOMIC_FMAX_X2     : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
 
 } // End SubtargetPredicate = isCI
 
+let SubtargetPredicate = HasFlatGlobalInsts in {
+def GLOBAL_LOAD_UBYTE    : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
+def GLOBAL_LOAD_SBYTE    : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
+def GLOBAL_LOAD_USHORT   : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
+def GLOBAL_LOAD_SSHORT   : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>;
+def GLOBAL_LOAD_DWORD    : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>;
+def GLOBAL_LOAD_DWORDX2  : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>;
+def GLOBAL_LOAD_DWORDX3  : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>;
+def GLOBAL_LOAD_DWORDX4  : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>;
+
+def GLOBAL_STORE_BYTE    : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
+def GLOBAL_STORE_SHORT   : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
+def GLOBAL_STORE_DWORD   : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>;
+def GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
+def GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
+def GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
+
+} // End SubtargetPredicate = HasFlatGlobalInsts
+
+
 //===----------------------------------------------------------------------===//
 // Flat Patterns
 //===----------------------------------------------------------------------===//
@@ -557,3 +604,18 @@ defm FLAT_ATOMIC_XOR_X2     : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>;
 defm FLAT_ATOMIC_INC_X2     : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>;
 defm FLAT_ATOMIC_DEC_X2     : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>;
 
+def GLOBAL_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, GLOBAL_LOAD_UBYTE>;
+def GLOBAL_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, GLOBAL_LOAD_SBYTE>;
+def GLOBAL_LOAD_USHORT_vi : FLAT_Real_vi <0x12, GLOBAL_LOAD_USHORT>;
+def GLOBAL_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, GLOBAL_LOAD_SSHORT>;
+def GLOBAL_LOAD_DWORD_vi : FLAT_Real_vi <0x14, GLOBAL_LOAD_DWORD>;
+def GLOBAL_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, GLOBAL_LOAD_DWORDX2>;
+def GLOBAL_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, GLOBAL_LOAD_DWORDX4>;
+def GLOBAL_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, GLOBAL_LOAD_DWORDX3>;
+
+def GLOBAL_STORE_BYTE_vi : FLAT_Real_vi <0x18, GLOBAL_STORE_BYTE>;
+def GLOBAL_STORE_SHORT_vi : FLAT_Real_vi <0x1a, GLOBAL_STORE_SHORT>;
+def GLOBAL_STORE_DWORD_vi : FLAT_Real_vi <0x1c, GLOBAL_STORE_DWORD>;
+def GLOBAL_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, GLOBAL_STORE_DWORDX2>;
+def GLOBAL_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, GLOBAL_STORE_DWORDX4>;
+def GLOBAL_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, GLOBAL_STORE_DWORDX3>;
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index b84640230eeeb..7c31c8e397ba7 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -72,6 +72,11 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
 }
 
+void AMDGPUInstPrinter::printS16ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &O) {
+  O << formatDec(static_cast<int16_t>(MI->getOperand(OpNo).getImm()));
+}
+
 void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
@@ -118,6 +123,16 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  uint16_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm != 0) {
+    O << ((OpNo == 0)? "offset:" : " offset:");
+    printS16ImmDecOperand(MI, OpNo, O);
+  }
+}
+
 void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
@@ -216,6 +231,24 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
     O << " vm";
 }
 
+void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " dfmt:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " nfmt:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
 void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
                                         const MCRegisterInfo &MRI) {
   switch (RegNo) {
@@ -379,7 +412,6 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
   uint16_t Lo16 = static_cast<uint16_t>(Imm);
-  assert(Lo16 == static_cast<uint16_t>(Imm >> 16));
   printImmediate16(Lo16, STI, O);
 }
 
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index c8094c4b840a1..7bbf99a85f409 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -42,6 +42,7 @@ private:
   void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
@@ -52,6 +53,9 @@ private:
   void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
+  void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                      raw_ostream &O);
+
   void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
   void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -84,6 +88,10 @@ private:
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpVM(const MCInst *MI, unsigned OpNo,
                   const MCSubtargetInfo &STI, raw_ostream &O);
+  void printDFMT(const MCInst *MI, unsigned OpNo,
+                 const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNFMT(const MCInst *MI, unsigned OpNo,
+                 const MCSubtargetInfo &STI, raw_ostream &O);
 
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 0a9c2b94c1eee..2b408ff10caae 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -30,14 +30,9 @@ public:
 
   unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
 
-  void processFixupValue(const MCAssembler &Asm,
-                         const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override {
@@ -102,36 +97,11 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
 }
 
-void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                         const MCAsmLayout &Layout,
-                                         const MCFixup &Fixup, const MCFragment *DF,
-                                         const MCValue &Target, uint64_t &Value,
-                                         bool &IsResolved) {
-  MCValue Res;
-
-  // When we have complex expressions like: BB0_1 + (BB0_2 - 4), which are
-  // used for long branches, this function will be called with
-  // IsResolved = false and Value set to some pre-computed value.  In
-  // the example above, the value would be:
-  // (BB0_1 + (BB0_2 - 4)) - CurrentOffsetFromStartOfFunction.
-  // This is not what we want.  We just want the expression computation
-  // only.  The reason the MC layer subtracts the current offset from the
-  // expression is because the fixup is of kind FK_PCRel_4.
-  // For these scenarios, evaluateAsValue gives us the computation that we
-  // want.
-  if (!IsResolved && Fixup.getValue()->evaluateAsValue(Res, Layout) &&
-      Res.isAbsolute()) {
-    Value = Res.getConstant();
-    IsResolved = true;
-
-  }
-  if (IsResolved)
-    Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
-}
-
-void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                  unsigned DataSize, uint64_t Value,
-                                  bool IsPCRel, MCContext &Ctx) const {
+void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                  const MCValue &Target,
+                                  MutableArrayRef<char> Data, uint64_t Value,
+                                  bool IsPCRel) const {
+  Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
   if (!Value)
     return; // Doesn't change encoding.
 
@@ -142,7 +112,7 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
   uint32_t Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // For each byte of the fragment that the fixup touches, mask in the bits from
   // the fixup value.
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index a856b17a228f0..1b062064ace1c 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -52,15 +52,15 @@ public:
     return 0;
   }
 
-  virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const {
+  virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
     return 0;
   }
 
-  virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+  virtual unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
     return 0;
   }
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index e02acf516c0db..376c9bfe5ccf2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -70,13 +70,13 @@ public:
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const override;
 
-  unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const override;
+  unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const override;
 
-  unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const override;
+  unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const override;
 };
 
 } // end anonymous namespace
@@ -252,9 +252,7 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
     uint16_t Lo16 = static_cast<uint16_t>(Imm);
-    assert(Lo16 == static_cast<uint16_t>(Imm >> 16));
     uint32_t Encoding = getLit16Encoding(Lo16, STI);
-    assert(Encoding != 255 && "packed constants can only be inline immediates");
     return Encoding;
   }
   default:
@@ -328,11 +326,11 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned
-SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const {
+SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const {
   using namespace AMDGPU::SDWA;
-  
+
   uint64_t RegEnc = 0;
 
   const MCOperand &MO = MI.getOperand(OpNo);
@@ -347,9 +345,9 @@ SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned
-SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
+SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
   using namespace AMDGPU::SDWA;
 
   uint64_t RegEnc = 0;
@@ -365,6 +363,25 @@ SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
   return RegEnc;
 }
 
+static bool needsPCRel(const MCExpr *Expr) {
+  switch (Expr->getKind()) {
+  case MCExpr::SymbolRef:
+    return true;
+  case MCExpr::Binary: {
+    auto *BE = cast<MCBinaryExpr>(Expr);
+    if (BE->getOpcode() == MCBinaryExpr::Sub)
+      return false;
+    return needsPCRel(BE->getLHS()) || needsPCRel(BE->getRHS());
+  }
+  case MCExpr::Unary:
+    return needsPCRel(cast<MCUnaryExpr>(Expr)->getSubExpr());
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    return false;
+  }
+  llvm_unreachable("invalid kind");
+}
+
 uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                             const MCOperand &MO,
                                        SmallVectorImpl<MCFixup> &Fixups,
@@ -373,12 +390,21 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
     return MRI.getEncodingValue(MO.getReg());
 
   if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
-    const auto *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+    // FIXME: If this is expression is PCRel or not should not depend on what
+    // the expression looks like. Given that this is just a general expression,
+    // it should probably be FK_Data_4 and whatever is producing
+    //
+    //    s_add_u32 s2, s2, (extern_const_addrspace+16
+    //
+    // And expecting a PCRel should instead produce
+    //
+    // .Ltmp1:
+    //   s_add_u32 s2, s2, (extern_const_addrspace+16)-.Ltmp1
     MCFixupKind Kind;
-    if (Expr && Expr->getSymbol().isExternal())
-      Kind = FK_Data_4;
-    else
+    if (needsPCRel(MO.getExpr()))
       Kind = FK_PCRel_4;
+    else
+      Kind = FK_Data_4;
     Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc()));
   }
 
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index f6f2582aa11b3..d30d1d382588c 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -80,7 +80,7 @@ def : Proc<"cayman",     R600_VLIW4_Itin,
 // Southern Islands
 //===----------------------------------------------------------------------===//
 
-def : ProcessorModel<"gfx600",     SIFullSpeedModel, 
+def : ProcessorModel<"gfx600",     SIFullSpeedModel,
   [FeatureISAVersion6_0_0]>;
 
 def : ProcessorModel<"SI",         SIFullSpeedModel,
@@ -95,7 +95,7 @@ def : ProcessorModel<"gfx601",     SIQuarterSpeedModel,
   [FeatureISAVersion6_0_1]
 >;
 
-def : ProcessorModel<"pitcairn",   SIQuarterSpeedModel, 
+def : ProcessorModel<"pitcairn",   SIQuarterSpeedModel,
   [FeatureISAVersion6_0_1]>;
 
 def : ProcessorModel<"verde",      SIQuarterSpeedModel,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index c55878f8bff0f..215791f4f92dd 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -584,23 +584,23 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
     case Intrinsic::r600_read_tgid_x:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T1_X, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T1_X, VT);
     case Intrinsic::r600_read_tgid_y:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T1_Y, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T1_Y, VT);
     case Intrinsic::r600_read_tgid_z:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T1_Z, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T1_Z, VT);
     case Intrinsic::r600_read_tidig_x:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T0_X, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T0_X, VT);
     case Intrinsic::r600_read_tidig_y:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T0_Y, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T0_Y, VT);
     case Intrinsic::r600_read_tidig_z:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T0_Z, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T0_Z, VT);
 
     case Intrinsic::r600_recipsqrt_ieee:
       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 5cd90323ff67b..3915c0e5bdbed 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -118,9 +118,9 @@ namespace AMDGPU {
     // Operand for source modifiers for VOP instructions
     OPERAND_INPUT_MODS,
 
-    // Operand for GFX9 SDWA instructions
-    OPERAND_SDWA9_SRC,
-    OPERAND_SDWA9_VOPC_DST,
+    // Operand for SDWA instructions
+    OPERAND_SDWA_SRC,
+    OPERAND_SDWA_VOPC_DST,
 
     /// Operand with 32-bit immediate that uses the constant bus.
     OPERAND_KIMM32,
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 5f5f25103c027..0a795c99f94e5 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -174,6 +174,31 @@ static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
   return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
 }
 
+static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
+                                      const SIRegisterInfo *TRI,
+                                      const SIInstrInfo *TII) {
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  auto &Src = MI.getOperand(1);
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = Src.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+      !TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+
+  for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
+    const auto *UseMI = MO.getParent();
+    if (UseMI == &MI)
+      continue;
+    if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
+        UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END ||
+        !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src))
+      return false;
+  }
+  // Change VGPR to SGPR destination.
+  MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
+  return true;
+}
+
 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
 //
 // SGPRx = ...
@@ -214,6 +239,9 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
   if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
     return false;
 
+  if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII))
+    return true;
+
   // TODO: Could have multiple extracts?
   unsigned SubReg = CopyUse.getOperand(1).getSubReg();
   if (SubReg != AMDGPU::NoSubRegister)
@@ -563,6 +591,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
             break;
           }
           TII->moveToVALU(MI);
+        } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
+          tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
         }
 
         break;
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index e10f1ed3762e8..f391f67a241f1 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -13,6 +13,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -166,6 +167,8 @@ static bool updateOperand(FoldCandidate &Fold,
   if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
       TargetRegisterInfo::isVirtualRegister(New->getReg())) {
     Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+
+    Old.setIsUndef(New->isUndef());
     return true;
   }
 
@@ -470,7 +473,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
       return &Op;
 
     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
-    if (Def->isMoveImmediate()) {
+    if (Def && Def->isMoveImmediate()) {
       MachineOperand &ImmSrc = Def->getOperand(1);
       if (ImmSrc.isImm())
         return &ImmSrc;
@@ -921,12 +924,9 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   // level.
   bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
+  for (MachineBasicBlock *MBB : depth_first(&MF)) {
     MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    for (I = MBB->begin(); I != MBB->end(); I = Next) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index b1bd14e421f02..08a64de385018 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -284,7 +284,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
-  if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
+  if (ST.isAmdCodeObjectV2(MF)) {
     PreloadedPrivateBufferReg = TRI->getPreloadedValue(
       MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
   }
@@ -363,14 +363,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     // Use relocations to get the pointer, and setup the other bits manually.
     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
 
-    if (MFI->hasPrivateMemoryInputPtr()) {
+    if (MFI->hasImplicitBufferPtr()) {
       unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
 
       if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
 
         BuildMI(MBB, I, DL, Mov64, Rsrc01)
-          .addReg(PreloadedPrivateBufferReg)
+          .addReg(MFI->getImplicitBufferPtrUserSGPR())
           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
       } else {
         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
@@ -385,7 +385,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                            MachineMemOperand::MODereferenceable,
                                            0, 0);
         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
-          .addReg(PreloadedPrivateBufferReg)
+          .addReg(MFI->getImplicitBufferPtrUserSGPR())
           .addImm(0) // offset
           .addImm(0) // glc
           .addMemOperand(MMO)
@@ -417,14 +417,69 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
 
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  if (MFI->isEntryFunction())
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  if (FuncInfo->isEntryFunction()) {
     emitEntryFunctionPrologue(MF, MBB);
+    return;
+  }
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+  unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
+
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL;
+
+  bool NeedFP = hasFP(MF);
+  if (NeedFP) {
+    // If we need a base pointer, set it up here. It's whatever the value of
+    // the stack pointer is at this point. Any variable size objects will be
+    // allocated after this, so we can still use the base pointer to reference
+    // locals.
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
+      .addReg(StackPtrReg)
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  uint32_t NumBytes = MFI.getStackSize();
+  if (NumBytes != 0 && hasSP(MF)) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
+      .addReg(StackPtrReg)
+      .addImm(NumBytes * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
 }
 
 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  if (FuncInfo->isEntryFunction())
+    return;
 
+  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+  if (StackPtrReg == AMDGPU::NoRegister)
+    return;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  uint32_t NumBytes = MFI.getStackSize();
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc DL;
+
+  // FIXME: Clarify distinction between no set SP and SP. For callee functions,
+  // it's really whether we need SP to be accurate or not.
+
+  if (NumBytes != 0 && hasSP(MF)) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
+      .addReg(StackPtrReg)
+      .addImm(NumBytes * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameDestroy);
+  }
 }
 
 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
@@ -557,3 +612,19 @@ void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
       WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
   }
 }
+
+bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
+  // All stack operations are relative to the frame offset SGPR.
+  // TODO: Still want to eliminate sometimes.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // XXX - Is this only called after frame is finalized? Should be able to check
+  // frame size.
+  return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI);
+}
+
+bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+  // All stack operations are relative to the frame offset SGPR.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MFI.hasCalls() || MFI.hasVarSizedObjects();
+}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index e17adbe273614..d4dfa1c7eaa86 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -60,6 +60,10 @@ private:
 
   /// \brief Emits debugger prologue.
   void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+public:
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasSP(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 441f1ef4bd04c..d0f4e00994de1 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -211,6 +211,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::UADDO, MVT::i32, Legal);
   setOperationAction(ISD::USUBO, MVT::i32, Legal);
 
+  setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
+  setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
+
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
@@ -471,6 +474,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT, VT, Custom);
   }
 
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::ADDCARRY);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::SUBCARRY);
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
@@ -1061,10 +1068,10 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
                                  MachineFunction &MF,
                                  const SIRegisterInfo &TRI,
                                  SIMachineFunctionInfo &Info) {
-  if (Info.hasPrivateMemoryInputPtr()) {
-    unsigned PrivateMemoryPtrReg = Info.addPrivateMemoryPtr(TRI);
-    MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SGPR_64RegClass);
-    CCInfo.AllocateReg(PrivateMemoryPtrReg);
+  if (Info.hasImplicitBufferPtr()) {
+    unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
+    MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(ImplicitBufferPtrReg);
   }
 
   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
@@ -1227,7 +1234,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
     }
   }
 
-  if (NeedSP){
+  if (NeedSP) {
     unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
     Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
 
@@ -2998,7 +3005,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_implicit_buffer_ptr: {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+    if (getSubtarget()->isAmdCodeObjectV2(MF))
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    unsigned Reg = TRI->getPreloadedValue(MF,
+                                          SIRegisterInfo::IMPLICIT_BUFFER_PTR);
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
   }
   case Intrinsic::amdgcn_dispatch_ptr:
@@ -3288,6 +3299,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                  SelectionDAG &DAG) const {
   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   SDLoc DL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
   case Intrinsic::amdgcn_atomic_dec: {
@@ -3313,7 +3326,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(5), // glc
       Op.getOperand(6)  // slc
     };
-    MachineFunction &MF = DAG.getMachineFunction();
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
@@ -3328,6 +3340,29 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
   }
+  case Intrinsic::amdgcn_tbuffer_load: {
+    SDValue Ops[] = {
+      Op.getOperand(0),  // Chain
+      Op.getOperand(2),  // rsrc
+      Op.getOperand(3),  // vindex
+      Op.getOperand(4),  // voffset
+      Op.getOperand(5),  // soffset
+      Op.getOperand(6),  // offset
+      Op.getOperand(7),  // dfmt
+      Op.getOperand(8),  // nfmt
+      Op.getOperand(9),  // glc
+      Op.getOperand(10)   // slc
+    };
+
+    EVT VT = Op.getOperand(2).getValueType();
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad,
+      VT.getStoreSize(), VT.getStoreSize());
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
   // Basic sample.
   case Intrinsic::amdgcn_image_sample:
   case Intrinsic::amdgcn_image_sample_cl:
@@ -3393,10 +3428,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                               SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  MachineFunction &MF = DAG.getMachineFunction();
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_exp: {
@@ -3463,33 +3498,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
                        Op.getOperand(2), Op.getOperand(3));
   }
-  case AMDGPUIntrinsic::SI_tbuffer_store: {
-    SDValue Ops[] = {
-      Chain,
-      Op.getOperand(2),
-      Op.getOperand(3),
-      Op.getOperand(4),
-      Op.getOperand(5),
-      Op.getOperand(6),
-      Op.getOperand(7),
-      Op.getOperand(8),
-      Op.getOperand(9),
-      Op.getOperand(10),
-      Op.getOperand(11),
-      Op.getOperand(12),
-      Op.getOperand(13),
-      Op.getOperand(14)
-    };
-
-    EVT VT = Op.getOperand(3).getValueType();
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore,
-      VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
-  }
   case AMDGPUIntrinsic::AMDGPU_kill: {
     SDValue Src = Op.getOperand(2);
     if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3505,7 +3513,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   }
   case Intrinsic::amdgcn_s_barrier: {
     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
-      const MachineFunction &MF = DAG.getMachineFunction();
       const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
       unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
       if (WGSize <= ST.getWavefrontSize())
@@ -3514,6 +3521,75 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     }
     return SDValue();
   };
+  case AMDGPUIntrinsic::SI_tbuffer_store: {
+
+    // Extract vindex and voffset from vaddr as appropriate
+    const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
+    const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
+    SDValue VAddr = Op.getOperand(5);
+
+    SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+
+    assert(!(OffEn->isOne() && IdxEn->isOne()) &&
+           "Legacy intrinsic doesn't support both offset and index - use new version");
+
+    SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
+    SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
+
+    // Deal with the vec-3 case
+    const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
+    auto Opcode = NumChannels->getZExtValue() == 3 ?
+      AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
+
+    SDValue Ops[] = {
+     Chain,
+     Op.getOperand(3),  // vdata
+     Op.getOperand(2),  // rsrc
+     VIndex,
+     VOffset,
+     Op.getOperand(6),  // soffset
+     Op.getOperand(7),  // inst_offset
+     Op.getOperand(8),  // dfmt
+     Op.getOperand(9),  // nfmt
+     Op.getOperand(12), // glc
+     Op.getOperand(13), // slc
+    };
+
+    assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
+           "Value of tfe other than zero is unsupported");
+
+    EVT VT = Op.getOperand(3).getValueType();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(Opcode, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+
+  case Intrinsic::amdgcn_tbuffer_store: {
+    SDValue Ops[] = {
+      Chain,
+      Op.getOperand(2),  // vdata
+      Op.getOperand(3),  // rsrc
+      Op.getOperand(4),  // vindex
+      Op.getOperand(5),  // voffset
+      Op.getOperand(6),  // soffset
+      Op.getOperand(7),  // offset
+      Op.getOperand(8),  // dfmt
+      Op.getOperand(9),  // nfmt
+      Op.getOperand(10), // glc
+      Op.getOperand(11)  // slc
+    };
+    EVT VT = Op.getOperand(3).getValueType();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+
   default:
     return Op;
   }
@@ -4839,6 +4915,103 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
   return 0;
 }
 
+SDValue SITargetLowering::performAddCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::i32)
+    return SDValue();
+
+  SDLoc SL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // add x, zext (setcc) => addcarry x, 0, setcc
+  // add x, sext (setcc) => subcarry x, 0, setcc
+  unsigned Opc = LHS.getOpcode();
+  if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
+      Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
+    std::swap(RHS, LHS);
+
+  Opc = RHS.getOpcode();
+  switch (Opc) {
+  default: break;
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ANY_EXTEND: {
+    auto Cond = RHS.getOperand(0);
+    if (Cond.getOpcode() != ISD::SETCC &&
+        Cond.getOpcode() != AMDGPUISD::FP_CLASS)
+      break;
+    SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
+    SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
+    Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
+    return DAG.getNode(Opc, SL, VTList, Args);
+  }
+  case ISD::ADDCARRY: {
+    // add x, (addcarry y, 0, cc) => addcarry x, y, cc
+    auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+    if (!C || C->getZExtValue() != 0) break;
+    SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
+    return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
+  }
+  }
+  return SDValue();
+}
+
+SDValue SITargetLowering::performSubCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::i32)
+    return SDValue();
+
+  SDLoc SL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  unsigned Opc = LHS.getOpcode();
+  if (Opc != ISD::SUBCARRY)
+    std::swap(RHS, LHS);
+
+  if (LHS.getOpcode() == ISD::SUBCARRY) {
+    // sub (subcarry x, 0, cc), y => subcarry x, y, cc
+    auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+    if (!C || C->getZExtValue() != 0)
+      return SDValue();
+    SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
+    return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
+  }
+  return SDValue();
+}
+
+SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
+  DAGCombinerInfo &DCI) const {
+
+  if (N->getValueType(0) != MVT::i32)
+    return SDValue();
+
+  auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C || C->getZExtValue() != 0)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue LHS = N->getOperand(0);
+
+  // addcarry (add x, y), 0, cc => addcarry x, y, cc
+  // subcarry (sub x, y), 0, cc => subcarry x, y, cc
+  unsigned LHSOpc = LHS.getOpcode();
+  unsigned Opc = N->getOpcode();
+  if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
+      (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
+    SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
+    return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
+  }
+  return SDValue();
+}
+
 SDValue SITargetLowering::performFAddCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
@@ -5009,6 +5182,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
   default:
     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+  case ISD::ADD:
+    return performAddCombine(N, DCI);
+  case ISD::SUB:
+    return performSubCombine(N, DCI);
+  case ISD::ADDCARRY:
+  case ISD::SUBCARRY:
+    return performAddCarrySubCarryCombine(N, DCI);
   case ISD::FADD:
     return performFAddCombine(N, DCI);
   case ISD::FSUB:
@@ -5425,15 +5605,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
 }
 
-SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
-                                               const TargetRegisterClass *RC,
-                                               unsigned Reg, EVT VT) const {
-  SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
-
-  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
-                            cast<RegisterSDNode>(VReg)->getReg(), VT);
-}
-
 //===----------------------------------------------------------------------===//
 //                         SI Inline Assembly Support
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 8e2ec40b224cd..24f88e632d38e 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -108,6 +108,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
+  SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -216,8 +219,6 @@ public:
   void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                      SDNode *Node) const override;
 
-  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
-                               unsigned Reg, EVT VT) const override;
   SDNode *legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
 
   MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL,
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1097814e99ce2..c9b48fea7225e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2108,7 +2108,9 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
 
 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
                                    uint8_t OperandType) const {
-  if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
+  if (!MO.isImm() ||
+      OperandType < AMDGPU::OPERAND_SRC_FIRST ||
+      OperandType > AMDGPU::OPERAND_SRC_LAST)
     return false;
 
   // MachineOperand provides no way to tell the true operand size, since it only
@@ -2433,8 +2435,73 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  // Verify SDWA
+  if (isSDWA(MI)) {
+
+    if (!ST.hasSDWA()) {
+      ErrInfo = "SDWA is not supported on this target";
+      return false;
+    }
+
+    int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+    if ( DstIdx == -1)
+      DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::sdst);
+
+    const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
+
+    for (int OpIdx: OpIndicies) {
+      if (OpIdx == -1)
+        continue;
+      const MachineOperand &MO = MI.getOperand(OpIdx);
+
+      if (!ST.hasSDWAScalar()) {
+        // Only VGPRS on VI
+        if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
+          ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
+          return false;
+        }
+      } else {
+        // No immediates on GFX9
+        if (!MO.isReg()) {
+          ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
+          return false;
+        }
+      }
+    }
+
+    if (!ST.hasSDWAOmod()) {
+      // No omod allowed on VI
+      const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
+      if (OMod != nullptr &&
+        (!OMod->isImm() || OMod->getImm() != 0)) {
+        ErrInfo = "OMod not allowed in SDWA instructions on VI";
+        return false;
+      }
+    }
+
+    uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+    if (isVOPC(BasicOpcode)) {
+      if (!ST.hasSDWASdst() && DstIdx != -1) {
+        // Only vcc allowed as dst on VI for VOPC
+        const MachineOperand &Dst = MI.getOperand(DstIdx);
+        if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
+          ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
+          return false;
+        }
+      } else if (!ST.hasSDWAClampVOPC()) {
+        // No clamp allowed on GFX9 for VOPC
+        const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
+        if (Clamp != nullptr &&
+          (!Clamp->isImm() || Clamp->getImm() != 0)) {
+          ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
+          return false;
+        }
+      }
+    }
+  }
+
   // Verify VOP*
-  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) {
+  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
     // Only look at the true operands. Only a real operand can use the constant
     // bus, and we don't want to check pseudo-operands like the source modifier
     // flags.
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index f6e5e8883f63c..74b48c7618087 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -814,6 +814,9 @@ namespace AMDGPU {
   int getSDWAOp(uint16_t Opcode);
 
   LLVM_READONLY
+  int getBasicFromSDWAOp(uint16_t Opcode);
+
+  LLVM_READONLY
   int getCommuteRev(uint16_t Opcode);
 
   LLVM_READONLY
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 470a47b024433..3b4a8b5d1e817 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -20,6 +20,8 @@ def SIEncodingFamily {
   int NONE = -1;
   int SI = 0;
   int VI = 1;
+  int SDWA = 2;
+  int SDWA9 = 3;
 }
 
 //===----------------------------------------------------------------------===//
@@ -39,25 +41,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
-def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
-  SDTypeProfile<0, 13,
-    [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
-     SDTCisVT<1, iAny>,   // vdata(VGPR)
-     SDTCisVT<2, i32>,    // num_channels(imm)
-     SDTCisVT<3, i32>,    // vaddr(VGPR)
+def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
+  SDTypeProfile<1, 9,
+    [                     // vdata
+     SDTCisVT<1, v4i32>,  // rsrc
+     SDTCisVT<2, i32>,    // vindex(VGPR)
+     SDTCisVT<3, i32>,    // voffset(VGPR)
      SDTCisVT<4, i32>,    // soffset(SGPR)
-     SDTCisVT<5, i32>,    // inst_offset(imm)
+     SDTCisVT<5, i32>,    // offset(imm)
      SDTCisVT<6, i32>,    // dfmt(imm)
      SDTCisVT<7, i32>,    // nfmt(imm)
-     SDTCisVT<8, i32>,    // offen(imm)
-     SDTCisVT<9, i32>,    // idxen(imm)
-     SDTCisVT<10, i32>,   // glc(imm)
-     SDTCisVT<11, i32>,   // slc(imm)
-     SDTCisVT<12, i32>    // tfe(imm)
+     SDTCisVT<8, i32>,    // glc(imm)
+     SDTCisVT<9, i32>     // slc(imm)
     ]>,
-  [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+  [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
 >;
 
+def SDTtbuffer_store : SDTypeProfile<0, 10,
+    [                     // vdata
+     SDTCisVT<1, v4i32>,  // rsrc
+     SDTCisVT<2, i32>,    // vindex(VGPR)
+     SDTCisVT<3, i32>,    // voffset(VGPR)
+     SDTCisVT<4, i32>,    // soffset(SGPR)
+     SDTCisVT<5, i32>,    // offset(imm)
+     SDTCisVT<6, i32>,    // dfmt(imm)
+     SDTCisVT<7, i32>,    // nfmt(imm)
+     SDTCisVT<8, i32>,    // glc(imm)
+     SDTCisVT<9, i32>     // slc(imm)
+    ]>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
+                             [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
+                                SDTtbuffer_store,
+                                [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+
 def SDTBufferLoad : SDTypeProfile<1, 5,
     [                    // vdata
      SDTCisVT<1, v4i32>, // rsrc
@@ -452,25 +470,25 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
   let ParserMatchClass = VReg32OrOffClass;
 }
 
-class SDWA9Src : RegisterOperand<VS_32> {
+class SDWASrc : RegisterOperand<VS_32> {
   let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_SDWA9_SRC";
-  let EncoderMethod = "getSDWA9SrcEncoding";
+  let OperandType = "OPERAND_SDWA_SRC";
+  let EncoderMethod = "getSDWASrcEncoding";
 }
 
-def SDWA9Src32 : SDWA9Src {
-  let DecoderMethod = "decodeSDWA9Src32";
+def SDWASrc32 : SDWASrc {
+  let DecoderMethod = "decodeSDWASrc32";
 }
 
-def SDWA9Src16 : SDWA9Src {
-  let DecoderMethod = "decodeSDWA9Src16";
+def SDWASrc16 : SDWASrc {
+  let DecoderMethod = "decodeSDWASrc16";
 }
 
-def SDWA9VopcDst : VOPDstOperand<SReg_64> {
+def SDWAVopcDst : VOPDstOperand<SReg_64> {
   let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_SDWA9_VOPC_DST";
-  let EncoderMethod = "getSDWA9VopcDstEncoding";
-  let DecoderMethod = "decodeSDWA9VopcDst";
+  let OperandType = "OPERAND_SDWA_VOPC_DST";
+  let EncoderMethod = "getSDWAVopcDstEncoding";
+  let DecoderMethod = "decodeSDWAVopcDst";
 }
 
 class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
@@ -525,7 +543,7 @@ def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>;
 def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
 
 def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>;
-def offset_s13 : NamedOperandS13<"Offset", NamedMatchClass<"OffsetS13">>;
+def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>;
 def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
 def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
 def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
@@ -545,6 +563,9 @@ def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
 def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
 def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
 
+def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
+def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
+
 def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 
 def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
@@ -634,13 +655,13 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>
 def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
 def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
 
-def FPRegInputModsMatchClass : AsmOperandClass {
-  let Name = "RegWithFPInputMods";
+def FPRegSDWAInputModsMatchClass : AsmOperandClass {
+  let Name = "SDWARegWithFPInputMods";
   let ParserMethod = "parseRegWithFPInputMods";
-  let PredicateMethod = "isRegKind";
+  let PredicateMethod = "isSDWARegKind";
 }
 
-def FPRegInputMods : InputMods <FPRegInputModsMatchClass> {
+def FPRegSDWAInputMods : InputMods <FPRegSDWAInputModsMatchClass> {
   let PrintMethod = "printOperandAndFPInputMods";
 }
 
@@ -655,13 +676,13 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
 }
 
 
-def IntRegInputModsMatchClass : AsmOperandClass {
-  let Name = "RegWithIntInputMods";
+def IntRegSDWAInputModsMatchClass : AsmOperandClass {
+  let Name = "SDWARegWithIntInputMods";
   let ParserMethod = "parseRegWithIntInputMods";
-  let PredicateMethod = "isRegKind";
+  let PredicateMethod = "isSDWARegKind";
 }
 
-def IntRegInputMods : InputMods <IntRegInputModsMatchClass> {
+def IntRegSDWAInputMods : InputMods <IntRegSDWAInputModsMatchClass> {
   let PrintMethod = "printOperandAndIntInputMods";
 }
 
@@ -851,10 +872,10 @@ class getVALUDstForVT<ValueType VT> {
 }
 
 // Returns the register class to use for the destination of VOP[12C]
-// instructions with GFX9 SDWA extension
-class getSDWA9DstForVT<ValueType VT> {
+// instructions with SDWA extension
+class getSDWADstForVT<ValueType VT> {
   RegisterOperand ret = !if(!eq(VT.Size, 1),
-                            SDWA9VopcDst, // VOPC
+                            SDWAVopcDst, // VOPC
                             VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst
 }
 
@@ -898,8 +919,8 @@ class getVregSrcForVT<ValueType VT> {
                         !if(!eq(VT.Size, 64), VReg_64, VGPR_32));
 }
 
-class getSDWA9SrcForVT <ValueType VT> {
-  RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32);
+class getSDWASrcForVT <ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32);
 }
 
 // Returns the register class to use for sources of VOP3 instructions for the
@@ -995,7 +1016,7 @@ class getSrcMod <ValueType VT> {
                      );
 }
 
-// Return type of input modifiers operand specified input operand for SDWA/DPP
+// Return type of input modifiers operand specified input operand for DPP
 class getSrcModExt <ValueType VT> {
     bit isFP = !if(!eq(VT.Value, f16.Value), 1,
                !if(!eq(VT.Value, f32.Value), 1,
@@ -1004,13 +1025,13 @@ class getSrcModExt <ValueType VT> {
   Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
 }
 
-// Return type of input modifiers operand specified input operand for SDWA 9
-class getSrcModSDWA9 <ValueType VT> {
+// Return type of input modifiers operand specified input operand for SDWA
+class getSrcModSDWA <ValueType VT> {
     bit isFP = !if(!eq(VT.Value, f16.Value), 1,
                !if(!eq(VT.Value, f32.Value), 1,
                !if(!eq(VT.Value, f64.Value), 1,
                0)));
-  Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods);
+  Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods);
 }
 
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
@@ -1141,36 +1162,12 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
              /* endif */)));
 }
 
-class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
-                  bit HasFloatModifiers, Operand Src0Mod, Operand Src1Mod,
-                  ValueType DstVT> {
 
-  dag ret = !if(!eq(NumSrcArgs, 0),
-               // VOP1 without input operands (V_NOP)
-               (ins),
-            !if(!eq(NumSrcArgs, 1),
-               // VOP1_SDWA
-               (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-                    clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
-                    src0_sel:$src0_sel),
-            !if(!eq(NumSrcArgs, 2),
-               !if(!eq(DstVT.Size, 1),
-                  // VOPC_SDWA with modifiers
-                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-                       Src1Mod:$src1_modifiers, Src1RC:$src1,
-                       clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
-                  // VOP2_SDWA with modifiers
-                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-                       Src1Mod:$src1_modifiers, Src1RC:$src1,
-                       clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
-                       src0_sel:$src0_sel, src1_sel:$src1_sel)),
-            (ins)/* endif */)));
-}
 
-// Ins for GFX9 SDWA
-class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
-                   bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod,
-                   ValueType DstVT> {
+// Ins for SDWA
+class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
+                  bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod,
+                  ValueType DstVT> {
 
   dag ret = !if(!eq(NumSrcArgs, 0),
                // VOP1 without input operands (V_NOP)
@@ -1178,31 +1175,31 @@ class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArg
             !if(!eq(NumSrcArgs, 1),
                // VOP1
                !if(!eq(HasSDWAOMod, 0),
-                  // VOP1_SDWA9 without omod
+                  // VOP1_SDWA without omod
                   (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                        clampmod:$clamp,
                        dst_sel:$dst_sel, dst_unused:$dst_unused,
                        src0_sel:$src0_sel),
-                  // VOP1_SDWA9 with omod
+                  // VOP1_SDWA with omod
                   (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                        clampmod:$clamp, omod:$omod,
                        dst_sel:$dst_sel, dst_unused:$dst_unused,
                        src0_sel:$src0_sel)),
             !if(!eq(NumSrcArgs, 2),
                !if(!eq(DstVT.Size, 1),
-                  // VOPC_SDWA9
+                  // VOPC_SDWA
                   (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                        Src1Mod:$src1_modifiers, Src1RC:$src1,
-                       src0_sel:$src0_sel, src1_sel:$src1_sel),
-                  // VOP2_SDWA9
+                       clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
+                  // VOP2_SDWA
                   !if(!eq(HasSDWAOMod, 0),
-                     // VOP2_SDWA9 without omod
+                     // VOP2_SDWA without omod
                      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                           Src1Mod:$src1_modifiers, Src1RC:$src1,
                           clampmod:$clamp,
                           dst_sel:$dst_sel, dst_unused:$dst_unused,
                           src0_sel:$src0_sel, src1_sel:$src1_sel),
-                     // VOP1_SDWA9 with omod
+                     // VOP2_SDWA with omod
                      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                           Src1Mod:$src1_modifiers, Src1RC:$src1,
                           clampmod:$clamp, omod:$omod,
@@ -1220,12 +1217,12 @@ class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> {
                 (outs)); // V_NOP
 }
 
-// Outs for GFX9 SDWA
-class getOutsSDWA9 <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA9> {
+// Outs for SDWA
+class getOutsSDWA <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA> {
   dag ret = !if(HasDst,
                 !if(!eq(DstVT.Size, 1),
-                    (outs DstRCSDWA9:$sdst),
-                    (outs DstRCSDWA9:$vdst)),
+                    (outs DstRCSDWA:$sdst),
+                    (outs DstRCSDWA:$vdst)),
                 (outs)); // V_NOP
 }
 
@@ -1387,8 +1384,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field ValueType Src2VT = ArgVT[3];
   field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
   field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
-  field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret;
-  field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT<DstVT>.ret;
+  field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret;
   field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
   field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
   field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
@@ -1396,19 +1392,15 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
   field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
   field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
-  field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;
-  field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;
-  field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;
-  field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;
+  field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
+  field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
   field Operand Src0Mod = getSrcMod<Src0VT>.ret;
   field Operand Src1Mod = getSrcMod<Src1VT>.ret;
   field Operand Src2Mod = getSrcMod<Src2VT>.ret;
   field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
   field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
-  field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
-  field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
-  field Operand Src0ModSDWA9 = getSrcModSDWA9<Src0VT>.ret;
-  field Operand Src1ModSDWA9 = getSrcModSDWA9<Src1VT>.ret;
+  field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
+  field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret;
 
 
   field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
@@ -1457,8 +1449,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field dag Outs32 = Outs;
   field dag Outs64 = Outs;
   field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
-  field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;
-  field dag OutsSDWA9 = getOutsSDWA9<HasDst, DstVT, DstRCSDWA9>.ret;
+  field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret;
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
@@ -1471,11 +1462,9 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
                                HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
   field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
-                                 HasModifiers, Src0ModSDWA, Src1ModSDWA,
+                                 HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
                                  DstVT>.ret;
-  field dag InsSDWA9 = getInsSDWA9<Src0SDWA9, Src1SDWA9, NumSrcArgs,
-                                   HasSDWAOMod, Src0ModSDWA9, Src1ModSDWA9,
-                                   DstVT>.ret;
+
 
   field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
   field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret;
@@ -1628,13 +1617,13 @@ def getSDWAOp : InstrMapping {
   let ValueCols = [["SDWA"]];
 }
 
-// Maps ordinary instructions to their SDWA GFX9 counterparts
-def getSDWA9Op : InstrMapping {
+// Maps SDWA instructions to their ordinary counterparts
+def getBasicFromSDWAOp : InstrMapping {
   let FilterClass = "VOP";
   let RowFields = ["OpName"];
   let ColFields = ["AsmVariantName"];
-  let KeyCol = ["Default"];
-  let ValueCols = [["SDWA9"]];
+  let KeyCol = ["SDWA"];
+  let ValueCols = [["Default"]];
 }
 
 def getMaskedMIMGOp : InstrMapping {
@@ -1669,7 +1658,9 @@ def getMCOpcodeGen : InstrMapping {
   let ColFields = ["Subtarget"];
   let KeyCol = [!cast<string>(SIEncodingFamily.NONE)];
   let ValueCols = [[!cast<string>(SIEncodingFamily.SI)],
-                   [!cast<string>(SIEncodingFamily.VI)]];
+                   [!cast<string>(SIEncodingFamily.VI)],
+                   [!cast<string>(SIEncodingFamily.SDWA)],
+                   [!cast<string>(SIEncodingFamily.SDWA9)]];
 }
 
 // Get equivalent SOPK instruction.
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 18b197ddb7ae7..3203c38dae344 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -74,7 +74,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkItemIDX(false),
     WorkItemIDY(false),
     WorkItemIDZ(false),
-    PrivateMemoryInputPtr(false) {
+    ImplicitBufferPtr(false) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   const Function *F = MF.getFunction();
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
@@ -86,6 +86,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
     ScratchWaveOffsetReg = AMDGPU::SGPR4;
     FrameOffsetReg = AMDGPU::SGPR5;
+    StackPtrOffsetReg = AMDGPU::SGPR32;
     return;
   }
 
@@ -150,7 +151,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
       DispatchID = true;
   } else if (ST.isMesaGfxShader(MF)) {
     if (HasStackObjects || MaySpill)
-      PrivateMemoryInputPtr = true;
+      ImplicitBufferPtr = true;
   }
 
   // We don't need to worry about accessing spills with flat instructions.
@@ -203,11 +204,11 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
   return FlatScratchInitUserSGPR;
 }
 
-unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
-  PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
+unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
+  ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg(
     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
   NumUserSGPRs += 2;
-  return PrivateMemoryPtrUserSGPR;
+  return ImplicitBufferPtrUserSGPR;
 }
 
 /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 9fdb8caac6f21..05aa249584bf1 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -97,7 +97,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   unsigned StackPtrOffsetReg;
 
   // Input registers for non-HSA ABI
-  unsigned PrivateMemoryPtrUserSGPR;
+  unsigned ImplicitBufferPtrUserSGPR;
 
   // Input registers setup for the HSA ABI.
   // User SGPRs in allocation order.
@@ -179,7 +179,7 @@ private:
   // Private memory buffer
   // Compute directly in sgpr[0:1]
   // Other shaders indirect 64-bits at sgpr[0:1]
-  bool PrivateMemoryInputPtr : 1;
+  bool ImplicitBufferPtr : 1;
 
   MCPhysReg getNextUserSGPR() const {
     assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
@@ -236,7 +236,7 @@ public:
   unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
   unsigned addDispatchID(const SIRegisterInfo &TRI);
   unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
-  unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI);
+  unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI);
 
   // Add system SGPRs.
   unsigned addWorkGroupIDX() {
@@ -341,8 +341,8 @@ public:
     return WorkItemIDZ;
   }
 
-  bool hasPrivateMemoryInputPtr() const {
-    return PrivateMemoryInputPtr;
+  bool hasImplicitBufferPtr() const {
+    return ImplicitBufferPtr;
   }
 
   unsigned getNumUserSGPRs() const {
@@ -396,8 +396,8 @@ public:
     return QueuePtrUserSGPR;
   }
 
-  unsigned getPrivateMemoryPtrUserSGPR() const {
-    return PrivateMemoryPtrUserSGPR;
+  unsigned getImplicitBufferPtrUserSGPR() const {
+    return ImplicitBufferPtrUserSGPR;
   }
 
   bool hasSpilledSGPRs() const {
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index f4ddf1891683b..4ac23ef03cb32 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -67,9 +67,9 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
   void matchSDWAOperands(MachineFunction &MF);
-  bool isConvertibleToSDWA(const MachineInstr &MI) const;
+  bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
-  void legalizeScalarOperands(MachineInstr &MI) const;
+  void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
 
   StringRef getPassName() const override { return "SI Peephole SDWA"; }
 
@@ -224,7 +224,7 @@ static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
 static bool isSubregOf(const MachineOperand &SubReg,
                        const MachineOperand &SuperReg,
                        const TargetRegisterInfo *TRI) {
-  
+
   if (!SuperReg.isReg() || !SubReg.isReg())
     return false;
 
@@ -557,7 +557,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
 
         MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
         MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-      
+
         if (TRI->isPhysicalRegister(Src0->getReg()) ||
             TRI->isPhysicalRegister(Dst->getReg()))
           break;
@@ -590,7 +590,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
           break;
 
         MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-      
+
         if (TRI->isPhysicalRegister(Src1->getReg()) ||
             TRI->isPhysicalRegister(Dst->getReg()))
           break;
@@ -607,16 +607,38 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
   }
 }
 
-bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
+bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
+                                         const SISubtarget &ST) const {
   // Check if this instruction has opcode that supports SDWA
-  unsigned Opc = MI.getOpcode();
-  if (AMDGPU::getSDWAOp(Opc) != -1)
-    return true;
-  int Opc32 = AMDGPU::getVOPe32(Opc);
-  if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1)
-    return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
-           !TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
-  return false;
+  int Opc = MI.getOpcode();
+  if (AMDGPU::getSDWAOp(Opc) == -1)
+    Opc = AMDGPU::getVOPe32(Opc);
+
+  if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1)
+    return false;
+
+  if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+    return false;
+
+  if (TII->isVOPC(Opc)) {
+    if (!ST.hasSDWASdst()) {
+      const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+      if (SDst && SDst->getReg() != AMDGPU::VCC)
+        return false;
+    }
+
+    if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+      return false;
+
+  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+    return false;
+  }
+
+  if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
+                           Opc == AMDGPU::V_MAC_F32_e32))
+    return false;
+
+  return true;
 }
 
 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
@@ -641,6 +663,11 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
   if (Dst) {
     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
     SDWAInst.add(*Dst);
+  } else {
+    Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+    assert(Dst &&
+           AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
+    SDWAInst.add(*Dst);
   }
 
   // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
@@ -677,9 +704,23 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     SDWAInst.add(*Src2);
   }
 
-  // Initialize clamp.
+  // Copy clamp if present, initialize otherwise
   assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
-  SDWAInst.addImm(0);
+  MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
+  if (Clamp) {
+    SDWAInst.add(*Clamp);
+  } else {
+    SDWAInst.addImm(0);
+  }
+
+  // Copy omod if present, initialize otherwise if needed
+  MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
+  if (OMod) {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1);
+    SDWAInst.add(*OMod);
+  } else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
+    SDWAInst.addImm(0);
+  }
 
   // Initialize dst_sel and dst_unused if present
   if (Dst) {
@@ -733,16 +774,25 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
 }
 
 // If an instruction was converted to SDWA it should not have immediates or SGPR
-// operands. Copy its scalar operands into VGPRs.
-void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
+// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
+void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
   const MCInstrDesc &Desc = TII->get(MI.getOpcode());
-  for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
-    MachineOperand &Op = MI.getOperand(I);
+  unsigned ConstantBusCount = 0;
+  for (MachineOperand &Op: MI.explicit_uses()) {
     if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
       continue;
+
+    unsigned I = MI.getOperandNo(&Op);
     if (Desc.OpInfo[I].RegClass == -1 ||
        !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
       continue;
+
+    if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
+        TRI->isSGPRReg(*MRI, Op.getReg())) {
+      ++ConstantBusCount;
+      continue;
+    }
+
     unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
                         TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
@@ -758,22 +808,20 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 
-  if (!ST.hasSDWA() ||
-      !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
+  if (!ST.hasSDWA())
     return false;
-  }
 
   MRI = &MF.getRegInfo();
   TRI = ST.getRegisterInfo();
   TII = ST.getInstrInfo();
-  
+
   // Find all SDWA operands in MF.
   matchSDWAOperands(MF);
 
   for (const auto &OperandPair : SDWAOperands) {
     const auto &Operand = OperandPair.second;
     MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
-    if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) {
+    if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
       PotentialMatches[PotentialMI].push_back(Operand.get());
     }
   }
@@ -788,7 +836,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
 
   bool Ret = !ConvertedInstructions.empty();
   while (!ConvertedInstructions.empty())
-    legalizeScalarOperands(*ConvertedInstructions.pop_back_val());
+    legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
 
   return Ret;
 }
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index b611f28fcabdf..ef6ad4ad0c8f3 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1044,18 +1044,29 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
           unsigned CarryOut
             = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
           unsigned ScaledReg
-            = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+            = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
-          // XXX - Should this use a vector shift?
-          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
-            .addReg(DiffReg, RegState::Kill)
-            .addImm(Log2_32(ST.getWavefrontSize()));
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
+            .addImm(Log2_32(ST.getWavefrontSize()))
+            .addReg(DiffReg, RegState::Kill);
 
           // TODO: Fold if use instruction is another add of a constant.
-          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
-            .addReg(CarryOut, RegState::Define | RegState::Dead)
-            .addImm(Offset)
-            .addReg(ScaledReg, RegState::Kill);
+          if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+              .addReg(CarryOut, RegState::Define | RegState::Dead)
+              .addImm(Offset)
+              .addReg(ScaledReg, RegState::Kill);
+          } else {
+            unsigned ConstOffsetReg
+              = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+              .addImm(Offset);
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+              .addReg(CarryOut, RegState::Define | RegState::Dead)
+              .addReg(ConstOffsetReg, RegState::Kill)
+              .addReg(ScaledReg, RegState::Kill);
+          }
 
           MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC);
         }
@@ -1341,12 +1352,11 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
   case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
     return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
   case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
-    if (ST.isAmdCodeObjectV2(MF)) {
-      assert(MFI->hasPrivateSegmentBuffer());
-      return MFI->PrivateSegmentBufferUserSGPR;
-    }
-    assert(MFI->hasPrivateMemoryInputPtr());
-    return MFI->PrivateMemoryPtrUserSGPR;
+    assert(MFI->hasPrivateSegmentBuffer());
+    return MFI->PrivateSegmentBufferUserSGPR;
+  case SIRegisterInfo::IMPLICIT_BUFFER_PTR:
+    assert(MFI->hasImplicitBufferPtr());
+    return MFI->ImplicitBufferPtrUserSGPR;
   case SIRegisterInfo::KERNARG_SEGMENT_PTR:
     assert(MFI->hasKernargSegmentPtr());
     return MFI->KernargSegmentPtrUserSGPR;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 8fed6d5f9710f..600cc886cb595 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -197,12 +197,13 @@ public:
     WORKGROUP_ID_Y      = 11,
     WORKGROUP_ID_Z      = 12,
     PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+    IMPLICIT_BUFFER_PTR = 15,
 
     // VGPRS:
-    FIRST_VGPR_VALUE    = 15,
+    FIRST_VGPR_VALUE    = 16,
     WORKITEM_ID_X       = FIRST_VGPR_VALUE,
-    WORKITEM_ID_Y       = 16,
-    WORKITEM_ID_Z       = 17
+    WORKITEM_ID_Y       = 17,
+    WORKITEM_ID_Z       = 18
   };
 
   /// \brief Returns the physical register that \p Value is stored in.
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index c5f121757e623..96a18544f02ac 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -92,6 +92,8 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
 
       case AMDGPU::V_ADDC_U32_e64:
       case AMDGPU::V_SUBB_U32_e64:
+        if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm())
+          return false;
         // Additional verification is needed for sdst/src2.
         return true;
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index f581e69980c79..26515b27bb77d 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -538,6 +538,27 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
     Reg == AMDGPU::SCC;
 }
 
+bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
+
+  if (Reg0 == Reg1) {
+    return true;
+  }
+
+  unsigned SubReg0 = TRI->getSubReg(Reg0, 1);
+  if (SubReg0 == 0) {
+    return TRI->getSubRegIndex(Reg1, Reg0) > 0;
+  }
+
+  for (unsigned Idx = 2; SubReg0 > 0; ++Idx) {
+    if (isRegIntersect(Reg1, SubReg0, TRI)) {
+      return true;
+    }
+    SubReg0 = TRI->getSubReg(Reg0, Idx);
+  }
+
+  return false;
+}
+
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
 
   switch(Reg) {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index eff0230d21f57..936e4921a7097 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -271,6 +271,9 @@ bool isGFX9(const MCSubtargetInfo &STI);
 /// \brief Is Reg - scalar register
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
 
+/// \brief Is there any intersection between registers
+bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI);
+
 /// If \p Reg is a pseudo reg, return the correct hardware register given
 /// \p STI otherwise return \p Reg.
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 95b5ef0a49dba..96b33c373f052 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -93,11 +93,6 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOP1";
 }
 
-class VOP1_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
-  VOP_SDWA9_Pseudo <OpName, P, pattern> {
-  let AsmMatchConverter = "cvtSdwaVOP1";
-}
-
 class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret =
     !if(P.HasModifiers,
@@ -117,7 +112,6 @@ multiclass VOP1Inst <string opName, VOPProfile P,
   def _e32 : VOP1_Pseudo <opName, P>;
   def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
   def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
-  def _sdwa9 : VOP1_SDWA9_Pseudo <opName, P>;
 }
 
 // Special profile for instructions which have clamp
@@ -274,12 +268,10 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
   let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
   let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+
   let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
-                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel);
-  let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
-                      clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
-                      src0_sel:$src0_sel);
 
   let Asm32 = getAsm32<1, 1>.ret;
   let Asm64 = getAsm64<1, 1, 0, 1>.ret;
@@ -545,8 +537,8 @@ multiclass VOP1_Real_vi <bits<10> op> {
     VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
   def _sdwa_gfx9 :
-    VOP_SDWA9_Real <!cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9")>,
-    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+    VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
   // For now left dpp only for asm/dasm
   // TODO: add corresponding pseudo
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 657cacaa792ca..7b9bc71ad4c77 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -114,11 +114,6 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOP2";
 }
 
-class VOP2_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
-  VOP_SDWA9_Pseudo <OpName, P, pattern> {
-  let AsmMatchConverter = "cvtSdwaVOP2";
-}
-
 class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret = !if(P.HasModifiers,
     [(set P.DstVT:$vdst,
@@ -139,7 +134,6 @@ multiclass VOP2Inst <string opName,
              Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
 
   def _sdwa  : VOP2_SDWA_Pseudo <opName, P>;
-  def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P>;
 }
 
 multiclass VOP2bInst <string opName,
@@ -156,10 +150,6 @@ multiclass VOP2bInst <string opName,
       def _sdwa  : VOP2_SDWA_Pseudo <opName, P> {
         let AsmMatchConverter = "cvtSdwaVOP2b";
       }
-
-      def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P> {
-        let AsmMatchConverter = "cvtSdwaVOP2b";
-      }
     }
 
     def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -221,17 +211,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
                     VGPR_32:$src2, // stub argument
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      VGPR_32:$src2, // stub argument
-                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     clampmod:$clamp, omod:$omod,
+                     dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
-  let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
-                      Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
-                      VGPR_32:$src2, // stub argument
-                      clampmod:$clamp, omod:$omod,
-                      dst_sel:$dst_sel, dst_unused:$dst_unused,
-                      src0_sel:$src0_sel, src1_sel:$src1_sel);
   let Asm32 = getAsm32<1, 2, vt>.ret;
   let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret;
   let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
@@ -289,15 +275,10 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
 
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
-                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     clampmod:$clamp, omod:$omod,
+                     dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
 
-  let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
-                      Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
-                      clampmod:$clamp, omod:$omod,
-                      dst_sel:$dst_sel, dst_unused:$dst_unused,
-                      src0_sel:$src0_sel, src1_sel:$src1_sel);
-
   let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,
                     Src1Mod:$src1_modifiers, Src1DPP:$src1,
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
@@ -326,6 +307,8 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
+  let HasExt = 0;
+  let HasSDWA9 = 0;
 }
 
 def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
@@ -335,6 +318,8 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
+  let HasExt = 0;
+  let HasSDWA9 = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -397,20 +382,29 @@ def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
 def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
 } // End isConvergent = 1
 
-defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
-defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32>;
-defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>;
-defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>;
-defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>;
+defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
+defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
+defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
 
 } // End SubtargetPredicate = isGCN
 
+def : Pat<
+    (AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
+    (V_ADDC_U32_e64 $src0, $src1, $src2)
+>;
+
+def : Pat<
+    (AMDGPUsube i32:$src0, i32:$src1, i1:$src2),
+    (V_SUBB_U32_e64 $src0, $src1, $src2)
+>;
 
 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {
@@ -728,8 +722,8 @@ multiclass VOP2_SDWA_Real <bits<6> op> {
 
 multiclass VOP2_SDWA9_Real <bits<6> op> {
   def _sdwa_gfx9 :
-    VOP_SDWA9_Real <!cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9")>,
-    VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+    VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 }
 
 multiclass VOP2be_Real_e32e64_vi <bits<6> op> :
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index cd347b86d3050..f3482a22d5dcd 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -113,11 +113,6 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOPC";
 }
 
-class VOPC_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
-  VOP_SDWA9_Pseudo <OpName, P, pattern> {
-  let AsmMatchConverter = "cvtSdwaVOPC";
-}
-
 // This class is used only with VOPC instructions. Use $sdst for out operand
 class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
   InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl {
@@ -189,13 +184,6 @@ multiclass VOPC_Pseudos <string opName,
     let isConvergent = DefExec;
     let isCompare = 1;
   }
-
-  def _sdwa9 : VOPC_SDWA9_Pseudo <opName, P> {
-    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
-    let SchedRW = P.Schedule;
-    let isConvergent = DefExec;
-    let isCompare = 1;
-  }
 }
 
 def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;
@@ -540,14 +528,12 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
   VOPC_Profile<sched, vt, i32> {
   let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
   let Asm64 = "$sdst, $src0_modifiers, $src1";
+
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
-  let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
-                      Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
-                      src0_sel:$src0_sel, src1_sel:$src1_sel);
+
   let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
-  //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
   let HasSrc1Mods = 0;
   let HasClamp = 0;
   let HasOMod = 0;
@@ -580,12 +566,6 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {
     let SchedRW = p.Schedule;
     let isConvergent = DefExec;
   }
-
-  def _sdwa9 : VOPC_SDWA9_Pseudo <opName, p> {
-    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
-    let SchedRW = p.Schedule;
-    let isConvergent = DefExec;
-  }
 }
 
 def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;
@@ -954,8 +934,8 @@ multiclass VOPC_Real_vi <bits<10> op> {
     VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
   def _sdwa_gfx9 :
-    VOP_SDWA9_Real <!cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9")>,
-    VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+    VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
   def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
                        !cast<Instruction>(NAME#"_e32_vi")> {
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 4da654f84f9d1..e386f21c2ba49 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -232,11 +232,11 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
   let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
   let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
 
-  let Inst{11} = !if(P.HasOpSel, src0_modifiers{2}, 0); // op_sel(0)
-  let Inst{12} = !if(P.HasOpSel, src1_modifiers{2}, 0); // op_sel(1)
-  let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2)
+  let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
+  let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
+  let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
 
-  let Inst{14} = !if(P.HasOpSel, src2_modifiers{3}, 0); // op_sel_hi(2)
+  let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, 0); // op_sel_hi(2)
 
   let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
 
@@ -245,8 +245,8 @@ class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
   let Inst{40-32} = !if(P.HasSrc0, src0, 0);
   let Inst{49-41} = !if(P.HasSrc1, src1, 0);
   let Inst{58-50} = !if(P.HasSrc2, src2, 0);
-  let Inst{59}    = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel_hi(0)
-  let Inst{60}    = !if(P.HasOpSel, src1_modifiers{3}, 0); // op_sel_hi(1)
+  let Inst{59}    = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, 0); // op_sel_hi(0)
+  let Inst{60}    = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, 0); // op_sel_hi(1)
   let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
   let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
@@ -300,6 +300,19 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
   let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
 }
 
+// GFX9 adds two features to SDWA:
+// 1.	Add 3 fields to the SDWA microcode word: S0, S1 and OMOD.
+//    a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather
+//       than VGPRs (at most 1 can be an SGPR);
+//    b. OMOD is the standard output modifier (result *2, *4, /2)
+// 2.	Add a new version of the SDWA microcode word for VOPC: SDWAB. This
+//    replaces OMOD and the dest fields with SD and SDST (SGPR destination)
+//    field.
+//    a. When SD=1, the SDST is used as the destination for the compare result;
+//    b. When SD=0, VCC is used.
+//
+// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA
+
 // gfx9 SDWA basic encoding
 class VOP_SDWA9e<VOPProfile P> : Enc64 {
   bits<9> src0; // {src0_sgpr{0}, src0{7-0}}
@@ -353,6 +366,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
 
   string Mnemonic = opName;
   string AsmOperands = P.AsmSDWA;
+  string AsmOperands9 = P.AsmSDWA9;
 
   let Size = 8;
   let mayLoad = 0;
@@ -372,53 +386,9 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   VOPProfile Pfl = P;
 }
 
-// GFX9 adds two features to SDWA:
-// 1.	Add 3 fields to the SDWA microcode word: S0, S1 and OMOD.
-//    a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather
-//       than VGPRs (at most 1 can be an SGPR);
-//    b. OMOD is the standard output modifier (result *2, *4, /2)
-// 2.	Add a new version of the SDWA microcode word for VOPC: SDWAB. This
-//    replaces OMOD and the dest fields with SD and SDST (SGPR destination)
-//    field.
-//    a. When SD=1, the SDST is used as the destination for the compare result;
-//    b.when SD=0, VCC is used.
-// 
-// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA
-
-class VOP_SDWA9_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
-  InstSI <P.OutsSDWA9, P.InsSDWA9, "", pattern>,
-  VOP <opName>,
-  SIMCInstr <opName#"_sdwa9", SIEncodingFamily.NONE>,
-  MnemonicAlias <opName#"_sdwa9", opName> {
-
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-  let UseNamedOperandTable = 1;
-
-  string Mnemonic = opName;
-  string AsmOperands = P.AsmSDWA9;
-
-  let Size = 8;
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-
-  let VALU = 1;
-  let SDWA = 1;
-  let Uses = [EXEC];
-
-  let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);
-  let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);
-  let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9,
-                                     AMDGPUAsmVariants.Disable);
-  let DecoderNamespace = "SDWA9";
-
-  VOPProfile Pfl = P;
-}
-
 class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
   InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
-  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -443,9 +413,9 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
   let TSFlags              = ps.TSFlags;
 }
 
-class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> :
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
-  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []>,
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -458,13 +428,15 @@ class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> :
   let Constraints     = ps.Constraints;
   let DisableEncoding = ps.DisableEncoding;
 
+  let SubtargetPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
+  let AssemblerPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
+  let AsmVariantName = !if(ps.Pfl.HasSDWA9, AMDGPUAsmVariants.SDWA9,
+                                            AMDGPUAsmVariants.Disable);
+  let DecoderNamespace = "SDWA9";
+
   // Copy relevant pseudo op flags
-  let SubtargetPredicate   = ps.SubtargetPredicate;
-  let AssemblerPredicate   = ps.AssemblerPredicate;
   let AsmMatchConverter    = ps.AsmMatchConverter;
-  let AsmVariantName       = ps.AsmVariantName;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
-  let DecoderNamespace     = ps.DecoderNamespace;
   let Constraints          = ps.Constraints;
   let DisableEncoding      = ps.DisableEncoding;
   let TSFlags              = ps.TSFlags;
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index ca68f5d42c32c..6f67183df6a18 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -100,7 +100,8 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
                 "Enable Reliability, Availability and Serviceability extensions">;
 def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true",
                 "Enable fast computation of positive address offsets">;
-
+def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
+                                      "CPU fuses AES crypto operations">;
 
 // Cyclone has preferred instructions for zeroing VFP registers, which can
 // execute in 0 cycles.
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index f9da036c7e468..90f635c812542 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1504,6 +1504,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case ARM::CONSTPOOL_ENTRY: {
+    if (Subtarget->genExecuteOnly())
+      llvm_unreachable("execute-only should not generate constant pools");
+
     /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
     /// in the function.  The first operand is the ID# for this instruction, the
     /// second is the index into the MachineConstantPool that this is, the third
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 8715657ad5e25..e0810c358f2da 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -665,12 +665,14 @@ bool ARMBaseInstrInfo::isPredicable(const MachineInstr &MI) const {
   const ARMFunctionInfo *AFI =
       MI.getParent()->getParent()->getInfo<ARMFunctionInfo>();
 
+  // Neon instructions in Thumb2 IT blocks are deprecated, see ARMARM.
+  // In their ARM encoding, they can't be encoded in a conditional form.
+  if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
+    return false;
+
   if (AFI->isThumb2Function()) {
     if (getSubtarget().restrictIT())
       return isV8EligibleForIT(&MI);
-  } else { // non-Thumb
-    if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
-      return false;
   }
 
   return true;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 5b2d093e8f0da..2bcc707e9fc3c 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -2669,12 +2669,35 @@ static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
 // be used to form addressing mode. These wrapped nodes will be selected
 // into MOVi.
-static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
+                                             SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   // FIXME there is no actual debug info here
   SDLoc dl(Op);
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   SDValue Res;
+
+  // When generating execute-only code Constant Pools must be promoted to the
+  // global data section. It's a bit ugly that we can't share them across basic
+  // blocks, but this way we guarantee that execute-only behaves correct with
+  // position-independent addressing modes.
+  if (Subtarget->genExecuteOnly()) {
+    auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
+    auto T = const_cast<Type*>(CP->getType());
+    auto C = const_cast<Constant*>(CP->getConstVal());
+    auto M = const_cast<Module*>(DAG.getMachineFunction().
+                                 getFunction()->getParent());
+    auto GV = new GlobalVariable(
+                    *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C,
+                    Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
+                    Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
+                    Twine(AFI->createPICLabelUId())
+                  );
+    SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
+                                            dl, PtrVT);
+    return LowerGlobalAddress(GA, DAG);
+  }
+
   if (CP->isMachineConstantPoolEntry())
     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
                                     CP->getAlignment());
@@ -3118,6 +3141,19 @@ static bool isReadOnly(const GlobalValue *GV) {
          isa<Function>(GV);
 }
 
+SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  switch (Subtarget->getTargetTriple().getObjectFormat()) {
+  default: llvm_unreachable("unknown object format");
+  case Triple::COFF:
+    return LowerGlobalAddressWindows(Op, DAG);
+  case Triple::ELF:
+    return LowerGlobalAddressELF(Op, DAG);
+  case Triple::MachO:
+    return LowerGlobalAddressDarwin(Op, DAG);
+  }
+}
+
 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -7634,21 +7670,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
-  case ISD::ConstantPool:
-    if (Subtarget->genExecuteOnly())
-      llvm_unreachable("execute-only should not generate constant pools");
-    return LowerConstantPool(Op, DAG);
+  case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
-  case ISD::GlobalAddress:
-    switch (Subtarget->getTargetTriple().getObjectFormat()) {
-    default: llvm_unreachable("unknown object format");
-    case Triple::COFF:
-      return LowerGlobalAddressWindows(Op, DAG);
-    case Triple::ELF:
-      return LowerGlobalAddressELF(Op, DAG);
-    case Triple::MachO:
-      return LowerGlobalAddressDarwin(Op, DAG);
-    }
+  case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SELECT:        return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 26da528c19e6d..5044134f5b1e2 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -601,6 +601,8 @@ class InstrItineraryData;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                     const ARMSubtarget *Subtarget) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 45471a4e95b39..53db5acbe805c 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -4756,6 +4756,16 @@ def t2MOVsr: t2AsmPseudo<"mov${p} $Rd, $shift",
 def t2MOVSsr: t2AsmPseudo<"movs${p} $Rd, $shift",
                           (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
 
+// Aliases for the above with the .w qualifier
+def : t2InstAlias<"mov${p}.w $Rd, $shift",
+                  (t2MOVsi rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def : t2InstAlias<"movs${p}.w $Rd, $shift",
+                  (t2MOVSsi rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def : t2InstAlias<"mov${p}.w $Rd, $shift",
+                  (t2MOVsr rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+def : t2InstAlias<"movs${p}.w $Rd, $shift",
+                  (t2MOVSsr rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+
 // ADR w/o the .w suffix
 def : t2InstAlias<"adr${p} $Rd, $addr",
                   (t2ADR rGPR:$Rd, t2adrlabel:$addr, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 2ae3bad4076b0..4cb0eca5ee5f8 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -42,6 +42,10 @@ public:
 private:
   bool selectImpl(MachineInstr &I) const;
 
+  bool selectICmp(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
+                  MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                  const RegisterBankInfo &RBI) const;
+
   const ARMBaseInstrInfo &TII;
   const ARMBaseRegisterInfo &TRI;
   const ARMBaseTargetMachine &TM;
@@ -243,6 +247,105 @@ static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank,
   return Opc;
 }
 
+static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) {
+  switch (Pred) {
+  // Needs two compares...
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UEQ:
+  default:
+    // AL is our "false" for now. The other two need more compares.
+    return ARMCC::AL;
+  case CmpInst::ICMP_EQ:
+  case CmpInst::FCMP_OEQ:
+    return ARMCC::EQ;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::FCMP_OGT:
+    return ARMCC::GT;
+  case CmpInst::ICMP_SGE:
+  case CmpInst::FCMP_OGE:
+    return ARMCC::GE;
+  case CmpInst::ICMP_UGT:
+  case CmpInst::FCMP_UGT:
+    return ARMCC::HI;
+  case CmpInst::FCMP_OLT:
+    return ARMCC::MI;
+  case CmpInst::ICMP_ULE:
+  case CmpInst::FCMP_OLE:
+    return ARMCC::LS;
+  case CmpInst::FCMP_ORD:
+    return ARMCC::VC;
+  case CmpInst::FCMP_UNO:
+    return ARMCC::VS;
+  case CmpInst::FCMP_UGE:
+    return ARMCC::PL;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::FCMP_ULT:
+    return ARMCC::LT;
+  case CmpInst::ICMP_SLE:
+  case CmpInst::FCMP_ULE:
+    return ARMCC::LE;
+  case CmpInst::FCMP_UNE:
+  case CmpInst::ICMP_NE:
+    return ARMCC::NE;
+  case CmpInst::ICMP_UGE:
+    return ARMCC::HS;
+  case CmpInst::ICMP_ULT:
+    return ARMCC::LO;
+  }
+}
+
+bool ARMInstructionSelector::selectICmp(MachineInstrBuilder &MIB,
+                                        const ARMBaseInstrInfo &TII,
+                                        MachineRegisterInfo &MRI,
+                                        const TargetRegisterInfo &TRI,
+                                        const RegisterBankInfo &RBI) const {
+  auto &MBB = *MIB->getParent();
+  auto InsertBefore = std::next(MIB->getIterator());
+  auto &DebugLoc = MIB->getDebugLoc();
+
+  // Move 0 into the result register.
+  auto Mov0I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVi))
+                   .addDef(MRI.createVirtualRegister(&ARM::GPRRegClass))
+                   .addImm(0)
+                   .add(predOps(ARMCC::AL))
+                   .add(condCodeOp());
+  if (!constrainSelectedInstRegOperands(*Mov0I, TII, TRI, RBI))
+    return false;
+
+  // Perform the comparison.
+  auto LHSReg = MIB->getOperand(2).getReg();
+  auto RHSReg = MIB->getOperand(3).getReg();
+  assert(MRI.getType(LHSReg) == MRI.getType(RHSReg) &&
+         MRI.getType(LHSReg).getSizeInBits() == 32 &&
+         MRI.getType(RHSReg).getSizeInBits() == 32 &&
+         "Unsupported types for comparison operation");
+  auto CmpI = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::CMPrr))
+                  .addUse(LHSReg)
+                  .addUse(RHSReg)
+                  .add(predOps(ARMCC::AL));
+  if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI))
+    return false;
+
+  // Move 1 into the result register if the flags say so.
+  auto ResReg = MIB->getOperand(0).getReg();
+  auto Cond =
+      static_cast<CmpInst::Predicate>(MIB->getOperand(1).getPredicate());
+  auto ARMCond = getComparePred(Cond);
+  if (ARMCond == ARMCC::AL)
+    return false;
+
+  auto Mov1I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVCCi))
+                   .addDef(ResReg)
+                   .addUse(Mov0I->getOperand(0).getReg())
+                   .addImm(1)
+                   .add(predOps(ARMCond, ARM::CPSR));
+  if (!constrainSelectedInstRegOperands(*Mov1I, TII, TRI, RBI))
+    return false;
+
+  MIB->eraseFromParent();
+  return true;
+}
+
 bool ARMInstructionSelector::select(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -343,6 +446,8 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     I.setDesc(TII.get(COPY));
     return selectCopy(I, TII, MRI, TRI, RBI);
   }
+  case G_ICMP:
+    return selectICmp(MIB, TII, MRI, TRI, RBI);
   case G_GEP:
     I.setDesc(TII.get(ARM::ADDrr));
     MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index a706079d98662..5873c7fb38729 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -86,6 +86,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
 
   setAction({G_CONSTANT, s32}, Legal);
 
+  setAction({G_ICMP, s1}, Legal);
+  for (auto Ty : {s8, s16})
+    setAction({G_ICMP, 1, Ty}, WidenScalar);
+  for (auto Ty : {s32, p0})
+    setAction({G_ICMP, 1, Ty}, Legal);
+
   if (!ST.useSoftFloat() && ST.hasVFP2()) {
     setAction({G_FADD, s32}, Legal);
     setAction({G_FADD, s64}, Legal);
diff --git a/lib/Target/ARM/ARMMacroFusion.cpp b/lib/Target/ARM/ARMMacroFusion.cpp
new file mode 100644
index 0000000000000..1b6e97c28d453
--- /dev/null
+++ b/lib/Target/ARM/ARMMacroFusion.cpp
@@ -0,0 +1,57 @@
+//===- ARMMacroFusion.cpp - ARM Macro Fusion ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the ARM implementation of the DAG scheduling
+///  mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMacroFusion.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr &SecondMI) {
+  const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(TSI);
+
+  // Assume wildcards for unspecified instrs.
+  unsigned FirstOpcode =
+    FirstMI ? FirstMI->getOpcode()
+	    : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = SecondMI.getOpcode();
+
+  if (ST.hasFuseAES())
+    // Fuse AES crypto operations.
+    switch(SecondOpcode) {
+    // AES encode.
+    case ARM::AESMC :
+      return FirstOpcode == ARM::AESE ||
+             FirstOpcode == ARM::INSTRUCTION_LIST_END;
+    // AES decode.
+    case ARM::AESIMC:
+      return FirstOpcode == ARM::AESD ||
+             FirstOpcode == ARM::INSTRUCTION_LIST_END;
+    }
+
+  return false;
+}
+
+std::unique_ptr<ScheduleDAGMutation> createARMMacroFusionDAGMutation () {
+  return createMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
+
+} // end namespace llvm
diff --git a/lib/Target/ARM/ARMMacroFusion.h b/lib/Target/ARM/ARMMacroFusion.h
new file mode 100644
index 0000000000000..1e4fc6687eae8
--- /dev/null
+++ b/lib/Target/ARM/ARMMacroFusion.h
@@ -0,0 +1,24 @@
+//===- ARMMacroFusion.h - ARM Macro Fusion ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the ARM definition of the DAG scheduling mutation
+/// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createARMMacroFusionDAGMutation());
+/// to ARMPassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createARMMacroFusionDAGMutation();
+
+} // llvm
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index f59b075e6dd9a..2350d0c6ef69e 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -255,6 +255,16 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OperandsMapping =
         getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
     break;
+  case G_ICMP: {
+    LLT Ty2 = MRI.getType(MI.getOperand(2).getReg());
+    (void)Ty2;
+    assert(Ty2.getSizeInBits() == 32 && "Unsupported size for G_ICMP");
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr,
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx]});
+    break;
+  }
   case G_MERGE_VALUES: {
     // We only support G_MERGE_VALUES for creating a double precision floating
     // point value out of two GPRs.
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index af682dd8321cf..d890d0fa777e8 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -285,6 +285,10 @@ protected:
   /// HasFPAO - if true, processor  does positive address offset computation faster
   bool HasFPAO = false;
 
+  /// HasFuseAES - if true, processor executes back to back AES instruction
+  /// pairs faster.
+  bool HasFuseAES = false;
+
   /// If true, if conversion may decide to leave some instructions unpredicated.
   bool IsProfitableToUnpredicate = false;
 
@@ -561,6 +565,10 @@ public:
   bool hasD16() const { return HasD16; }
   bool hasFullFP16() const { return HasFullFP16; }
 
+  bool hasFuseAES() const { return HasFuseAES; }
+  /// \brief Return true if the CPU supports any kind of instruction fusion.
+  bool hasFusion() const { return hasFuseAES(); }
+
   const Triple &getTargetTriple() const { return TargetTriple; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index c0506cfda6129..eb71e557ec917 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -17,6 +17,7 @@
 #include "ARMRegisterBankInfo.h"
 #endif
 #include "ARMSubtarget.h"
+#include "ARMMacroFusion.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
 #include "ARMTargetTransformInfo.h"
@@ -394,6 +395,9 @@ public:
   createMachineScheduler(MachineSchedContext *C) const override {
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
     // add DAG Mutations here.
+    const ARMSubtarget &ST = C->MF->getSubtarget<ARMSubtarget>();
+    if (ST.hasFusion())
+      DAG->addMutation(createARMMacroFusionDAGMutation());
     return DAG;
   }
 
@@ -401,6 +405,9 @@ public:
   createPostMachineScheduler(MachineSchedContext *C) const override {
     ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
     // add DAG Mutations here.
+    const ARMSubtarget &ST = C->MF->getSubtarget<ARMSubtarget>();
+    if (ST.hasFusion())
+      DAG->addMutation(createARMMacroFusionDAGMutation());
     return DAG;
   }
 
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 19fba3033bb2b..891b5c60e1fd6 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -6860,6 +6860,17 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
 bool ARMAsmParser::processInstruction(MCInst &Inst,
                                       const OperandVector &Operands,
                                       MCStreamer &Out) {
+  // Check if we have the wide qualifier, because if it's present we
+  // must avoid selecting a 16-bit thumb instruction.
+  bool HasWideQualifier = false;
+  for (auto &Op : Operands) {
+    ARMOperand &ARMOp = static_cast<ARMOperand&>(*Op);
+    if (ARMOp.isToken() && ARMOp.getToken() == ".w") {
+      HasWideQualifier = true;
+      break;
+    }
+  }
+
   switch (Inst.getOpcode()) {
   // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
   case ARM::LDRT_POST:
@@ -6939,8 +6950,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // Select the narrow version if the immediate will fit.
     if (Inst.getOperand(1).getImm() > 0 &&
         Inst.getOperand(1).getImm() <= 0xff &&
-        !(static_cast<ARMOperand &>(*Operands[2]).isToken() &&
-          static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w"))
+        !HasWideQualifier)
       Inst.setOpcode(ARM::tLDRpci);
     else
       Inst.setOpcode(ARM::t2LDRpci);
@@ -6971,10 +6981,9 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     else if (Inst.getOpcode() == ARM::t2LDRConstPool)
       TmpInst.setOpcode(ARM::t2LDRpci);
     const ARMOperand &PoolOperand =
-      (static_cast<ARMOperand &>(*Operands[2]).isToken() &&
-       static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w") ?
-      static_cast<ARMOperand &>(*Operands[4]) :
-      static_cast<ARMOperand &>(*Operands[3]);
+      (HasWideQualifier ?
+       static_cast<ARMOperand &>(*Operands[4]) :
+       static_cast<ARMOperand &>(*Operands[3]));
     const MCExpr *SubExprVal = PoolOperand.getConstantPoolImm();
     // If SubExprVal is a constant we may be able to use a MOV
     if (isa<MCConstantExpr>(SubExprVal) &&
@@ -8117,8 +8126,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
-        !(static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-          static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) {
+        !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
       default: llvm_unreachable("unexpected opcode");
@@ -8152,7 +8160,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         isARMLowRegister(Inst.getOperand(2).getReg()) &&
         Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
-        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr))
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr) &&
+        !HasWideQualifier)
       isNarrow = true;
     MCInst TmpInst;
     unsigned newOpc;
@@ -8186,7 +8195,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     bool isNarrow = false;
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
-        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi))
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi) &&
+        !HasWideQualifier)
       isNarrow = true;
     MCInst TmpInst;
     unsigned newOpc;
@@ -8415,10 +8425,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
         !isARMLowRegister(Inst.getOperand(0).getReg()) ||
         (Inst.getOperand(2).isImm() &&
          (unsigned)Inst.getOperand(2).getImm() > 255) ||
-        ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
-        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+        Inst.getOperand(5).getReg() != (inITBlock() ? 0 : ARM::CPSR) ||
+        HasWideQualifier)
       break;
     MCInst TmpInst;
     TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ?
@@ -8447,8 +8455,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     }
     if (!Transform ||
         Inst.getOperand(5).getReg() != 0 ||
-        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+        HasWideQualifier)
       break;
     MCInst TmpInst;
     TmpInst.setOpcode(ARM::tADDhirr);
@@ -8568,11 +8575,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         (Inst.getOperand(1).isImm() &&
          (unsigned)Inst.getOperand(1).getImm() <= 255) &&
-        ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
-          Inst.getOperand(4).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
-        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
-         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+        Inst.getOperand(4).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !HasWideQualifier) {
       // The operands aren't in the same order for tMOVi8...
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::tMOVi8);
@@ -8593,8 +8597,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(2).getImm() == ARMCC::AL &&
         Inst.getOperand(4).getReg() == ARM::CPSR &&
-        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
-         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+        !HasWideQualifier) {
       // The operands aren't the same for tMOV[S]r... (no cc_out)
       MCInst TmpInst;
       TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
@@ -8616,8 +8619,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(2).getImm() == 0 &&
-        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
-         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+        !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
       default: llvm_unreachable("Illegal opcode!");
@@ -8716,11 +8718,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
          isARMLowRegister(Inst.getOperand(2).getReg())) &&
         Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
-        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
-        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
-         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
-             ".w"))) {
+        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
         default: llvm_unreachable("unexpected opcode");
@@ -8756,11 +8755,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
          isARMLowRegister(Inst.getOperand(2).getReg())) &&
         (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
          Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
-        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
-        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
-         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
-             ".w"))) {
+        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
         default: llvm_unreachable("unexpected opcode");
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 3cde43967568b..cf6827fd6ca19 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -49,6 +49,7 @@ add_llvm_target(ARMCodeGen
   ARMLoadStoreOptimizer.cpp
   ARMMCInstLower.cpp
   ARMMachineFunctionInfo.cpp
+  ARMMacroFusion.cpp
   ARMRegisterInfo.cpp
   ARMOptimizeBarriersPass.cpp
   ARMSelectionDAGInfo.cpp
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 716492ea25662..81760f03940ad 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -358,11 +358,27 @@ static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
   return Value;
 }
 
-unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
+                                         const MCFixup &Fixup,
+                                         const MCValue &Target, uint64_t Value,
                                          bool IsPCRel, MCContext &Ctx,
                                          bool IsLittleEndian,
                                          bool IsResolved) const {
   unsigned Kind = Fixup.getKind();
+
+  // MachO tries to make .o files that look vaguely pre-linked, so for MOVW/MOVT
+  // and .word relocations they put the Thumb bit into the addend if possible.
+  // Other relocation types don't want this bit though (branches couldn't encode
+  // it if it *was* present, and no other relocations exist) and it can
+  // interfere with checking valid expressions.
+  if (const MCSymbolRefExpr *A = Target.getSymA()) {
+    if (A->hasSubsectionsViaSymbols() && Asm.isThumbFunc(&A->getSymbol()) &&
+        (Kind == FK_Data_4 || Kind == ARM::fixup_arm_movw_lo16 ||
+         Kind == ARM::fixup_arm_movt_hi16 || Kind == ARM::fixup_t2_movw_lo16 ||
+         Kind == ARM::fixup_t2_movt_hi16))
+      Value |= 1;
+  }
+
   switch (Kind) {
   default:
     Ctx.reportError(Fixup.getLoc(), "bad relocation fixup type");
@@ -505,6 +521,13 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     return swapHalfWords(out, IsLittleEndian);
   }
   case ARM::fixup_arm_thumb_bl: {
+    // FIXME: We get both thumb1 and thumb2 in here, so we can only check for
+    // the less strict thumb2 value.
+    if (!isInt<26>(Value - 4)) {
+      Ctx.reportError(Fixup.getLoc(), "Relocation out of range");
+      return 0;
+    }
+
     // The value doesn't encode the low bit (always zero) and is offset by
     // four. The 32-bit immediate value is encoded as
     //   imm32 = SignExtend(S:I1:I2:imm10:imm11:0)
@@ -716,29 +739,11 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
 }
 
 void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                      const MCAsmLayout &Layout,
                                       const MCFixup &Fixup,
-                                      const MCFragment *DF,
-                                      const MCValue &Target, uint64_t &Value,
-                                      bool &IsResolved) {
+                                      const MCValue &Target, bool &IsResolved) {
   const MCSymbolRefExpr *A = Target.getSymA();
   const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
   const unsigned FixupKind = Fixup.getKind() ;
-  // MachO (the only user of "Value") tries to make .o files that look vaguely
-  // pre-linked, so for MOVW/MOVT and .word relocations they put the Thumb bit
-  // into the addend if possible. Other relocation types don't want this bit
-  // though (branches couldn't encode it if it *was* present, and no other
-  // relocations exist) and it can interfere with checking valid expressions.
-  if (FixupKind == FK_Data_4 ||
-      FixupKind == ARM::fixup_arm_movw_lo16 ||
-      FixupKind == ARM::fixup_arm_movt_hi16 ||
-      FixupKind == ARM::fixup_t2_movw_lo16 ||
-      FixupKind == ARM::fixup_t2_movt_hi16) {
-    if (Sym) {
-      if (Asm.isThumbFunc(Sym))
-        Value |= 1;
-    }
-  }
   if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
     assert(Sym && "How did we resolve this?");
 
@@ -747,7 +752,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
 
     // If the symbol is out of range, produce a relocation and hope the
     // linker can handle it. GNU AS produces an error in this case.
-    if (Sym->isExternal() || Value >= 0x400004)
+    if (Sym->isExternal())
       IsResolved = false;
   }
   // Create relocations for unconditional branches to function symbols with
@@ -759,6 +764,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
         IsResolved = false;
       if (!Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_thumb_br ||
                                     FixupKind == ARM::fixup_arm_thumb_bl ||
+                                    FixupKind == ARM::fixup_t2_condbranch ||
                                     FixupKind == ARM::fixup_t2_uncondbranch))
         IsResolved = false;
     }
@@ -875,22 +881,25 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
   }
 }
 
-void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value, bool IsPCRel,
-                               MCContext &Ctx) const {
+void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                               const MCValue &Target,
+                               MutableArrayRef<char> Data, uint64_t Value,
+                               bool IsPCRel) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  Value = adjustFixupValue(Fixup, Value, IsPCRel, Ctx, IsLittleEndian, true);
+  MCContext &Ctx = Asm.getContext();
+  Value = adjustFixupValue(Asm, Fixup, Target, Value, IsPCRel, Ctx,
+                           IsLittleEndian, true);
   if (!Value)
     return; // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // Used to point to big endian bytes.
   unsigned FullSizeBytes;
   if (!IsLittleEndian) {
     FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
-    assert((Offset + FullSizeBytes) <= DataSize && "Invalid fixup size!");
+    assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!");
     assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
   }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 2ddedb5d61059..6a0ba2ed41c1a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -40,17 +40,17 @@ public:
 
   /// processFixupValue - Target hook to process the literal value of a fixup
   /// if necessary.
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
+  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                         const MCValue &Target, bool &IsResolved) override;
 
-  unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel,
+  unsigned adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                            const MCValue &Target, uint64_t Value, bool IsPCRel,
                             MCContext &Ctx, bool IsLittleEndian,
                             bool IsResolved) const;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   unsigned getRelaxedOpcode(unsigned Op) const;
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 00505a103e00f..f74fb2e20b5a3 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -33,8 +33,8 @@ public:
 
   ~ARMWinCOFFObjectWriter() override = default;
 
-  unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsCrossSection,
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsCrossSection,
                         const MCAsmBackend &MAB) const override;
 
   bool recordRelocation(const MCFixup &) const override;
@@ -42,7 +42,8 @@ public:
 
 } // end anonymous namespace
 
-unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
+unsigned ARMWinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+                                              const MCValue &Target,
                                               const MCFixup &Fixup,
                                               bool IsCrossSection,
                                               const MCAsmBackend &MAB) const {
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 7d5fb6ca17b98..c6ddd6bdad5e6 100644
--- a/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -214,7 +214,12 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
     if (Opcode != ISD::LOAD)
       continue;
 
-    unsigned char new_val[8]; // hold up the constant values replacing loads.
+    union {
+      uint8_t c[8];
+      uint16_t s;
+      uint32_t i;
+      uint64_t d;
+    } new_val; // hold up the constant values replacing loads.
     bool to_replace = false;
     SDLoc DL(Node);
     const LoadSDNode *LD = cast<LoadSDNode>(Node);
@@ -242,7 +247,7 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
       const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode());
       if (GADN && CDN)
         to_replace =
-            getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val);
+            getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c);
     } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
                LDAddrNode->getNumOperands() > 0) {
       DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
@@ -250,7 +255,7 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
       SDValue OP1 = LDAddrNode->getOperand(0);
       if (const GlobalAddressSDNode *GADN =
               dyn_cast<GlobalAddressSDNode>(OP1.getNode()))
-        to_replace = getConstantFieldValue(GADN, 0, size, new_val);
+        to_replace = getConstantFieldValue(GADN, 0, size, new_val.c);
     }
 
     if (!to_replace)
@@ -259,13 +264,13 @@ void BPFDAGToDAGISel::PreprocessISelDAG() {
     // replacing the old with a new value
     uint64_t val;
     if (size == 1)
-      val = *(uint8_t *)new_val;
+      val = new_val.c[0];
     else if (size == 2)
-      val = *(uint16_t *)new_val;
+      val = new_val.s;
     else if (size == 4)
-      val = *(uint32_t *)new_val;
+      val = new_val.i;
     else {
-      val = *(uint64_t *)new_val;
+      val = new_val.d;
     }
 
     DEBUG(dbgs() << "Replacing load of size " << size << " with constant "
@@ -318,14 +323,17 @@ bool BPFDAGToDAGISel::getConstantFieldValue(const GlobalAddressSDNode *Node,
   }
 
   // test whether host endianness matches target
-  uint8_t test_buf[2];
+  union {
+    uint8_t c[2];
+    uint16_t s;
+  } test_buf;
   uint16_t test_val = 0x2345;
   if (DL.isLittleEndian())
-    support::endian::write16le(test_buf, test_val);
+    support::endian::write16le(test_buf.c, test_val);
   else
-    support::endian::write16be(test_buf, test_val);
+    support::endian::write16be(test_buf.c, test_val);
 
-  bool endian_match = *(uint16_t *)test_buf == test_val;
+  bool endian_match = test_buf.s == test_val;
   for (uint64_t i = Offset, j = 0; i < Offset + Size; i++, j++)
     ByteSeq[j] = endian_match ? TmpVal[i] : TmpVal[Offset + Size - 1 - j];
 
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 80357a63a4e12..15e89fb2a2611 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -27,8 +27,9 @@ public:
     : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
   ~BPFAsmBackend() override = default;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -61,9 +62,10 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value, bool IsPCRel,
-                               MCContext &Ctx) const {
+void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                               const MCValue &Target,
+                               MutableArrayRef<char> Data, uint64_t Value,
+                               bool IsPCRel) const {
   if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
     assert(Value == 0);
   } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 9f8c9ded8127b..734f3c6658d92 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -567,8 +567,19 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
     }
     llvm_unreachable("Invalid register operand");
   }
-  if (SO.isImm() || SO.isFPImm())
-    return IfTrue ? C2_cmoveit : C2_cmoveif;
+  switch (SO.getType()) {
+    case MachineOperand::MO_Immediate:
+    case MachineOperand::MO_FPImmediate:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_TargetIndex:
+    case MachineOperand::MO_JumpTableIndex:
+    case MachineOperand::MO_ExternalSymbol:
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_BlockAddress:
+      return IfTrue ? C2_cmoveit : C2_cmoveif;
+    default:
+      break;
+  }
   llvm_unreachable("Unexpected source operand");
 }
 
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 18e49c69b8e36..2b0ceaa66258e 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1051,10 +1051,26 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   bool HasExtraAlign = HRI.needsStackRealignment(MF);
   bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None;
 
+  unsigned FrameSize = MFI.getStackSize();
   unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister();
   auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
   unsigned AP = HMFI.getStackAlignBasePhysReg();
-  unsigned FrameSize = MFI.getStackSize();
+  // It may happen that AP will be absent even HasAlloca && HasExtraAlign
+  // is true. HasExtraAlign may be set because of vector spills, without
+  // aligned locals or aligned outgoing function arguments. Since vector
+  // spills will ultimately be "unaligned", it is safe to use FP as the
+  // base register.
+  // In fact, in such a scenario the stack is actually not required to be
+  // aligned, although it may end up being aligned anyway, since this
+  // particular case is not easily detectable. The alignment will be
+  // unnecessary, but not incorrect.
+  // Unfortunately there is no quick way to verify that the above is
+  // indeed the case (and that it's not a result of an error), so just
+  // assume that missing AP will be replaced by FP.
+  // (A better fix would be to rematerialize AP from FP and always align
+  // vector spills.)
+  if (AP == 0)
+    AP = FP;
 
   bool UseFP = false, UseAP = false;  // Default: use SP (except at -O0).
   // Use FP at -O0, except when there are objects with extra alignment.
@@ -2454,9 +2470,44 @@ bool HexagonFrameLowering::mayOverflowFrameOffset(MachineFunction &MF) const {
   unsigned StackSize = MF.getFrameInfo().estimateStackSize(MF);
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   // A fairly simplistic guess as to whether a potential load/store to a
-  // stack location could require an extra register. It does not account
-  // for store-immediate instructions.
-  if (HST.useHVXOps())
-    return StackSize > 256;
+  // stack location could require an extra register.
+  if (HST.useHVXOps() && StackSize > 256)
+    return true;
+
+  // Check if the function has store-immediate instructions that access
+  // the stack. Since the offset field is not extendable, if the stack
+  // size exceeds the offset limit (6 bits, shifted), the stores will
+  // require a new base register.
+  bool HasImmStack = false;
+  unsigned MinLS = ~0u;   // Log_2 of the memory access size.
+
+  for (const MachineBasicBlock &B : MF) {
+    for (const MachineInstr &MI : B) {
+      unsigned LS = 0;
+      switch (MI.getOpcode()) {
+        case Hexagon::S4_storeirit_io:
+        case Hexagon::S4_storeirif_io:
+        case Hexagon::S4_storeiri_io:
+          ++LS;
+          LLVM_FALLTHROUGH;
+        case Hexagon::S4_storeirht_io:
+        case Hexagon::S4_storeirhf_io:
+        case Hexagon::S4_storeirh_io:
+          ++LS;
+          LLVM_FALLTHROUGH;
+        case Hexagon::S4_storeirbt_io:
+        case Hexagon::S4_storeirbf_io:
+        case Hexagon::S4_storeirb_io:
+          if (MI.getOperand(0).isFI())
+            HasImmStack = true;
+          MinLS = std::min(MinLS, LS);
+          break;
+      }
+    }
+  }
+
+  if (HasImmStack)
+    return !isUInt<6>(StackSize >> MinLS);
+
   return false;
 }
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
index 11ac5454f6043..5abbcbba72ddd 100644
--- a/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -295,15 +296,12 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
     unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0;
     unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0;
     bool Failure = false, CanUp = true, CanDown = true;
-    bool Used1 = false, Used2 = false;
     for (unsigned X = MinX+1; X < MaxX; X++) {
       const DefUseInfo &DU = DUM.lookup(X);
       if (DU.Defs[PR] || DU.Defs[DR] || DU.Uses[DR]) {
         Failure = true;
         break;
       }
-      Used1 |= DU.Uses[SR1];
-      Used2 |= DU.Uses[SR2];
       if (CanDown && DU.Defs[SR1])
         CanDown = false;
       if (CanUp && DU.Defs[SR2])
@@ -317,64 +315,52 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
     // Prefer "down", since this will move the MUX farther away from the
     // predicate definition.
     MachineBasicBlock::iterator At = CanDown ? Def2 : Def1;
-    if (CanDown) {
-      // If the MUX is placed "down", we need to make sure that there aren't
-      // any kills of the source registers between the two defs.
-      if (Used1 || Used2) {
-        auto ResetKill = [this] (unsigned Reg, MachineInstr &MI) -> bool {
-          if (MachineOperand *Op = MI.findRegisterUseOperand(Reg, true, HRI)) {
-            Op->setIsKill(false);
-            return true;
-          }
-          return false;
-        };
-        bool KilledSR1 = false, KilledSR2 = false;
-        for (MachineInstr &MJ : make_range(std::next(It1), It2)) {
-          if (SR1)
-            KilledSR1 |= ResetKill(SR1, MJ);
-          if (SR2)
-            KilledSR2 |= ResetKill(SR1, MJ);
-        }
-        // If any of the source registers were killed in this range, transfer
-        // the kills to the source operands: they will me "moved" to the
-        // resulting MUX and their parent instructions will be deleted.
-        if (KilledSR1) {
-          assert(Src1->isReg());
-          Src1->setIsKill(true);
-        }
-        if (KilledSR2) {
-          assert(Src2->isReg());
-          Src2->setIsKill(true);
-        }
-      }
-    } else {
-      // If the MUX is placed "up", it shouldn't kill any source registers
-      // that are still used afterwards. We can reset the kill flags directly
-      // on the operands, because the source instructions will be erased.
-      if (Used1 && Src1->isReg())
-        Src1->setIsKill(false);
-      if (Used2 && Src2->isReg())
-        Src2->setIsKill(false);
-    }
     ML.push_back(MuxInfo(At, DR, PR, SrcT, SrcF, Def1, Def2));
   }
 
-  for (unsigned I = 0, N = ML.size(); I < N; ++I) {
-    MuxInfo &MX = ML[I];
-    MachineBasicBlock &B = *MX.At->getParent();
-    DebugLoc DL = MX.At->getDebugLoc();
+  for (MuxInfo &MX : ML) {
     unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF);
     if (!MxOpc)
       continue;
-    BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
-        .addReg(MX.PredR)
-        .add(*MX.SrcT)
-        .add(*MX.SrcF);
+    MachineBasicBlock &B = *MX.At->getParent();
+    const DebugLoc &DL = B.findDebugLoc(MX.At);
+    auto NewMux = BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
+                      .addReg(MX.PredR)
+                      .add(*MX.SrcT)
+                      .add(*MX.SrcF);
+    NewMux->clearKillInfo();
     B.erase(MX.Def1);
     B.erase(MX.Def2);
     Changed = true;
   }
 
+  // Fix up kill flags.
+
+  LivePhysRegs LPR(*HRI);
+  LPR.addLiveOuts(B);
+  auto IsLive = [&LPR,this] (unsigned Reg) -> bool {
+    for (MCSubRegIterator S(Reg, HRI, true); S.isValid(); ++S)
+      if (LPR.contains(*S))
+        return true;
+    return false;
+  };
+  for (auto I = B.rbegin(), E = B.rend(); I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+    // This isn't 100% accurate, but it's safe.
+    // It won't detect (as a kill) a case like this
+    //   r0 = add r0, 1    <-- r0 should be "killed"
+    //   ... = r0
+    for (MachineOperand &Op : I->operands()) {
+      if (!Op.isReg() || !Op.isUse())
+        continue;
+      assert(Op.getSubReg() == 0 && "Should have physical registers only");
+      bool Live = IsLive(Op.getReg());
+      Op.setIsKill(!Live);
+    }
+    LPR.stepBackward(*I);
+  }
+
   return Changed;
 }
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index f43101fa456d5..fec2dc5ce3066 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -94,10 +94,6 @@ static cl::opt<bool> UseDFAHazardRec("dfa-hazard-rec",
 ///
 /// Constants for Hexagon instructions.
 ///
-const int Hexagon_MEMV_OFFSET_MAX_128B = 896;   // #s4: -8*128...7*128
-const int Hexagon_MEMV_OFFSET_MIN_128B = -1024; // #s4
-const int Hexagon_MEMV_OFFSET_MAX = 448;  // #s4: -8*64...7*64
-const int Hexagon_MEMV_OFFSET_MIN = -512; // #s4
 const int Hexagon_MEMW_OFFSET_MAX = 4095;
 const int Hexagon_MEMW_OFFSET_MIN = -4096;
 const int Hexagon_MEMD_OFFSET_MAX = 8191;
@@ -2443,8 +2439,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::V6_vS32b_ai:
   case Hexagon::V6_vL32Ub_ai:
   case Hexagon::V6_vS32Ub_ai:
-    return (Offset >= Hexagon_MEMV_OFFSET_MIN) &&
-      (Offset <= Hexagon_MEMV_OFFSET_MAX);
+    return isShiftedInt<4,6>(Offset);
 
   case Hexagon::PS_vstorerq_ai_128B:
   case Hexagon::PS_vstorerw_ai_128B:
@@ -2454,8 +2449,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::V6_vS32b_ai_128B:
   case Hexagon::V6_vL32Ub_ai_128B:
   case Hexagon::V6_vS32Ub_ai_128B:
-    return (Offset >= Hexagon_MEMV_OFFSET_MIN_128B) &&
-      (Offset <= Hexagon_MEMV_OFFSET_MAX_128B);
+    return isShiftedInt<4,7>(Offset);
 
   case Hexagon::J2_loop0i:
   case Hexagon::J2_loop1i:
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index d73fc7c73185d..de6b203015d8e 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -629,7 +629,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
             if (MO.isReg() && MO.isUse()) {
               unsigned feederReg = MO.getReg();
               for (MachineBasicBlock::iterator localII = feederPos,
-                   end = jmpPos; localII != end; localII++) {
+                   end = cmpInstr->getIterator(); localII != end; localII++) {
                 MachineInstr &localMI = *localII;
                 for (unsigned j = 0; j < localMI.getNumOperands(); j++) {
                   MachineOperand &localMO = localMI.getOperand(j);
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index ee3209354688d..7d961a238ae28 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -100,9 +100,6 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
-
-  private:
-    void ChangeOpInto(MachineOperand &Dst, MachineOperand &Src);
   };
 }
 
@@ -132,7 +129,9 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
     PeepholeDoubleRegsMap.clear();
 
     // Traverse the basic block.
-    for (MachineInstr &MI : *MBB) {
+    for (auto I = MBB->begin(), E = MBB->end(), NextI = I; I != E; I = NextI) {
+      NextI = std::next(I);
+      MachineInstr &MI = *I;
       // Look for sign extends:
       // %vreg170<def> = SXTW %vreg166
       if (!DisableOptSZExt && MI.getOpcode() == Hexagon::A2_sxtw) {
@@ -280,14 +279,13 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
           if (NewOp) {
             unsigned PSrc = MI.getOperand(PR).getReg();
             if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
-              MI.getOperand(PR).setReg(POrig);
+              BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(),
+                      QII->get(NewOp), MI.getOperand(0).getReg())
+                .addReg(POrig)
+                .add(MI.getOperand(S2))
+                .add(MI.getOperand(S1));
               MRI->clearKillFlags(POrig);
-              MI.setDesc(QII->get(NewOp));
-              // Swap operands S1 and S2.
-              MachineOperand Op1 = MI.getOperand(S1);
-              MachineOperand Op2 = MI.getOperand(S2);
-              ChangeOpInto(MI.getOperand(S1), Op2);
-              ChangeOpInto(MI.getOperand(S2), Op1);
+              MI.eraseFromParent();
             }
           } // if (NewOp)
         } // if (!Done)
@@ -299,40 +297,6 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-void HexagonPeephole::ChangeOpInto(MachineOperand &Dst, MachineOperand &Src) {
-  assert (&Dst != &Src && "Cannot duplicate into itself");
-  switch (Dst.getType()) {
-    case MachineOperand::MO_Register:
-      if (Src.isReg()) {
-        Dst.setReg(Src.getReg());
-        Dst.setSubReg(Src.getSubReg());
-        MRI->clearKillFlags(Src.getReg());
-      } else if (Src.isImm()) {
-        Dst.ChangeToImmediate(Src.getImm());
-      } else {
-        llvm_unreachable("Unexpected src operand type");
-      }
-      break;
-
-    case MachineOperand::MO_Immediate:
-      if (Src.isImm()) {
-        Dst.setImm(Src.getImm());
-      } else if (Src.isReg()) {
-        Dst.ChangeToRegister(Src.getReg(), Src.isDef(), Src.isImplicit(),
-                             false, Src.isDead(), Src.isUndef(),
-                             Src.isDebug());
-        Dst.setSubReg(Src.getSubReg());
-      } else {
-        llvm_unreachable("Unexpected src operand type");
-      }
-      break;
-
-    default:
-      llvm_unreachable("Unexpected dst operand type");
-      break;
-  }
-}
-
 FunctionPass *llvm::createHexagonPeephole() {
   return new HexagonPeephole();
 }
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 8851a23ae8ace..0aada8a53c979 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -1,4 +1,4 @@
-//===-- HexagonSubtarget.cpp - Hexagon Subtarget Information --------------===//
+//===- HexagonSubtarget.cpp - Hexagon Subtarget Information ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,13 +11,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "HexagonSubtarget.h"
 #include "Hexagon.h"
+#include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
 #include <map>
 
 using namespace llvm;
@@ -119,9 +129,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
 HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
                                    StringRef FS, const TargetMachine &TM)
     : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      FrameLowering() {
-
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {
   initializeEnvironment();
 
   // Initialize scheduling itinerary for the specified CPU.
@@ -196,7 +204,6 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
   updateLatency(*SrcInst, *DstInst, Dep);
 }
 
-
 void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) {
   for (auto &SU : DAG->SUnits) {
     if (!SU.isInstr())
@@ -240,18 +247,18 @@ void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) {
   }
 }
 
-
 void HexagonSubtarget::getPostRAMutations(
-      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+  Mutations.push_back(
+      llvm::make_unique<HexagonSubtarget::HexagonDAGMutation>());
 }
 
 void HexagonSubtarget::getSMSMutations(
-      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+  Mutations.push_back(
+      llvm::make_unique<HexagonSubtarget::HexagonDAGMutation>());
 }
 
-
 // Pin the vtable to this file.
 void HexagonSubtarget::anchor() {}
 
@@ -447,4 +454,3 @@ unsigned HexagonSubtarget::getL1PrefetchDistance() const {
 bool HexagonSubtarget::enableSubRegLiveness() const {
   return EnableSubregLiveness;
 }
-
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 4379efa79c9cd..753dca0000652 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -1,4 +1,4 @@
-//===-- HexagonSubtarget.h - Define Subtarget for the Hexagon ---*- C++ -*-===//
+//===- HexagonSubtarget.h - Define Subtarget for the Hexagon ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,12 +15,17 @@
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
 
 #include "HexagonFrameLowering.h"
-#include "HexagonISelLowering.h"
 #include "HexagonInstrInfo.h"
+#include "HexagonISelLowering.h"
 #include "HexagonSelectionDAGInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <memory>
 #include <string>
+#include <vector>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "HexagonGenSubtargetInfo.inc"
@@ -30,6 +35,12 @@
 
 namespace llvm {
 
+class MachineInstr;
+class SDep;
+class SUnit;
+class TargetMachine;
+class Triple;
+
 class HexagonSubtarget : public HexagonGenSubtargetInfo {
   virtual void anchor();
 
@@ -57,6 +68,7 @@ private:
   HexagonSelectionDAGInfo TSInfo;
   HexagonFrameLowering FrameLowering;
   InstrItineraryData InstrItins;
+
   void initializeEnvironment();
 
 public:
@@ -108,6 +120,7 @@ public:
 
   bool useBSBScheduling() const { return UseBSBScheduling; }
   bool enableMachineScheduler() const override;
+
   // Always use the TargetLowering default scheduler.
   // FIXME: This will use the vliw scheduler which is probably just hurting
   // compiler time and will be removed eventually anyway.
@@ -124,6 +137,7 @@ public:
   unsigned getSmallDataThreshold() const {
     return Hexagon_SMALL_DATA_THRESHOLD;
   }
+
   const HexagonArchEnum &getHexagonArchVersion() const {
     return HexagonArchVersion;
   }
@@ -155,4 +169,4 @@ private:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index e507a797871fc..031a1bdefafbf 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -291,7 +291,6 @@ bool HexagonPassConfig::addInstSelector() {
     if (EnableBitSimplify)
       addPass(createHexagonBitSimplify());
     addPass(createHexagonPeephole());
-    printAndVerify("After hexagon peephole pass");
     // Constant propagation.
     if (!DisableHCP) {
       addPass(createHexagonConstPropagationPass());
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 545c8b6b2acde..093ce80bc2e3f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -202,10 +202,8 @@ public:
   /// processFixupValue - Target hook to adjust the literal value of a fixup
   /// if necessary. IsResolved signals whether the caller believes a relocation
   /// is needed; the target can modify the value. The default does nothing.
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override {
+  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                         const MCValue &Target, bool &IsResolved) override {
     MCFixupKind Kind = Fixup.getKind();
 
     switch((unsigned)Kind) {
@@ -415,9 +413,9 @@ public:
   /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
   /// data fragment, at the offset specified by the fixup and following the
   /// fixup kind as appropriate.
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t FixupValue, bool IsPCRel,
-                  MCContext &Ctx) const override {
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t FixupValue, bool IsPCRel) const override {
 
     // When FixupValue is 0 the relocation is external and there
     // is nothing for us to do.
@@ -432,8 +430,8 @@ public:
     // to a real offset before we can use it.
     uint32_t Offset = Fixup.getOffset();
     unsigned NumBytes = getFixupKindNumBytes(Kind);
-    assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-    char *InstAddr = Data + Offset;
+    assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+    char *InstAddr = Data.data() + Offset;
 
     Value = adjustFixupValue(Kind, FixupValue);
     if(!Value)
@@ -517,7 +515,7 @@ public:
           dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) <<
             ": AValue=0x"; dbgs().write_hex(FixupValue) <<
             ": Offset=" << Offset <<
-            ": Size=" << DataSize <<
+            ": Size=" << Data.size() <<
             ": OInst=0x"; dbgs().write_hex(OldData) <<
             ": Reloc=0x"; dbgs().write_hex(Reloc););
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index e8f154a1fa533..c7114c7f18a0a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -701,33 +701,32 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     break;
   case Hexagon::A2_addi:
     Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
-    assert(Absolute);(void)Absolute;
-    if (Value == 1) {
-      Result.setOpcode(Hexagon::SA1_inc);
-      addOps(Result, Inst, 0);
-      addOps(Result, Inst, 1);
-      break;
-    } //  1,2 SUBInst $Rd = add($Rs, #1)
-    else if (Value == -1) {
-      Result.setOpcode(Hexagon::SA1_dec);
-      addOps(Result, Inst, 0);
-      addOps(Result, Inst, 1);
-      addOps(Result, Inst, 2);
-      break;
-    } //  1,2 SUBInst $Rd = add($Rs,#-1)
-    else if (Inst.getOperand(1).getReg() == Hexagon::R29) {
-      Result.setOpcode(Hexagon::SA1_addsp);
-      addOps(Result, Inst, 0);
-      addOps(Result, Inst, 2);
-      break;
-    } //  1,3 SUBInst $Rd = add(r29, #$u6_2)
-    else {
-      Result.setOpcode(Hexagon::SA1_addi);
-      addOps(Result, Inst, 0);
-      addOps(Result, Inst, 1);
-      addOps(Result, Inst, 2);
-      break;
-    } //    1,2,3 SUBInst $Rx = add($Rx, #$s7)
+    if (Absolute) {
+      if (Value == 1) {
+        Result.setOpcode(Hexagon::SA1_inc);
+        addOps(Result, Inst, 0);
+        addOps(Result, Inst, 1);
+        break;
+      } //  1,2 SUBInst $Rd = add($Rs, #1)
+      if (Value == -1) {
+        Result.setOpcode(Hexagon::SA1_dec);
+        addOps(Result, Inst, 0);
+        addOps(Result, Inst, 1);
+        addOps(Result, Inst, 2);
+        break;
+      } //  1,2 SUBInst $Rd = add($Rs,#-1)
+      if (Inst.getOperand(1).getReg() == Hexagon::R29) {
+        Result.setOpcode(Hexagon::SA1_addsp);
+        addOps(Result, Inst, 0);
+        addOps(Result, Inst, 2);
+        break;
+      } //  1,3 SUBInst $Rd = add(r29, #$u6_2)
+    }
+    Result.setOpcode(Hexagon::SA1_addi);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rx = add($Rx, #$s7)
   case Hexagon::A2_add:
     Result.setOpcode(Hexagon::SA1_addrx);
     addOps(Result, Inst, 0);
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index 0ef1401ef531a..c212726113ab7 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -49,8 +49,9 @@ public:
   LanaiAsmBackend(const Target &T, Triple::OSType OST)
       : MCAsmBackend(), OSType(OST) {}
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -88,9 +89,10 @@ bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-void LanaiAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                 unsigned /*DataSize*/, uint64_t Value,
-                                 bool /*IsPCRel*/, MCContext & /*Ctx*/) const {
+void LanaiAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                 const MCValue &Target,
+                                 MutableArrayRef<char> Data, uint64_t Value,
+                                 bool /*IsPCRel*/) const {
   MCFixupKind Kind = Fixup.getKind();
   Value = adjustFixupValue(static_cast<unsigned>(Kind), Value);
 
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index d8fdc8ba674e6..982c6fea62d44 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -32,16 +32,20 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+                                     const TargetOptions &Options) {
+  return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16";
+}
+
 MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
                                          const TargetOptions &Options,
                                          Optional<Reloc::Model> RM,
                                          CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, "e-m:e-p:16:16-i32:16:32-a:16-n8:16", TT, CPU, FS,
+    : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS,
                         Options, getEffectiveRelocModel(RM), CM, OL),
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
-      // FIXME: Check DataLayout string.
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 694c201cbe8dc..9d5c179a0fd90 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -322,6 +322,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseDirectiveSet();
   bool parseDirectiveOption();
   bool parseInsnDirective();
+  bool parseRSectionDirective(StringRef Section);
   bool parseSSectionDirective(StringRef Section, unsigned Type);
 
   bool parseSetAtDirective();
@@ -5106,7 +5107,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
 
   CC = StringSwitch<unsigned>(Name)
            .Case("zero", 0)
-           .Case("at", 1)
+           .Cases("at", "AT", 1)
            .Case("a0", 4)
            .Case("a1", 5)
            .Case("a2", 6)
@@ -6952,6 +6953,23 @@ bool MipsAsmParser::parseInsnDirective() {
   return false;
 }
 
+/// parseRSectionDirective
+///  ::= .rdata
+bool MipsAsmParser::parseRSectionDirective(StringRef Section) {
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  MCSection *ELFSection = getContext().getELFSection(
+      Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  getParser().getStreamer().SwitchSection(ELFSection);
+
+  getParser().Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
 /// parseSSectionDirective
 ///  ::= .sbss
 ///  ::= .sdata
@@ -7499,6 +7517,10 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
     parseInsnDirective();
     return false;
   }
+  if (IDVal == ".rdata") {
+    parseRSectionDirective(".rodata");
+    return false;
+  }
   if (IDVal == ".sbss") {
     parseSSectionDirective(IDVal, ELF::SHT_NOBITS);
     return false;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 6d3d4db036032..ae48d6e38fa0f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -235,10 +235,12 @@ static unsigned calculateMMLEIndex(unsigned i) {
 /// ApplyFixup - Apply the \p Value for given \p Fixup into the provided
 /// data fragment, at the offset specified by the fixup and following the
 /// fixup kind as appropriate.
-void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                unsigned DataSize, uint64_t Value, bool IsPCRel,
-                                MCContext &Ctx) const {
+void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                const MCValue &Target,
+                                MutableArrayRef<char> Data, uint64_t Value,
+                                bool IsPCRel) const {
   MCFixupKind Kind = Fixup.getKind();
+  MCContext &Ctx = Asm.getContext();
   Value = adjustFixupValue(Fixup, Value, Ctx);
 
   if (!Value)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 4b3cc6e21f4cd..bf3b290b7ed53 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -38,8 +38,9 @@ public:
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 3272319ad50f4..7daea163b8a64 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -326,9 +326,9 @@ class AUIPC_DESC : ALUIPC_DESC_BASE<"auipc", GPR32Opnd, II_AUIPC>;
 class AUI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
                     InstrItinClass itin = NoItinerary>
       : MipsR6Arch<instr_asm> {
-  dag OutOperandList = (outs GPROpnd:$rs);
-  dag InOperandList = (ins GPROpnd:$rt, uimm16:$imm);
-  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm");
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins GPROpnd:$rs, uimm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm");
   list<dag> Pattern = [];
   InstrItinClass Itinerary = itin;
 }
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 68708dc4f50fe..02102d6b22f4e 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -907,6 +907,11 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
         if (!(CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1))))
           return SDValue();
       }
+      // Don't generate INS if constant OR operand doesn't fit into bits
+      // cleared by constant AND operand.
+      if (CN->getSExtValue() & CN1->getSExtValue())
+        return SDValue();
+
       SDLoc DL(N);
       EVT ValTy = N->getOperand(0)->getValueType(0);
       SDValue Const1;
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 272595af5f6f1..b95f1158fa562 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -274,8 +274,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
   if (IsPIC) {
     MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
     MF->insert(FallThroughMBB, BalTgtMBB);
-    LongBrMBB->addSuccessor(BalTgtMBB, BranchProbability::getOne());
-    BalTgtMBB->addSuccessor(&*FallThroughMBB, BranchProbability::getOne());
+    LongBrMBB->addSuccessor(BalTgtMBB);
+    BalTgtMBB->addSuccessor(TgtMBB);
 
     // We must select between the MIPS32r6/MIPS64r6 BAL (which is a normal
     // instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an
@@ -342,8 +342,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
           .addReg(Mips::SP).addImm(8);
 
       if (Subtarget.hasMips32r6())
-        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR), Mips::ZERO)
-            .addReg(Mips::AT);
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR))
+          .addReg(Mips::ZERO).addReg(Mips::AT);
       else
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT);
 
@@ -415,8 +415,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
         .addReg(Mips::SP_64).addImm(0);
 
       if (Subtarget.hasMips64r6())
-        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64), Mips::ZERO_64)
-            .addReg(Mips::AT_64);
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64))
+          .addReg(Mips::ZERO_64).addReg(Mips::AT_64);
       else
         BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64);
 
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 2382ea2716612..b57bceb3c8371 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1257,19 +1257,22 @@ static SDValue lowerMSACopyIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
 static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
   EVT ResVecTy = Op->getValueType(0);
   EVT ViaVecTy = ResVecTy;
+  bool BigEndian = !DAG.getSubtarget().getTargetTriple().isLittleEndian();
   SDLoc DL(Op);
 
   // When ResVecTy == MVT::v2i64, LaneA is the upper 32 bits of the lane and
   // LaneB is the lower 32-bits. Otherwise LaneA and LaneB are alternating
   // lanes.
-  SDValue LaneA;
-  SDValue LaneB = Op->getOperand(2);
+  SDValue LaneA = Op->getOperand(OpNr);
+  SDValue LaneB;
 
   if (ResVecTy == MVT::v2i64) {
-    LaneA = DAG.getConstant(0, DL, MVT::i32);
+    LaneB = DAG.getConstant(0, DL, MVT::i32);
     ViaVecTy = MVT::v4i32;
+    if(BigEndian)
+      std::swap(LaneA, LaneB);
   } else
-    LaneA = LaneB;
+    LaneB = LaneA;
 
   SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
                       LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
@@ -1277,8 +1280,11 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
   SDValue Result = DAG.getBuildVector(
       ViaVecTy, DL, makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
 
-  if (ViaVecTy != ResVecTy)
-    Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
+  if (ViaVecTy != ResVecTy) {
+    SDValue One = DAG.getConstant(1, DL, ViaVecTy);
+    Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy,
+                         DAG.getNode(ISD::AND, DL, ViaVecTy, Result, One));
+  }
 
   return Result;
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 028c2cb562f8e..6d7eb786a6835 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -113,8 +113,9 @@ public:
     return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
   }
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override {
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override {
     Value = adjustFixupValue(Fixup.getKind(), Value);
     if (!Value) return;           // Doesn't change encoding.
 
@@ -130,10 +131,8 @@ public:
     }
   }
 
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override {
+  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                         const MCValue &Target, bool &IsResolved) override {
     switch ((PPC::Fixups)Fixup.getKind()) {
     default: break;
     case PPC::fixup_ppc_br24:
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 38ae62b26757a..07c9c1f9f84c0 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -24,7 +24,6 @@ namespace llvm {
   class PPCTargetMachine;
   class PassRegistry;
   class FunctionPass;
-  class ImmutablePass;
   class MachineInstr;
   class AsmPrinter;
   class MCInst;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 57a1d373c88cf..c2c115cb6dafa 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -521,7 +521,7 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
   const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   bool HasBP = RegInfo->hasBasePointer(MF);
   unsigned BPReg  = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg;
-  unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg;
+  unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FP8Reg;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI)
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 662550f7a396a..72f14e9691382 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2560,8 +2560,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                PPCII::MO_TPREL_HA);
     SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                                PPCII::MO_TPREL_LO);
-    SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
-                                     is64bit ? MVT::i64 : MVT::i32);
+    SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)
+                             : DAG.getRegister(PPC::R2, MVT::i32);
+
     SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
     return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
   }
@@ -8377,9 +8378,9 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   if (IntrinsicID == Intrinsic::thread_pointer) {
     // Reads the thread pointer register, used for __builtin_thread_pointer.
-    bool is64bit = Subtarget.isPPC64();
-    return DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
-                           is64bit ? MVT::i64 : MVT::i32);
+    if (Subtarget.isPPC64())
+      return DAG.getRegister(PPC::X13, MVT::i64);
+    return DAG.getRegister(PPC::R2, MVT::i32);
   }
 
   // If this is a lowered altivec predicate compare, CompareOpc is set to the
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 70536a6039b82..e2af5e5295445 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -972,13 +972,15 @@ def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
 
 // Support for medium and large code model.
 let hasSideEffects = 0 in {
+let isReMaterializable = 1 in {
 def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
                        "#ADDIStocHA", []>, isPPC64;
+def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+                     "#ADDItocL", []>, isPPC64;
+}
 let mayLoad = 1 in
 def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
                    "#LDtocL", []>, isPPC64;
-def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
-                     "#ADDItocL", []>, isPPC64;
 }
 
 // Support for thread-local storage.
@@ -994,7 +996,7 @@ def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
                           (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
                  isPPC64;
 
-let isBarrier = 1, isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in
+let isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in
 def CFENCE8 : Pseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>;
 
 def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 236e513bec231..13b4f9ab962da 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -292,6 +292,29 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   return 0;
 }
 
+// For opcodes with the ReMaterializable flag set, this function is called to
+// verify the instruction is really rematable.  
+bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                                     AliasAnalysis *AA) const {
+  switch (MI.getOpcode()) {
+  default: 
+    // This function should only be called for opcodes with the ReMaterializable
+    // flag set.
+    llvm_unreachable("Unknown rematerializable operation!");
+    break;
+  case PPC::LI:
+  case PPC::LI8:
+  case PPC::LIS:
+  case PPC::LIS8:
+  case PPC::QVGPCI:
+  case PPC::ADDIStocHA:
+  case PPC::ADDItocL:
+  case PPC::LOAD_STACK_GUARD:
+    return true;
+  }
+  return false;
+}
+
 unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                           int &FrameIndex) const {
   // Note: This list must be kept consistent with StoreRegToStackSlot.
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 8dd4dbb608794..b0629c88cf57b 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -162,6 +162,8 @@ public:
                              unsigned &SubIdx) const override;
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                         AliasAnalysis *AA) const override;
   unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 637e52bbdbeec..8af7f7e981171 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -389,9 +389,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
 
   if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
-    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
-      .addReg(PPC::R31)
-      .addImm(FrameSize);
+    if (LP64)
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), Reg)
+        .addReg(PPC::X31)
+        .addImm(FrameSize);
+    else
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
+        .addReg(PPC::R31)
+        .addImm(FrameSize);
   } else if (LP64) {
     BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
       .addImm(0)
@@ -478,8 +483,10 @@ void PPCRegisterInfo::lowerDynamicAreaOffset(
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
 
   unsigned maxCallFrameSize = MFI.getMaxCallFrameSize();
+  bool is64Bit = TM.isPPC64();
   DebugLoc dl = MI.getDebugLoc();
-  BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg())
+  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LI8 : PPC::LI),
+          MI.getOperand(0).getReg())
       .addImm(maxCallFrameSize);
   MBB.erase(II);
 }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 5a226b23ff96f..a88a6541e8d00 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -86,9 +86,9 @@ EnableMachineCombinerPass("ppc-machine-combiner",
 
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
-  RegisterTargetMachine<PPC32TargetMachine> A(getThePPC32Target());
-  RegisterTargetMachine<PPC64TargetMachine> B(getThePPC64Target());
-  RegisterTargetMachine<PPC64TargetMachine> C(getThePPC64LETarget());
+  RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
+  RegisterTargetMachine<PPCTargetMachine> B(getThePPC64Target());
+  RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializePPCBoolRetToIntPass(PR);
@@ -177,32 +177,34 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
   assert(Options.MCOptions.getABIName().empty() &&
          "Unknown target-abi option!");
 
-  if (!TT.isMacOSX()) {
-    switch (TT.getArch()) {
-    case Triple::ppc64le:
-      return PPCTargetMachine::PPC_ABI_ELFv2;
-    case Triple::ppc64:
-      return PPCTargetMachine::PPC_ABI_ELFv1;
-    default:
-      // Fallthrough.
-      ;
-    }
+  if (TT.isMacOSX())
+    return PPCTargetMachine::PPC_ABI_UNKNOWN;
+
+  switch (TT.getArch()) {
+  case Triple::ppc64le:
+    return PPCTargetMachine::PPC_ABI_ELFv2;
+  case Triple::ppc64:
+    return PPCTargetMachine::PPC_ABI_ELFv1;
+  default:
+    return PPCTargetMachine::PPC_ABI_UNKNOWN;
   }
-  return PPCTargetMachine::PPC_ABI_UNKNOWN;
 }
 
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            Optional<Reloc::Model> RM) {
-  if (!RM.hasValue()) {
-    if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) {
-      if (!TT.isOSBinFormatMachO() && !TT.isMacOSX())
-        return Reloc::PIC_;
-    }
-    if (TT.isOSDarwin())
-      return Reloc::DynamicNoPIC;
-    return Reloc::Static;
-  }
-  return *RM;
+  if (RM.hasValue())
+    return *RM;
+
+  // Darwin defaults to dynamic-no-pic.
+  if (TT.isOSDarwin())
+    return Reloc::DynamicNoPIC;
+
+  // Non-darwin 64-bit platforms are PIC by default.
+  if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le)
+    return Reloc::PIC_;
+
+  // 32-bit is static by default.
+  return Reloc::Static;
 }
 
 // The FeatureString here is a little subtle. We are modifying the feature
@@ -224,26 +226,6 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
 
 PPCTargetMachine::~PPCTargetMachine() = default;
 
-void PPC32TargetMachine::anchor() {}
-
-PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Optional<Reloc::Model> RM,
-                                       CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-    : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
-
-void PPC64TargetMachine::anchor() {}
-
-PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Optional<Reloc::Model> RM,
-                                       CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-    : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
-
 const PPCSubtarget *
 PPCTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -406,7 +388,7 @@ void PPCPassConfig::addPreRegAlloc() {
   // FIXME: We probably don't need to run these for -fPIE.
   if (getPPCTargetMachine().isPositionIndependent()) {
     // FIXME: LiveVariables should not be necessary here!
-    // PPCTLSDYnamicCallPass uses LiveIntervals which previously dependet on
+    // PPCTLSDynamicCallPass uses LiveIntervals which previously dependent on
     // LiveVariables. This (unnecessary) dependency has been removed now,
     // however a stage-2 clang build fails without LiveVariables computed here.
     addPass(&LiveVariablesID, false);
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index b8f5a2083d808..5eb6ba785d1b8 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -23,7 +23,7 @@ namespace llvm {
 
 /// Common code between 32-bit and 64-bit PowerPC targets.
 ///
-class PPCTargetMachine : public LLVMTargetMachine {
+class PPCTargetMachine final : public LLVMTargetMachine {
 public:
   enum PPCABI { PPC_ABI_UNKNOWN, PPC_ABI_ELFv1, PPC_ABI_ELFv2 };
 private:
@@ -60,29 +60,6 @@ public:
     return false;
   }
 };
-
-/// PowerPC 32-bit target machine.
-///
-class PPC32TargetMachine : public PPCTargetMachine {
-  virtual void anchor();
-public:
-  PPC32TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                     StringRef FS, const TargetOptions &Options,
-                     Optional<Reloc::Model> RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
-};
-
-/// PowerPC 64-bit target machine.
-///
-class PPC64TargetMachine : public PPCTargetMachine {
-  virtual void anchor();
-public:
-  PPC64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                     StringRef FS, const TargetOptions &Options,
-                     Optional<Reloc::Model> RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
-};
-
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index d9a71893afee7..f85c0cf111c43 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -32,8 +32,9 @@ public:
       : MCAsmBackend(), OSABI(OSABI), Is64Bit(Is64Bit) {}
   ~RISCVAsmBackend() override {}
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -69,9 +70,10 @@ bool RISCVAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-void RISCVAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                 unsigned DataSize, uint64_t Value,
-                                 bool IsPCRel, MCContext &Ctx) const {
+void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                 const MCValue &Target,
+                                 MutableArrayRef<char> Data, uint64_t Value,
+                                 bool IsPCRel) const {
   return;
 }
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index d1d1334163a26..c72b47b090857 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -203,10 +203,8 @@ namespace {
       return InfosBE[Kind - FirstTargetFixupKind];
     }
 
-    void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                           const MCFixup &Fixup, const MCFragment *DF,
-                           const MCValue &Target, uint64_t &Value,
-                           bool &IsResolved) override {
+    void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                           const MCValue &Target, bool &IsResolved) override {
       switch ((Sparc::Fixups)Fixup.getKind()) {
       default: break;
       case Sparc::fixup_sparc_wplt30:
@@ -273,9 +271,9 @@ namespace {
     ELFSparcAsmBackend(const Target &T, Triple::OSType OSType) :
       SparcAsmBackend(T), OSType(OSType) { }
 
-    void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                    uint64_t Value, bool IsPCRel,
-                    MCContext &Ctx) const override {
+    void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                    const MCValue &Target, MutableArrayRef<char> Data,
+                    uint64_t Value, bool IsPCRel) const override {
 
       Value = adjustFixupValue(Fixup.getKind(), Value);
       if (!Value) return;           // Doesn't change encoding.
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.cpp b/lib/Target/Sparc/SparcTargetObjectFile.cpp
index 627e49a95f3cc..2c040dce994b6 100644
--- a/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -15,6 +15,12 @@
 
 using namespace llvm;
 
+void SparcELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                          const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
 const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
     const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
     MachineModuleInfo *MMI, MCStreamer &Streamer) const {
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.h b/lib/Target/Sparc/SparcTargetObjectFile.h
index fe8800625a567..3b1b345c3b193 100644
--- a/lib/Target/Sparc/SparcTargetObjectFile.h
+++ b/lib/Target/Sparc/SparcTargetObjectFile.h
@@ -23,6 +23,8 @@ public:
     TargetLoweringObjectFileELF()
   {}
 
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
   const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
                                         unsigned Encoding,
                                         const TargetMachine &TM,
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index fd1fd7bc40dcc..6b32a7926437a 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -50,8 +50,9 @@ public:
     return SystemZ::NumTargetFixupKinds;
   }
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
   bool mayNeedRelaxation(const MCInst &Inst) const override {
     return false;
   }
@@ -89,15 +90,17 @@ SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   return Infos[Kind - FirstTargetFixupKind];
 }
 
-void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                     unsigned DataSize, uint64_t Value,
-                                     bool IsPCRel, MCContext &Ctx) const {
+void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm,
+                                     const MCFixup &Fixup,
+                                     const MCValue &Target,
+                                     MutableArrayRef<char> Data, uint64_t Value,
+                                     bool IsPCRel) const {
   MCFixupKind Kind = Fixup.getKind();
   unsigned Offset = Fixup.getOffset();
   unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
   unsigned Size = (BitSize + 7) / 8;
 
-  assert(Offset + Size <= DataSize && "Invalid fixup offset!");
+  assert(Offset + Size <= Data.size() && "Invalid fixup offset!");
 
   // Big-endian insertion of Size bytes.
   Value = extractBitsForFixup(Kind, Value);
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index a28a91e834f61..0cb2b5a14ce73 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -277,8 +277,21 @@ void SystemZFrameLowering::
 processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                     RegScavenger *RS) const {
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
-  uint64_t MaxReach = (MFFrame.estimateStackSize(MF) +
-                       SystemZMC::CallFrameSize * 2);
+  // Get the size of our stack frame to be allocated ...
+  uint64_t StackSize = (MFFrame.estimateStackSize(MF) +
+                        SystemZMC::CallFrameSize);
+  // ... and the maximum offset we may need to reach into the
+  // caller's frame to access the save area or stack arguments.
+  int64_t MaxArgOffset = SystemZMC::CallFrameSize;
+  for (int I = MFFrame.getObjectIndexBegin(); I != 0; ++I)
+    if (MFFrame.getObjectOffset(I) >= 0) {
+      int64_t ArgOffset = SystemZMC::CallFrameSize +
+                          MFFrame.getObjectOffset(I) +
+                          MFFrame.getObjectSize(I);
+      MaxArgOffset = std::max(MaxArgOffset, ArgOffset);
+    }
+
+  uint64_t MaxReach = StackSize + MaxArgOffset;
   if (!isUInt<12>(MaxReach)) {
     // We may need register scavenging slots if some parts of the frame
     // are outside the reach of an unsigned 12-bit displacement.
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index ac4c3f6db684d..fef4a8c92a362 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1322,11 +1322,6 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
-SDValue SystemZTargetLowering::prepareVolatileOrAtomicLoad(
-    SDValue Chain, const SDLoc &DL, SelectionDAG &DAG) const {
-  return DAG.getNode(SystemZISD::SERIALIZE, DL, MVT::Other, Chain);
-}
-
 // Return true if Op is an intrinsic node with chain that returns the CC value
 // as its only (other) argument.  Provide the associated SystemZISD opcode and
 // the mask of valid CC values if so.
@@ -2059,6 +2054,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
   if (NewC.ICmpType != SystemZICMP::SignedOnly &&
       NewC.Op0.getOpcode() == ISD::SHL &&
       isSimpleShift(NewC.Op0, ShiftVal) &&
+      (MaskVal >> ShiftVal != 0) &&
       (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
                                         MaskVal >> ShiftVal,
                                         CmpVal >> ShiftVal,
@@ -2068,6 +2064,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
   } else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
              NewC.Op0.getOpcode() == ISD::SRL &&
              isSimpleShift(NewC.Op0, ShiftVal) &&
+             (MaskVal << ShiftVal != 0) &&
              (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
                                                MaskVal << ShiftVal,
                                                CmpVal << ShiftVal,
@@ -3212,12 +3209,15 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }
 
-// Op is an atomic load.  Lower it into a normal volatile load.
+// Op is an atomic load.  Lower it into a serialization followed
+// by a normal volatile load.
 SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
                                                 SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
+  SDValue Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
+                                             MVT::Other, Node->getChain()), 0);
   return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
-                        Node->getChain(), Node->getBasePtr(),
+                        Chain, Node->getBasePtr(),
                         Node->getMemoryVT(), Node->getMemOperand());
 }
 
@@ -4688,7 +4688,6 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(STRCMP);
     OPCODE(SEARCH_STRING);
     OPCODE(IPM);
-    OPCODE(SERIALIZE);
     OPCODE(MEMBARRIER);
     OPCODE(TBEGIN);
     OPCODE(TBEGIN_NOFLOAT);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 79c8c4d92669f..5dcb19c0a35db 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -139,9 +139,6 @@ enum NodeType : unsigned {
   // Store the CC value in bits 29 and 28 of an integer.
   IPM,
 
-  // Perform a serialization operation.  (BCR 15,0 or BCR 14,0.)
-  SERIALIZE,
-
   // Compiler barrier only; generate a no-op.
   MEMBARRIER,
 
@@ -471,8 +468,6 @@ public:
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
-  SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL,
-                                      SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
   ISD::NodeType getExtendForAtomicOps() const override {
@@ -522,7 +517,6 @@ private:
                               unsigned Opcode) const;
   SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerLOAD_SEQUENCE_POINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index fa5ecdd852433..9f5e6288348e0 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -189,18 +189,15 @@ let isBranch = 1, isTerminator = 1 in {
 //===----------------------------------------------------------------------===//
 
 // Unconditional trap.
-// FIXME: This trap instruction should be marked as isTerminator, but there is
-// currently a general bug that allows non-terminators to be placed between
-// terminators. Temporarily leave this unmarked until the bug is fixed.
-let isBarrier = 1, hasCtrlDep = 1 in
+let hasCtrlDep = 1 in
   def Trap : Alias<4, (outs), (ins), [(trap)]>;
 
 // Conditional trap.
-let isTerminator = 1, hasCtrlDep = 1, Uses = [CC] in
+let hasCtrlDep = 1, Uses = [CC] in
   def CondTrap : Alias<4, (outs), (ins cond4:$valid, cond4:$R1), []>;
 
 // Fused compare-and-trap instructions.
-let isTerminator = 1, hasCtrlDep = 1 in {
+let hasCtrlDep = 1 in {
   // These patterns work the same way as for compare-and-branch.
   defm CRT   : CmpBranchRRFcPair<"crt",   0xB972, GR32>;
   defm CGRT  : CmpBranchRRFcPair<"cgrt",  0xB960, GR64>;
@@ -1449,7 +1446,7 @@ let Predicates = [FeatureExecutionHint] in {
 // A serialization instruction that acts as a barrier for all memory
 // accesses, which expands to "bcr 14, 0".
 let hasSideEffects = 1 in
-def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>;
+def Serialize : Alias<2, (outs), (ins), []>;
 
 // A pseudo instruction that serves as a compiler barrier.
 let hasSideEffects = 1, hasNoSchedulingInfo = 1 in
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index ab6020f3f1896..b6feaa49d8585 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 #ifndef NDEBUG
 // Print the set of SUs
 void SystemZPostRASchedStrategy::SUSet::
-dump(SystemZHazardRecognizer &HazardRec) {
+dump(SystemZHazardRecognizer &HazardRec) const {
   dbgs() << "{";
   for (auto &SU : *this) {
     HazardRec.dumpSU(SU, dbgs());
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.h b/lib/Target/SystemZ/SystemZMachineScheduler.h
index 12357e0348a9e..3dfef388691e7 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -72,7 +72,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
   // A set of SUs with a sorter and dump method.
   struct SUSet : std::set<SUnit*, SUSorter> {
     #ifndef NDEBUG
-    void dump(SystemZHazardRecognizer &HazardRec);
+    void dump(SystemZHazardRecognizer &HazardRec) const;
     #endif
   };
 
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index adfc69c5d4cf4..ab2392809f3be 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -191,8 +191,6 @@ def z_sdivrem64         : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
 def z_udivrem32         : SDNode<"SystemZISD::UDIVREM32", SDT_ZGR128Binary32>;
 def z_udivrem64         : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>;
 
-def z_serialize         : SDNode<"SystemZISD::SERIALIZE", SDTNone,
-                                 [SDNPHasChain, SDNPMayStore]>;
 def z_membarrier        : SDNode<"SystemZISD::MEMBARRIER", SDTNone,
                                  [SDNPHasChain, SDNPSideEffect]>;
 
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index eb2f17a2091c3..a10ca64fa6329 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -51,8 +51,6 @@ public:
   }
 
   bool targetSchedulesPostRAScheduling() const override { return true; };
-
-  bool isMachineVerifierClean() const override { return false; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 4f20096c15830..1357cb5735f8a 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -37,8 +37,9 @@ public:
       : MCAsmBackend(), Is64Bit(Is64Bit) {}
   ~WebAssemblyAsmBackendELF() override {}
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -77,8 +78,9 @@ public:
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -105,9 +107,11 @@ bool WebAssemblyAsmBackendELF::writeNopData(uint64_t Count,
   return true;
 }
 
-void WebAssemblyAsmBackendELF::applyFixup(const MCFixup &Fixup, char *Data,
-                                          unsigned DataSize, uint64_t Value,
-                                          bool IsPCRel, MCContext &Ctx) const {
+void WebAssemblyAsmBackendELF::applyFixup(const MCAssembler &Asm,
+                                          const MCFixup &Fixup,
+                                          const MCValue &Target,
+                                          MutableArrayRef<char> Data,
+                                          uint64_t Value, bool IsPCRel) const {
   const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
   assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
 
@@ -119,7 +123,7 @@ void WebAssemblyAsmBackendELF::applyFixup(const MCFixup &Fixup, char *Data,
   Value <<= Info.TargetOffset;
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // For each byte of the fragment that the fixup touches, mask in the
   // bits from the fixup value.
@@ -163,9 +167,11 @@ bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
   return true;
 }
 
-void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                       unsigned DataSize, uint64_t Value,
-                                       bool IsPCRel, MCContext &Ctx) const {
+void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm,
+                                       const MCFixup &Fixup,
+                                       const MCValue &Target,
+                                       MutableArrayRef<char> Data,
+                                       uint64_t Value, bool IsPCRel) const {
   const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
   assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
 
@@ -177,7 +183,7 @@ void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   Value <<= Info.TargetOffset;
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // For each byte of the fragment that the fixup touches, mask in the
   // bits from the fixup value.
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index c56c591def361..3e3b52fca5691 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -36,7 +36,6 @@ STATISTIC(MCNumFixups, "Number of MC fixups created.");
 namespace {
 class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
   const MCInstrInfo &MCII;
-  MCContext &Ctx;
 
   // Implementation generated by tablegen.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -48,14 +47,12 @@ class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
                          const MCSubtargetInfo &STI) const override;
 
 public:
-  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
-      : MCII(mcii), Ctx(ctx) {}
+  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
 };
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
-                                                    MCContext &Ctx) {
-  return new WebAssemblyMCCodeEmitter(MCII, Ctx);
+MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII) {
+  return new WebAssemblyMCCodeEmitter(MCII);
 }
 
 void WebAssemblyMCCodeEmitter::encodeInstruction(
@@ -89,11 +86,7 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
         } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
           encodeSLEB128(int64_t(MO.getImm()), OS);
         } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
-          Fixups.push_back(MCFixup::create(
-              OS.tell() - Start, MCConstantExpr::create(MO.getImm(), Ctx),
-              MCFixupKind(WebAssembly::fixup_code_global_index), MI.getLoc()));
-          ++MCNumFixups;
-          encodeULEB128(uint64_t(MO.getImm()), OS);
+          llvm_unreachable("wasm globals should only be accessed symbolicly");
         } else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
           encodeSLEB128(int64_t(MO.getImm()), OS);
         } else {
@@ -135,6 +128,9 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
                  Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
         FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32);
         PaddedSize = 5;
+      } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
+        FixupKind = MCFixupKind(WebAssembly::fixup_code_global_index);
+        PaddedSize = 5;
       } else {
         llvm_unreachable("unexpected symbolic operand kind");
       }
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 9fd3ec81c258f..9580eeaa33d73 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -74,7 +74,7 @@ static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
 static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo & /*MRI*/,
                                         MCContext &Ctx) {
-  return createWebAssemblyMCCodeEmitter(MCII, Ctx);
+  return createWebAssemblyMCCodeEmitter(MCII);
 }
 
 static MCAsmBackend *createAsmBackend(const Target & /*T*/,
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 0ba700a86b744..4d676c32a09c5 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -35,8 +35,7 @@ class raw_pwrite_stream;
 Target &getTheWebAssemblyTarget32();
 Target &getTheWebAssemblyTarget64();
 
-MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
-                                              MCContext &Ctx);
+MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
 
 MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
 
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index 19e14f3261aa7..9cf77829f3bc2 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -68,6 +68,8 @@ WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
   bool IsFunction = IsFunctionExpr(Fixup.getValue());
 
   switch (unsigned(Fixup.getKind())) {
+  case WebAssembly::fixup_code_global_index:
+    return wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB;
   case WebAssembly::fixup_code_sleb128_i32:
     if (IsFunction)
       return wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB;
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index b999091e2d294..f51585a10ca12 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -96,13 +96,6 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
                                MCConstantExpr::create(Size, OutContext));
     }
   }
-
-  if (!TM.getTargetTriple().isOSBinFormatELF()) {
-    MachineModuleInfoWasm &MMIW = MMI->getObjFileInfo<MachineModuleInfoWasm>();
-    getTargetStreamer()->emitGlobal(MMIW.getGlobals());
-    if (MMIW.hasStackPointerGlobal())
-      getTargetStreamer()->emitStackPointer(MMIW.getStackPointerGlobal());
-  }
 }
 
 void WebAssemblyAsmPrinter::EmitConstantPool() {
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 09338a4898e03..c980f4b87f916 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -63,12 +63,16 @@ class WebAssemblyFastISel final : public FastISel {
   public:
     // Innocuous defaults for our address.
     Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
-    void setKind(BaseKind K) { Kind = K; }
+    void setKind(BaseKind K) {
+      assert(!isSet() && "Can't change kind with non-zero base");
+      Kind = K;
+    }
     BaseKind getKind() const { return Kind; }
     bool isRegBase() const { return Kind == RegBase; }
     bool isFIBase() const { return Kind == FrameIndexBase; }
     void setReg(unsigned Reg) {
       assert(isRegBase() && "Invalid base register access!");
+      assert(Base.Reg == 0 && "Overwriting non-zero register");
       Base.Reg = Reg;
     }
     unsigned getReg() const {
@@ -77,6 +81,7 @@ class WebAssemblyFastISel final : public FastISel {
     }
     void setFI(unsigned FI) {
       assert(isFIBase() && "Invalid base frame index access!");
+      assert(Base.FI == 0 && "Overwriting non-zero frame index");
       Base.FI = FI;
     }
     unsigned getFI() const {
@@ -91,6 +96,13 @@ class WebAssemblyFastISel final : public FastISel {
     int64_t getOffset() const { return Offset; }
     void setGlobalValue(const GlobalValue *G) { GV = G; }
     const GlobalValue *getGlobalValue() const { return GV; }
+    bool isSet() const {
+      if (isRegBase()) {
+        return Base.Reg != 0;
+      } else {
+        return Base.FI != 0;
+      }
+    }
   };
 
   /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
@@ -297,6 +309,9 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
     DenseMap<const AllocaInst *, int>::iterator SI =
         FuncInfo.StaticAllocaMap.find(AI);
     if (SI != FuncInfo.StaticAllocaMap.end()) {
+      if (Addr.isSet()) {
+        return false;
+      }
       Addr.setKind(Address::FrameIndexBase);
       Addr.setFI(SI->second);
       return true;
@@ -341,6 +356,9 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
     break;
   }
   }
+  if (Addr.isSet()) {
+    return false;
+  }
   Addr.setReg(getRegForValue(Obj));
   return Addr.getReg() != 0;
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 4209bc333f230..a37d6136e44ed 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -104,10 +104,10 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
                             const DebugLoc &DL) {
   const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
 
+  const char *ES = "__stack_pointer";
+  auto *SPSymbol = MF.createExternalSymbolName(ES);
   if (MF.getSubtarget<WebAssemblySubtarget>()
         .getTargetTriple().isOSBinFormatELF()) {
-    const char *ES = "__stack_pointer";
-    auto *SPSymbol = MF.createExternalSymbolName(ES);
     MachineRegisterInfo &MRI = MF.getRegInfo();
     const TargetRegisterClass *PtrRC =
         MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
@@ -125,10 +125,8 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
         .addReg(SrcReg)
         .addMemOperand(MMO);
   } else {
-    MachineModuleInfoWasm &MMIW =
-        MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
     BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::SET_GLOBAL_I32))
-        .addImm(MMIW.getStackPointerGlobal())
+        .addExternalSymbol(SPSymbol)
         .addReg(SrcReg);
   }
 }
@@ -171,10 +169,11 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
   unsigned SPReg = WebAssembly::SP32;
   if (StackSize)
     SPReg = MRI.createVirtualRegister(PtrRC);
+
+  const char *ES = "__stack_pointer";
+  auto *SPSymbol = MF.createExternalSymbolName(ES);
   if (MF.getSubtarget<WebAssemblySubtarget>()
         .getTargetTriple().isOSBinFormatELF()) {
-    const char *ES = "__stack_pointer";
-    auto *SPSymbol = MF.createExternalSymbolName(ES);
     unsigned Zero = MRI.createVirtualRegister(PtrRC);
 
     BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
@@ -189,22 +188,8 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
         .addReg(Zero)    // addr
         .addMemOperand(LoadMMO);
   } else {
-    auto &MMIW = MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
-    if (!MMIW.hasStackPointerGlobal()) {
-      MMIW.setStackPointerGlobal(MMIW.getGlobals().size());
-
-      // Create the stack-pointer global. For now, just use the
-      // Emscripten/Binaryen ABI names.
-      wasm::Global G;
-      G.Type = wasm::ValType::I32;
-      G.Mutable = true;
-      G.InitialValue = 0;
-      G.InitialModule = "env";
-      G.InitialName = "STACKTOP";
-      MMIW.addGlobal(G);
-    }
     BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GET_GLOBAL_I32), SPReg)
-        .addImm(MMIW.getStackPointerGlobal());
+        .addExternalSymbol(SPSymbol);
   }
 
   bool HasBP = hasBP(MF);
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 6650191807dcb..ea9e3fa862ce2 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -170,28 +170,16 @@ static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
   if (MI.mayStore()) {
     Write = true;
 
-    const MachineFunction &MF = *MI.getParent()->getParent();
-    if (MF.getSubtarget<WebAssemblySubtarget>()
-          .getTargetTriple().isOSBinFormatELF()) {
-      // Check for stores to __stack_pointer.
-      for (auto MMO : MI.memoperands()) {
-        const MachinePointerInfo &MPI = MMO->getPointerInfo();
-        if (MPI.V.is<const PseudoSourceValue *>()) {
-          auto PSV = MPI.V.get<const PseudoSourceValue *>();
-          if (const ExternalSymbolPseudoSourceValue *EPSV =
-                  dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
-            if (StringRef(EPSV->getSymbol()) == "__stack_pointer")
-              StackPointer = true;
-        }
-      }
-    } else {
-      // Check for sets of the stack pointer.
-      const MachineModuleInfoWasm &MMIW =
-          MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
-      if ((MI.getOpcode() == WebAssembly::SET_LOCAL_I32 ||
-           MI.getOpcode() == WebAssembly::SET_LOCAL_I64) &&
-          MI.getOperand(0).getImm() == MMIW.getStackPointerGlobal()) {
-        StackPointer = true;
+    // Check for stores to __stack_pointer.
+    for (auto MMO : MI.memoperands()) {
+      const MachinePointerInfo &MPI = MMO->getPointerInfo();
+      if (MPI.V.is<const PseudoSourceValue *>()) {
+        auto PSV = MPI.V.get<const PseudoSourceValue *>();
+        if (const ExternalSymbolPseudoSourceValue *EPSV =
+                dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
+          if (StringRef(EPSV->getSymbol()) == "__stack_pointer") {
+            StackPointer = true;
+          }
       }
     }
   } else if (MI.hasOrderedMemoryRef()) {
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e5d3209ec6a97..d30cc724c203f 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1705,8 +1705,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
   if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
     SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
     unsigned Len = DotDispStr.size();
-    unsigned Val = OrigDispVal + DotDispVal;
-    InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val);
+    InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, DotDispVal);
   }
 
   NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 7a9e4f4468ec7..914fb36f91a7d 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -108,12 +108,12 @@ public:
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override {
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override {
     unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
 
-    assert(Fixup.getOffset() + Size <= DataSize &&
-           "Invalid fixup offset!");
+    assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
 
     // Check that uppper bits are either all zeros or all ones.
     // Specifically ignore overflow/underflow as long as the leakage is
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 4097ef224d503..caf98bffb80de 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -153,8 +153,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     const MCSymbol *B_Base = Asm.getAtom(*B);
 
     // Neither symbol can be modified.
-    if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
-        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+    if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None) {
       Asm.getContext().reportError(Fixup.getLoc(),
                                    "unsupported relocation of modified symbol");
       return;
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 105580c913a16..5892f1de33eec 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -10,6 +10,7 @@
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
@@ -25,8 +26,8 @@ public:
   X86WinCOFFObjectWriter(bool Is64Bit);
   ~X86WinCOFFObjectWriter() override = default;
 
-  unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsCrossSection,
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsCrossSection,
                         const MCAsmBackend &MAB) const override;
 };
 
@@ -36,11 +37,19 @@ X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
     : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
                                           : COFF::IMAGE_FILE_MACHINE_I386) {}
 
-unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
+unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+                                              const MCValue &Target,
                                               const MCFixup &Fixup,
                                               bool IsCrossSection,
                                               const MCAsmBackend &MAB) const {
-  unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
+  unsigned FixupKind = Fixup.getKind();
+  if (IsCrossSection) {
+    if (FixupKind != FK_Data_4) {
+      Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression");
+      return COFF::IMAGE_REL_AMD64_ADDR32;
+    }
+    FixupKind = FK_PCRel_4;
+  }
 
   MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
     MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 2777fa89330f6..e3aa227702bea 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -748,17 +748,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
   else
     CallOp = X86::CALLpcrel32;
 
-  const char *Symbol;
-  if (Is64Bit) {
-    if (STI.isTargetCygMing()) {
-      Symbol = "___chkstk_ms";
-    } else {
-      Symbol = "__chkstk";
-    }
-  } else if (STI.isTargetCygMing())
-    Symbol = "_alloca";
-  else
-    Symbol = "_chkstk";
+  StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF);
 
   MachineInstrBuilder CI;
   MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
@@ -769,10 +759,11 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
     // For the large code model, we have to call through a register. Use R11,
     // as it is scratch in all supported calling conventions.
     BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
-        .addExternalSymbol(Symbol);
+        .addExternalSymbol(MF.createExternalSymbolName(Symbol));
     CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
   } else {
-    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
+    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))
+        .addExternalSymbol(MF.createExternalSymbolName(Symbol));
   }
 
   unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
@@ -783,13 +774,16 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
       .addReg(SP, RegState::Define | RegState::Implicit)
       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
 
-  if (Is64Bit) {
+  if (STI.isTargetWin64() || !STI.isOSWindows()) {
+    // MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
     // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
-    // themselves. It also does not clobber %rax so we can reuse it when
+    // themselves. They also does not clobber %rax so we can reuse it when
     // adjusting %rsp.
-    BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
-        .addReg(X86::RSP)
-        .addReg(X86::RAX);
+    // All other platforms do not specify a particular ABI for the stack probe
+    // function, so we arbitrarily define it to not adjust %esp/%rsp itself.
+    BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP)
+        .addReg(SP)
+        .addReg(AX);
   }
 
   if (InProlog) {
@@ -978,7 +972,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     X86FI->setCalleeSavedFrameSize(
       X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
 
-  bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
+  bool UseRedZone = false;
+  bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();
 
   // The default stack probe size is 4096 if the function has no stackprobesize
   // attribute.
@@ -1007,6 +1002,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       !TRI->needsStackRealignment(MF) &&
       !MFI.hasVarSizedObjects() &&             // No dynamic alloca.
       !MFI.adjustsStack() &&                   // No calls.
+      !UseStackProbe &&                        // No stack probes.
       !IsWin64CC &&                            // Win64 has no Red Zone
       !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
       !MF.shouldSplitStack()) {                // Regular stack
@@ -1015,6 +1011,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
     MFI.setStackSize(StackSize);
+    UseRedZone = true;
   }
 
   // Insert stack pointer adjustment for later moving of return addr.  Only
@@ -1192,6 +1189,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
     AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
   if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
+    assert(!UseRedZone && "The Red Zone is not accounted for in stack probes");
+
     // Check whether EAX is livein for this block.
     bool isEAXAlive = isEAXLiveIn(MBB);
 
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 2a1633de0a239..3c4589ab18f6f 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -204,6 +204,11 @@ namespace {
     bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
                           SDValue &Scale, SDValue &Index, SDValue &Disp,
                           SDValue &Segment);
+    template <class GatherScatterSDNode>
+    bool selectAddrOfGatherScatterNode(GatherScatterSDNode *Parent, SDValue N,
+                                       SDValue &Base, SDValue &Scale,
+                                       SDValue &Index, SDValue &Disp,
+                                       SDValue &Segment);
     bool selectMOV64Imm32(SDValue N, SDValue &Imm);
     bool selectLEAAddr(SDValue N, SDValue &Base,
                        SDValue &Scale, SDValue &Index, SDValue &Disp,
@@ -1415,13 +1420,10 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
   return false;
 }
 
-bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
-                                      SDValue &Scale, SDValue &Index,
-                                      SDValue &Disp, SDValue &Segment) {
-
-  MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent);
-  if (!Mgs)
-    return false;
+template <class GatherScatterSDNode>
+bool X86DAGToDAGISel::selectAddrOfGatherScatterNode(
+    GatherScatterSDNode *Mgs, SDValue N, SDValue &Base, SDValue &Scale,
+    SDValue &Index, SDValue &Disp, SDValue &Segment) {
   X86ISelAddressMode AM;
   unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace();
   // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
@@ -1453,6 +1455,18 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
   return true;
 }
 
+bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                                       SDValue &Scale, SDValue &Index,
+                                       SDValue &Disp, SDValue &Segment) {
+  if (auto Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent))
+    return selectAddrOfGatherScatterNode<MaskedGatherScatterSDNode>(
+        Mgs, N, Base, Scale, Index, Disp, Segment);
+  if (auto X86Gather = dyn_cast<X86MaskedGatherSDNode>(Parent))
+    return selectAddrOfGatherScatterNode<X86MaskedGatherSDNode>(
+        X86Gather, N, Base, Scale, Index, Disp, Segment);
+  return false;
+}
+
 /// Returns true if it is able to pattern match an addressing mode.
 /// It returns the operands which make up the maximal addressing mode it can
 /// match by reference.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 172eba0002d4f..f777e56289884 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1662,6 +1662,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
+
+  // TODO: These control memcmp expansion in CGP and are set low to prevent
+  // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder.
+  MaxLoadsPerMemcmp = 1;
+  MaxLoadsPerMemcmpOptSize = 1;
+
   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
 
@@ -14272,9 +14278,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   // If we are inserting a element, see if we can do this more efficiently with
   // a blend shuffle with a rematerializable vector than a costly integer
   // insertion.
-  // TODO: pre-SSE41 targets will tend to use bit masking - this could still
-  // be beneficial if we are inserting several zeros and can combine the masks.
-  if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
+  if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
+      16 <= EltVT.getSizeInBits()) {
     SmallVector<int, 8> BlendMask;
     for (unsigned i = 0; i != NumElts; ++i)
       BlendMask.push_back(i == IdxVal ? i + NumElts : i);
@@ -17621,23 +17626,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
-
       SDValue CmpOp0 = Cmp.getOperand(0);
+
       // Apply further optimizations for special cases
       // (select (x != 0), -1, 0) -> neg & sbb
       // (select (x == 0), 0, -1) -> neg & sbb
       if (isNullConstant(Y) &&
-            (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
-          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
-          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
-                                    DAG.getConstant(0, DL,
-                                                    CmpOp0.getValueType()),
-                                    CmpOp0);
-          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
-                                    DAG.getConstant(X86::COND_B, DL, MVT::i8),
-                                    SDValue(Neg.getNode(), 1));
-          return Res;
-        }
+          (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
+        SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+        SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
+        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
+        SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                  DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                                  SDValue(Neg.getNode(), 1));
+        return Res;
+      }
 
       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
@@ -18648,8 +18651,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool SplitStack = MF.shouldSplitStack();
+  bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
-               SplitStack;
+               SplitStack || EmitStackProbe;
   SDLoc dl(Op);
 
   // Get the inputs.
@@ -23705,6 +23709,57 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
     SDValue RetOps[] = {Exract, NewGather.getValue(1)};
     return DAG.getMergeValues(RetOps, dl);
   }
+  if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
+    // There is a special case when the return type is v2i32 is illegal and
+    // the type legaizer extended it to v2i64. Without this conversion we end up
+    // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
+    // In order to avoid this situation, we'll build an X86 specific Gather node
+    // with index v2i64 and value type v4i32.
+    assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
+           "Unexpected type in masked gather");
+    Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
+                                DAG.getBitcast(MVT::v4i32, Src0),
+                                DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
+    // The mask should match the destination type. Extending mask with zeroes
+    // is not necessary since instruction itself reads only two values from
+    // memory.
+    Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
+    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+    SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+      DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
+      N->getMemOperand());
+
+    SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
+                                  NewGather.getValue(0), DAG);
+    SDValue RetOps[] = { Sext, NewGather.getValue(1) };
+    return DAG.getMergeValues(RetOps, dl);
+  }
+  if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
+    // This transformation is for optimization only.
+    // The type legalizer extended mask and index to 4 elements vector
+    // in order to match requirements of the common gather node - same
+    // vector width of index and value. X86 Gather node allows mismatch
+    // of vector width in order to select more optimal instruction at the
+    // end.
+    assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
+           "Unexpected type in masked gather");
+    if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
+        ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
+        Index.getOpcode() == ISD::CONCAT_VECTORS &&
+        Index.getOperand(1).isUndef()) {
+      Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
+      Index = Index.getOperand(0);
+    } else
+      return Op;
+    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+    SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+      DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
+      N->getMemOperand());
+
+    SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
+    return DAG.getMergeValues(RetOps, dl);
+
+  }
   return Op;
 }
 
@@ -24508,6 +24563,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::CVTS2SI_RND:        return "X86ISD::CVTS2SI_RND";
   case X86ISD::CVTS2UI_RND:        return "X86ISD::CVTS2UI_RND";
   case X86ISD::LWPINS:             return "X86ISD::LWPINS";
+  case X86ISD::MGATHER:            return "X86ISD::MGATHER";
   }
   return nullptr;
 }
@@ -29868,7 +29924,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
     if (N->getValueType(0) == MVT::i32)
       Diff = (unsigned)Diff;
 
-    bool isFastMultiplier = false;
+    bool IsFastMultiplier = false;
     if (Diff < 10) {
       switch ((unsigned char)Diff) {
       default:
@@ -29880,12 +29936,12 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
       case 5: // result = lea base(cond, cond*4)
       case 8: // result = lea base(    , cond*8)
       case 9: // result = lea base(cond, cond*8)
-        isFastMultiplier = true;
+        IsFastMultiplier = true;
         break;
       }
     }
 
-    if (isFastMultiplier) {
+    if (IsFastMultiplier) {
       APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
       if (NeedsCondInvert) // Invert the condition if needed.
         Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
@@ -34841,23 +34897,56 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
       !Cmp.getOperand(0).getValueType().isInteger())
     return SDValue();
 
-  // (cmp Z, 1) sets the carry flag if Z is 0.
   SDValue Z = Cmp.getOperand(0);
-  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
-                               DAG.getConstant(1, DL, Z.getValueType()));
+  EVT ZVT = Z.getValueType();
+
+  // If X is -1 or 0, then we have an opportunity to avoid constants required in
+  // the general case below.
+  if (auto *ConstantX = dyn_cast<ConstantSDNode>(X)) {
+    // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
+    // fake operands:
+    //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
+    // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
+    if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
+        (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
+      SDValue Zero = DAG.getConstant(0, DL, ZVT);
+      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+      SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
+      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                         DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                         SDValue(Neg.getNode(), 1));
+    }
+
+    // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
+    // with fake operands:
+    //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
+    // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
+    if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
+        (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
+      SDValue One = DAG.getConstant(1, DL, ZVT);
+      SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                         DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
+    }
+  }
 
-  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+  // (cmp Z, 1) sets the carry flag if Z is 0.
+  SDValue One = DAG.getConstant(1, DL, ZVT);
+  SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+
+  // Add the flags type for ADC/SBB nodes.
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
 
   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
   if (CC == X86::COND_NE)
     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
-                       DAG.getConstant(-1ULL, DL, VT), NewCmp);
+                       DAG.getConstant(-1ULL, DL, VT), Cmp1);
 
   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
-                     DAG.getConstant(0, DL, VT), NewCmp);
+                     DAG.getConstant(0, DL, VT), Cmp1);
 }
 
 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
@@ -34976,6 +35065,32 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
 }
 
+/// Convert vector increment or decrement to sub/add with an all-ones constant:
+/// add X, <1, 1...> --> sub X, <-1, -1...>
+/// sub X, <1, 1...> --> add X, <-1, -1...>
+/// The all-ones vector constant can be materialized using a pcmpeq instruction
+/// that is commonly recognized as an idiom (has no register dependency), so
+/// that's better/smaller than loading a splat 1 constant.
+static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB &&
+         "Unexpected opcode for increment/decrement transform");
+
+  // Pseudo-legality check: getOnesVector() expects one of these types, so bail
+  // out and wait for legalization if we have an unsupported vector length.
+  EVT VT = N->getValueType(0);
+  if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
+    return SDValue();
+
+  SDNode *N1 = N->getOperand(1).getNode();
+  APInt SplatVal;
+  if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue())
+    return SDValue();
+
+  SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
+  unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+  return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
+}
+
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   const SDNodeFlags Flags = N->getFlags();
@@ -34995,6 +35110,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
       isHorizontalBinOp(Op0, Op1, true))
     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
 
+  if (SDValue V = combineIncDecVector(N, DAG))
+    return V;
+
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
@@ -35028,6 +35146,9 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
       isHorizontalBinOp(Op0, Op1, false))
     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
 
+  if (SDValue V = combineIncDecVector(N, DAG))
+    return V;
+
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
@@ -36335,3 +36456,22 @@ void X86TargetLowering::insertCopiesSplitCSR(
 bool X86TargetLowering::supportSwiftError() const {
   return Subtarget.is64Bit();
 }
+
+/// Returns the name of the symbol used to emit stack probes or the empty
+/// string if not applicable.
+StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+  // If the function specifically requests stack probes, emit them.
+  if (MF.getFunction()->hasFnAttribute("probe-stack"))
+    return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
+
+  // Generally, if we aren't on Windows, the platform ABI does not include
+  // support for stack probes, so don't emit them.
+  if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
+    return "";
+
+  // We need a stack probe to conform to the Windows ABI. Choose the right
+  // symbol.
+  if (Subtarget.is64Bit())
+    return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
+  return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index f51b6641db2fb..e1ade92979dc0 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -615,7 +615,10 @@ namespace llvm {
       // Vector truncating store with unsigned/signed saturation
       VTRUNCSTOREUS, VTRUNCSTORES,
       // Vector truncating masked store with unsigned/signed saturation
-      VMTRUNCSTOREUS, VMTRUNCSTORES
+      VMTRUNCSTOREUS, VMTRUNCSTORES,
+
+      // X86 specific gather
+      MGATHER
 
       // WARNING: Do not add anything in the end unless you want the node to
       // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
@@ -1056,6 +1059,8 @@ namespace llvm {
 
     bool supportSwiftError() const override;
 
+    StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
+
     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
     /// \brief Lower interleaved load(s) into target specific
@@ -1065,6 +1070,12 @@ namespace llvm {
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
 
+    /// \brief Lower interleaved store(s) into target specific
+    /// instructions/intrinsics.
+    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+                               unsigned Factor) const override;
+
+
     void finalizeLowering(MachineFunction &MF) const override;
 
   protected:
@@ -1397,6 +1408,19 @@ namespace llvm {
     }
   };
 
+  // X86 specific Gather node.
+  class X86MaskedGatherSDNode : public MaskedGatherScatterSDNode {
+  public:
+    X86MaskedGatherSDNode(unsigned Order,
+                          const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+                          MachineMemOperand *MMO)
+      : MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT, MMO)
+    {}
+    static bool classof(const SDNode *N) {
+      return N->getOpcode() == X86ISD::MGATHER;
+    }
+  };
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 2620679df2517..01a70323224c3 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -7265,13 +7265,13 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
   let Predicates = [HasAVX512] in {
   def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
-             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>;
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x9))), _.FRC)>;
   def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
-             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>;
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xa))), _.FRC)>;
   def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
-             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>;
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xb))), _.FRC)>;
   def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
              (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>;
@@ -7281,13 +7281,13 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
 
   def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
-             addr:$src, (i32 0x1))), _.FRC)>;
+             addr:$src, (i32 0x9))), _.FRC)>;
   def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
-             addr:$src, (i32 0x2))), _.FRC)>;
+             addr:$src, (i32 0xa))), _.FRC)>;
   def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
-             addr:$src, (i32 0x3))), _.FRC)>;
+             addr:$src, (i32 0xb))), _.FRC)>;
   def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
              addr:$src, (i32 0x4))), _.FRC)>;
@@ -7869,7 +7869,7 @@ let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
                                           vx128xmem, mgatherv4i32>, EVEX_V128;
   defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
-                                          vx64xmem, mgatherv2i64>, EVEX_V128;
+                                          vx64xmem, X86mgatherv2i64>, EVEX_V128;
 }
 }
 
@@ -8471,26 +8471,26 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
 }
 let Predicates = [HasAVX512] in {
 def : Pat<(v16f32 (ffloor VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>;
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>;
 def : Pat<(v16f32 (fnearbyint VR512:$src)),
           (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
 def : Pat<(v16f32 (fceil VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>;
+          (VRNDSCALEPSZrri VR512:$src, (i32 0xA))>;
 def : Pat<(v16f32 (frint VR512:$src)),
           (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
 def : Pat<(v16f32 (ftrunc VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>;
+          (VRNDSCALEPSZrri VR512:$src, (i32 0xB))>;
 
 def : Pat<(v8f64 (ffloor VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>;
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x9))>;
 def : Pat<(v8f64 (fnearbyint VR512:$src)),
           (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
 def : Pat<(v8f64 (fceil VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>;
+          (VRNDSCALEPDZrri VR512:$src, (i32 0xA))>;
 def : Pat<(v8f64 (frint VR512:$src)),
           (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
 def : Pat<(v8f64 (ftrunc VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>;
+          (VRNDSCALEPDZrri VR512:$src, (i32 0xB))>;
 }
 
 defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index c28b35b22977a..8b5bbf24f6f63 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -641,7 +641,7 @@ def sdmem : Operand<v2f64> {
 // SSE pattern fragments
 //===----------------------------------------------------------------------===//
 
-// Vector load wrappers to prevent folding of non-temporal aligned loads on 
+// Vector load wrappers to prevent folding of non-temporal aligned loads on
 // supporting targets.
 def vec128load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return !Subtarget->hasSSE41() || !cast<LoadSDNode>(N)->isNonTemporal() ||
@@ -754,16 +754,6 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
 def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
 
-// These are needed to match a scalar memop that is used in a vector-only
-// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
-// The memory operand is required to be a 128-bit load, so it must be converted
-// from a vector to a scalar.
-def memopfsf32_128 : PatFrag<(ops node:$ptr),
-  (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>;
-def memopfsf64_128 : PatFrag<(ops node:$ptr),
-  (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>;
-
-
 // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
 // 16-byte boundary.
 // FIXME: 8 byte alignment for mmx reads is not required
@@ -773,6 +763,9 @@ def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
 
 def memopmmx  : PatFrag<(ops node:$ptr), (x86mmx  (memop64 node:$ptr))>;
 
+def X86masked_gather  : SDNode<"X86ISD::MGATHER",  SDTMaskedGather,
+                       [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
 def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
   (masked_gather node:$src1, node:$src2, node:$src3) , [{
   if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
@@ -796,6 +789,15 @@ def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
             Mgt->getBasePtr().getValueType() == MVT::v2i64);
   return false;
 }]>;
+def X86mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (X86MaskedGatherSDNode *Mgt = dyn_cast<X86MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
+            Mgt->getBasePtr().getValueType() == MVT::v2i64) &&
+           (Mgt->getMemoryVT() == MVT::v2i32 ||
+            Mgt->getMemoryVT() == MVT::v2f32);
+  return false;
+}]>;
 def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
   (masked_gather node:$src1, node:$src2, node:$src3) , [{
   if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 8490b972eb5c1..fe87bbd994738 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1744,7 +1744,7 @@ def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
                       Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
 }
 
-def : Pat<(f32 (fpround FR64:$src)), 
+def : Pat<(f32 (fpround FR64:$src)),
             (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>,
           Requires<[UseAVX]>;
 
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 77dead8d24137..f98c2a7e802dd 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -72,9 +72,24 @@ private:
                   MachineFunction &MF) const;
   bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI,
                  MachineFunction &MF) const;
-
   bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI,
                    MachineFunction &MF) const;
+  bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI,
+                    MachineFunction &MF) const;
+  bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI,
+                     MachineFunction &MF) const;
+
+  // emit insert subreg instruction and insert it before MachineInstr &I
+  bool emitInsertSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
+                        MachineRegisterInfo &MRI, MachineFunction &MF) const;
+  // emit extract subreg instruction and insert it before MachineInstr &I
+  bool emitExtractSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
+                         MachineRegisterInfo &MRI, MachineFunction &MF) const;
+
+  const TargetRegisterClass *getRegClass(LLT Ty, const RegisterBank &RB) const;
+  const TargetRegisterClass *getRegClass(LLT Ty, unsigned Reg,
+                                         MachineRegisterInfo &MRI) const;
 
   const X86TargetMachine &TM;
   const X86Subtarget &STI;
@@ -113,8 +128,8 @@ X86InstructionSelector::X86InstructionSelector(const X86TargetMachine &TM,
 
 // FIXME: This should be target-independent, inferred from the types declared
 // for each class in the bank.
-static const TargetRegisterClass *
-getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
+const TargetRegisterClass *
+X86InstructionSelector::getRegClass(LLT Ty, const RegisterBank &RB) const {
   if (RB.getID() == X86::GPRRegBankID) {
     if (Ty.getSizeInBits() <= 8)
       return &X86::GR8RegClass;
@@ -127,13 +142,13 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
   }
   if (RB.getID() == X86::VECRRegBankID) {
     if (Ty.getSizeInBits() == 32)
-      return &X86::FR32XRegClass;
+      return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
     if (Ty.getSizeInBits() == 64)
-      return &X86::FR64XRegClass;
+      return STI.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
     if (Ty.getSizeInBits() == 128)
-      return &X86::VR128XRegClass;
+      return STI.hasAVX512() ? &X86::VR128XRegClass : &X86::VR128RegClass;
     if (Ty.getSizeInBits() == 256)
-      return &X86::VR256XRegClass;
+      return STI.hasAVX512() ? &X86::VR256XRegClass : &X86::VR256RegClass;
     if (Ty.getSizeInBits() == 512)
       return &X86::VR512RegClass;
   }
@@ -141,10 +156,16 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
   llvm_unreachable("Unknown RegBank!");
 }
 
+const TargetRegisterClass *
+X86InstructionSelector::getRegClass(LLT Ty, unsigned Reg,
+                                    MachineRegisterInfo &MRI) const {
+  const RegisterBank &RegBank = *RBI.getRegBank(Reg, MRI, TRI);
+  return getRegClass(Ty, RegBank);
+}
+
 // Set X86 Opcode and constrain DestReg.
-static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
-                       MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
-                       const RegisterBankInfo &RBI) {
+bool X86InstructionSelector::selectCopy(MachineInstr &I,
+                                        MachineRegisterInfo &MRI) const {
 
   unsigned DstReg = I.getOperand(0).getReg();
   if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
@@ -171,7 +192,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   switch (RegBank.getID()) {
   case X86::GPRRegBankID:
     assert((DstSize <= 64) && "GPRs cannot get more than 64-bit width values.");
-    RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank);
+    RC = getRegClass(MRI.getType(DstReg), RegBank);
 
     // Change the physical register
     if (SrcSize > DstSize && TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
@@ -186,7 +207,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
     }
     break;
   case X86::VECRRegBankID:
-    RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank);
+    RC = getRegClass(MRI.getType(DstReg), RegBank);
     break;
   default:
     llvm_unreachable("Unknown RegBank!");
@@ -220,7 +241,7 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
     // Certain non-generic instructions also need some special handling.
 
     if (I.isCopy())
-      return selectCopy(I, TII, MRI, TRI, RBI);
+      return selectCopy(I, MRI);
 
     // TODO: handle more cases - LOAD_STACK_GUARD, PHI
     return true;
@@ -249,6 +270,10 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
     return true;
   if (selectUadde(I, MRI, MF))
     return true;
+  if (selectExtract(I, MRI, MF))
+    return true;
+  if (selectInsert(I, MRI, MF))
+    return true;
 
   return false;
 }
@@ -326,6 +351,34 @@ unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
   return Opc;
 }
 
+// Fill in an address from the given instruction.
+void X86SelectAddress(const MachineInstr &I, const MachineRegisterInfo &MRI,
+                      X86AddressMode &AM) {
+
+  assert(I.getOperand(0).isReg() && "unsupported opperand.");
+  assert(MRI.getType(I.getOperand(0).getReg()).isPointer() &&
+         "unsupported type.");
+
+  if (I.getOpcode() == TargetOpcode::G_GEP) {
+    if (auto COff = getConstantVRegVal(I.getOperand(2).getReg(), MRI)) {
+      int64_t Imm = *COff;
+      if (isInt<32>(Imm)) { // Check for displacement overflow.
+        AM.Disp = static_cast<int32_t>(Imm);
+        AM.Base.Reg = I.getOperand(1).getReg();
+        return;
+      }
+    }
+  } else if (I.getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+    AM.Base.FrameIndex = I.getOperand(1).getIndex();
+    AM.BaseType = X86AddressMode::FrameIndexBase;
+    return;
+  }
+
+  // Default behavior.
+  AM.Base.Reg = I.getOperand(0).getReg();
+  return;
+}
+
 bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
                                                MachineRegisterInfo &MRI,
                                                MachineFunction &MF) const {
@@ -340,18 +393,28 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
   const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
 
   auto &MemOp = **I.memoperands_begin();
+  if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+    DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+    return false;
+  }
+
   unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment());
   if (NewOpc == Opc)
     return false;
 
+  X86AddressMode AM;
+  X86SelectAddress(*MRI.getVRegDef(I.getOperand(1).getReg()), MRI, AM);
+
   I.setDesc(TII.get(NewOpc));
   MachineInstrBuilder MIB(MF, I);
-  if (Opc == TargetOpcode::G_LOAD)
-    addOffset(MIB, 0);
-  else {
+  if (Opc == TargetOpcode::G_LOAD) {
+    I.RemoveOperand(1);
+    addFullAddress(MIB, AM);
+  } else {
     // G_STORE (VAL, Addr), X86Store instruction (Addr, VAL)
+    I.RemoveOperand(1);
     I.RemoveOperand(0);
-    addOffset(MIB, 0).addUse(DefReg);
+    addFullAddress(MIB, AM).addUse(DefReg);
   }
   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
 }
@@ -461,11 +524,11 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
   if (DstRB.getID() != X86::GPRRegBankID)
     return false;
 
-  const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
+  const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
   if (!DstRC)
     return false;
 
-  const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
+  const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
   if (!SrcRC)
     return false;
 
@@ -519,9 +582,8 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
     else
       return false;
 
-    const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
     unsigned DefReg =
-        MRI.createVirtualRegister(getRegClassForTypeOnBank(DstTy, RegBank));
+        MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
 
     BuildMI(*I.getParent(), I, I.getDebugLoc(),
             TII.get(TargetOpcode::SUBREG_TO_REG), DefReg)
@@ -656,6 +718,202 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I,
   return true;
 }
 
+bool X86InstructionSelector::selectExtract(MachineInstr &I,
+                                           MachineRegisterInfo &MRI,
+                                           MachineFunction &MF) const {
+
+  if (I.getOpcode() != TargetOpcode::G_EXTRACT)
+    return false;
+
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned SrcReg = I.getOperand(1).getReg();
+  int64_t Index = I.getOperand(2).getImm();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+
+  // Meanwile handle vector type only.
+  if (!DstTy.isVector())
+    return false;
+
+  if (Index % DstTy.getSizeInBits() != 0)
+    return false; // Not extract subvector.
+
+  if (Index == 0) {
+    // Replace by extract subreg copy.
+    if (!emitExtractSubreg(DstReg, SrcReg, I, MRI, MF))
+      return false;
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  bool HasAVX = STI.hasAVX();
+  bool HasAVX512 = STI.hasAVX512();
+  bool HasVLX = STI.hasVLX();
+
+  if (SrcTy.getSizeInBits() == 256 && DstTy.getSizeInBits() == 128) {
+    if (HasVLX)
+      I.setDesc(TII.get(X86::VEXTRACTF32x4Z256rr));
+    else if (HasAVX)
+      I.setDesc(TII.get(X86::VEXTRACTF128rr));
+    else
+      return false;
+  } else if (SrcTy.getSizeInBits() == 512 && HasAVX512) {
+    if (DstTy.getSizeInBits() == 128)
+      I.setDesc(TII.get(X86::VEXTRACTF32x4Zrr));
+    else if (DstTy.getSizeInBits() == 256)
+      I.setDesc(TII.get(X86::VEXTRACTF64x4Zrr));
+    else
+      return false;
+  } else
+    return false;
+
+  // Convert to X86 VEXTRACT immediate.
+  Index = Index / DstTy.getSizeInBits();
+  I.getOperand(2).setImm(Index);
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg,
+                                               MachineInstr &I,
+                                               MachineRegisterInfo &MRI,
+                                               MachineFunction &MF) const {
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+  unsigned SubIdx = X86::NoSubRegister;
+
+  if (!DstTy.isVector() || !SrcTy.isVector())
+    return false;
+
+  assert(SrcTy.getSizeInBits() > DstTy.getSizeInBits() &&
+         "Incorrect Src/Dst register size");
+
+  if (DstTy.getSizeInBits() == 128)
+    SubIdx = X86::sub_xmm;
+  else if (DstTy.getSizeInBits() == 256)
+    SubIdx = X86::sub_ymm;
+  else
+    return false;
+
+  const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI);
+  const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI);
+
+  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx);
+
+  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+    DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+    return false;
+  }
+
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), DstReg)
+      .addReg(SrcReg, 0, SubIdx);
+
+  return true;
+}
+
+bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg,
+                                              MachineInstr &I,
+                                              MachineRegisterInfo &MRI,
+                                              MachineFunction &MF) const {
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+  unsigned SubIdx = X86::NoSubRegister;
+
+  // TODO: support scalar types
+  if (!DstTy.isVector() || !SrcTy.isVector())
+    return false;
+
+  assert(SrcTy.getSizeInBits() < DstTy.getSizeInBits() &&
+         "Incorrect Src/Dst register size");
+
+  if (SrcTy.getSizeInBits() == 128)
+    SubIdx = X86::sub_xmm;
+  else if (SrcTy.getSizeInBits() == 256)
+    SubIdx = X86::sub_ymm;
+  else
+    return false;
+
+  const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI);
+  const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI);
+
+  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+    DEBUG(dbgs() << "Failed to constrain INSERT_SUBREG\n");
+    return false;
+  }
+
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY))
+      .addReg(DstReg, RegState::DefineNoRead, SubIdx)
+      .addReg(SrcReg);
+
+  return true;
+}
+
+bool X86InstructionSelector::selectInsert(MachineInstr &I,
+                                          MachineRegisterInfo &MRI,
+                                          MachineFunction &MF) const {
+
+  if (I.getOpcode() != TargetOpcode::G_INSERT)
+    return false;
+
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned SrcReg = I.getOperand(1).getReg();
+  const unsigned InsertReg = I.getOperand(2).getReg();
+  int64_t Index = I.getOperand(3).getImm();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT InsertRegTy = MRI.getType(InsertReg);
+
+  // Meanwile handle vector type only.
+  if (!DstTy.isVector())
+    return false;
+
+  if (Index % InsertRegTy.getSizeInBits() != 0)
+    return false; // Not insert subvector.
+
+  if (Index == 0 && MRI.getVRegDef(SrcReg)->isImplicitDef()) {
+    // Replace by subreg copy.
+    if (!emitInsertSubreg(DstReg, InsertReg, I, MRI, MF))
+      return false;
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  bool HasAVX = STI.hasAVX();
+  bool HasAVX512 = STI.hasAVX512();
+  bool HasVLX = STI.hasVLX();
+
+  if (DstTy.getSizeInBits() == 256 && InsertRegTy.getSizeInBits() == 128) {
+    if (HasVLX)
+      I.setDesc(TII.get(X86::VINSERTF32x4Z256rr));
+    else if (HasAVX)
+      I.setDesc(TII.get(X86::VINSERTF128rr));
+    else
+      return false;
+  } else if (DstTy.getSizeInBits() == 512 && HasAVX512) {
+    if (InsertRegTy.getSizeInBits() == 128)
+      I.setDesc(TII.get(X86::VINSERTF32x4Zrr));
+    else if (InsertRegTy.getSizeInBits() == 256)
+      I.setDesc(TII.get(X86::VINSERTF64x4Zrr));
+    else
+      return false;
+  } else
+    return false;
+
+  // Convert to X86 VINSERT immediate.
+  Index = Index / InsertRegTy.getSizeInBits();
+
+  I.getOperand(3).setImm(Index);
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
 InstructionSelector *
 llvm::createX86InstructionSelector(const X86TargetMachine &TM,
                                    X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp
index 806d6cc888f0f..f0ed4bc16e2f9 100644
--- a/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/lib/Target/X86/X86InterleavedAccess.cpp
@@ -16,6 +16,7 @@
 
 #include "X86ISelLowering.h"
 #include "X86TargetMachine.h"
+#include "llvm/Analysis/VectorUtils.h"
 
 using namespace llvm;
 
@@ -50,9 +51,8 @@ class X86InterleavedAccessGroup {
   IRBuilder<> &Builder;
 
   /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
-  /// sub vectors of type \p T. Returns true and the sub-vectors in
-  /// \p DecomposedVectors if it decomposes the Inst, returns false otherwise.
-  bool decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
+  /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
+  void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
                  SmallVectorImpl<Instruction *> &DecomposedVectors);
 
   /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
@@ -80,8 +80,7 @@ public:
   /// target information \p STarget.
   explicit X86InterleavedAccessGroup(Instruction *I,
                                      ArrayRef<ShuffleVectorInst *> Shuffs,
-                                     ArrayRef<unsigned> Ind,
-                                     const unsigned F,
+                                     ArrayRef<unsigned> Ind, const unsigned F,
                                      const X86Subtarget &STarget,
                                      IRBuilder<> &B)
       : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
@@ -102,48 +101,61 @@ bool X86InterleavedAccessGroup::isSupported() const {
   uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
   Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
 
-  if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize)
-    return false;
+  // Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
+  uint64_t ExpectedShuffleVecSize;
+  if (isa<LoadInst>(Inst))
+    ExpectedShuffleVecSize = 256;
+  else
+    ExpectedShuffleVecSize = 1024;
 
-  // Currently, lowering is supported for 64 bits on AVX.
-  if (!Subtarget.hasAVX() || ShuffleVecSize != 256 ||
+  if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize ||
       DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
     return false;
 
   return true;
 }
 
-bool X86InterleavedAccessGroup::decompose(
+void X86InterleavedAccessGroup::decompose(
     Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
     SmallVectorImpl<Instruction *> &DecomposedVectors) {
+
+  assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
+         "Expected Load or Shuffle");
+
   Type *VecTy = VecInst->getType();
   (void)VecTy;
   assert(VecTy->isVectorTy() &&
          DL.getTypeSizeInBits(VecTy) >=
              DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
          "Invalid Inst-size!!!");
-  assert(VecTy->getVectorElementType() == SubVecTy->getVectorElementType() &&
-         "Element type mismatched!!!");
 
-  if (!isa<LoadInst>(VecInst))
-    return false;
+  if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
+    Value *Op0 = SVI->getOperand(0);
+    Value *Op1 = SVI->getOperand(1);
+
+    // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
+    for (unsigned i = 0; i < NumSubVectors; ++i)
+      DecomposedVectors.push_back(
+          cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
+              Op0, Op1, createSequentialMask(Builder, Indices[i],
+                                             SubVecTy->getVectorNumElements(), 0))));
+    return;
+  }
 
+  // Decompose the load instruction.
   LoadInst *LI = cast<LoadInst>(VecInst);
   Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
-
   Value *VecBasePtr =
       Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
 
-  // Generate N loads of T type
+  // Generate N loads of T type.
   for (unsigned i = 0; i < NumSubVectors; i++) {
-    // TODO: Support inbounds GEP
+    // TODO: Support inbounds GEP.
     Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
     Instruction *NewLoad =
         Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
     DecomposedVectors.push_back(NewLoad);
   }
-
-  return true;
 }
 
 void X86InterleavedAccessGroup::transpose_4x4(
@@ -181,21 +193,46 @@ void X86InterleavedAccessGroup::transpose_4x4(
 // instructions/intrinsics.
 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
   SmallVector<Instruction *, 4> DecomposedVectors;
-  VectorType *VecTy = Shuffles[0]->getType();
-  // Try to generate target-sized register(/instruction).
-  if (!decompose(Inst, Factor, VecTy, DecomposedVectors))
-    return false;
-
   SmallVector<Value *, 4> TransposedVectors;
-  // Perform matrix-transposition in order to compute interleaved
-  // results by generating some sort of (optimized) target-specific
-  // instructions.
+  VectorType *ShuffleTy = Shuffles[0]->getType();
+
+  if (isa<LoadInst>(Inst)) {
+    // Try to generate target-sized register(/instruction).
+    decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
+
+    // Perform matrix-transposition in order to compute interleaved
+    // results by generating some sort of (optimized) target-specific
+    // instructions.
+    transpose_4x4(DecomposedVectors, TransposedVectors);
+
+    // Now replace the unoptimized-interleaved-vectors with the
+    // transposed-interleaved vectors.
+    for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
+      Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+
+    return true;
+  }
+
+  Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
+  unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
+
+  // Lower the interleaved stores:
+  //   1. Decompose the interleaved wide shuffle into individual shuffle
+  //   vectors.
+  decompose(Shuffles[0], Factor,
+            VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors);
+
+  //   2. Transpose the interleaved-vectors into vectors of contiguous
+  //      elements.
   transpose_4x4(DecomposedVectors, TransposedVectors);
 
-  // Now replace the unoptimized-interleaved-vectors with the
-  // transposed-interleaved vectors.
-  for (unsigned i = 0; i < Shuffles.size(); i++)
-    Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+  //   3. Concatenate the contiguous-vectors back into a wide vector.
+  Value *WideVec = concatenateVectors(Builder, TransposedVectors);
+
+  //   4. Generate a store instruction for wide-vec.
+  StoreInst *SI = cast<StoreInst>(Inst);
+  Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
+                             SI->getAlignment());
 
   return true;
 }
@@ -220,3 +257,29 @@ bool X86TargetLowering::lowerInterleavedLoad(
 
   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 }
+
+bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
+                                              ShuffleVectorInst *SVI,
+                                              unsigned Factor) const {
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+
+  assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
+         "Invalid interleaved store");
+
+  // Holds the indices of SVI that correspond to the starting index of each
+  // interleaved shuffle.
+  SmallVector<unsigned, 4> Indices;
+  auto Mask = SVI->getShuffleMask();
+  for (unsigned i = 0; i < Factor; i++)
+    Indices.push_back(Mask[i]);
+
+  ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
+
+  // Create an interleaved access group.
+  IRBuilder<> Builder(SI);
+  X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
+                                Builder);
+
+  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
+}
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index bc73bb1ae8c51..6b1add8ff8ed1 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -510,12 +510,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::SHUF128, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
                      X86ISD::SHUF128, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_b_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_b_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_b_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_d_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_d_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_d_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
@@ -524,16 +518,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM,
                      X86ISD::CMPM_RND),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_q_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_q_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_q_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_sd,     CMP_MASK_SCALAR_CC,
                      X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
   X86_INTRINSIC_DATA(avx512_mask_cmp_ss,     CMP_MASK_SCALAR_CC,
                      X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_w_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_w_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_w_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_compress_d_128,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::COMPRESS, 0),
   X86_INTRINSIC_DATA(avx512_mask_compress_d_256,  COMPRESS_EXPAND_IN_REG,
@@ -1171,18 +1159,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::FSUBS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::FSUBS_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
                      X86ISD::CVTPH2PS, 0),
   X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 979aaee110aa4..a584eabcc1b28 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -214,12 +214,24 @@ void X86LegalizerInfo::setLegalizerInfoAVX() {
   if (!Subtarget.hasAVX())
     return;
 
+  const LLT v16s8 = LLT::vector(16, 8);
+  const LLT v8s16 = LLT::vector(8, 16);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  const LLT v32s8 = LLT::vector(32, 8);
+  const LLT v16s16 = LLT::vector(16, 16);
   const LLT v8s32 = LLT::vector(8, 32);
   const LLT v4s64 = LLT::vector(4, 64);
 
   for (unsigned MemOp : {G_LOAD, G_STORE})
     for (auto Ty : {v8s32, v4s64})
       setAction({MemOp, Ty}, Legal);
+
+  for (auto Ty : {v32s8, v16s16, v8s32, v4s64})
+    setAction({G_INSERT, Ty}, Legal);
+  for (auto Ty : {v16s8, v8s16, v4s32, v2s64})
+    setAction({G_INSERT, 1, Ty}, Legal);
 }
 
 void X86LegalizerInfo::setLegalizerInfoAVX2() {
@@ -243,6 +255,18 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
   if (!Subtarget.hasAVX512())
     return;
 
+  const LLT v16s8 = LLT::vector(16, 8);
+  const LLT v8s16 = LLT::vector(8, 16);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  const LLT v32s8 = LLT::vector(32, 8);
+  const LLT v16s16 = LLT::vector(16, 16);
+  const LLT v8s32 = LLT::vector(8, 32);
+  const LLT v4s64 = LLT::vector(4, 64);
+
+  const LLT v64s8 = LLT::vector(64, 8);
+  const LLT v32s16 = LLT::vector(32, 16);
   const LLT v16s32 = LLT::vector(16, 32);
   const LLT v8s64 = LLT::vector(8, 64);
 
@@ -256,13 +280,15 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
     for (auto Ty : {v16s32, v8s64})
       setAction({MemOp, Ty}, Legal);
 
+  for (auto Ty : {v64s8, v32s16, v16s32, v8s64})
+    setAction({G_INSERT, Ty}, Legal);
+  for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64})
+    setAction({G_INSERT, 1, Ty}, Legal);
+
   /************ VLX *******************/
   if (!Subtarget.hasVLX())
     return;
 
-  const LLT v4s32 = LLT::vector(4, 32);
-  const LLT v8s32 = LLT::vector(8, 32);
-
   for (auto Ty : {v4s32, v8s32})
     setAction({G_MUL, Ty}, Legal);
 }
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index dd21e2b7c4a13..8fdf10617059a 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -2,39 +2,31 @@
 //
 //                     The LLVM Compiler Infrastructure
 //
-// \file This file is distributed under the University of Illinois Open Source
+// This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the X86 implementation of the DAG scheduling mutation to
-// pair instructions back to back.
+/// \file This file contains the X86 implementation of the DAG scheduling
+/// mutation to pair instructions back to back.
 //
 //===----------------------------------------------------------------------===//
 
 #include "X86MacroFusion.h"
 #include "X86Subtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
-
-#define DEBUG_TYPE "misched"
-
-STATISTIC(NumFused, "Number of instr pairs fused");
+#include "llvm/CodeGen/MacroFusion.h"
 
 using namespace llvm;
 
-static cl::opt<bool> EnableMacroFusion("x86-misched-fusion", cl::Hidden,
-  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
-
-namespace {
-
-/// \brief Verify that the instruction pair, First and Second,
-/// should be scheduled back to back.  If either instruction is unspecified,
-/// then verify that the other instruction may be part of a pair at all.
-static bool shouldScheduleAdjacent(const X86Subtarget &ST,
-                                   const MachineInstr *First,
-                                   const MachineInstr *Second) {
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr &SecondMI) {
+  const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI);
   // Check if this processor supports macro-fusion. Since this is a minor
   // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
   // proxy for SandyBridge+.
@@ -47,13 +39,10 @@ static bool shouldScheduleAdjacent(const X86Subtarget &ST,
     FuseInc
   } FuseKind;
 
-  assert((First || Second) && "At least one instr must be specified");
-  unsigned FirstOpcode = First
-                         ? First->getOpcode()
+  unsigned FirstOpcode = FirstMI
+                         ? FirstMI->getOpcode()
                          : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
-  unsigned SecondOpcode = Second
-                          ? Second->getOpcode()
-                          : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = SecondMI.getOpcode();
 
   switch (SecondOpcode) {
   default:
@@ -203,69 +192,11 @@ static bool shouldScheduleAdjacent(const X86Subtarget &ST,
   }
 }
 
-/// \brief Post-process the DAG to create cluster edges between instructions
-/// that may be fused by the processor into a single operation.
-class X86MacroFusion : public ScheduleDAGMutation {
-public:
-  X86MacroFusion() {}
-
-  void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-
-void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
-  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
-  const X86Subtarget &ST = DAG->MF.getSubtarget<X86Subtarget>();
-
-  // For now, assume targets can only fuse with the branch.
-  SUnit &ExitSU = DAG->ExitSU;
-  MachineInstr *Branch = ExitSU.getInstr();
-  if (!Branch || !shouldScheduleAdjacent(ST, nullptr, Branch))
-    return;
-
-  for (SDep &PredDep : ExitSU.Preds) {
-    if (PredDep.isWeak())
-      continue;
-    SUnit &SU = *PredDep.getSUnit();
-    MachineInstr &Pred = *SU.getInstr();
-    if (!shouldScheduleAdjacent(ST, &Pred, Branch))
-      continue;
-
-    // Create a single weak edge from SU to ExitSU. The only effect is to cause
-    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
-    // need to copy predecessor edges from ExitSU to SU, since top-down
-    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
-    // of SU, we could create an artificial edge from the deepest root, but it
-    // hasn't been needed yet.
-    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
-    (void)Success;
-    assert(Success && "No DAG nodes should be reachable from ExitSU");
-
-    // Adjust latency of data deps between the nodes.
-    for (SDep &PredDep : ExitSU.Preds)
-      if (PredDep.getSUnit() == &SU)
-        PredDep.setLatency(0);
-    for (SDep &SuccDep : SU.Succs)
-      if (SuccDep.getSUnit() == &ExitSU)
-        SuccDep.setLatency(0);
-
-    ++NumFused;
-    DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
-          SU.print(dbgs(), DAG);
-          dbgs() << " - ExitSU"
-                 << " / " << DAG->TII->getName(Pred.getOpcode()) << " - "
-                 << DAG->TII->getName(Branch->getOpcode()) << '\n';);
-
-    break;
-  }
-}
-
-} // end namespace
-
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation>
 createX86MacroFusionDAGMutation () {
-  return EnableMacroFusion ? make_unique<X86MacroFusion>() : nullptr;
+  return createBranchMacroFusionDAGMutation(shouldScheduleAdjacent);
 }
 
 } // end namespace llvm
diff --git a/lib/Target/X86/X86MacroFusion.h b/lib/Target/X86/X86MacroFusion.h
index e630f802e8e63..13fa2d78a0185 100644
--- a/lib/Target/X86/X86MacroFusion.h
+++ b/lib/Target/X86/X86MacroFusion.h
@@ -2,23 +2,18 @@
 //
 //                     The LLVM Compiler Infrastructure
 //
-// \file This file is distributed under the University of Illinois Open Source
+// This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the X86 definition of the DAG scheduling mutation to pair
-// instructions back to back.
+/// \file This file contains the X86 definition of the DAG scheduling mutation
+///  to pair instructions back to back.
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86InstrInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
-//===----------------------------------------------------------------------===//
-// X86MacroFusion - DAG post-processing to encourage fusion of macro ops.
-//===----------------------------------------------------------------------===//
-
 namespace llvm {
 
 /// Note that you have to add:
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 278b57eb00b74..a9f42cacf7886 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -91,6 +91,8 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
     return llvm::make_unique<X86FreeBSDTargetObjectFile>();
   if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
     return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
+  if (TT.isOSSolaris())
+    return llvm::make_unique<X86SolarisTargetObjectFile>();
   if (TT.isOSFuchsia())
     return llvm::make_unique<X86FuchsiaTargetObjectFile>();
   if (TT.isOSBinFormatELF())
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 4fd95717478e9..8627c06d44313 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -86,6 +86,12 @@ X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
   InitializeELF(TM.Options.UseInitArray);
 }
 
+void X86SolarisTargetObjectFile::Initialize(MCContext &Ctx,
+                                            const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
 const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference(
     const GlobalValue *LHS, const GlobalValue *RHS,
     const TargetMachine &TM) const {
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 39d2e84e5ed77..f6aa570b6332a 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -66,6 +66,11 @@ namespace llvm {
     void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
   };
 
+  /// \brief This implementation is used for Solaris on x86/x86-64.
+  class X86SolarisTargetObjectFile : public X86ELFTargetObjectFile {
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+  };
+
   /// \brief This implementation is used for Windows targets on x86 and x86-64.
   class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
     const MCExpr *
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 11ba7025e1b73..5ba8534d32d33 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2178,17 +2178,6 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
   return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
 }
 
-bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                               TargetTransformInfo::LSRCost &C2) {
-    // X86 specific here are "instruction number 1st priority".
-    return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
-                    C1.NumIVMuls, C1.NumBaseAdds,
-                    C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
-           std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
-                    C2.NumIVMuls, C2.NumBaseAdds,
-                    C2.ScaleCost, C2.ImmCost, C2.SetupCost);
-}
-
 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
   Type *ScalarTy = DataTy->getScalarType();
   int DataWidth = isa<PointerType>(ScalarTy) ?
@@ -2243,6 +2232,12 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
   return (CallerBits & CalleeBits) == CalleeBits;
 }
 
+bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+  // TODO: We can increase these based on available vector ops.
+  MaxLoadSize = ST->is64Bit() ? 8 : 4;
+  return true;
+}
+
 bool X86TTIImpl::enableInterleavedAccessVectorization() {
   // TODO: We expect this to be beneficial regardless of arch,
   // but there are currently some unexplained performance artifacts on Atom.
@@ -2250,6 +2245,114 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
   return !(ST->isAtom());
 }
 
+// Get estimation for interleaved load/store operations for AVX2.
+// \p Factor is the interleaved-access factor (stride) - number of
+// (interleaved) elements in the group.
+// \p Indices contains the indices for a strided load: when the
+// interleaved load has gaps they indicate which elements are used.
+// If Indices is empty (or if the number of indices is equal to the size
+// of the interleaved-access as given in \p Factor) the access has no gaps.
+//
+// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
+// computing the cost using a generic formula as a function of generic
+// shuffles. We therefore use a lookup table instead, filled according to
+// the instruction sequences that codegen currently generates.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+
+  // We currently Support only fully-interleaved groups, with no gaps.
+  // TODO: Support also strided loads (interleaved-groups with gaps).
+  if (Indices.size() && Indices.size() != Factor)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  // VecTy for interleave memop is <VF*Factor x Elt>.
+  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+  // VecTy = <12 x i32>.
+  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+
+  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
+  // the VF=2, while v2i128 is an unsupported MVT vector type
+  // (see MachineValueType.h::getVectorVT()).
+  if (!LegalVT.isVector())
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  unsigned VF = VecTy->getVectorNumElements() / Factor;
+  Type *ScalarTy = VecTy->getVectorElementType();
+  
+  // Calculate the number of memory operations (NumOfMemOps), required
+  // for load/store the VecTy.
+  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+  unsigned LegalVTSize = LegalVT.getStoreSize();
+  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+  // Get the cost of one memory operation.
+  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+                                        LegalVT.getVectorNumElements());
+  unsigned MemOpCost =
+      getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+  
+  VectorType *VT = VectorType::get(ScalarTy, VF);
+  EVT ETy = TLI->getValueType(DL, VT);
+  if (!ETy.isSimple())
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  // TODO: Complete for other data-types and strides.
+  // Each combination of Stride, ElementTy and VF results in a different
+  // sequence; The cost tables are therefore accessed with:
+  // Factor (stride) and VectorType=VFxElemType.
+  // The Cost accounts only for the shuffle sequence;
+  // The cost of the loads/stores is accounted for separately.
+  //
+  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
+    { 3, MVT::v2i8,  10 }, //(load 6i8 and)  deinterleave into 3 x 2i8
+    { 3, MVT::v4i8,  4 },  //(load 12i8 and) deinterleave into 3 x 4i8
+    { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
+    { 3, MVT::v16i8, 18},  //(load 48i8 and) deinterleave into 3 x 16i8
+    { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
+    
+    { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
+    { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
+    { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
+    { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
+    { 4, MVT::v32i8, 80 }  //(load 128i8 and) deinterleave into 4 x 32i8
+  };
+
+  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+    { 3, MVT::v2i8,  7 },  //interleave 3 x 2i8  into 6i8 (and store)
+    { 3, MVT::v4i8,  8 },  //interleave 3 x 4i8  into 12i8 (and store)
+    { 3, MVT::v8i8,  11 }, //interleave 3 x 8i8  into 24i8 (and store)
+    { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
+    { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
+
+    { 4, MVT::v2i8,  12 }, //interleave 4 x 2i8  into 8i8 (and store)
+    { 4, MVT::v4i8,  9 },  //interleave 4 x 4i8  into 16i8 (and store)
+    { 4, MVT::v8i8,  16 }, //interleave 4 x 8i8  into 32i8 (and store)
+    { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
+    { 4, MVT::v32i8, 40 }  //interleave 4 x 32i8 into 128i8 (and store)
+  };
+
+  if (Opcode == Instruction::Load) {
+    if (const auto *Entry =
+            CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
+      return NumOfMemOps * MemOpCost + Entry->Cost;
+  } else {
+    assert(Opcode == Instruction::Store &&
+           "Expected Store Instruction at this  point");
+    if (const auto *Entry = 
+            CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
+      return NumOfMemOps * MemOpCost + Entry->Cost;
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
+
 // Get estimation for interleaved load/store operations and strided load.
 // \p Indices contains indices for strided load.
 // \p Factor - the factor of interleaving.
@@ -2358,6 +2461,10 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
     return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
                                             Alignment, AddressSpace);
+  if (ST->hasAVX2())
+    return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
+                                          Alignment, AddressSpace);
+  
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace);
 }
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 09ce2c90498d9..ad0a0a2113012 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -93,6 +93,9 @@ public:
   int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
                                  unsigned Alignment, unsigned AddressSpace);
+  int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor, ArrayRef<unsigned> Indices,
+                                 unsigned Alignment, unsigned AddressSpace);
 
   int getIntImmCost(int64_t);
 
@@ -101,15 +104,13 @@ public:
   int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
   int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                     Type *Ty);
-  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
-                     TargetTransformInfo::LSRCost &C2);
   bool isLegalMaskedLoad(Type *DataType);
   bool isLegalMaskedStore(Type *DataType);
   bool isLegalMaskedGather(Type *DataType);
   bool isLegalMaskedScatter(Type *DataType);
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
-
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
 private:
   int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
author	Dimitry Andric <dim@FreeBSD.org>	2017-06-26 20:32:52 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-06-26 20:32:52 +0000
commit	08bbd35a80bf7765fe0d3043f9eb5a2f2786b649 (patch)
tree	80108f0f128657f8623f8f66ad9735b4d88e7b47 /lib/Target
parent	7c7aba6e5fef47a01a136be655b0a92cfd7090f6 (diff)