145 files changed, 6415 insertions, 1672 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 37b9690d0434a..1dda746a6be1e 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -44,6 +44,8 @@ ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
 FunctionPass *createAArch64A53Fix835769();
+FunctionPass *createFalkorHWPFFixPass();
+FunctionPass *createFalkorMarkStridedAccessesPass();
 
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
@@ -66,6 +68,8 @@ void initializeAArch64VectorByElementOptPass(PassRegistry&);
 void initializeAArch64PromoteConstantPass(PassRegistry&);
 void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
 void initializeAArch64StorePairSuppressPass(PassRegistry&);
+void initializeFalkorHWPFFixPass(PassRegistry&);
+void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
 void initializeLDTLSCleanupPass(PassRegistry&);
 } // end namespace llvm
 
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 53eef79c4df3f..436bf1193304c 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -50,6 +50,9 @@ def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
 def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
   "Enable Statistical Profiling extension">;
 
+def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
+  "Enable Scalable Vector Extension (SVE) instructions">;
+
 /// Cyclone has register move instructions which are "free".
 def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
                                         "Has zero-cycle register moves">;
@@ -269,6 +272,7 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                      FeatureCrypto,
                                      FeatureCustomCheapAsMoveHandling,
                                      FeatureFPARMv8,
+                                     FeatureFuseAES,
                                      FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 938779d23690d..291bc5ea858e3 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -118,6 +118,13 @@ def RetCC_AArch64_AAPCS : CallingConv<[
       CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
 ]>;
 
+// Vararg functions on windows pass floats in integer registers
+def CC_AArch64_Win64_VarArg : CallingConv<[
+  CCIfType<[f16, f32],    CCPromoteToType<f64>>,
+  CCIfType<[f64], CCBitConvertToType<i64>>,
+  CCDelegateTo<CC_AArch64_AAPCS>
+]>;
+
 
 // Darwin uses a calling convention which differs in only two ways
 // from the standard one at this level:
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index ee54550c9900b..b72f23b109d94 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -102,6 +102,10 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
       case AArch64::LDADDALh:
       case AArch64::LDADDALs:
       case AArch64::LDADDALd:
+      case AArch64::LDCLRALb:
+      case AArch64::LDCLRALh:
+      case AArch64::LDCLRALs:
+      case AArch64::LDCLRALd:
       case AArch64::LDEORALb:
       case AArch64::LDEORALh:
       case AArch64::LDEORALs:
diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
new file mode 100644
index 0000000000000..c0e22355a9ff6
--- /dev/null
+++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -0,0 +1,790 @@
+//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions
+/// that may inhibit the HW prefetching.  This is done in two steps.  Before
+/// ISel, we mark strided loads (i.e. those that will likely benefit from
+/// prefetching) with metadata.  Then, after opcodes have been finalized, we
+/// insert MOVs and re-write loads to prevent unintnentional tag collisions.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "falkor-hwpf-fix"
+
+STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
+STATISTIC(NumCollisionsAvoided,
+          "Number of HW prefetch tag collisions avoided");
+STATISTIC(NumCollisionsNotAvoided,
+          "Number of HW prefetch tag collisions not avoided due to lack of regsiters");
+
+namespace {
+
+class FalkorMarkStridedAccesses {
+public:
+  FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
+      : LI(LI), SE(SE) {}
+
+  bool run();
+
+private:
+  bool runOnLoop(Loop &L);
+
+  LoopInfo &LI;
+  ScalarEvolution &SE;
+};
+
+class FalkorMarkStridedAccessesLegacy : public FunctionPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
+    initializeFalkorMarkStridedAccessesLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    // FIXME: For some reason, preserving SE here breaks LSR (even if
+    // this pass changes nothing).
+    // AU.addPreserved<ScalarEvolutionWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char FalkorMarkStridedAccessesLegacy::ID = 0;
+INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
+                      "Falkor HW Prefetch Fix", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
+                    "Falkor HW Prefetch Fix", false, false)
+
+FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {
+  return new FalkorMarkStridedAccessesLegacy();
+}
+
+bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
+  TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+  const AArch64Subtarget *ST =
+      TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
+  if (ST->getProcFamily() != AArch64Subtarget::Falkor)
+    return false;
+
+  if (skipFunction(F))
+    return false;
+
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+  FalkorMarkStridedAccesses LDP(LI, SE);
+  return LDP.run();
+}
+
+bool FalkorMarkStridedAccesses::run() {
+  bool MadeChange = false;
+
+  for (Loop *L : LI)
+    for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt)
+      MadeChange |= runOnLoop(**LIt);
+
+  return MadeChange;
+}
+
+bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
+  // Only mark strided loads in the inner-most loop
+  if (!L.empty())
+    return false;
+
+  bool MadeChange = false;
+
+  for (BasicBlock *BB : L.blocks()) {
+    for (Instruction &I : *BB) {
+      LoadInst *LoadI = dyn_cast<LoadInst>(&I);
+      if (!LoadI)
+        continue;
+
+      Value *PtrValue = LoadI->getPointerOperand();
+      if (L.isLoopInvariant(PtrValue))
+        continue;
+
+      const SCEV *LSCEV = SE.getSCEV(PtrValue);
+      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+      if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
+        continue;
+
+      LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
+                         MDNode::get(LoadI->getContext(), {}));
+      ++NumStridedLoadsMarked;
+      DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+namespace {
+
+class FalkorHWPFFix : public MachineFunctionPass {
+public:
+  static char ID;
+
+  FalkorHWPFFix() : MachineFunctionPass(ID) {
+    initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  void runOnLoop(MachineLoop &L, MachineFunction &Fn);
+
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap;
+  bool Modified;
+};
+
+/// Bits from load opcodes used to compute HW prefetcher instruction tags.
+struct LoadInfo {
+  LoadInfo()
+      : DestReg(0), BaseReg(0), BaseRegIdx(-1), OffsetOpnd(nullptr),
+        IsPrePost(false) {}
+  unsigned DestReg;
+  unsigned BaseReg;
+  int BaseRegIdx;
+  const MachineOperand *OffsetOpnd;
+  bool IsPrePost;
+};
+
+} // namespace
+
+char FalkorHWPFFix::ID = 0;
+
+INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late",
+                      "Falkor HW Prefetch Fix Late Phase", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late",
+                    "Falkor HW Prefetch Fix Late Phase", false, false)
+
+static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
+  return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8);
+}
+
+static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
+  int DestRegIdx;
+  int BaseRegIdx;
+  int OffsetIdx;
+  bool IsPrePost;
+
+  switch (MI.getOpcode()) {
+  default:
+    return None;
+
+  case AArch64::LD1i8:
+  case AArch64::LD1i16:
+  case AArch64::LD1i32:
+  case AArch64::LD1i64:
+  case AArch64::LD2i8:
+  case AArch64::LD2i16:
+  case AArch64::LD2i32:
+  case AArch64::LD2i64:
+  case AArch64::LD3i8:
+  case AArch64::LD3i16:
+  case AArch64::LD3i32:
+  case AArch64::LD4i8:
+  case AArch64::LD4i16:
+  case AArch64::LD4i32:
+    DestRegIdx = 0;
+    BaseRegIdx = 3;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD3i64:
+  case AArch64::LD4i64:
+    DestRegIdx = -1;
+    BaseRegIdx = 3;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Onev1d:
+  case AArch64::LD1Onev2s:
+  case AArch64::LD1Onev4h:
+  case AArch64::LD1Onev8b:
+  case AArch64::LD1Onev2d:
+  case AArch64::LD1Onev4s:
+  case AArch64::LD1Onev8h:
+  case AArch64::LD1Onev16b:
+  case AArch64::LD1Rv1d:
+  case AArch64::LD1Rv2s:
+  case AArch64::LD1Rv4h:
+  case AArch64::LD1Rv8b:
+  case AArch64::LD1Rv2d:
+  case AArch64::LD1Rv4s:
+  case AArch64::LD1Rv8h:
+  case AArch64::LD1Rv16b:
+  case AArch64::LD1Twov1d:
+  case AArch64::LD1Twov2s:
+  case AArch64::LD1Twov4h:
+  case AArch64::LD1Twov8b:
+  case AArch64::LD2Twov2s:
+  case AArch64::LD2Twov4s:
+  case AArch64::LD2Twov8b:
+  case AArch64::LD2Rv1d:
+  case AArch64::LD2Rv2s:
+  case AArch64::LD2Rv4s:
+  case AArch64::LD2Rv8b:
+    DestRegIdx = 0;
+    BaseRegIdx = 1;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Twov2d:
+  case AArch64::LD1Twov4s:
+  case AArch64::LD1Twov8h:
+  case AArch64::LD1Twov16b:
+  case AArch64::LD1Threev1d:
+  case AArch64::LD1Threev2s:
+  case AArch64::LD1Threev4h:
+  case AArch64::LD1Threev8b:
+  case AArch64::LD1Threev2d:
+  case AArch64::LD1Threev4s:
+  case AArch64::LD1Threev8h:
+  case AArch64::LD1Threev16b:
+  case AArch64::LD1Fourv1d:
+  case AArch64::LD1Fourv2s:
+  case AArch64::LD1Fourv4h:
+  case AArch64::LD1Fourv8b:
+  case AArch64::LD1Fourv2d:
+  case AArch64::LD1Fourv4s:
+  case AArch64::LD1Fourv8h:
+  case AArch64::LD1Fourv16b:
+  case AArch64::LD2Twov2d:
+  case AArch64::LD2Twov4h:
+  case AArch64::LD2Twov8h:
+  case AArch64::LD2Twov16b:
+  case AArch64::LD2Rv2d:
+  case AArch64::LD2Rv4h:
+  case AArch64::LD2Rv8h:
+  case AArch64::LD2Rv16b:
+  case AArch64::LD3Threev2s:
+  case AArch64::LD3Threev4h:
+  case AArch64::LD3Threev8b:
+  case AArch64::LD3Threev2d:
+  case AArch64::LD3Threev4s:
+  case AArch64::LD3Threev8h:
+  case AArch64::LD3Threev16b:
+  case AArch64::LD3Rv1d:
+  case AArch64::LD3Rv2s:
+  case AArch64::LD3Rv4h:
+  case AArch64::LD3Rv8b:
+  case AArch64::LD3Rv2d:
+  case AArch64::LD3Rv4s:
+  case AArch64::LD3Rv8h:
+  case AArch64::LD3Rv16b:
+  case AArch64::LD4Fourv2s:
+  case AArch64::LD4Fourv4h:
+  case AArch64::LD4Fourv8b:
+  case AArch64::LD4Fourv2d:
+  case AArch64::LD4Fourv4s:
+  case AArch64::LD4Fourv8h:
+  case AArch64::LD4Fourv16b:
+  case AArch64::LD4Rv1d:
+  case AArch64::LD4Rv2s:
+  case AArch64::LD4Rv4h:
+  case AArch64::LD4Rv8b:
+  case AArch64::LD4Rv2d:
+  case AArch64::LD4Rv4s:
+  case AArch64::LD4Rv8h:
+  case AArch64::LD4Rv16b:
+    DestRegIdx = -1;
+    BaseRegIdx = 1;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1i8_POST:
+  case AArch64::LD1i16_POST:
+  case AArch64::LD1i32_POST:
+  case AArch64::LD1i64_POST:
+  case AArch64::LD2i8_POST:
+  case AArch64::LD2i16_POST:
+  case AArch64::LD2i32_POST:
+  case AArch64::LD2i64_POST:
+  case AArch64::LD3i8_POST:
+  case AArch64::LD3i16_POST:
+  case AArch64::LD3i32_POST:
+  case AArch64::LD4i8_POST:
+  case AArch64::LD4i16_POST:
+  case AArch64::LD4i32_POST:
+    DestRegIdx = 1;
+    BaseRegIdx = 4;
+    OffsetIdx = 5;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD3i64_POST:
+  case AArch64::LD4i64_POST:
+    DestRegIdx = -1;
+    BaseRegIdx = 4;
+    OffsetIdx = 5;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Onev1d_POST:
+  case AArch64::LD1Onev2s_POST:
+  case AArch64::LD1Onev4h_POST:
+  case AArch64::LD1Onev8b_POST:
+  case AArch64::LD1Onev2d_POST:
+  case AArch64::LD1Onev4s_POST:
+  case AArch64::LD1Onev8h_POST:
+  case AArch64::LD1Onev16b_POST:
+  case AArch64::LD1Rv1d_POST:
+  case AArch64::LD1Rv2s_POST:
+  case AArch64::LD1Rv4h_POST:
+  case AArch64::LD1Rv8b_POST:
+  case AArch64::LD1Rv2d_POST:
+  case AArch64::LD1Rv4s_POST:
+  case AArch64::LD1Rv8h_POST:
+  case AArch64::LD1Rv16b_POST:
+  case AArch64::LD1Twov1d_POST:
+  case AArch64::LD1Twov2s_POST:
+  case AArch64::LD1Twov4h_POST:
+  case AArch64::LD1Twov8b_POST:
+  case AArch64::LD2Twov2s_POST:
+  case AArch64::LD2Twov4s_POST:
+  case AArch64::LD2Twov8b_POST:
+  case AArch64::LD2Rv1d_POST:
+  case AArch64::LD2Rv2s_POST:
+  case AArch64::LD2Rv4s_POST:
+  case AArch64::LD2Rv8b_POST:
+    DestRegIdx = 1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Twov2d_POST:
+  case AArch64::LD1Twov4s_POST:
+  case AArch64::LD1Twov8h_POST:
+  case AArch64::LD1Twov16b_POST:
+  case AArch64::LD1Threev1d_POST:
+  case AArch64::LD1Threev2s_POST:
+  case AArch64::LD1Threev4h_POST:
+  case AArch64::LD1Threev8b_POST:
+  case AArch64::LD1Threev2d_POST:
+  case AArch64::LD1Threev4s_POST:
+  case AArch64::LD1Threev8h_POST:
+  case AArch64::LD1Threev16b_POST:
+  case AArch64::LD1Fourv1d_POST:
+  case AArch64::LD1Fourv2s_POST:
+  case AArch64::LD1Fourv4h_POST:
+  case AArch64::LD1Fourv8b_POST:
+  case AArch64::LD1Fourv2d_POST:
+  case AArch64::LD1Fourv4s_POST:
+  case AArch64::LD1Fourv8h_POST:
+  case AArch64::LD1Fourv16b_POST:
+  case AArch64::LD2Twov2d_POST:
+  case AArch64::LD2Twov4h_POST:
+  case AArch64::LD2Twov8h_POST:
+  case AArch64::LD2Twov16b_POST:
+  case AArch64::LD2Rv2d_POST:
+  case AArch64::LD2Rv4h_POST:
+  case AArch64::LD2Rv8h_POST:
+  case AArch64::LD2Rv16b_POST:
+  case AArch64::LD3Threev2s_POST:
+  case AArch64::LD3Threev4h_POST:
+  case AArch64::LD3Threev8b_POST:
+  case AArch64::LD3Threev2d_POST:
+  case AArch64::LD3Threev4s_POST:
+  case AArch64::LD3Threev8h_POST:
+  case AArch64::LD3Threev16b_POST:
+  case AArch64::LD3Rv1d_POST:
+  case AArch64::LD3Rv2s_POST:
+  case AArch64::LD3Rv4h_POST:
+  case AArch64::LD3Rv8b_POST:
+  case AArch64::LD3Rv2d_POST:
+  case AArch64::LD3Rv4s_POST:
+  case AArch64::LD3Rv8h_POST:
+  case AArch64::LD3Rv16b_POST:
+  case AArch64::LD4Fourv2s_POST:
+  case AArch64::LD4Fourv4h_POST:
+  case AArch64::LD4Fourv8b_POST:
+  case AArch64::LD4Fourv2d_POST:
+  case AArch64::LD4Fourv4s_POST:
+  case AArch64::LD4Fourv8h_POST:
+  case AArch64::LD4Fourv16b_POST:
+  case AArch64::LD4Rv1d_POST:
+  case AArch64::LD4Rv2s_POST:
+  case AArch64::LD4Rv4h_POST:
+  case AArch64::LD4Rv8b_POST:
+  case AArch64::LD4Rv2d_POST:
+  case AArch64::LD4Rv4s_POST:
+  case AArch64::LD4Rv8h_POST:
+  case AArch64::LD4Rv16b_POST:
+    DestRegIdx = -1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRBBui:
+  case AArch64::LDRBroW:
+  case AArch64::LDRBroX:
+  case AArch64::LDRBui:
+  case AArch64::LDRDl:
+  case AArch64::LDRDroW:
+  case AArch64::LDRDroX:
+  case AArch64::LDRDui:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRHHui:
+  case AArch64::LDRHroW:
+  case AArch64::LDRHroX:
+  case AArch64::LDRHui:
+  case AArch64::LDRQl:
+  case AArch64::LDRQroW:
+  case AArch64::LDRQroX:
+  case AArch64::LDRQui:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWl:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSWui:
+  case AArch64::LDRSl:
+  case AArch64::LDRSroW:
+  case AArch64::LDRSroX:
+  case AArch64::LDRSui:
+  case AArch64::LDRWl:
+  case AArch64::LDRWroW:
+  case AArch64::LDRWroX:
+  case AArch64::LDRWui:
+  case AArch64::LDRXl:
+  case AArch64::LDRXroW:
+  case AArch64::LDRXroX:
+  case AArch64::LDRXui:
+  case AArch64::LDURBBi:
+  case AArch64::LDURBi:
+  case AArch64::LDURDi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURHi:
+  case AArch64::LDURQi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+    DestRegIdx = 0;
+    BaseRegIdx = 1;
+    OffsetIdx = 2;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDRBBpost:
+  case AArch64::LDRBBpre:
+  case AArch64::LDRBpost:
+  case AArch64::LDRBpre:
+  case AArch64::LDRDpost:
+  case AArch64::LDRDpre:
+  case AArch64::LDRHHpost:
+  case AArch64::LDRHHpre:
+  case AArch64::LDRHpost:
+  case AArch64::LDRHpre:
+  case AArch64::LDRQpost:
+  case AArch64::LDRQpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHWpost:
+  case AArch64::LDRSHWpre:
+  case AArch64::LDRSHXpost:
+  case AArch64::LDRSHXpre:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRSpost:
+  case AArch64::LDRSpre:
+  case AArch64::LDRWpost:
+  case AArch64::LDRWpre:
+  case AArch64::LDRXpost:
+  case AArch64::LDRXpre:
+    DestRegIdx = 1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = true;
+    break;
+
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+    DestRegIdx = -1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDPSWi:
+  case AArch64::LDPSi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi:
+    DestRegIdx = 0;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDPQpost:
+  case AArch64::LDPQpre:
+    DestRegIdx = -1;
+    BaseRegIdx = 3;
+    OffsetIdx = 4;
+    IsPrePost = true;
+    break;
+
+  case AArch64::LDPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPSWpre:
+  case AArch64::LDPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::LDPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::LDPXpost:
+  case AArch64::LDPXpre:
+    DestRegIdx = 1;
+    BaseRegIdx = 3;
+    OffsetIdx = 4;
+    IsPrePost = true;
+    break;
+  }
+
+  LoadInfo LI;
+  LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg();
+  LI.BaseReg = MI.getOperand(BaseRegIdx).getReg();
+  LI.BaseRegIdx = BaseRegIdx;
+  LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx);
+  LI.IsPrePost = IsPrePost;
+  return LI;
+}
+
+static Optional<unsigned> getTag(const TargetRegisterInfo *TRI,
+                                 const MachineInstr &MI, const LoadInfo &LI) {
+  unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0;
+  unsigned Base = TRI->getEncodingValue(LI.BaseReg);
+  unsigned Off;
+  if (LI.OffsetOpnd == nullptr)
+    Off = 0;
+  else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
+           LI.OffsetOpnd->isCPI())
+    return None;
+  else if (LI.OffsetOpnd->isReg())
+    Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg());
+  else
+    Off = LI.OffsetOpnd->getImm() >> 2;
+
+  return makeTag(Dest, Base, Off);
+}
+
+void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
+  // Build the initial tag map for the whole loop.
+  TagMap.clear();
+  for (MachineBasicBlock *MBB : L.getBlocks())
+    for (MachineInstr &MI : *MBB) {
+      Optional<LoadInfo> LInfo = getLoadInfo(MI);
+      if (!LInfo)
+        continue;
+      Optional<unsigned> Tag = getTag(TRI, MI, *LInfo);
+      if (!Tag)
+        continue;
+      TagMap[*Tag].push_back(&MI);
+    }
+
+  bool AnyCollisions = false;
+  for (auto &P : TagMap) {
+    auto Size = P.second.size();
+    if (Size > 1) {
+      for (auto *MI : P.second) {
+        if (TII->isStridedAccess(*MI)) {
+          AnyCollisions = true;
+          break;
+        }
+      }
+    }
+    if (AnyCollisions)
+      break;
+  }
+  // Nothing to fix.
+  if (!AnyCollisions)
+    return;
+
+  MachineRegisterInfo &MRI = Fn.getRegInfo();
+
+  // Go through all the basic blocks in the current loop and fix any streaming
+  // loads to avoid collisions with any other loads.
+  LiveRegUnits LR(*TRI);
+  for (MachineBasicBlock *MBB : L.getBlocks()) {
+    LR.clear();
+    LR.addLiveOuts(*MBB);
+    for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) {
+      MachineInstr &MI = *I;
+      if (!TII->isStridedAccess(MI))
+        continue;
+
+      LoadInfo LdI = *getLoadInfo(MI);
+      unsigned OldTag = *getTag(TRI, MI, LdI);
+      auto &OldCollisions = TagMap[OldTag];
+      if (OldCollisions.size() <= 1)
+        continue;
+
+      bool Fixed = false;
+      DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
+
+      for (unsigned ScratchReg : AArch64::GPR64RegClass) {
+        if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))
+          continue;
+
+        LoadInfo NewLdI(LdI);
+        NewLdI.BaseReg = ScratchReg;
+        unsigned NewTag = *getTag(TRI, MI, NewLdI);
+        // Scratch reg tag would collide too, so don't use it.
+        if (TagMap.count(NewTag))
+          continue;
+
+        DEBUG(dbgs() << "Changing base reg to: " << PrintReg(ScratchReg, TRI)
+                     << '\n');
+
+        // Rewrite:
+        //   Xd = LOAD Xb, off
+        // to:
+        //   Xc = MOV Xb
+        //   Xd = LOAD Xc, off
+        DebugLoc DL = MI.getDebugLoc();
+        BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg)
+            .addReg(AArch64::XZR)
+            .addReg(LdI.BaseReg)
+            .addImm(0);
+        MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx);
+        BaseOpnd.setReg(ScratchReg);
+
+        // If the load does a pre/post increment, then insert a MOV after as
+        // well to update the real base register.
+        if (LdI.IsPrePost) {
+          DEBUG(dbgs() << "Doing post MOV of incremented reg: "
+                       << PrintReg(ScratchReg, TRI) << '\n');
+          MI.getOperand(0).setReg(
+              ScratchReg); // Change tied operand pre/post update dest.
+          BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,
+                  TII->get(AArch64::ORRXrs), LdI.BaseReg)
+              .addReg(AArch64::XZR)
+              .addReg(ScratchReg)
+              .addImm(0);
+        }
+
+        for (int I = 0, E = OldCollisions.size(); I != E; ++I)
+          if (OldCollisions[I] == &MI) {
+            std::swap(OldCollisions[I], OldCollisions[E - 1]);
+            OldCollisions.pop_back();
+            break;
+          }
+
+        // Update TagMap to reflect instruction changes to reduce the number
+        // of later MOVs to be inserted.  This needs to be done after
+        // OldCollisions is updated since it may be relocated by this
+        // insertion.
+        TagMap[NewTag].push_back(&MI);
+        ++NumCollisionsAvoided;
+        Fixed = true;
+        Modified = true;
+        break;
+      }
+      if (!Fixed)
+        ++NumCollisionsNotAvoided;
+    }
+  }
+}
+
+bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
+  auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+  if (ST.getProcFamily() != AArch64Subtarget::Falkor)
+    return false;
+
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+  TRI = ST.getRegisterInfo();
+
+  assert(TRI->trackLivenessAfterRegAlloc(Fn) &&
+         "Register liveness not available!");
+
+  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
+
+  Modified = false;
+
+  for (MachineLoop *I : LI)
+    for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
+      // Only process inner-loops
+      if (L->empty())
+        runOnLoop(**L, Fn);
+
+  return Modified;
+}
+
+FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); }
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 3682b62d2b84d..97396057dce07 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -5138,6 +5138,7 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
   return selectOperator(I, I->getOpcode());
   // Silence warnings.
   (void)&CC_AArch64_DarwinPCS_VarArg;
+  (void)&CC_AArch64_Win64_VarArg;
 }
 
 namespace llvm {
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index e96ee7d29b3e8..4907d082eda09 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -41,6 +41,10 @@
 // |                                   |
 // |-----------------------------------|
 // |                                   |
+// | (Win64 only) varargs from reg     |
+// |                                   |
+// |-----------------------------------|
+// |                                   |
 // | prev_fp, prev_lr                  |
 // | (a.k.a. "frame record")           |
 // |-----------------------------------| <- fp(=x29)
@@ -950,7 +954,13 @@ static void computeCalleeSaveRegisterPairs(
           CC == CallingConv::PreserveMost ||
           (Count & 1) == 0) &&
          "Odd number of callee-saved regs to spill!");
-  unsigned Offset = AFI->getCalleeSavedStackSize();
+  int Offset = AFI->getCalleeSavedStackSize();
+
+  unsigned GPRSaveSize = AFI->getVarArgsGPRSize();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
+  if (IsWin64)
+    Offset -= alignTo(GPRSaveSize, 16);
 
   for (unsigned i = 0; i < Count; ++i) {
     RegPairInfo RPI;
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 04687847c1a30..06005f6b68861 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -239,10 +239,17 @@ bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
   case InlineAsm::Constraint_i:
   case InlineAsm::Constraint_m:
   case InlineAsm::Constraint_Q:
-    // Require the address to be in a register.  That is safe for all AArch64
-    // variants and it is hard to do anything much smarter without knowing
-    // how the operand is used.
-    OutOps.push_back(Op);
+    // We need to make sure that this one operand does not end up in XZR, thus
+    // require the address to be in a PointerRegClass register.
+    const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+    const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF);
+    SDLoc dl(Op);
+    SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
+    SDValue NewOp =
+        SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                       dl, Op.getValueType(),
+                                       Op, RC), 0);
+    OutOps.push_back(NewOp);
     return false;
   }
   return true;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60fde5caa3393..c6150f9e5d1d7 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2650,9 +2650,13 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::PreserveMost:
   case CallingConv::CXX_FAST_TLS:
   case CallingConv::Swift:
+    if (Subtarget->isTargetWindows() && IsVarArg)
+      return CC_AArch64_Win64_VarArg;
     if (!Subtarget->isTargetDarwin())
       return CC_AArch64_AAPCS;
     return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
+  case CallingConv::Win64:
+    return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
   }
 }
 
@@ -2668,6 +2672,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv());
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -2824,10 +2829,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   // varargs
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   if (isVarArg) {
-    if (!Subtarget->isTargetDarwin()) {
+    if (!Subtarget->isTargetDarwin() || IsWin64) {
       // The AAPCS variadic function ABI is identical to the non-variadic
       // one. As a result there may be more arguments in registers and we should
       // save them for future reference.
+      // Win64 variadic functions also pass arguments in registers, but all float
+      // arguments are passed in integer registers.
       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
     }
 
@@ -2869,6 +2876,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
+  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv());
 
   SmallVector<SDValue, 8> MemOps;
 
@@ -2881,7 +2889,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
   int GPRIdx = 0;
   if (GPRSaveSize != 0) {
-    GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
+    if (IsWin64)
+      GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
+    else
+      GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
 
     SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
 
@@ -2890,7 +2901,11 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
       SDValue Store = DAG.getStore(
           Val.getValue(1), DL, Val, FIN,
-          MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
+          IsWin64
+              ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
+                                                  GPRIdx,
+                                                  (i - FirstVariadicGPR) * 8)
+              : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
       MemOps.push_back(Store);
       FIN =
           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -2899,7 +2914,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   FuncInfo->setVarArgsGPRIndex(GPRIdx);
   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
 
-  if (Subtarget->hasFPARMv8()) {
+  if (Subtarget->hasFPARMv8() && !IsWin64) {
     static const MCPhysReg FPRArgRegs[] = {
         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
@@ -4491,6 +4506,21 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
                       MachinePointerInfo(SV));
 }
 
+SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  AArch64FunctionInfo *FuncInfo =
+      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
+                                     ? FuncInfo->getVarArgsGPRIndex()
+                                     : FuncInfo->getVarArgsStackIndex(),
+                                 getPointerTy(DAG.getDataLayout()));
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
                                                 SelectionDAG &DAG) const {
   // The layout of the va_list struct is specified in the AArch64 Procedure Call
@@ -4562,8 +4592,14 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
                                             SelectionDAG &DAG) const {
-  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
-                                     : LowerAAPCS_VASTART(Op, DAG);
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
+    return LowerWin64_VASTART(Op, DAG);
+  else if (Subtarget->isTargetDarwin())
+    return LowerDarwin_VASTART(Op, DAG);
+  else
+    return LowerAAPCS_VASTART(Op, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
@@ -4571,7 +4607,8 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
   // pointer.
   SDLoc DL(Op);
-  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+  unsigned VaListSize =
+      Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32;
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
@@ -7451,6 +7488,14 @@ AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
 }
 
+MachineMemOperand::Flags
+AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
+  if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
+      I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
+    return MOStridedAccess;
+  return MachineMemOperand::MONone;
+}
+
 bool AArch64TargetLowering::isLegalInterleavedAccessType(
     VectorType *VecTy, const DataLayout &DL) const {
 
@@ -10567,9 +10612,6 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   if (Size > 128) return AtomicExpansionKind::None;
   // Nand not supported in LSE.
   if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
-  // Currently leaving And and Sub to LLSC
-  if ((AI->getOperation() == AtomicRMWInst::And) || (AI->getOperation() == AtomicRMWInst::Sub))
-    return AtomicExpansionKind::LLSC;
   // Leave 128 bits to LLSC.
   return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
 }
@@ -10783,7 +10825,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
 
 unsigned
 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
-  if (Subtarget->isTargetDarwin())
+  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
     return getPointerTy(DL).getSizeInBits();
 
   return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index ecc2517fb288d..3b0e0f1de8946 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -408,6 +408,19 @@ public:
 
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
+  bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+                        const SelectionDAG &DAG) const override {
+    // Do not merge to float value size (128 bytes) if no implicit
+    // float attribute is set.
+
+    bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute(
+        Attribute::NoImplicitFloat);
+
+    if (NoFloat)
+      return (MemVT.getSizeInBits() <= 64);
+    return true;
+  }
+
   bool isCheapToSpeculateCttz() const override {
     return true;
   }
@@ -455,6 +468,8 @@ public:
   unsigned getNumInterleavedAccesses(VectorType *VecTy,
                                      const DataLayout &DL) const;
 
+  MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
+
 private:
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
@@ -541,6 +556,7 @@ private:
   SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index de283b70210fe..eec41ddbc159f 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -451,3 +451,13 @@ def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)>
 def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>;
 def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>;
 def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_sub_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd (SUBXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_and_8 GPR64:$Rn, GPR32:$Rs), (LDCLRALb (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_16 GPR64:$Rn, GPR32:$Rs), (LDCLRALh (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_32 GPR64:$Rn, GPR32:$Rs), (LDCLRALs (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_64 GPR64:$Rn, GPR64:$Rs), (LDCLRALd (ORNXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index dba3e4bdf82f1..c0c6055c358f7 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -52,9 +52,6 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
-static const MachineMemOperand::Flags MOSuppressPair =
-    MachineMemOperand::MOTargetFlag1;
-
 static cl::opt<unsigned>
 TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
                     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
@@ -1715,6 +1712,13 @@ void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
 }
 
+/// Check all MachineMemOperands for a hint that the load/store is strided.
+bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const {
+  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+    return MMO->getFlags() & MOStridedAccess;
+  });
+}
+
 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
   switch (Opc) {
   default:
@@ -4433,7 +4437,8 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
-      {{MOSuppressPair, "aarch64-suppress-pair"}};
+      {{MOSuppressPair, "aarch64-suppress-pair"},
+       {MOStridedAccess, "aarch64-strided-access"}};
   return makeArrayRef(TargetFlags);
 }
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 0809ede4df2a5..1765a0263ea44 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -27,6 +27,13 @@ namespace llvm {
 class AArch64Subtarget;
 class AArch64TargetMachine;
 
+static const MachineMemOperand::Flags MOSuppressPair =
+    MachineMemOperand::MOTargetFlag1;
+static const MachineMemOperand::Flags MOStridedAccess =
+    MachineMemOperand::MOTargetFlag2;
+
+#define FALKOR_STRIDED_ACCESS_MD "falkor.strided.access"
+
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
   const AArch64Subtarget &Subtarget;
@@ -81,6 +88,9 @@ public:
   /// unprofitable.
   bool isLdStPairSuppressed(const MachineInstr &MI) const;
 
+  /// Return true if the given load or store is a strided memory access.
+  bool isStridedAccess(const MachineInstr &MI) const;
+
   /// Return true if this is an unscaled load/store.
   bool isUnscaledLdSt(unsigned Opc) const;
 
@@ -356,7 +366,7 @@ enum AArch64FrameOffsetStatus {
 /// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
 /// use an offset.eq
 /// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
-/// rewriten in @p MI.
+/// rewritten in @p MI.
 /// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
 /// amount that is off the limit of the legal offset.
 /// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 0be14673eb20b..0dcf07f984124 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -37,6 +37,8 @@ def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
 def HasSPE           : Predicate<"Subtarget->hasSPE()">,
                                  AssemblerPredicate<"FeatureSPE", "spe">;
+def HasSVE           : Predicate<"Subtarget->hasSVE()">,
+                                 AssemblerPredicate<"FeatureSVE", "sve">;
 
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 4a0a7c36baf8b..ffb27834c31c6 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -82,7 +82,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     setAction({Op, 1, s1}, Legal);
   }
 
-  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV})
     for (auto Ty : {s32, s64})
       setAction({BinOp, Ty}, Legal);
 
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index fab92e139dd09..9f7dcb3fe1c3f 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -74,7 +74,7 @@ const uint32_t *
 AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
   if (CC == CallingConv::GHC)
-    // This is academic becase all GHC calls are (supposed to be) tail calls
+    // This is academic because all GHC calls are (supposed to be) tail calls
     return CSR_AArch64_NoRegs_RegMask;
   if (CC == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_RegMask;
@@ -167,7 +167,7 @@ bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
 const TargetRegisterClass *
 AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
                                       unsigned Kind) const {
-  return &AArch64::GPR64RegClass;
+  return &AArch64::GPR64spRegClass;
 }
 
 const TargetRegisterClass *
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index a3238cf3b60f0..ea61124527365 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -134,7 +134,9 @@ void AArch64Subtarget::initializeProperties() {
   case CortexA72:
     PrefFunctionAlignment = 4;
     break;
-  case CortexA73: break;
+  case CortexA73:
+    PrefFunctionAlignment = 4;
+    break;
   case Others: break;
   }
 }
@@ -171,7 +173,8 @@ struct AArch64GISelActualAccessor : public GISelAccessor {
 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
                                    const TargetMachine &TM, bool LittleEndian)
-    : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()),
+    : AArch64GenSubtargetInfo(TT, CPU, FS),
+      ReserveX18(TT.isOSDarwin() || TT.isOSWindows()),
       IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
       TLInfo(TM, *this), GISel() {
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index db53946cbc77f..5a1f45ee25528 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -70,6 +70,7 @@ protected:
   bool HasFullFP16 = false;
   bool HasSPE = false;
   bool HasLSLFast = false;
+  bool HasSVE = false;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove = false;
@@ -251,6 +252,7 @@ public:
   bool hasFullFP16() const { return HasFullFP16; }
   bool hasSPE() const { return HasSPE; }
   bool hasLSLFast() const { return HasLSLFast; }
+  bool hasSVE() const { return HasSVE; }
 
   bool isLittleEndian() const { return IsLittle; }
 
@@ -304,6 +306,17 @@ public:
   bool enableEarlyIfConversion() const override;
 
   std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
+
+  bool isCallingConvWin64(CallingConv::ID CC) const {
+    switch (CC) {
+    case CallingConv::C:
+      return isTargetWindows();
+    case CallingConv::Win64:
+      return true;
+    default:
+      return false;
+    }
+  }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 6237b8f3e7b9b..ba28c01a2effb 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -138,6 +138,9 @@ static cl::opt<int> EnableGlobalISelAtO(
     cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
     cl::init(-1));
 
+static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
+                                         cl::init(true), cl::Hidden);
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -158,6 +161,8 @@ extern "C" void LLVMInitializeAArch64Target() {
   initializeAArch64PromoteConstantPass(*PR);
   initializeAArch64RedundantCopyEliminationPass(*PR);
   initializeAArch64StorePairSuppressPass(*PR);
+  initializeFalkorHWPFFixPass(*PR);
+  initializeFalkorMarkStridedAccessesLegacyPass(*PR);
   initializeLDTLSCleanupPass(*PR);
 }
 
@@ -182,7 +187,7 @@ static std::string computeDataLayout(const Triple &TT,
   if (TT.isOSBinFormatMachO())
     return "e-m:o-i64:64-i128:128-n32:64-S128";
   if (TT.isOSBinFormatCOFF())
-    return "e-m:w-i64:64-i128:128-n32:64-S128";
+    return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
   if (LittleEndian)
     return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
   return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
@@ -346,8 +351,12 @@ void AArch64PassConfig::addIRPasses() {
   //
   // Run this before LSR to remove the multiplies involved in computing the
   // pointer values N iterations ahead.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
-    addPass(createLoopDataPrefetchPass());
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    if (EnableLoopDataPrefetch)
+      addPass(createLoopDataPrefetchPass());
+    if (EnableFalkorHWPFFix)
+      addPass(createFalkorMarkStridedAccessesPass());
+  }
 
   TargetPassConfig::addIRPasses();
 
@@ -478,8 +487,12 @@ void AArch64PassConfig::addPreSched2() {
   // Expand some pseudo instructions to allow proper scheduling.
   addPass(createAArch64ExpandPseudoPass());
   // Use load/store pair instructions when possible.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
-    addPass(createAArch64LoadStoreOptimizationPass());
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    if (EnableLoadStoreOpt)
+      addPass(createAArch64LoadStoreOptimizationPass());
+    if (EnableFalkorHWPFFix)
+      addPass(createFalkorHWPFFixPass());
+  }
 }
 
 void AArch64PassConfig::addPreEmitPass() {
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index fefa7e26b79f6..85de02e859e0c 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -36,6 +36,8 @@ public:
 
   ~AArch64TargetMachine() override;
   const AArch64Subtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some, targets is
+  // deprecated and should not be used.
   const AArch64Subtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index e841fb8945191..a79d518205450 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -86,7 +86,7 @@ private:
   bool parseOperand(OperandVector &Operands, bool isCondCode,
                     bool invertCondCode);
 
-  bool showMatchError(SMLoc Loc, unsigned ErrCode);
+  bool showMatchError(SMLoc Loc, unsigned ErrCode, OperandVector &Operands);
 
   bool parseDirectiveArch(SMLoc L);
   bool parseDirectiveCPU(SMLoc L);
@@ -3257,7 +3257,10 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst,
   }
 }
 
-bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS);
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
+                                      OperandVector &Operands) {
   switch (ErrCode) {
   case Match_MissingFeature:
     return Error(Loc,
@@ -3380,8 +3383,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
     return Error(Loc, "expected readable system register");
   case Match_MSR:
     return Error(Loc, "expected writable system register or pstate");
-  case Match_MnemonicFail:
-    return Error(Loc, "unrecognized instruction mnemonic");
+  case Match_MnemonicFail: {
+    std::string Suggestion = AArch64MnemonicSpellCheck(
+        ((AArch64Operand &)*Operands[0]).getToken(),
+        ComputeAvailableFeatures(STI->getFeatureBits()));
+    return Error(Loc, "unrecognized instruction mnemonic" + Suggestion);
+  }
   default:
     llvm_unreachable("unexpected error code!");
   }
@@ -3707,7 +3714,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, Msg);
   }
   case Match_MnemonicFail:
-    return showMatchError(IDLoc, MatchResult);
+    return showMatchError(IDLoc, MatchResult, Operands);
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
 
@@ -3726,7 +3733,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
       MatchResult = Match_InvalidSuffix;
 
-    return showMatchError(ErrorLoc, MatchResult);
+    return showMatchError(ErrorLoc, MatchResult, Operands);
   }
   case Match_InvalidMemoryIndexed1:
   case Match_InvalidMemoryIndexed2:
@@ -3784,7 +3791,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc())
       ErrorLoc = IDLoc;
-    return showMatchError(ErrorLoc, MatchResult);
+    return showMatchError(ErrorLoc, MatchResult, Operands);
   }
   }
 
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index 02b12b5e90ca2..f7e0a5c7bed39 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -47,6 +47,7 @@ add_llvm_target(AArch64CodeGen
   AArch64ConditionalCompares.cpp
   AArch64DeadRegisterDefinitionsPass.cpp
   AArch64ExpandPseudoInsts.cpp
+  AArch64FalkorHWPFFix.cpp
   AArch64FastISel.cpp
   AArch64A53Fix835769.cpp
   AArch64FrameLowering.cpp
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index a7a7daf4b4a55..2bd0cbf9f7c6a 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -104,8 +104,9 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case FK_Data_1:
     return 1;
 
-  case FK_Data_2:
   case AArch64::fixup_aarch64_movw:
+  case FK_Data_2:
+  case FK_SecRel_2:
     return 2;
 
   case AArch64::fixup_aarch64_pcrel_branch14:
@@ -124,6 +125,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
   case FK_Data_4:
+  case FK_SecRel_4:
     return 4;
 
   case FK_Data_8:
@@ -218,6 +220,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case FK_Data_2:
   case FK_Data_4:
   case FK_Data_8:
+  case FK_SecRel_2:
+  case FK_SecRel_4:
     return Value;
   }
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 7862a03e771c1..31762b9e4cd50 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -27,8 +27,7 @@ namespace {
 class AArch64WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
 public:
   AArch64WinCOFFObjectWriter()
-    : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {
-  }
+      : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {}
 
   ~AArch64WinCOFFObjectWriter() override = default;
 
@@ -36,19 +35,59 @@ public:
                         const MCFixup &Fixup, bool IsCrossSection,
                         const MCAsmBackend &MAB) const override;
 
-   bool recordRelocation(const MCFixup &) const override;
+  bool recordRelocation(const MCFixup &) const override;
 };
 
 } // end anonymous namespace
 
-unsigned
-AArch64WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
-                                         const MCValue &Target,
-                                         const MCFixup &Fixup,
-                                         bool IsCrossSection,
-                                         const MCAsmBackend &MAB) const {
-  const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
-  report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+unsigned AArch64WinCOFFObjectWriter::getRelocType(
+    MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup,
+    bool IsCrossSection, const MCAsmBackend &MAB) const {
+  auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None
+                                      : Target.getSymA()->getKind();
+
+  switch (static_cast<unsigned>(Fixup.getKind())) {
+  default: {
+    const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
+    report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+  }
+
+  case FK_Data_4:
+    switch (Modifier) {
+    default:
+      return COFF::IMAGE_REL_ARM64_ADDR32;
+    case MCSymbolRefExpr::VK_COFF_IMGREL32:
+      return COFF::IMAGE_REL_ARM64_ADDR32NB;
+    case MCSymbolRefExpr::VK_SECREL:
+      return COFF::IMAGE_REL_ARM64_SECREL;
+    }
+
+  case FK_Data_8:
+    return COFF::IMAGE_REL_ARM64_ADDR64;
+
+  case FK_SecRel_2:
+    return COFF::IMAGE_REL_ARM64_SECTION;
+
+  case FK_SecRel_4:
+    return COFF::IMAGE_REL_ARM64_SECREL;
+
+  case AArch64::fixup_aarch64_add_imm12:
+    return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12A;
+
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12L;
+
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    return COFF::IMAGE_REL_ARM64_PAGEBASE_REL21;
+
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    return COFF::IMAGE_REL_ARM64_BRANCH26;
+  }
 }
 
 bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 5a799b2d88d0d..568682899be51 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -56,7 +56,7 @@ extern char &AMDGPUMachineCFGStructurizerID;
 
 void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
 
-ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+Pass *createAMDGPUAnnotateKernelFeaturesPass();
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
 
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 7235d8fae3327..c68e5861ff25b 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -15,8 +15,10 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 
@@ -26,26 +28,27 @@ using namespace llvm;
 
 namespace {
 
-class AMDGPUAnnotateKernelFeatures : public ModulePass {
+class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
 private:
+  const TargetMachine *TM = nullptr;
   AMDGPUAS AS;
-  static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS);
 
-  void addAttrToCallers(Function *Intrin, StringRef AttrName);
-  bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
+  bool addFeatureAttributes(Function &F);
 
 public:
   static char ID;
 
-  AMDGPUAnnotateKernelFeatures() : ModulePass(ID) {}
-  bool runOnModule(Module &M) override;
+  AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
+
+  bool doInitialization(CallGraph &CG) override;
+  bool runOnSCC(CallGraphSCC &SCC) override;
   StringRef getPassName() const override {
     return "AMDGPU Annotate Kernel Features";
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
-    ModulePass::getAnalysisUsage(AU);
+    CallGraphSCCPass::getAnalysisUsage(AU);
   }
 
   static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS);
@@ -121,16 +124,130 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
   return false;
 }
 
-// Return true if an addrspacecast is used that requires the queue ptr.
-bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F,
-    AMDGPUAS AS) {
+// We do not need to note the x workitem or workgroup id because they are always
+// initialized.
+//
+// TODO: We should not add the attributes if the known compile time workgroup
+// size is 1 for y/z.
+static StringRef intrinsicToAttrName(Intrinsic::ID ID,
+                                     bool &NonKernelOnly,
+                                     bool &IsQueuePtr) {
+  switch (ID) {
+  case Intrinsic::amdgcn_workitem_id_x:
+    NonKernelOnly = true;
+    return "amdgpu-work-item-id-x";
+  case Intrinsic::amdgcn_workgroup_id_x:
+    NonKernelOnly = true;
+    return "amdgpu-work-group-id-x";
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::r600_read_tidig_y:
+    return "amdgpu-work-item-id-y";
+  case Intrinsic::amdgcn_workitem_id_z:
+  case Intrinsic::r600_read_tidig_z:
+    return "amdgpu-work-item-id-z";
+  case Intrinsic::amdgcn_workgroup_id_y:
+  case Intrinsic::r600_read_tgid_y:
+    return "amdgpu-work-group-id-y";
+  case Intrinsic::amdgcn_workgroup_id_z:
+  case Intrinsic::r600_read_tgid_z:
+    return "amdgpu-work-group-id-z";
+  case Intrinsic::amdgcn_dispatch_ptr:
+    return "amdgpu-dispatch-ptr";
+  case Intrinsic::amdgcn_dispatch_id:
+    return "amdgpu-dispatch-id";
+  case Intrinsic::amdgcn_kernarg_segment_ptr:
+  case Intrinsic::amdgcn_implicitarg_ptr:
+    return "amdgpu-kernarg-segment-ptr";
+  case Intrinsic::amdgcn_queue_ptr:
+  case Intrinsic::trap:
+  case Intrinsic::debugtrap:
+    IsQueuePtr = true;
+    return "amdgpu-queue-ptr";
+  default:
+    return "";
+  }
+}
+
+static bool handleAttr(Function &Parent, const Function &Callee,
+                       StringRef Name) {
+  if (Callee.hasFnAttribute(Name)) {
+    Parent.addFnAttr(Name);
+    return true;
+  }
+
+  return false;
+}
+
+static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
+                                   bool &NeedQueuePtr) {
+  // X ids unnecessarily propagated to kernels.
+  static const StringRef AttrNames[] = {
+    { "amdgpu-work-item-id-x" },
+    { "amdgpu-work-item-id-y" },
+    { "amdgpu-work-item-id-z" },
+    { "amdgpu-work-group-id-x" },
+    { "amdgpu-work-group-id-y" },
+    { "amdgpu-work-group-id-z" },
+    { "amdgpu-dispatch-ptr" },
+    { "amdgpu-dispatch-id" },
+    { "amdgpu-kernarg-segment-ptr" }
+  };
+
+  if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
+    NeedQueuePtr = true;
+
+  for (StringRef AttrName : AttrNames)
+    handleAttr(Parent, Callee, AttrName);
+}
+
+bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  bool HasFlat = ST.hasFlatAddressSpace();
+  bool HasApertureRegs = ST.hasApertureRegs();
   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 
-  for (const BasicBlock &BB : F) {
-    for (const Instruction &I : BB) {
+  bool Changed = false;
+  bool NeedQueuePtr = false;
+  bool HaveCall = false;
+  bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
+
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      CallSite CS(&I);
+      if (CS) {
+        Function *Callee = CS.getCalledFunction();
+
+        // TODO: Do something with indirect calls.
+        if (!Callee) {
+          if (!CS.isInlineAsm())
+            HaveCall = true;
+          continue;
+        }
+
+        Intrinsic::ID IID = Callee->getIntrinsicID();
+        if (IID == Intrinsic::not_intrinsic) {
+          HaveCall = true;
+          copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
+          Changed = true;
+        } else {
+          bool NonKernelOnly = false;
+          StringRef AttrName = intrinsicToAttrName(IID,
+                                                   NonKernelOnly, NeedQueuePtr);
+          if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
+            F.addFnAttr(AttrName);
+            Changed = true;
+          }
+        }
+      }
+
+      if (NeedQueuePtr || HasApertureRegs)
+        continue;
+
       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
-        if (castRequiresQueuePtr(ASC, AS))
-          return true;
+        if (castRequiresQueuePtr(ASC, AS)) {
+          NeedQueuePtr = true;
+          continue;
+        }
       }
 
       for (const Use &U : I.operands()) {
@@ -138,100 +255,57 @@ bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F,
         if (!OpC)
           continue;
 
-        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS))
-          return true;
+        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) {
+          NeedQueuePtr = true;
+          break;
+        }
       }
     }
   }
 
-  return false;
-}
-
-void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
-                                                    StringRef AttrName) {
-  SmallPtrSet<Function *, 4> SeenFuncs;
-
-  for (User *U : Intrin->users()) {
-    // CallInst is the only valid user for an intrinsic.
-    CallInst *CI = cast<CallInst>(U);
-
-    Function *CallingFunction = CI->getParent()->getParent();
-    if (SeenFuncs.insert(CallingFunction).second)
-      CallingFunction->addFnAttr(AttrName);
+  if (NeedQueuePtr) {
+    F.addFnAttr("amdgpu-queue-ptr");
+    Changed = true;
   }
-}
-
-bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
-  Module &M,
-  ArrayRef<StringRef[2]> IntrinsicToAttr) {
-  bool Changed = false;
 
-  for (const StringRef *Arr  : IntrinsicToAttr) {
-    if (Function *Fn = M.getFunction(Arr[0])) {
-      addAttrToCallers(Fn, Arr[1]);
-      Changed = true;
-    }
+  // TODO: We could refine this to captured pointers that could possibly be
+  // accessed by flat instructions. For now this is mostly a poor way of
+  // estimating whether there are calls before argument lowering.
+  if (HasFlat && !IsFunc && HaveCall) {
+    F.addFnAttr("amdgpu-flat-scratch");
+    Changed = true;
   }
 
   return Changed;
 }
 
-bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
+bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
+  Module &M = SCC.getCallGraph().getModule();
   Triple TT(M.getTargetTriple());
-  AS = AMDGPU::getAMDGPUAS(M);
-
-  static const StringRef IntrinsicToAttr[][2] = {
-    // .x omitted
-    { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" },
-    { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" },
-
-    { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" },
-    { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" },
-
-    { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
-    { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
-
-    // .x omitted
-    { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
-    { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
-  };
 
-  static const StringRef HSAIntrinsicToAttr[][2] = {
-    { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" },
-    { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" },
-    { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" },
-    { "llvm.trap", "amdgpu-queue-ptr" },
-    { "llvm.debugtrap", "amdgpu-queue-ptr" }
-  };
-
-  // TODO: We should not add the attributes if the known compile time workgroup
-  // size is 1 for y/z.
-
-  // TODO: Intrinsics that require queue ptr.
+  bool Changed = false;
+  for (CallGraphNode *I : SCC) {
+    Function *F = I->getFunction();
+    if (!F || F->isDeclaration())
+      continue;
 
-  // We do not need to note the x workitem or workgroup id because they are
-  // always initialized.
+    Changed |= addFeatureAttributes(*F);
+  }
 
-  bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
-  if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) {
-    Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
 
-    for (Function &F : M) {
-      if (F.hasFnAttribute("amdgpu-queue-ptr"))
-        continue;
+  return Changed;
+}
 
-      auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-      bool HasApertureRegs = TPC && TPC->getTM<TargetMachine>()
-                                        .getSubtarget<AMDGPUSubtarget>(F)
-                                        .hasApertureRegs();
-      if (!HasApertureRegs && hasAddrSpaceCast(F, AS))
-        F.addFnAttr("amdgpu-queue-ptr");
-    }
-  }
+bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    report_fatal_error("TargetMachine is required");
 
-  return Changed;
+  AS = AMDGPU::getAMDGPUAS(CG.getModule());
+  TM = &TPC->getTM<TargetMachine>();
+  return false;
 }
 
-ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
   return new AMDGPUAnnotateKernelFeatures();
 }
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 83ad1a5c6ee30..2247814cfe55d 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -268,20 +268,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                  CurrentProgramInfo.ScratchSize,
                                  getFunctionCodeSize(MF));
 
-      OutStreamer->emitRawComment(" codeLenInByte = " +
-                                  Twine(getFunctionCodeSize(MF)), false);
-      OutStreamer->emitRawComment(
-        " NumSgprs: " + Twine(CurrentProgramInfo.NumSGPR), false);
-      OutStreamer->emitRawComment(
-        " NumVgprs: " + Twine(CurrentProgramInfo.NumVGPR), false);
-
       OutStreamer->emitRawComment(
         " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
       OutStreamer->emitRawComment(
         " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
       OutStreamer->emitRawComment(
-        " ScratchSize: " + Twine(CurrentProgramInfo.ScratchSize), false);
-      OutStreamer->emitRawComment(
         " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
         " bytes/workgroup (compile time only)", false);
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 2553cf4da0feb..258b1737deb38 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -573,6 +573,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FNEG);
   setTargetDAGCombine(ISD::FABS);
+  setTargetDAGCombine(ISD::AssertZext);
+  setTargetDAGCombine(ISD::AssertSext);
 }
 
 //===----------------------------------------------------------------------===//
@@ -883,7 +885,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
 
 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
 /// input values across multiple registers.  Each item in the Ins array
-/// represents a single value that will be stored in regsters.  Ins[x].VT is
+/// represents a single value that will be stored in registers.  Ins[x].VT is
 /// the value type of the value that will be stored in the register, so
 /// whatever SDNode we lower the argument to needs to be this type.
 ///
@@ -2591,6 +2593,31 @@ SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
   return SDValue(CSrc, 0);
 }
 
+// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
+// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
+// issues.
+SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
+                                                        DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+
+  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
+  //     (vt2 (truncate (assertzext vt0:x, vt1)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue N1 = N->getOperand(1);
+    EVT ExtVT = cast<VTSDNode>(N1)->getVT();
+    SDLoc SL(N);
+
+    SDValue Src = N0.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.bitsGE(ExtVT)) {
+      SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
+      return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
+    }
+  }
+
+  return SDValue();
+}
 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
 /// binary operation \p Opc to it with the corresponding constant operands.
 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -3521,6 +3548,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
+  case ISD::AssertZext:
+  case ISD::AssertSext:
+    return performAssertSZExtCombine(N, DCI);
   }
   return SDValue();
 }
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index a45234e2b39f2..d85aada6053a1 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -76,6 +76,7 @@ protected:
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
                                        unsigned Opc, SDValue LHS,
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 1bc5a52053ecb..7796176290108 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -277,7 +277,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   // Make sure requested values are compatible with values implied by requested
   // minimum/maximum flat work group sizes.
   if (RequestedFlatWorkGroupSize &&
-      Requested.first > MinImpliedByFlatWorkGroupSize)
+      Requested.first < MinImpliedByFlatWorkGroupSize)
     return Default;
 
   return Requested;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 22cede59086ab..d4b6a5fe8020b 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -359,6 +359,10 @@ public:
     return FP64FP16Denormals;
   }
 
+  bool supportsMinMaxDenormModes() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
   bool hasFPExceptions() const {
     return FPExceptions;
   }
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index e3c90f250600a..b37c274102bc8 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1208,7 +1208,7 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
 }
 
 bool AMDGPUOperand::isLiteralImm(MVT type) const {
-  // Check that this imediate can be added as literal
+  // Check that this immediate can be added as literal
   if (!isImmTy(ImmTyNone)) {
     return false;
   }
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index f26e49295e690..966c6fec20c63 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -87,6 +87,7 @@ DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)
 DECODE_OPERAND_REG(VGPR_32)
 DECODE_OPERAND_REG(VS_32)
 DECODE_OPERAND_REG(VS_64)
+DECODE_OPERAND_REG(VS_128)
 
 DECODE_OPERAND_REG(VReg_64)
 DECODE_OPERAND_REG(VReg_96)
@@ -318,6 +319,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const {
   return decodeSrcOp(OPW64, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_VS_128(unsigned Val) const {
+  return decodeSrcOp(OPW128, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
   return decodeSrcOp(OPW16, Val);
 }
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 3d71db909e20d..4c755be099995 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -70,6 +70,7 @@ public:
   MCOperand decodeOperand_VGPR_32(unsigned Val) const;
   MCOperand decodeOperand_VS_32(unsigned Val) const;
   MCOperand decodeOperand_VS_64(unsigned Val) const;
+  MCOperand decodeOperand_VS_128(unsigned Val) const;
   MCOperand decodeOperand_VSrc16(unsigned Val) const;
   MCOperand decodeOperand_VSrcV216(unsigned Val) const;
 
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3af242d9ea660..0aad8f0843d62 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -653,6 +653,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
         // again. The same constant folded instruction could also have a second
         // use operand.
         NextUse = MRI->use_begin(Dst.getReg());
+        FoldList.clear();
         continue;
       }
 
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 08a64de385018..7334781916d81 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -158,7 +158,7 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
   // No replacement necessary.
   if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
       !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) {
-    assert(MFI->getStackPtrOffsetReg() == AMDGPU::NoRegister);
+    assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG);
     return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister);
   }
 
@@ -246,13 +246,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   // this point it appears we need the setup. This part of the prolog should be
   // emitted after frame indices are eliminated.
 
-  if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
+  if (MFI->hasFlatScratchInit())
     emitFlatScratchInit(ST, MF, MBB);
 
   unsigned SPReg = MFI->getStackPtrOffsetReg();
-  if (SPReg != AMDGPU::NoRegister) {
+  if (SPReg != AMDGPU::SP_REG) {
+    assert(MRI.isReserved(SPReg) && "SPReg used but not reserved");
+
     DebugLoc DL;
-    int64_t StackSize = MF.getFrameInfo().getStackSize();
+    const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+    int64_t StackSize = FrameInfo.getStackSize();
 
     if (StackSize == 0) {
       BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 2ba570b9ebbbc..2356405f09199 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1171,8 +1171,7 @@ static void allocateSystemSGPRs(CCState &CCInfo,
 static void reservePrivateMemoryRegs(const TargetMachine &TM,
                                      MachineFunction &MF,
                                      const SIRegisterInfo &TRI,
-                                     SIMachineFunctionInfo &Info,
-                                     bool NeedSP) {
+                                     SIMachineFunctionInfo &Info) {
   // Now that we've figured out where the scratch register inputs are, see if
   // should reserve the arguments and use them directly.
   MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1234,15 +1233,6 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
     }
   }
-
-  if (NeedSP) {
-    unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
-    Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
-
-    assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg());
-    assert(!TRI.isSubRegister(Info.getScratchRSrcReg(),
-                              Info.getStackPtrOffsetReg()));
-  }
 }
 
 SDValue SITargetLowering::LowerFormalArguments(
@@ -1380,10 +1370,37 @@ SDValue SITargetLowering::LowerFormalArguments(
 
     unsigned Reg = VA.getLocReg();
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+    EVT ValVT = VA.getValVT();
 
     Reg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
 
+    // If this is an 8 or 16-bit value, it is really passed promoted
+    // to 32 bits. Insert an assert[sz]ext to capture this, then
+    // truncate to the right size.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
+      break;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
+                        DAG.getValueType(ValVT));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
+                        DAG.getValueType(ValVT));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+      break;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
+    }
+
     if (IsShader && Arg.VT.isVector()) {
       // Build a vector from the registers
       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
@@ -1410,25 +1427,13 @@ SDValue SITargetLowering::LowerFormalArguments(
     InVals.push_back(Val);
   }
 
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-
-  // TODO: Could maybe omit SP if only tail calls?
-  bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects();
-
   // Start adding system SGPRs.
   if (IsEntryFunc) {
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
-    reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
     CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
     CCInfo.AllocateReg(Info->getFrameOffsetReg());
-
-    if (NeedSP) {
-      unsigned StackPtrReg = findFirstFreeSGPR(CCInfo);
-      CCInfo.AllocateReg(StackPtrReg);
-      Info->setStackPtrOffsetReg(StackPtrReg);
-    }
   }
 
   return Chains.empty() ? Chain :
@@ -4624,8 +4629,8 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
   return DAG.isKnownNeverNaN(Op);
 }
 
-static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
-                            unsigned MaxDepth=5) {
+static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
+                            const SISubtarget *ST, unsigned MaxDepth=5) {
   // If source is a result of another standard FP operation it is already in
   // canonical form.
 
@@ -4663,7 +4668,7 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
   case ISD::FNEG:
   case ISD::FABS:
     return (MaxDepth > 0) &&
-           isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1);
+           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
 
   case ISD::FSIN:
   case ISD::FCOS:
@@ -4672,16 +4677,19 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
 
   // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
   // For such targets need to check their input recursively.
-  // TODO: on GFX9+ we could return true without checking provided no-nan
-  // mode, since canonicalization is also used to quiet sNaNs.
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
   case ISD::FMINNAN:
   case ISD::FMAXNAN:
 
+    if (ST->supportsMinMaxDenormModes() &&
+        DAG.isKnownNeverNaN(Op.getOperand(0)) &&
+        DAG.isKnownNeverNaN(Op.getOperand(1)))
+      return true;
+
     return (MaxDepth > 0) &&
-           isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) &&
-           isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1);
+           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
+           isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
 
   case ISD::ConstantFP: {
     auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
@@ -4700,11 +4708,19 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
 
   if (!CFP) {
     SDValue N0 = N->getOperand(0);
+    EVT VT = N0.getValueType().getScalarType();
+    auto ST = getSubtarget();
+
+    if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
+         (VT == MVT::f64 && ST->hasFP64Denormals()) ||
+         (VT == MVT::f16 && ST->hasFP16Denormals())) &&
+        DAG.isKnownNeverNaN(N0))
+      return N0;
 
     bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
 
     if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
-        isCanonicalized(N0, getSubtarget()))
+        isCanonicalized(DAG, N0, ST))
       return N0;
 
     return SDValue();
@@ -5813,3 +5829,44 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
   }
   return TargetLowering::getConstraintType(Constraint);
 }
+
+// Figure out which registers should be reserved for stack access. Only after
+// the function is legalized do we know all of the non-spill stack objects or if
+// calls are present.
+void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  if (Info->isEntryFunction()) {
+    // Callable functions have fixed registers used for stack access.
+    reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
+  }
+
+  // We have to assume the SP is needed in case there are calls in the function
+  // during lowering. Calls are only detected after the function is
+  // lowered. We're about to reserve registers, so don't bother using it if we
+  // aren't really going to use it.
+  bool NeedSP = !Info->isEntryFunction() ||
+    MFI.hasVarSizedObjects() ||
+    MFI.hasCalls();
+
+  if (NeedSP) {
+    unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
+    Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
+
+    assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
+    assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
+                               Info->getStackPtrOffsetReg()));
+    MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
+  }
+
+  MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
+  MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
+  MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
+                     Info->getScratchWaveOffsetReg());
+
+  TargetLoweringBase::finalizeLowering(MF);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 83392a7ab1b21..e6bb3d6cd4191 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -232,6 +232,8 @@ public:
   ConstraintType getConstraintType(StringRef Constraint) const override;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
                    SDValue V) const;
+
+  void finalizeLowering(MachineFunction &MF) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 160f8837d49c8..a7e0feb10b9f1 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3408,8 +3408,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
 }
 
 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
-  SmallVector<MachineInstr *, 128> Worklist;
-  Worklist.push_back(&TopInst);
+  SetVectorType Worklist;
+  Worklist.insert(&TopInst);
 
   while (!Worklist.empty()) {
     MachineInstr &Inst = *Worklist.pop_back_val();
@@ -3610,7 +3610,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
   }
 }
 
-void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
                                  MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3635,7 +3635,7 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
 }
 
 void SIInstrInfo::splitScalar64BitUnaryOp(
-    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    SetVectorType &Worklist, MachineInstr &Inst,
     unsigned Opcode) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3686,7 +3686,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
 }
 
 void SIInstrInfo::splitScalar64BitBinaryOp(
-    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    SetVectorType &Worklist, MachineInstr &Inst,
     unsigned Opcode) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3753,7 +3753,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
 }
 
 void SIInstrInfo::splitScalar64BitBCNT(
-    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const {
+    SetVectorType &Worklist, MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
@@ -3789,7 +3789,7 @@ void SIInstrInfo::splitScalar64BitBCNT(
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
                                       MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3853,12 +3853,12 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
 void SIInstrInfo::addUsersToMoveToVALUWorklist(
   unsigned DstReg,
   MachineRegisterInfo &MRI,
-  SmallVectorImpl<MachineInstr *> &Worklist) const {
+  SetVectorType &Worklist) const {
   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
          E = MRI.use_end(); I != E;) {
     MachineInstr &UseMI = *I->getParent();
     if (!canReadVGPR(UseMI, I.getOperandNo())) {
-      Worklist.push_back(&UseMI);
+      Worklist.insert(&UseMI);
 
       do {
         ++I;
@@ -3869,7 +3869,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
   }
 }
 
-void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
                                  MachineRegisterInfo &MRI,
                                  MachineInstr &Inst) const {
   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -3932,7 +3932,7 @@ void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
 }
 
 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
-    MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
+    MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
   // This assumes that all the users of SCC are in the same block
   // as the SCC def.
   for (MachineInstr &MI :
@@ -3943,7 +3943,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(
       return;
 
     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
-      Worklist.push_back(&MI);
+      Worklist.insert(&MI);
   }
 }
 
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index d00c0d4a7f4ea..3dd5bc89e6c77 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -19,6 +19,7 @@
 #include "AMDGPUInstrInfo.h"
 #include "SIDefines.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/SetVector.h"
 
 namespace llvm {
 
@@ -38,6 +39,8 @@ private:
     EXECZ = 3
   };
 
+  typedef SmallSetVector<MachineInstr *, 32> SetVectorType;
+
   static unsigned getBranchOpcode(BranchPredicate Cond);
   static BranchPredicate getBranchPredicate(unsigned Opcode);
 
@@ -56,30 +59,30 @@ private:
 
   void swapOperands(MachineInstr &Inst) const;
 
-  void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+  void lowerScalarAbs(SetVectorType &Worklist,
                       MachineInstr &Inst) const;
 
-  void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitUnaryOp(SetVectorType &Worklist,
                                MachineInstr &Inst, unsigned Opcode) const;
 
-  void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitBinaryOp(SetVectorType &Worklist,
                                 MachineInstr &Inst, unsigned Opcode) const;
 
-  void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitBCNT(SetVectorType &Worklist,
                             MachineInstr &Inst) const;
-  void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitBFE(SetVectorType &Worklist,
                            MachineInstr &Inst) const;
-  void movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+  void movePackToVALU(SetVectorType &Worklist,
                       MachineRegisterInfo &MRI,
                       MachineInstr &Inst) const;
 
   void addUsersToMoveToVALUWorklist(
     unsigned Reg, MachineRegisterInfo &MRI,
-    SmallVectorImpl<MachineInstr *> &Worklist) const;
+    SetVectorType &Worklist) const;
 
   void
   addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst,
-                               SmallVectorImpl<MachineInstr *> &Worklist) const;
+                               SetVectorType &Worklist) const;
 
   const TargetRegisterClass *
   getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index ffb01363e1313..088173680fa89 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1436,7 +1436,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
 
   field bit IsPacked = isPackedType<Src0VT>.ret;
   field bit HasOpSel = IsPacked;
-  field bit HasOMod = !if(HasOpSel, 0, HasModifiers);
+  field bit HasOMod = !if(HasOpSel, 0, isFloatType<DstVT>.ret);
   field bit HasSDWAOMod = isFloatType<DstVT>.ret;
 
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index bcc685015cf5e..ba69e42d9125f 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1060,7 +1060,7 @@ def : Pat <
 
 class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
   (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
-  (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+  (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
 >;
 
 def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 3203c38dae344..a7c8166ff6d27 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -23,10 +23,10 @@ using namespace llvm;
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     TIDReg(AMDGPU::NoRegister),
-    ScratchRSrcReg(AMDGPU::NoRegister),
-    ScratchWaveOffsetReg(AMDGPU::NoRegister),
-    FrameOffsetReg(AMDGPU::NoRegister),
-    StackPtrOffsetReg(AMDGPU::NoRegister),
+    ScratchRSrcReg(AMDGPU::PRIVATE_RSRC_REG),
+    ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG),
+    FrameOffsetReg(AMDGPU::FP_REG),
+    StackPtrOffsetReg(AMDGPU::SP_REG),
     PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
     DispatchPtrUserSGPR(AMDGPU::NoRegister),
     QueuePtrUserSGPR(AMDGPU::NoRegister),
@@ -42,6 +42,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
     WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
+    WorkItemIDXVGPR(AMDGPU::NoRegister),
+    WorkItemIDYVGPR(AMDGPU::NoRegister),
+    WorkItemIDZVGPR(AMDGPU::NoRegister),
     PSInputAddr(0),
     PSInputEnable(0),
     ReturnsVoid(true),
@@ -87,12 +90,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     ScratchWaveOffsetReg = AMDGPU::SGPR4;
     FrameOffsetReg = AMDGPU::SGPR5;
     StackPtrOffsetReg = AMDGPU::SGPR32;
-    return;
+
+    // FIXME: Not really a system SGPR.
+    PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg;
   }
 
   CallingConv::ID CC = F->getCallingConv();
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
-    KernargSegmentPtr = true;
+    KernargSegmentPtr = !F->arg_empty();
     WorkGroupIDX = true;
     WorkItemIDX = true;
   } else if (CC == CallingConv::AMDGPU_PS) {
@@ -101,17 +106,25 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
   if (ST.debuggerEmitPrologue()) {
     // Enable everything.
+    WorkGroupIDX = true;
     WorkGroupIDY = true;
     WorkGroupIDZ = true;
+    WorkItemIDX = true;
     WorkItemIDY = true;
     WorkItemIDZ = true;
   } else {
+    if (F->hasFnAttribute("amdgpu-work-group-id-x"))
+      WorkGroupIDX = true;
+
     if (F->hasFnAttribute("amdgpu-work-group-id-y"))
       WorkGroupIDY = true;
 
     if (F->hasFnAttribute("amdgpu-work-group-id-z"))
       WorkGroupIDZ = true;
 
+    if (F->hasFnAttribute("amdgpu-work-item-id-x"))
+      WorkItemIDX = true;
+
     if (F->hasFnAttribute("amdgpu-work-item-id-y"))
       WorkItemIDY = true;
 
@@ -119,25 +132,28 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
       WorkItemIDZ = true;
   }
 
-  // X, XY, and XYZ are the only supported combinations, so make sure Y is
-  // enabled if Z is.
-  if (WorkItemIDZ)
-    WorkItemIDY = true;
-
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   bool MaySpill = ST.isVGPRSpillingEnabled(*F);
-  bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls();
+  bool HasStackObjects = FrameInfo.hasStackObjects();
+
+  if (isEntryFunction()) {
+    // X, XY, and XYZ are the only supported combinations, so make sure Y is
+    // enabled if Z is.
+    if (WorkItemIDZ)
+      WorkItemIDY = true;
 
-  if (HasStackObjects || MaySpill) {
-    PrivateSegmentWaveByteOffset = true;
+    if (HasStackObjects || MaySpill) {
+      PrivateSegmentWaveByteOffset = true;
 
-    // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
-    if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
-        (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
-      PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
+      // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
+      if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
+          (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
+        PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
+    }
   }
 
-  if (ST.isAmdCodeObjectV2(MF)) {
+  bool IsCOV2 = ST.isAmdCodeObjectV2(MF);
+  if (IsCOV2) {
     if (HasStackObjects || MaySpill)
       PrivateSegmentBuffer = true;
 
@@ -154,11 +170,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
       ImplicitBufferPtr = true;
   }
 
-  // We don't need to worry about accessing spills with flat instructions.
-  // TODO: On VI where we must use flat for global, we should be able to omit
-  // this if it is never used for generic access.
-  if (HasStackObjects && ST.hasFlatAddressSpace() && ST.isAmdHsaOS())
-    FlatScratchInit = true;
+  if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr"))
+    KernargSegmentPtr = true;
+
+  if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) {
+    // TODO: This could be refined a lot. The attribute is a poor way of
+    // detecting calls that may require it before argument lowering.
+    if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch"))
+      FlatScratchInit = true;
+  }
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 05aa249584bf1..4c7f38a09a484 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -119,6 +119,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   unsigned WorkGroupInfoSystemSGPR;
   unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
 
+  // VGPR inputs. These are always v0, v1 and v2 for entry functions.
+  unsigned WorkItemIDXVGPR;
+  unsigned WorkItemIDYVGPR;
+  unsigned WorkItemIDZVGPR;
+
   // Graphics info.
   unsigned PSInputAddr;
   unsigned PSInputEnable;
@@ -377,10 +382,13 @@ public:
   }
 
   void setStackPtrOffsetReg(unsigned Reg) {
-    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
     StackPtrOffsetReg = Reg;
   }
 
+  // Note the unset value for this is AMDGPU::SP_REG rather than
+  // NoRegister. This is mostly a workaround for MIR tests where state that
+  // can't be directly computed from the function is not preserved in serialized
+  // MIR.
   unsigned getStackPtrOffsetReg() const {
     return StackPtrOffsetReg;
   }
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ef6ad4ad0c8f3..4a3fbb4593bb3 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -207,7 +207,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
   }
 
+  // We have to assume the SP is needed in case there are calls in the function,
+  // which is detected after the function is lowered. If we aren't really going
+  // to need SP, don't bother reserving it.
   unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
+
   if (StackPtrReg != AMDGPU::NoRegister) {
     reserveRegisterTuples(Reserved, StackPtrReg);
     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index fc808011cd889..54ea7805e18d6 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -23,6 +23,13 @@ class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
 def VCC_LO : SIReg<"vcc_lo", 106>;
 def VCC_HI : SIReg<"vcc_hi", 107>;
 
+// Pseudo-registers: Used as placeholders during isel and immediately
+// replaced, never seeing the verifier.
+def PRIVATE_RSRC_REG : SIReg<"", 0>;
+def FP_REG : SIReg<"", 0>;
+def SP_REG : SIReg<"", 0>;
+def SCRATCH_WAVE_OFFSET_REG : SIReg<"", 0>;
+
 // VCC for 64-bit instructions
 def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
           DwarfRegAlias<VCC_LO> {
@@ -267,7 +274,8 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
 def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
    TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
-   SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
+   SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT,
+   FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
   let AllocationPriority = 7;
 }
 
@@ -314,7 +322,8 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R
   let isAllocatable = 0;
 }
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> {
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32,
+  (add SGPR_128, TTMP_128)> {
   let AllocationPriority = 10;
 }
 
@@ -464,7 +473,9 @@ defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
 
 defm VSrc : RegImmOperand<"VS", "VSrc">;
 
-def VSrc_128 : RegisterOperand<VReg_128>;
+def VSrc_128 : RegisterOperand<VReg_128> {
+  let DecoderMethod = "DecodeVS_128RegisterClass";
+}
 
 //===----------------------------------------------------------------------===//
 //  VSrc_* Operands with an VGPR
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 26515b27bb77d..67ad904ca9723 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -539,23 +539,9 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
 }
 
 bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
-
-  if (Reg0 == Reg1) {
-    return true;
+  for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) {
+    if (*R == Reg1) return true;
   }
-
-  unsigned SubReg0 = TRI->getSubReg(Reg0, 1);
-  if (SubReg0 == 0) {
-    return TRI->getSubRegIndex(Reg1, Reg0) > 0;
-  }
-
-  for (unsigned Idx = 2; SubReg0 > 0; ++Idx) {
-    if (isRegIntersect(Reg1, SubReg0, TRI)) {
-      return true;
-    }
-    SubReg0 = TRI->getSubReg(Reg0, Idx);
-  }
-
   return false;
 }
 
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 7b9bc71ad4c77..d5acb49b4f39c 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -117,7 +117,10 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
 class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret = !if(P.HasModifiers,
     [(set P.DstVT:$vdst,
-      (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+      (node (P.Src0VT
+              !if(P.HasOMod,
+                  (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+                  (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))),
             (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
     [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]);
 }
@@ -813,9 +816,11 @@ let SubtargetPredicate = isVI in {
 
 // Aliases to simplify matching of floating-point instructions that
 // are VOP2 on SI and VOP3 on VI.
-class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
+class SI2_VI3Alias <string name, VOP3_Real inst> : InstAlias <
   name#" $dst, $src0, $src1",
-  (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0)
+  !if(inst.Pfl.HasOMod,
+      (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0),
+      (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0))
 >, PredicateControl {
   let UseInstAsmMatchConverter = 0;
   let AsmVariantName = AMDGPUAsmVariants.VOP3;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index a8ca593f14ed0..92ed0706dc011 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -12,17 +12,21 @@
 //===----------------------------------------------------------------------===//
 
 class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
+  dag src0 = !if(P.HasOMod,
+    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
+
   list<dag> ret3 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+    (node (P.Src0VT src0),
           (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+    (node (P.Src0VT src0),
           (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))];
+    (node (P.Src0VT src0)))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
@@ -92,6 +96,7 @@ class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> {
 class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
   // v_div_scale_{f32|f64} do not support input modifiers.
   let HasModifiers = 0;
+  let HasOMod = 0;
   let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
   let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
 }
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index f2de1f9957260..3becf758aaa3e 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -34,6 +34,9 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_
 
 let isCommutable = 1 in {
 def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
+def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+
 def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
 def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
 def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
@@ -41,7 +44,6 @@ def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>
 
 def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
 def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
 def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
 
 def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
@@ -50,6 +52,9 @@ def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>
 def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
 }
 
+def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+
 def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
 def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
@@ -71,6 +76,7 @@ multiclass VOP3P_Real_vi<bits<10> op> {
   }
 }
 
+defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>;
 defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>;
 defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>;
 defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>;
@@ -79,8 +85,10 @@ defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>;
 defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>;
 defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>;
 defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>;
+defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>;
 
 defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>;
+defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>;
 defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>;
 defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>;
 defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>;
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index f3482a22d5dcd..b636fc9be431b 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -148,6 +148,19 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
   let SubtargetPredicate = AssemblerPredicate;
 }
 
+class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies {
+  list<dag> ret = !if(P.HasModifiers,
+      [(set i1:$sdst,
+        (setcc (P.Src0VT
+                  !if(P.HasOMod,
+                    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+                    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))),
+               (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+               cond))],
+      [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]);
+}
+
+
 multiclass VOPC_Pseudos <string opName,
                          VOPC_Profile P,
                          PatLeaf cond = COND_NULL,
@@ -163,14 +176,7 @@ multiclass VOPC_Pseudos <string opName,
     let isCommutable = 1;
   }
 
-  def _e64 : VOP3_Pseudo<opName, P,
-    !if(P.HasModifiers,
-      [(set i1:$sdst,
-          (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                      i1:$clamp, i32:$omod)),
-                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
-                 cond))],
-      [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))])>,
+  def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
     Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
     let Defs = !if(DefExec, [EXEC], []);
     let SchedRW = P.Schedule;
@@ -634,7 +640,7 @@ class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
   (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
                    (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
   (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
-        DSTCLAMP.NONE, DSTOMOD.NONE)
+        DSTCLAMP.NONE)
 >;
 
 def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 77b7952b22a88..b47538ba0349a 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -136,6 +136,8 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
+
+  VOPProfile Pfl = ps.Pfl;
 }
 
 // XXX - Is there any reason to distingusih this from regular VOP3
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index c40b4450a5b5b..e49c1babac210 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -17,144 +17,172 @@
 include "llvm/Target/Target.td"
 
 //===----------------------------------------------------------------------===//
-// ARM Helper classes.
+// ARM Subtarget state.
 //
 
-class ProcNoItin<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
+def ModeThumb             : SubtargetFeature<"thumb-mode", "InThumbMode",
+                                             "true", "Thumb mode">;
+
+def ModeSoftFloat         : SubtargetFeature<"soft-float","UseSoftFloat",
+                                             "true", "Use software floating "
+                                             "point features.">;
 
-class Architecture<string fname, string aname, list<SubtargetFeature> features >
-  : SubtargetFeature<fname, "ARMArch", aname,
-                     !strconcat(aname, " architecture"), features>;
 
 //===----------------------------------------------------------------------===//
-// ARM Subtarget state.
+// ARM Subtarget features.
 //
 
-def ModeThumb  : SubtargetFeature<"thumb-mode", "InThumbMode", "true",
-                                  "Thumb mode">;
+// Floating Point, HW Division and Neon Support
+def FeatureVFP2           : SubtargetFeature<"vfp2", "HasVFPv2", "true",
+                                             "Enable VFP2 instructions">;
 
-def ModeSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
-                                     "Use software floating point features.">;
+def FeatureVFP3           : SubtargetFeature<"vfp3", "HasVFPv3", "true",
+                                             "Enable VFP3 instructions",
+                                             [FeatureVFP2]>;
 
-//===----------------------------------------------------------------------===//
-// ARM Subtarget features.
-//
+def FeatureNEON           : SubtargetFeature<"neon", "HasNEON", "true",
+                                             "Enable NEON instructions",
+                                             [FeatureVFP3]>;
+
+def FeatureFP16           : SubtargetFeature<"fp16", "HasFP16", "true",
+                                             "Enable half-precision "
+                                             "floating point">;
+
+def FeatureVFP4           : SubtargetFeature<"vfp4", "HasVFPv4", "true",
+                                             "Enable VFP4 instructions",
+                                             [FeatureVFP3, FeatureFP16]>;
+
+def FeatureFPARMv8        : SubtargetFeature<"fp-armv8", "HasFPARMv8",
+                                             "true", "Enable ARMv8 FP",
+                                             [FeatureVFP4]>;
+
+def FeatureFullFP16       : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+                                             "Enable full half-precision "
+                                             "floating point",
+                                             [FeatureFPARMv8]>;
+
+def FeatureVFPOnlySP      : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
+                                             "Floating point unit supports "
+                                             "single precision only">;
+
+def FeatureD16            : SubtargetFeature<"d16", "HasD16", "true",
+                                             "Restrict FP to 16 double registers">;
+
+def FeatureHWDivThumb     : SubtargetFeature<"hwdiv",
+                                             "HasHardwareDivideInThumb", "true",
+                                             "Enable divide instructions in Thumb">;
+
+def FeatureHWDivARM       : SubtargetFeature<"hwdiv-arm",
+                                             "HasHardwareDivideInARM", "true",
+                                             "Enable divide instructions in ARM mode">;
+
+// Atomic Support
+def FeatureDB             : SubtargetFeature<"db", "HasDataBarrier", "true",
+                                             "Has data barrier (dmb/dsb) instructions">;
+
+def FeatureV7Clrex        : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
+                                             "Has v7 clrex instruction">;
 
-def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
-                                   "Enable VFP2 instructions">;
-def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true",
-                                   "Enable VFP3 instructions",
-                                   [FeatureVFP2]>;
-def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
-                                   "Enable NEON instructions",
-                                   [FeatureVFP3]>;
-def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
-                                     "Enable Thumb2 instructions">;
-def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
-                                     "Does not support ARM mode execution",
-                                     [ModeThumb]>;
-def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
-                                     "Enable half-precision floating point">;
-def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
-                                     "Enable VFP4 instructions",
-                                     [FeatureVFP3, FeatureFP16]>;
-def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
-                                   "true", "Enable ARMv8 FP",
-                                   [FeatureVFP4]>;
-def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
-                                       "Enable full half-precision floating point",
-                                       [FeatureFPARMv8]>;
-def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
-                                     "Restrict FP to 16 double registers">;
-def FeatureHWDivThumb : SubtargetFeature<"hwdiv", "HasHardwareDivideInThumb",
-                                         "true",
-                                         "Enable divide instructions in Thumb">;
-def FeatureHWDivARM  : SubtargetFeature<"hwdiv-arm",
-                                        "HasHardwareDivideInARM", "true",
-                                      "Enable divide instructions in ARM mode">;
-def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
-                                   "Has data barrier (dmb / dsb) instructions">;
-def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
-                                      "Has v7 clrex instruction">;
 def FeatureAcquireRelease : SubtargetFeature<"acquire-release",
                                              "HasAcquireRelease", "true",
-                         "Has v8 acquire/release (lda/ldaex etc) instructions">;
-def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
-                                         "FP compare + branch is slow">;
-def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
-                          "Floating point unit supports single precision only">;
-def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
-                           "Enable support for Performance Monitor extensions">;
-def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
-                          "Enable support for TrustZone security extensions">;
-def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
-                          "Enable support for ARMv8-M Security Extensions">;
-def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
-                          "Enable support for Cryptography extensions",
-                          [FeatureNEON]>;
-def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
-                          "Enable support for CRC instructions">;
+                                             "Has v8 acquire/release (lda/ldaex "
+                                             " etc) instructions">;
+
+
+def FeatureSlowFPBrcc     : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
+                                             "FP compare + branch is slow">;
+
+def FeaturePerfMon        : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+                                             "Enable support for Performance "
+                                             "Monitor extensions">;
+
+
+// TrustZone Security Extensions
+def FeatureTrustZone      : SubtargetFeature<"trustzone", "HasTrustZone", "true",
+                                             "Enable support for TrustZone "
+                                             "security extensions">;
+
+def Feature8MSecExt       : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
+                                             "Enable support for ARMv8-M "
+                                             "Security Extensions">;
+
+def FeatureCrypto         : SubtargetFeature<"crypto", "HasCrypto", "true",
+                                             "Enable support for "
+                                             "Cryptography extensions",
+                                             [FeatureNEON]>;
+
+def FeatureCRC            : SubtargetFeature<"crc", "HasCRC", "true",
+                                             "Enable support for CRC instructions">;
+
+
 // Not to be confused with FeatureHasRetAddrStack (return address stack)
-def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
-                "Enable Reliability, Availability and Serviceability extensions">;
-def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true",
-                "Enable fast computation of positive address offsets">;
-def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
-                                      "CPU fuses AES crypto operations">;
-
-// Cyclone has preferred instructions for zeroing VFP registers, which can
-// execute in 0 cycles.
-def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
-                                        "Has zero-cycle zeroing instructions">;
-
-// Whether or not it may be profitable to unpredicate certain instructions
-// during if conversion.
+def FeatureRAS            : SubtargetFeature<"ras", "HasRAS", "true",
+                                             "Enable Reliability, Availability "
+                                             "and Serviceability extensions">;
+
+// Fast computation of non-negative address offsets
+def FeatureFPAO           : SubtargetFeature<"fpao", "HasFPAO", "true",
+                                             "Enable fast computation of "
+                                             "positive address offsets">;
+
+// Fast execution of AES crypto operations
+def FeatureFuseAES        : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
+                                             "CPU fuses AES crypto operations">;
+
+// Cyclone can zero VFP registers in 0 cycles.
+def FeatureZCZeroing      : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                             "Has zero-cycle zeroing instructions">;
+
+// Whether it is profitable to unpredicate certain instructions during if-conversion
 def FeatureProfUnpredicate : SubtargetFeature<"prof-unpr",
-                                              "IsProfitableToUnpredicate",
-                                              "true",
+                                              "IsProfitableToUnpredicate", "true",
                                               "Is profitable to unpredicate">;
 
 // Some targets (e.g. Swift) have microcoded VGETLNi32.
-def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32",
-                                            "HasSlowVGETLNi32", "true",
-                                            "Has slow VGETLNi32 - prefer VMOV">;
+def FeatureSlowVGETLNi32  : SubtargetFeature<"slow-vgetlni32",
+                                             "HasSlowVGETLNi32", "true",
+                                             "Has slow VGETLNi32 - prefer VMOV">;
 
 // Some targets (e.g. Swift) have microcoded VDUP32.
-def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", "true",
-                                         "Has slow VDUP32 - prefer VMOV">;
+def FeatureSlowVDUP32     : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32",
+                                             "true",
+                                             "Has slow VDUP32 - prefer VMOV">;
 
 // Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON
 // for scalar FP, as this allows more effective execution domain optimization.
-def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
-                                           "true", "Prefer VMOVSR">;
+def FeaturePreferVMOVSR   : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
+                                             "true", "Prefer VMOVSR">;
 
 // Swift has ISHST barriers compatible with Atomic Release semantics but weaker
 // than ISH
 def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST",
-                                           "true", "Prefer ISHST barriers">;
+                                               "true", "Prefer ISHST barriers">;
 
 // Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU.
-def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", "true",
-                                         "Has muxed AGU and NEON/FPU">;
+def FeatureMuxedUnits     : SubtargetFeature<"muxed-units", "HasMuxedUnits",
+                                             "true",
+                                             "Has muxed AGU and NEON/FPU">;
 
-// On some targets, a VLDM/VSTM starting with an odd register number needs more
-// microops than single VLDRS.
+// Whether VLDM/VSTM starting with odd register number need more microops
+// than single VLDRS
 def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister",
-                     "true", "VLDM/VSTM starting with an odd register is slow">;
+                                              "true", "VLDM/VSTM starting "
+                                              "with an odd register is slow">;
 
 // Some targets have a renaming dependency when loading into D subregisters.
 def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg",
                                               "SlowLoadDSubregister", "true",
                                               "Loading into D subregs is slow">;
+
 // Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD.
 def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs",
                                              "DontWidenVMOVS", "true",
                                              "Don't widen VMOVS to VMOVD">;
 
 // Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions.
-def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", "ExpandMLx", "true",
-                                        "Expand VFP/NEON MLA/MLS instructions">;
+def FeatureExpandMLx      : SubtargetFeature<"expand-fp-mlx",
+                                             "ExpandMLx", "true",
+                                             "Expand VFP/NEON MLA/MLS instructions">;
 
 // Some targets have special RAW hazards for VFP/NEON VMLA/VMLS.
 def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
@@ -162,15 +190,18 @@ def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
 
 // Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from
 // VFP to NEON, as an execution domain optimization.
-def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", "UseNEONForFPMovs",
-                              "true", "Convert VMOVSR, VMOVRS, VMOVS to NEON">;
+def FeatureNEONForFPMovs  : SubtargetFeature<"neon-fpmovs",
+                                             "UseNEONForFPMovs", "true",
+                                             "Convert VMOVSR, VMOVRS, "
+                                             "VMOVS to NEON">;
 
 // Some processors benefit from using NEON instructions for scalar
 // single-precision FP operations. This affects instruction selection and should
 // only be enabled if the handling of denormals is not important.
-def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
-                                        "true",
-                                        "Use NEON for single precision FP">;
+def FeatureNEONForFP      : SubtargetFeature<"neonfp",
+                                             "UseNEONForSinglePrecisionFP",
+                                             "true",
+                                             "Use NEON for single precision FP">;
 
 // On some processors, VLDn instructions that access unaligned data take one
 // extra cycle. Take that into account when computing operand latencies.
@@ -181,18 +212,18 @@ def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign",
 // Some processors have a nonpipelined VFP coprocessor.
 def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
                                               "NonpipelinedVFP", "true",
-                                          "VFP instructions are not pipelined">;
+                                              "VFP instructions are not pipelined">;
 
 // Some processors have FP multiply-accumulate instructions that don't
 // play nicely with other VFP / NEON instructions, and it's generally better
 // to just not use them.
-def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
-                                         "Disable VFP / NEON MAC instructions">;
+def FeatureHasSlowFPVMLx  : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
+                                             "Disable VFP / NEON MAC instructions">;
 
 // Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
 def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
-                                       "HasVMLxForwarding", "true",
-                                       "Has multiplier accumulator forwarding">;
+                                             "HasVMLxForwarding", "true",
+                                             "Has multiplier accumulator forwarding">;
 
 // Disable 32-bit to 16-bit narrowing for experimentation.
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
@@ -213,14 +244,16 @@ def FeatureCheapPredicableCPSR : SubtargetFeature<"cheap-predicable-cpsr",
                                                   "true",
                   "Disable +1 predication cost for instructions updating CPSR">;
 
-def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
-                                            "AvoidMOVsShifterOperand", "true",
-                                "Avoid movs instructions with shifter operand">;
+def FeatureAvoidMOVsShOp  : SubtargetFeature<"avoid-movs-shop",
+                                             "AvoidMOVsShifterOperand", "true",
+                                             "Avoid movs instructions with "
+                                             "shifter operand">;
 
 // Some processors perform return stack prediction. CodeGen should avoid issue
 // "normal" call instructions to callees which do not return.
-def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
-                                     "Has return address stack">;
+def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack",
+                                              "HasRetAddrStack", "true",
+                                              "Has return address stack">;
 
 // Some processors have no branch predictor, which changes the expected cost of
 // taking a branch which affects the choice of whether to use predicated
@@ -230,63 +263,80 @@ def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
                                                    "Has no branch predictor">;
 
 /// DSP extension.
-def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
-                              "Supports DSP instructions in ARM and/or Thumb2">;
+def FeatureDSP            : SubtargetFeature<"dsp", "HasDSP", "true",
+                                             "Supports DSP instructions in "
+                                             "ARM and/or Thumb2">;
 
 // Multiprocessing extension.
-def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
-                                 "Supports Multiprocessing extension">;
+def FeatureMP             : SubtargetFeature<"mp", "HasMPExtension", "true",
+                                        "Supports Multiprocessing extension">;
 
 // Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8).
 def FeatureVirtualization : SubtargetFeature<"virtualization",
-                                 "HasVirtualization", "true",
-                                 "Supports Virtualization extension",
-                                 [FeatureHWDivThumb, FeatureHWDivARM]>;
+                                             "HasVirtualization", "true",
+                                             "Supports Virtualization extension",
+                                             [FeatureHWDivThumb, FeatureHWDivARM]>;
 
-// M-series ISA
-def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
-                                     "Is microcontroller profile ('M' series)">;
+// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
+// See ARMInstrInfo.td for details.
+def FeatureNaClTrap       : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
+                                             "NaCl trap">;
 
-// R-series ISA
-def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
-                                     "Is realtime profile ('R' series)">;
+def FeatureStrictAlign    : SubtargetFeature<"strict-align",
+                                             "StrictAlign", "true",
+                                             "Disallow all unaligned memory "
+                                             "access">;
+
+def FeatureLongCalls      : SubtargetFeature<"long-calls", "GenLongCalls", "true",
+                                             "Generate calls via indirect call "
+                                             "instructions">;
+
+def FeatureExecuteOnly    : SubtargetFeature<"execute-only",
+                                             "GenExecuteOnly", "true",
+                                             "Enable the generation of "
+                                             "execute only code.">;
+
+def FeatureReserveR9      : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
+                                             "Reserve R9, making it unavailable"
+                                             " as GPR">;
+
+def FeatureNoMovt         : SubtargetFeature<"no-movt", "NoMovt", "true",
+                                             "Don't use movt/movw pairs for "
+                                             "32-bit imms">;
+
+def FeatureNoNegativeImmediates
+                          : SubtargetFeature<"no-neg-immediates",
+                                             "NegativeImmediates", "false",
+                                             "Convert immediates and instructions "
+                                             "to their negated or complemented "
+                                             "equivalent when the immediate does "
+                                             "not fit in the encoding.">;
+
+
+//===----------------------------------------------------------------------===//
+// ARM architecture class
+//
 
 // A-series ISA
 def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
                                      "Is application profile ('A' series)">;
 
-// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
-// See ARMInstrInfo.td for details.
-def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
-                                       "NaCl trap">;
-
-def FeatureStrictAlign : SubtargetFeature<"strict-align",
-                                          "StrictAlign", "true",
-                                          "Disallow all unaligned memory "
-                                          "access">;
+// R-series ISA
+def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
+                                     "Is realtime profile ('R' series)">;
 
-def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true",
-                                        "Generate calls via indirect call "
-                                        "instructions">;
+// M-series ISA
+def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
+                                     "Is microcontroller profile ('M' series)">;
 
-def FeatureExecuteOnly
-    : SubtargetFeature<"execute-only", "GenExecuteOnly", "true",
-                       "Enable the generation of execute only code.">;
 
-def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
-                                        "Reserve R9, making it unavailable as "
-                                        "GPR">;
+def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
+                                     "Enable Thumb2 instructions">;
 
-def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true",
-                                     "Don't use movt/movw pairs for 32-bit "
-                                     "imms">;
+def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
+                                     "Does not support ARM mode execution",
+                                     [ModeThumb]>;
 
-def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
-                                        "NegativeImmediates", "false",
-                                        "Convert immediates and instructions "
-                                        "to their negated or complemented "
-                                        "equivalent when the immediate does "
-                                        "not fit in the encoding.">;
 
 //===----------------------------------------------------------------------===//
 // ARM ISAa.
@@ -294,43 +344,57 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
 
 def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
                                    "Support ARM v4T instructions">;
+
 def HasV5TOps   : SubtargetFeature<"v5t", "HasV5TOps", "true",
                                    "Support ARM v5T instructions",
                                    [HasV4TOps]>;
+
 def HasV5TEOps  : SubtargetFeature<"v5te", "HasV5TEOps", "true",
-                             "Support ARM v5TE, v5TEj, and v5TExp instructions",
+                                   "Support ARM v5TE, v5TEj, and "
+                                   "v5TExp instructions",
                                    [HasV5TOps]>;
+
 def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
                                    "Support ARM v6 instructions",
                                    [HasV5TEOps]>;
+
 def HasV6MOps   : SubtargetFeature<"v6m", "HasV6MOps", "true",
                                    "Support ARM v6M instructions",
                                    [HasV6Ops]>;
+
 def HasV8MBaselineOps : SubtargetFeature<"v8m", "HasV8MBaselineOps", "true",
                                          "Support ARM v8M Baseline instructions",
                                          [HasV6MOps]>;
+
 def HasV6KOps   : SubtargetFeature<"v6k", "HasV6KOps", "true",
                                    "Support ARM v6k instructions",
                                    [HasV6Ops]>;
+
 def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
                                    "Support ARM v6t2 instructions",
                                    [HasV8MBaselineOps, HasV6KOps, FeatureThumb2]>;
+
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
                                    [HasV6T2Ops, FeaturePerfMon,
                                     FeatureV7Clrex]>;
+
+def HasV8MMainlineOps :
+                  SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
+                                   "Support ARM v8M Mainline instructions",
+                                   [HasV7Ops]>;
+
 def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
                                    "Support ARM v8 instructions",
                                    [HasV7Ops, FeatureAcquireRelease]>;
+
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
                                    "Support ARM v8.1a instructions",
                                    [HasV8Ops]>;
-def HasV8_2aOps   : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+
+def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
                                    "Support ARM v8.2a instructions",
                                    [HasV8_1aOps]>;
-def HasV8MMainlineOps : SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
-                                         "Support ARM v8M Mainline instructions",
-                                         [HasV7Ops]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -386,11 +450,17 @@ def ProcR52     : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52",
 def ProcM3      : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
                                    "Cortex-M3 ARM processors", []>;
 
+
 //===----------------------------------------------------------------------===//
-// ARM schedules.
+// ARM Helper classes.
 //
 
-include "ARMSchedule.td"
+class Architecture<string fname, string aname, list<SubtargetFeature> features>
+  : SubtargetFeature<fname, "ARMArch", aname,
+                     !strconcat(aname, " architecture"), features>;
+
+class ProcNoItin<string Name, list<SubtargetFeature> Features>
+  : Processor<Name, NoItineraries, Features>;
 
 
 //===----------------------------------------------------------------------===//
@@ -547,12 +617,21 @@ def ARMv7s   : Architecture<"armv7s",      "ARMv7a",   [ARMv7a]>;
 
 
 //===----------------------------------------------------------------------===//
+// ARM schedules.
+//===----------------------------------------------------------------------===//
+//
+include "ARMSchedule.td"
+
+//===----------------------------------------------------------------------===//
 // ARM processors
 //
 
 // Dummy CPU, used to target architectures
 def : ProcessorModel<"generic",     CortexA8Model,      []>;
 
+// FIXME: Several processors below are not using their own scheduler
+// model, but one of similar/previous processor. These should be fixed.
+
 def : ProcNoItin<"arm8",                                [ARMv4]>;
 def : ProcNoItin<"arm810",                              [ARMv4]>;
 def : ProcNoItin<"strongarm",                           [ARMv4]>;
@@ -612,7 +691,6 @@ def : Processor<"arm1156t2f-s",     ARMV6Itineraries,   [ARMv6t2,
                                                          FeatureVFP2,
                                                          FeatureHasSlowFPVMLx]>;
 
-// FIXME: A5 has currently the same Schedule model as A8
 def : ProcessorModel<"cortex-a5",   CortexA8Model,      [ARMv7a, ProcA5,
                                                          FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
@@ -656,7 +734,6 @@ def : ProcessorModel<"cortex-a9",   CortexA9Model,      [ARMv7a, ProcA9,
                                                          FeatureCheckVLDnAlign,
                                                          FeatureMP]>;
 
-// FIXME: A12 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
                                                          FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
@@ -666,7 +743,6 @@ def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
                                                          FeatureVirtualization,
                                                          FeatureMP]>;
 
-// FIXME: A15 has currently the same Schedule model as A9.
 def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
                                                          FeatureDontWidenVMOVS,
                                                          FeatureHasRetAddrStack,
@@ -678,7 +754,6 @@ def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization]>;
 
-// FIXME: A17 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
                                                          FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
@@ -688,9 +763,7 @@ def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization]>;
 
-// FIXME: krait has currently the same Schedule model as A9
-// FIXME: krait has currently the same features as A9 plus VFP4 and hardware
-//        division features.
+// FIXME: krait has currently the same features as A9 plus VFP4 and  HWDiv
 def : ProcessorModel<"krait",       CortexA9Model,      [ARMv7a, ProcKrait,
                                                          FeatureHasRetAddrStack,
                                                          FeatureMuxedUnits,
@@ -720,12 +793,10 @@ def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
                                                          FeatureSlowVGETLNi32,
                                                          FeatureSlowVDUP32]>;
 
-// FIXME: R4 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r4",   CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureHasRetAddrStack,
                                                          FeatureAvoidPartialCPSR]>;
 
-// FIXME: R4F has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureHasRetAddrStack,
                                                          FeatureSlowFPBrcc,
@@ -734,7 +805,6 @@ def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureD16,
                                                          FeatureAvoidPartialCPSR]>;
 
-// FIXME: R5 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
                                                          FeatureHasRetAddrStack,
                                                          FeatureVFP3,
@@ -744,7 +814,6 @@ def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureAvoidPartialCPSR]>;
 
-// FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5.
 def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
                                                          FeatureHasRetAddrStack,
                                                          FeatureVFP3,
@@ -814,14 +883,14 @@ def : ProcNoItin<"cortex-a53",                          [ARMv8a, ProcA53,
                                                          FeatureCRC,
                                                          FeatureFPAO]>;
 
-def : ProcessorModel<"cortex-a57",  CortexA57Model, [ARMv8a, ProcA57,
-                                                     FeatureHWDivThumb,
-                                                     FeatureHWDivARM,
-                                                     FeatureCrypto,
-                                                     FeatureCRC,
-                                                     FeatureFPAO,
-                                                     FeatureAvoidPartialCPSR,
-                                                     FeatureCheapPredicableCPSR]>;
+def : ProcessorModel<"cortex-a57",  CortexA57Model,     [ARMv8a, ProcA57,
+                                                         FeatureHWDivThumb,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC,
+                                                         FeatureFPAO,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureCheapPredicableCPSR]>;
 
 def : ProcNoItin<"cortex-a72",                          [ARMv8a, ProcA72,
                                                          FeatureHWDivThumb,
@@ -835,7 +904,6 @@ def : ProcNoItin<"cortex-a73",                          [ARMv8a, ProcA73,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
-// Cyclone is very similar to swift
 def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
                                                          FeatureHasRetAddrStack,
                                                          FeatureNEONForFP,
@@ -881,9 +949,7 @@ def : ProcessorModel<"cortex-r52", CortexR52Model,      [ARMv8r, ProcR52,
 //===----------------------------------------------------------------------===//
 
 include "ARMRegisterInfo.td"
-
 include "ARMRegisterBanks.td"
-
 include "ARMCallingConv.td"
 
 //===----------------------------------------------------------------------===//
@@ -891,7 +957,6 @@ include "ARMCallingConv.td"
 //===----------------------------------------------------------------------===//
 
 include "ARMInstrInfo.td"
-
 def ARMInstrInfo : InstrInfo;
 
 //===----------------------------------------------------------------------===//
@@ -912,7 +977,7 @@ def ARMAsmParserVariant : AsmParserVariant {
 }
 
 def ARM : Target {
-  // Pull in Instruction Info:
+  // Pull in Instruction Info.
   let InstructionSet = ARMInstrInfo;
   let AssemblyWriters = [ARMAsmWriter];
   let AssemblyParserVariants = [ARMAsmParserVariant];
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index e97a7ce5067f9..370c0a7f5c537 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -117,7 +117,7 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
   if (CC == CallingConv::GHC)
-    // This is academic becase all GHC calls are (supposed to be) tail calls
+    // This is academic because all GHC calls are (supposed to be) tail calls
     return CSR_NoRegs_RegMask;
 
   if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() &&
@@ -163,7 +163,7 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
   // both or otherwise does not want to enable this optimization, the function
   // should return NULL
   if (CC == CallingConv::GHC)
-    // This is academic becase all GHC calls are (supposed to be) tail calls
+    // This is academic because all GHC calls are (supposed to be) tail calls
     return nullptr;
   return STI.isTargetDarwin() ? CSR_iOS_ThisReturn_RegMask
                               : CSR_AAPCS_ThisReturn_RegMask;
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 384f80356cc84..bf00ef61c2d1b 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -250,8 +250,7 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
     return false;
 
   // Look to see if our OptionalDef is defining CPSR or CCR.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isDef()) continue;
     if (MO.getReg() == ARM::CPSR)
       *CPSR = true;
@@ -267,8 +266,8 @@ bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
        AFI->isThumb2Function())
     return MI->isPredicable();
 
-  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
-    if (MCID.OpInfo[i].isPredicate())
+  for (const MCOperandInfo &opInfo : MCID.operands())
+    if (opInfo.isPredicate())
       return true;
 
   return false;
@@ -1972,7 +1971,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
         break;
       }
       case CCValAssign::AExt:
-        // Intentional fall-through.  Handle AExt and ZExt.
+      // Intentional fall-through.  Handle AExt and ZExt.
       case CCValAssign::ZExt: {
         MVT DestVT = VA.getLocVT();
         Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true);
@@ -2001,6 +2000,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
       assert(VA.getLocVT() == MVT::f64 &&
              "Custom lowering for v2f64 args not available");
 
+      // FIXME: ArgLocs[++i] may extend beyond ArgLocs.size()
       CCValAssign &NextVA = ArgLocs[++i];
 
       assert(VA.isRegLoc() && NextVA.isRegLoc() &&
@@ -2172,8 +2172,8 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(RetOpc));
   AddOptionalDefs(MIB);
-  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
-    MIB.addReg(RetRegs[i], RegState::Implicit);
+  for (unsigned R : RetRegs)
+    MIB.addReg(R, RegState::Implicit);
   return true;
 }
 
@@ -2233,8 +2233,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   ArgRegs.reserve(I->getNumOperands());
   ArgVTs.reserve(I->getNumOperands());
   ArgFlags.reserve(I->getNumOperands());
-  for (unsigned i = 0; i < I->getNumOperands(); ++i) {
-    Value *Op = I->getOperand(i);
+  for (Value *Op :  I->operands()) {
     unsigned Arg = getRegForValue(Op);
     if (Arg == 0) return false;
 
@@ -2278,8 +2277,8 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
     MIB.addExternalSymbol(TLI.getLibcallName(Call));
 
   // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
+  for (unsigned R : RegArgs)
+    MIB.addReg(R, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2423,8 +2422,8 @@ bool ARMFastISel::SelectCall(const Instruction *I,
     MIB.addExternalSymbol(IntrMemName, 0);
 
   // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
+  for (unsigned R : RegArgs)
+    MIB.addReg(R, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2932,13 +2931,12 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
   bool Found = false;
   bool isZExt;
-  for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends);
-       i != e; ++i) {
-    if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() &&
-        (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm &&
-        MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) {
+  for (const FoldableLoadExtendsStruct &FLE : FoldableLoadExtends) {
+    if (FLE.Opc[isThumb2] == MI->getOpcode() &&
+        (uint64_t)FLE.ExpectedImm == Imm &&
+        MVT((MVT::SimpleValueType)FLE.ExpectedVT) == VT) {
       Found = true;
-      isZExt = FoldableLoadExtends[i].isZExt;
+      isZExt = FLE.isZExt;
     }
   }
   if (!Found) return false;
@@ -3057,9 +3055,8 @@ bool ARMFastISel::fastLowerArguments() {
   };
 
   const TargetRegisterClass *RC = &ARM::rGPRRegClass;
-  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I) {
-    unsigned ArgNo = I->getArgNo();
+  for (const Argument &Arg : F->args()) {
+    unsigned ArgNo = Arg.getArgNo();
     unsigned SrcReg = GPRArgRegs[ArgNo];
     unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
@@ -3069,7 +3066,7 @@ bool ARMFastISel::fastLowerArguments() {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY),
             ResultReg).addReg(DstReg, getKillRegState(true));
-    updateValueMap(&*I, ResultReg);
+    updateValueMap(&Arg, ResultReg);
   }
 
   return true;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 7f9fe55a5c38b..f75dd4de3f96c 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2682,9 +2682,12 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
       SDNode *ResNode;
       if (Subtarget->isThumb()) {
-        SDValue Pred = getAL(CurDAG, dl);
-        SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
-        SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
+        SDValue Ops[] = {
+          CPIdx,
+          getAL(CurDAG, dl),
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getEntryNode()
+        };
         ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other,
                                          Ops);
       } else {
@@ -2698,6 +2701,17 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
         ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
                                          Ops);
       }
+      // Annotate the Node with memory operand information so that MachineInstr
+      // queries work properly. This e.g. gives the register allocation the
+      // required information for rematerialization.
+      MachineFunction& MF = CurDAG->getMachineFunction();
+      MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+      MemOp[0] = MF.getMachineMemOperand(
+          MachinePointerInfo::getConstantPool(MF),
+          MachineMemOperand::MOLoad, 4, 4);
+
+      cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp+1);
+        
       ReplaceNode(N, ResNode);
       return;
     }
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 29ef69ad0010f..faed6b867e2bc 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -722,6 +722,29 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
       return false;
     break;
   }
+  case G_BRCOND: {
+    if (!validReg(MRI, I.getOperand(0).getReg(), 1, ARM::GPRRegBankID)) {
+      DEBUG(dbgs() << "Unsupported condition register for G_BRCOND");
+      return false;
+    }
+
+    // Set the flags.
+    auto Test = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::TSTri))
+                    .addReg(I.getOperand(0).getReg())
+                    .addImm(1)
+                    .add(predOps(ARMCC::AL));
+    if (!constrainSelectedInstRegOperands(*Test, TII, TRI, RBI))
+      return false;
+
+    // Branch conditionally.
+    auto Branch = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::Bcc))
+                      .add(I.getOperand(1))
+                      .add(predOps(ARMCC::EQ, ARM::CPSR));
+    if (!constrainSelectedInstRegOperands(*Branch, TII, TRI, RBI))
+      return false;
+    I.eraseFromParent();
+    return true;
+  }
   default:
     return false;
   }
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index f23e62595d2e9..1c17c07e4cb00 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -66,14 +66,16 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
       setAction({Op, s32}, Libcall);
   }
 
-  // FIXME: Support s8 and s16 as well
-  for (unsigned Op : {G_SREM, G_UREM})
+  for (unsigned Op : {G_SREM, G_UREM}) {
+    for (auto Ty : {s8, s16})
+      setAction({Op, Ty}, WidenScalar);
     if (ST.hasDivideInARMMode())
       setAction({Op, s32}, Lower);
     else if (AEABI(ST))
       setAction({Op, s32}, Custom);
     else
       setAction({Op, s32}, Libcall);
+  }
 
   for (unsigned Op : {G_SEXT, G_ZEXT}) {
     setAction({Op, s32}, Legal);
@@ -88,6 +90,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   setAction({G_SELECT, p0}, Legal);
   setAction({G_SELECT, 1, s1}, Legal);
 
+  setAction({G_BRCOND, s1}, Legal);
+
   setAction({G_CONSTANT, s32}, Legal);
   for (auto Ty : {s1, s8, s16})
     setAction({G_CONSTANT, Ty}, WidenScalar);
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 13acea3c28a9e..48b02d40b2466 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -153,9 +153,7 @@ void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
     break;
   }
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp;
     if (AP.lowerOperand(MO, MCOp)) {
       if (MCOp.isImm() && EncodeImms) {
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index c0c09e8c15afd..8449302358948 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -259,6 +259,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   case G_SELECT: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    (void)Ty;
     LLT Ty2 = MRI.getType(MI.getOperand(1).getReg());
     (void)Ty2;
     assert(Ty.getSizeInBits() == 32 && "Unsupported size for G_SELECT");
@@ -282,6 +283,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case G_FCMP: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    (void)Ty;
     LLT Ty1 = MRI.getType(MI.getOperand(2).getReg());
     LLT Ty2 = MRI.getType(MI.getOperand(3).getReg());
     (void)Ty2;
@@ -329,6 +331,13 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
                             &ARM::ValueMappings[ARM::DPR3OpsIdx]});
     break;
   }
+  case G_BR:
+    OperandsMapping = getOperandsMapping({nullptr});
+    break;
+  case G_BRCOND:
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
+    break;
   default:
     return getInvalidInstructionMapping();
   }
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index f41da3e8e2232..22ce949367f34 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -47,6 +47,8 @@ public:
   ~ARMBaseTargetMachine() override;
 
   const ARMSubtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some targets, is
+  // deprecated and should not be used.
   const ARMSubtarget *getSubtargetImpl() const = delete;
   bool isLittleEndian() const { return isLittle; }
 
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index cc7a7c3849bca..81b0aa7f8b98f 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -515,8 +515,9 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
+  bool isSelectOp = MI.getOpcode() == BPF::Select;
 
-  assert(MI.getOpcode() == BPF::Select && "Unexpected instr type to insert");
+  assert((isSelectOp || MI.getOpcode() == BPF::Select_Ri) && "Unexpected instr type to insert");
 
   // To "insert" a SELECT instruction, we actually have to insert the diamond
   // control-flow pattern.  The incoming instruction knows the destination vreg
@@ -548,48 +549,40 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
   // Insert Branch if Flag
   unsigned LHS = MI.getOperand(1).getReg();
-  unsigned RHS = MI.getOperand(2).getReg();
   int CC = MI.getOperand(3).getImm();
+  int NewCC;
   switch (CC) {
   case ISD::SETGT:
-    BuildMI(BB, DL, TII.get(BPF::JSGT_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JSGT_rr : BPF::JSGT_ri;
     break;
   case ISD::SETUGT:
-    BuildMI(BB, DL, TII.get(BPF::JUGT_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JUGT_rr : BPF::JUGT_ri;
     break;
   case ISD::SETGE:
-    BuildMI(BB, DL, TII.get(BPF::JSGE_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JSGE_rr : BPF::JSGE_ri;
     break;
   case ISD::SETUGE:
-    BuildMI(BB, DL, TII.get(BPF::JUGE_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JUGE_rr : BPF::JUGE_ri;
     break;
   case ISD::SETEQ:
-    BuildMI(BB, DL, TII.get(BPF::JEQ_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JEQ_rr : BPF::JEQ_ri;
     break;
   case ISD::SETNE:
-    BuildMI(BB, DL, TII.get(BPF::JNE_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JNE_rr : BPF::JNE_ri;
     break;
   default:
     report_fatal_error("unimplemented select CondCode " + Twine(CC));
   }
+  if (isSelectOp)
+    BuildMI(BB, DL, TII.get(NewCC))
+        .addReg(LHS)
+        .addReg(MI.getOperand(2).getReg())
+        .addMBB(Copy1MBB);
+  else
+    BuildMI(BB, DL, TII.get(NewCC))
+        .addReg(LHS)
+        .addImm(MI.getOperand(2).getImm())
+        .addMBB(Copy1MBB);
 
   // Copy0MBB:
   //  %FalseValue = ...
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index 5ad7772682084..f68357809add2 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -460,6 +460,11 @@ let usesCustomInserter = 1 in {
                       "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
                       [(set i64:$dst,
                        (BPFselectcc i64:$lhs, i64:$rhs, (i64 imm:$imm), i64:$src, i64:$src2))]>;
+  def Select_Ri : Pseudo<(outs GPR:$dst),
+                      (ins GPR:$lhs, i64imm:$rhs, i64imm:$imm, GPR:$src, GPR:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i64:$dst,
+                       (BPFselectcc i64:$lhs, (i64 imm:$rhs), (i64 imm:$imm), i64:$src, i64:$src2))]>;
 }
 
 // load 64-bit global addr into register
diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp
index b064778c4bbd3..d75d95a6baeae 100644
--- a/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexbit"
-
 #include "HexagonBitTracker.h"
 #include "HexagonTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
@@ -42,6 +40,8 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "hexbit"
+
 using namespace llvm;
 
 static cl::opt<bool> PreserveTiedOps("hexbit-keep-tied", cl::Hidden,
diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td
index 2dc74632e9be2..30ebf89c98083 100644
--- a/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -45863,6 +45863,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000000;
 let isSoloAin1 = 1;
+let hasSideEffects = 1;
 }
 def Y2_dccleaninva : HInst<
 (outs),
@@ -45872,6 +45873,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000010;
 let isSoloAin1 = 1;
+let hasSideEffects = 1;
 }
 def Y2_dcfetch : HInst<
 (outs),
@@ -45900,6 +45902,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000001;
 let isSoloAin1 = 1;
+let hasSideEffects = 1;
 }
 def Y2_dczeroa : HInst<
 (outs),
@@ -45909,6 +45912,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000110;
 let isSoloAin1 = 1;
+let hasSideEffects = 1;
 let mayStore = 1;
 }
 def Y2_icinva : HInst<
diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index 03c4a83594b33..80361015e6499 100644
--- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -59,8 +59,6 @@
 //         J2_jump <BB#6>, %PC<imp-def,dead>
 //     Successors according to CFG: BB#6 BB#3
 
-#define DEBUG_TYPE "hexagon-eif"
-
 #include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
@@ -90,6 +88,8 @@
 #include <cassert>
 #include <iterator>
 
+#define DEBUG_TYPE "hexagon-eif"
+
 using namespace llvm;
 
 namespace llvm {
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 734f3c6658d92..a2f6dd68c1a13 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -86,8 +86,6 @@
 // however, is that finding the locations where the implicit uses need
 // to be added, and updating the live ranges will be more involved.
 
-#define DEBUG_TYPE "expand-condsets"
-
 #include "HexagonInstrInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
@@ -116,6 +114,8 @@
 #include <set>
 #include <utility>
 
+#define DEBUG_TYPE "expand-condsets"
+
 using namespace llvm;
 
 static cl::opt<unsigned> OptTfrLimit("expand-condsets-tfr-limit",
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index c790579ccebc0..e5e75198b2d18 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -8,8 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexagon-pei"
-
 #include "HexagonFrameLowering.h"
 #include "HexagonBlockRanges.h"
 #include "HexagonInstrInfo.h"
@@ -63,6 +61,8 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "hexagon-pei"
+
 // Hexagon stack frame layout as defined by the ABI:
 //
 //                                                       Incoming arguments
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
index bf31e16992840..0a955aedaf1a8 100644
--- a/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexinsert"
-
 #include "BitTracker.h"
 #include "HexagonBitTracker.h"
 #include "HexagonInstrInfo.h"
@@ -44,6 +42,8 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "hexinsert"
+
 using namespace llvm;
 
 static cl::opt<unsigned> VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U),
diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 3470480d607dc..2da211563e0a1 100644
--- a/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "gen-pred"
-
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "llvm/ADT/SetVector.h"
@@ -35,6 +33,8 @@
 #include <set>
 #include <utility>
 
+#define DEBUG_TYPE "gen-pred"
+
 using namespace llvm;
 
 namespace llvm {
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 67242764d4531..3997702bc962d 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1286,16 +1286,6 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV));
 }
 
-// Creates a SPLAT instruction for a constant value VAL.
-static SDValue createSplat(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
-                           SDValue Val) {
-  EVT T = VT.getVectorElementType();
-  if (T == MVT::i8 || T == MVT::i16)
-    return DAG.getNode(HexagonISD::VSPLAT, dl, VT, Val);
-
-  return SDValue();
-}
-
 static bool isSExtFree(SDValue N) {
   // A sign-extend of a truncate of a sign-extend is free.
   if (N.getOpcode() == ISD::TRUNCATE &&
@@ -1374,79 +1364,6 @@ HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-// Handle only specific vector loads.
-SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
-  SDValue Chain = LoadNode->getChain();
-  SDValue Ptr = Op.getOperand(1);
-  SDValue LoweredLoad;
-  SDValue Result;
-  SDValue Base = LoadNode->getBasePtr();
-  ISD::LoadExtType Ext = LoadNode->getExtensionType();
-  unsigned Alignment = LoadNode->getAlignment();
-  SDValue LoadChain;
-
-  if(Ext == ISD::NON_EXTLOAD)
-    Ext = ISD::ZEXTLOAD;
-
-  if (VT == MVT::v4i16) {
-    if (Alignment == 2) {
-      SDValue Loads[4];
-      // Base load.
-      Loads[0] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Base,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // Base+2 load.
-      SDValue Increment = DAG.getConstant(2, DL, MVT::i32);
-      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
-      Loads[1] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // SHL 16, then OR base and base+2.
-      SDValue ShiftAmount = DAG.getConstant(16, DL, MVT::i32);
-      SDValue Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[1], ShiftAmount);
-      SDValue Tmp2 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[0]);
-      // Base + 4.
-      Increment = DAG.getConstant(4, DL, MVT::i32);
-      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
-      Loads[2] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // Base + 6.
-      Increment = DAG.getConstant(6, DL, MVT::i32);
-      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
-      Loads[3] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // SHL 16, then OR base+4 and base+6.
-      Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[3], ShiftAmount);
-      SDValue Tmp4 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[2]);
-      // Combine to i64. This could be optimised out later if we can
-      // affect reg allocation of this code.
-      Result = DAG.getNode(HexagonISD::COMBINE, DL, MVT::i64, Tmp4, Tmp2);
-      LoadChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                              Loads[0].getValue(1), Loads[1].getValue(1),
-                              Loads[2].getValue(1), Loads[3].getValue(1));
-    } else {
-      // Perform default type expansion.
-      Result = DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
-                           LoadNode->getAlignment(),
-                           LoadNode->getMemOperand()->getFlags());
-      LoadChain = Result.getValue(1);
-    }
-  } else
-    llvm_unreachable("Custom lowering unsupported load");
-
-  Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
-  // Since we pretend to lower a load, we need the original chain
-  // info attached to the result.
-  SDValue Ops[] = { Result, LoadChain };
-
-  return DAG.getMergeValues(Ops, DL);
-}
-
 SDValue
 HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op.getValueType();
@@ -1971,18 +1888,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   // Handling of vector operations.
   //
 
-  // Custom lower v4i16 load only. Let v4i16 store to be
-  // promoted for now.
   promoteLdStType(MVT::v4i8,  MVT::i32);
   promoteLdStType(MVT::v2i16, MVT::i32);
   promoteLdStType(MVT::v8i8,  MVT::i64);
+  promoteLdStType(MVT::v4i16, MVT::i64);
   promoteLdStType(MVT::v2i32, MVT::i64);
 
-  setOperationAction(ISD::LOAD,  MVT::v4i16, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i16, Promote);
-  AddPromotedToType(ISD::LOAD,  MVT::v4i16, MVT::i64);
-  AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::i64);
-
   // Set the action for vector operations to "expand", then override it with
   // either "custom" or "legal" for specific cases.
   static const unsigned VectExpOps[] = {
@@ -2301,7 +2212,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::RET_FLAG:      return "HexagonISD::RET_FLAG";
   case HexagonISD::TC_RETURN:     return "HexagonISD::TC_RETURN";
   case HexagonISD::VCOMBINE:      return "HexagonISD::VCOMBINE";
-  case HexagonISD::VPACK:         return "HexagonISD::VPACK";
+  case HexagonISD::VPACKE:        return "HexagonISD::VPACKE";
+  case HexagonISD::VPACKO:        return "HexagonISD::VPACKO";
   case HexagonISD::VASL:          return "HexagonISD::VASL";
   case HexagonISD::VASR:          return "HexagonISD::VASR";
   case HexagonISD::VLSR:          return "HexagonISD::VLSR";
@@ -2394,7 +2306,7 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
 
     // Test if V1 is a SCALAR_TO_VECTOR.
     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return createSplat(DAG, dl, VT, V1.getOperand(0));
+      return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
 
     // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
     // (and probably will turn into a SCALAR_TO_VECTOR once legalization
@@ -2409,28 +2321,26 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
         }
       }
       if (IsScalarToVector)
-        return createSplat(DAG, dl, VT, V1.getOperand(0));
+        return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
     }
-    return createSplat(DAG, dl, VT, DAG.getConstant(Lane, dl, MVT::i32));
+    return DAG.getNode(HexagonISD::VSPLAT, dl, VT,
+                       DAG.getConstant(Lane, dl, MVT::i32));
   }
 
   if (UseHVX) {
     ArrayRef<int> Mask = SVN->getMask();
     size_t MaskLen = Mask.size();
-    int ElemSizeInBits = VT.getScalarSizeInBits();
-    if ((Subtarget.useHVXSglOps() && (ElemSizeInBits * MaskLen) == 64 * 8) ||
-        (Subtarget.useHVXDblOps() && (ElemSizeInBits * MaskLen) == 128 * 8)) {
-      // Return 1 for odd and 2 of even
-      StridedLoadKind Pattern = isStridedLoad(Mask);
+    unsigned SizeInBits = VT.getScalarSizeInBits() * MaskLen;
 
+    if ((Subtarget.useHVXSglOps() && SizeInBits == 64 * 8) ||
+        (Subtarget.useHVXDblOps() && SizeInBits == 128 * 8)) {
+      StridedLoadKind Pattern = isStridedLoad(Mask);
       if (Pattern == StridedLoadKind::NoPattern)
         return SDValue();
 
-      SDValue Vec0 = Op.getOperand(0);
-      SDValue Vec1 = Op.getOperand(1);
-      SDValue StridePattern = DAG.getConstant(Pattern, dl, MVT::i32);
-      SDValue Ops[] = { Vec1, Vec0, StridePattern };
-      return DAG.getNode(HexagonISD::VPACK, dl, VT, Ops);
+      unsigned Opc = Pattern == StridedLoadKind::Even ? HexagonISD::VPACKE
+                                                      : HexagonISD::VPACKO;
+      return DAG.getNode(Opc, dl, VT, {Op.getOperand(1), Op.getOperand(0)});
     }
     // We used to assert in the "else" part here, but that is bad for Halide
     // Halide creates intermediate double registers by interleaving two
@@ -2531,19 +2441,26 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (Size > 64)
     return SDValue();
 
-  APInt APSplatBits, APSplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
   unsigned NElts = BVN->getNumOperands();
 
   // Try to generate a SPLAT instruction.
-  if ((VT.getSimpleVT() == MVT::v4i8 || VT.getSimpleVT() == MVT::v4i16) &&
-      (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
-                            HasAnyUndefs, 0, true) && SplatBitSize <= 16)) {
-    unsigned SplatBits = APSplatBits.getZExtValue();
-    int32_t SextVal = ((int32_t) (SplatBits << (32 - SplatBitSize)) >>
-                       (32 - SplatBitSize));
-    return createSplat(DAG, dl, VT, DAG.getConstant(SextVal, dl, MVT::i32));
+  if (VT == MVT::v4i8 || VT == MVT::v4i16 || VT == MVT::v2i32) {
+    APInt APSplatBits, APSplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    if (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                             HasAnyUndefs, 0, false)) {
+      if (SplatBitSize == VT.getVectorElementType().getSizeInBits()) {
+        unsigned ZV = APSplatBits.getZExtValue();
+        assert(SplatBitSize <= 32 && "Can only handle up to i32");
+        // Sign-extend the splat value from SplatBitSize to 32.
+        int32_t SV = SplatBitSize < 32
+                        ? int32_t(ZV << (32-SplatBitSize)) >> (32-SplatBitSize)
+                        : int32_t(ZV);
+        return DAG.getNode(HexagonISD::VSPLAT, dl, VT,
+                           DAG.getConstant(SV, dl, MVT::i32));
+      }
+    }
   }
 
   // Try to generate COMBINE to build v2i32 vectors.
@@ -2974,8 +2891,6 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::BlockAddress:         return LowerBlockAddress(Op, DAG);
     case ISD::GLOBAL_OFFSET_TABLE:  return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
     case ISD::VASTART:              return LowerVASTART(Op, DAG);
-    // Custom lower some vector loads.
-    case ISD::LOAD:                 return LowerLOAD(Op, DAG);
     case ISD::DYNAMIC_STACKALLOC:   return LowerDYNAMIC_STACKALLOC(Op, DAG);
     case ISD::SETCC:                return LowerSETCC(Op, DAG);
     case ISD::VSELECT:              return LowerVSELECT(Op, DAG);
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index bfd2c94eeabaa..d66cbc95e9188 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -62,7 +62,8 @@ namespace HexagonISD {
       EXTRACTU,
       EXTRACTURP,
       VCOMBINE,
-      VPACK,
+      VPACKE,
+      VPACKO,
       TC_RETURN,
       EH_RETURN,
       DCFETCH,
@@ -164,7 +165,6 @@ namespace HexagonISD {
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
 
     bool CanLowerReturn(CallingConv::ID CallConv,
                         MachineFunction &MF, bool isVarArg,
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index c611857ec26af..104a28654dd50 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -1366,6 +1366,18 @@ defm : MaskedStore <V6_vS32b_nqpred_ai, int_hexagon_V6_vmaskedstorenq>;
 defm : MaskedStore <V6_vS32b_nt_qpred_ai, int_hexagon_V6_vmaskedstorentq>;
 defm : MaskedStore <V6_vS32b_nt_nqpred_ai, int_hexagon_V6_vmaskedstorentnq>;
 
+//*******************************************************************
+//           SYSTEM
+//*******************************************************************
+
+def: T_R_pat<Y2_dccleana,    int_hexagon_Y2_dccleana>;
+def: T_R_pat<Y2_dccleaninva, int_hexagon_Y2_dccleaninva>;
+def: T_R_pat<Y2_dcinva,      int_hexagon_Y2_dcinva>;
+def: T_R_pat<Y2_dczeroa,     int_hexagon_Y2_dczeroa>;
+
+def: T_RR_pat<Y4_l2fetch,    int_hexagon_Y4_l2fetch>;
+def: T_RP_pat<Y5_l2fetch,    int_hexagon_Y5_l2fetch>;
+
 include "HexagonIntrinsicsV3.td"
 include "HexagonIntrinsicsV4.td"
 include "HexagonIntrinsicsV5.td"
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index a331c978f59d2..374ffa3799b03 100644
--- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -10,8 +10,6 @@
 // load/store instructions.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "opt-addr-mode"
-
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
@@ -36,6 +34,8 @@
 #include <cassert>
 #include <cstdint>
 
+#define DEBUG_TYPE "opt-addr-mode"
+
 static cl::opt<int> CodeGrowthLimit("hexagon-amode-growth-limit",
   cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode "
   "optimization"));
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index ba98b8994937f..804a547d5b339 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -2250,6 +2250,12 @@ def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, PS_storerhabs>;
 def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, PS_storeriabs>;
 def: Storea_pat<SwapSt<atomic_store_64>, I64, addrgp, PS_storerdabs>;
 
+// Prefer this pattern to S2_asl_i_p_or for the special case of joining
+// two 32-bit words into a 64-bit word.
+let AddedComplexity = 200 in
+def: Pat<(or (shl (Aext64 I32:$a), (i32 32)), (Zext64 I32:$b)),
+         (A2_combinew I32:$a, I32:$b)>;
+
 def: Pat<(or (or (or (shl (i64 (zext (and I32:$b, (i32 65535)))), (i32 16)),
                      (i64 (zext (i32 (and I32:$a, (i32 65535)))))),
                  (shl (i64 (anyext (and I32:$c, (i32 65535)))), (i32 32))),
@@ -2971,45 +2977,40 @@ def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs),
          (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
          Requires<[UseHVXDbl]>;
 
-def SDTHexagonVPACK: SDTypeProfile<1, 3, [SDTCisSameAs<1, 2>,
-                                          SDTCisInt<3>]>;
-
-def HexagonVPACK: SDNode<"HexagonISD::VPACK", SDTHexagonVPACK>;
-
-// 0 as the last argument denotes vpacke. 1 denotes vpacko
-def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs),
-                              (v64i8 VectorRegs:$Vt), (i32 0))),
-         (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs),
-                              (v64i8 VectorRegs:$Vt), (i32 1))),
-         (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs),
-                               (v32i16 VectorRegs:$Vt), (i32 0))),
-         (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs),
-                             (v32i16 VectorRegs:$Vt), (i32 1))),
-         (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-
-def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs),
-                             (v128i8 VecDblRegs:$Vt), (i32 0))),
-         (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-         Requires<[UseHVXDbl]>;
-def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs),
-                             (v128i8 VecDblRegs:$Vt), (i32 1))),
-         (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-         Requires<[UseHVXDbl]>;
-def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs),
-                             (v64i16 VecDblRegs:$Vt), (i32 0))),
-         (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-         Requires<[UseHVXDbl]>;
-def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs),
-                            (v64i16 VecDblRegs:$Vt), (i32 1))),
-        (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-        Requires<[UseHVXDbl]>;
+def SDTHexagonVPACK: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>, SDTCisVec<1>]>;
+
+def HexagonVPACKE: SDNode<"HexagonISD::VPACKE", SDTHexagonVPACK>;
+def HexagonVPACKO: SDNode<"HexagonISD::VPACKO", SDTHexagonVPACK>;
+
+let Predicates = [UseHVXSgl] in {
+  def: Pat<(v64i8 (HexagonVPACKE (v64i8 VectorRegs:$Vs),
+                                 (v64i8 VectorRegs:$Vt))),
+           (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(v64i8 (HexagonVPACKO (v64i8 VectorRegs:$Vs),
+                                 (v64i8 VectorRegs:$Vt))),
+           (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(v32i16 (HexagonVPACKE (v32i16 VectorRegs:$Vs),
+                                  (v32i16 VectorRegs:$Vt))),
+           (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(v32i16 (HexagonVPACKO (v32i16 VectorRegs:$Vs),
+                                  (v32i16 VectorRegs:$Vt))),
+           (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>;
+}
+
+let Predicates = [UseHVXDbl] in {
+  def: Pat<(v128i8 (HexagonVPACKE (v128i8 VecDblRegs:$Vs),
+                                  (v128i8 VecDblRegs:$Vt))),
+           (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+  def: Pat<(v128i8 (HexagonVPACKO (v128i8 VecDblRegs:$Vs),
+                                  (v128i8 VecDblRegs:$Vt))),
+           (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+  def: Pat<(v64i16 (HexagonVPACKE (v64i16 VecDblRegs:$Vs),
+                                  (v64i16 VecDblRegs:$Vt))),
+           (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+  def: Pat<(v64i16 (HexagonVPACKO (v64i16 VecDblRegs:$Vs),
+                                  (v64i16 VecDblRegs:$Vt))),
+          (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+}
 
 def V2I1:  PatLeaf<(v2i1  PredRegs:$R)>;
 def V4I1:  PatLeaf<(v4i1  PredRegs:$R)>;
@@ -3073,6 +3074,10 @@ def: Pat<(v4i8 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrb I32:$Rs)>;
 // four halfwords of 64-bits destination register.
 def: Pat<(v4i16 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrh I32:$Rs)>;
 
+def: Pat<(v2i32 (HexagonVSPLAT s8_0ImmPred:$s8)),
+         (A2_combineii imm:$s8, imm:$s8)>;
+def: Pat<(v2i32 (HexagonVSPLAT I32:$Rs)), (A2_combinew I32:$Rs, I32:$Rs)>;
+
 
 class VArith_pat <InstHexagon MI, SDNode Op, PatFrag Type>
   : Pat <(Op Type:$Rss, Type:$Rtt),
@@ -3099,14 +3104,11 @@ def: VArith_pat <A2_xorp,   xor, V8I8>;
 def: VArith_pat <A2_xorp,   xor, V4I16>;
 def: VArith_pat <A2_xorp,   xor, V2I32>;
 
-def: Pat<(v2i32 (sra V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
-                                                    (i32 u5_0ImmPred:$c))))),
+def: Pat<(v2i32 (sra V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))),
          (S2_asr_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(v2i32 (srl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
-                                                    (i32 u5_0ImmPred:$c))))),
+def: Pat<(v2i32 (srl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))),
          (S2_lsr_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(v2i32 (shl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
-                                                    (i32 u5_0ImmPred:$c))))),
+def: Pat<(v2i32 (shl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))),
          (S2_asl_i_vw V2I32:$b, imm:$c)>;
 
 def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c)))),
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 34df2ebcc5206..ea86c9c42f478 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -53,6 +53,10 @@ static cl::opt<bool>
     EmitJtInText("hexagon-emit-jt-text", cl::Hidden, cl::init(false),
                  cl::desc("Emit hexagon jump tables in function section"));
 
+static cl::opt<bool>
+    EmitLutInText("hexagon-emit-lut-text", cl::Hidden, cl::init(false),
+                 cl::desc("Emit hexagon lookup tables in function section"));
+
 // TraceGVPlacement controls messages for all builds. For builds with assertions
 // (debug or release), messages are also controlled by the usual debug flags
 // (e.g. -debug and -debug-only=globallayout)
@@ -136,6 +140,13 @@ MCSection *HexagonTargetObjectFile::SelectSectionForGlobal(
          << (Kind.isBSS() ? "kind_bss " : "" )
          << (Kind.isBSSLocal() ? "kind_bss_local " : "" ));
 
+  // If the lookup table is used by more than one function, do not place
+  // it in text section.
+  if (EmitLutInText && GO->getName().startswith("switch.table")) {
+    if (const Function *Fn = getLutUsedFunction(GO))
+      return selectSectionForLookupTable(GO, TM, Fn);
+  }
+
   if (isGlobalInSmallSection(GO, TM))
     return selectSmallSectionForGlobal(GO, Kind, TM);
 
@@ -402,3 +413,39 @@ MCSection *HexagonTargetObjectFile::selectSmallSectionForGlobal(
   // Otherwise, we work the same as ELF.
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
 }
+
+// Return the function that uses the lookup table. If there are more
+// than one live function that uses this look table, bail out and place
+// the lookup table in default section.
+const Function *
+HexagonTargetObjectFile::getLutUsedFunction(const GlobalObject *GO) const {
+  const Function *ReturnFn = nullptr;
+  for (auto U : GO->users()) {
+    // validate each instance of user to be a live function.
+    auto *I = dyn_cast<Instruction>(U);
+    if (!I)
+      continue;
+    auto *Bb = I->getParent();
+    if (!Bb)
+      continue;
+    auto *UserFn = Bb->getParent();
+    if (!ReturnFn)
+      ReturnFn = UserFn;
+    else if (ReturnFn != UserFn)
+      return nullptr;
+  }
+  return ReturnFn;
+}
+
+MCSection *HexagonTargetObjectFile::selectSectionForLookupTable(
+    const GlobalObject *GO, const TargetMachine &TM, const Function *Fn) const {
+
+  SectionKind Kind = SectionKind::getText();
+  // If the function has explicit section, place the lookup table in this
+  // explicit section.
+  if (Fn->hasSection())
+    return getExplicitSectionGlobal(Fn, Kind, TM);
+
+  const auto *FuncObj = dyn_cast<GlobalObject>(Fn);
+  return SelectSectionForGlobal(FuncObj, Kind, TM);
+}
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index 373d850b53be7..eff44f097e03f 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -36,6 +36,8 @@ namespace llvm {
     bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference,
                                              const Function &F) const override;
 
+    const Function *getLutUsedFunction(const GlobalObject *GO) const;
+
   private:
     MCSectionELF *SmallDataSection;
     MCSectionELF *SmallBSSSection;
@@ -46,6 +48,10 @@ namespace llvm {
     MCSection *selectSmallSectionForGlobal(const GlobalObject *GO,
                                            SectionKind Kind,
                                            const TargetMachine &TM) const;
+
+    MCSection *selectSectionForLookupTable(const GlobalObject *GO,
+                                           const TargetMachine &TM,
+                                           const Function *Fn) const;
   };
 
 } // namespace llvm
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index b72c9d5344787..e12188e706025 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -304,9 +304,6 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                   const MCSubtargetInfo *STI);
 
-  bool expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                       const MCSubtargetInfo *STI);
-
   bool reportParseError(Twine ErrorMsg);
   bool reportParseError(SMLoc Loc, Twine ErrorMsg);
 
@@ -2514,16 +2511,6 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::SEQIMacro:
     return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
-  case Mips::MFTC0:   case Mips::MTTC0:
-  case Mips::MFTGPR:  case Mips::MTTGPR:
-  case Mips::MFTLO:   case Mips::MTTLO:
-  case Mips::MFTHI:   case Mips::MTTHI:
-  case Mips::MFTACX:  case Mips::MTTACX:
-  case Mips::MFTDSP:  case Mips::MTTDSP:
-  case Mips::MFTC1:   case Mips::MTTC1:
-  case Mips::MFTHC1:  case Mips::MTTHC1:
-  case Mips::CFTC1:   case Mips::CTTC1:
-    return expandMXTRAlias(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   }
 }
 
@@ -4895,212 +4882,6 @@ bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   return false;
 }
 
-// Map the DSP accumulator and control register to the corresponding gpr
-// operand. Unlike the other alias, the m(f|t)t(lo|hi|acx) instructions
-// do not map the DSP registers contigously to gpr registers.
-static unsigned getRegisterForMxtrDSP(MCInst &Inst, bool IsMFDSP) {
-  switch (Inst.getOpcode()) {
-    case Mips::MFTLO:
-    case Mips::MTTLO:
-      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
-        case Mips::AC0:
-          return Mips::ZERO;
-        case Mips::AC1:
-          return Mips::A0;
-        case Mips::AC2:
-          return Mips::T0;
-        case Mips::AC3:
-          return Mips::T4;
-        default:
-          llvm_unreachable("Unknown register for 'mttr' alias!");
-    }
-    case Mips::MFTHI:
-    case Mips::MTTHI:
-      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
-        case Mips::AC0:
-          return Mips::AT;
-        case Mips::AC1:
-          return Mips::A1;
-        case Mips::AC2:
-          return Mips::T1;
-        case Mips::AC3:
-          return Mips::T5;
-        default:
-          llvm_unreachable("Unknown register for 'mttr' alias!");
-    }
-    case Mips::MFTACX:
-    case Mips::MTTACX:
-      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
-        case Mips::AC0:
-          return Mips::V0;
-        case Mips::AC1:
-          return Mips::A2;
-        case Mips::AC2:
-          return Mips::T2;
-        case Mips::AC3:
-          return Mips::T6;
-        default:
-          llvm_unreachable("Unknown register for 'mttr' alias!");
-    }
-    case Mips::MFTDSP:
-    case Mips::MTTDSP:
-      return Mips::S0;
-    default:
-      llvm_unreachable("Unknown instruction for 'mttr' dsp alias!");
-  }
-}
-
-// Map the floating point register operand to the corresponding register
-// operand.
-static unsigned getRegisterForMxtrFP(MCInst &Inst, bool IsMFTC1) {
-  switch (Inst.getOperand(IsMFTC1 ? 1 : 0).getReg()) {
-    case Mips::F0:  return Mips::ZERO;
-    case Mips::F1:  return Mips::AT;
-    case Mips::F2:  return Mips::V0;
-    case Mips::F3:  return Mips::V1;
-    case Mips::F4:  return Mips::A0;
-    case Mips::F5:  return Mips::A1;
-    case Mips::F6:  return Mips::A2;
-    case Mips::F7:  return Mips::A3;
-    case Mips::F8:  return Mips::T0;
-    case Mips::F9:  return Mips::T1;
-    case Mips::F10: return Mips::T2;
-    case Mips::F11: return Mips::T3;
-    case Mips::F12: return Mips::T4;
-    case Mips::F13: return Mips::T5;
-    case Mips::F14: return Mips::T6;
-    case Mips::F15: return Mips::T7;
-    case Mips::F16: return Mips::S0;
-    case Mips::F17: return Mips::S1;
-    case Mips::F18: return Mips::S2;
-    case Mips::F19: return Mips::S3;
-    case Mips::F20: return Mips::S4;
-    case Mips::F21: return Mips::S5;
-    case Mips::F22: return Mips::S6;
-    case Mips::F23: return Mips::S7;
-    case Mips::F24: return Mips::T8;
-    case Mips::F25: return Mips::T9;
-    case Mips::F26: return Mips::K0;
-    case Mips::F27: return Mips::K1;
-    case Mips::F28: return Mips::GP;
-    case Mips::F29: return Mips::SP;
-    case Mips::F30: return Mips::FP;
-    case Mips::F31: return Mips::RA;
-    default: llvm_unreachable("Unknown register for mttc1 alias!");
-  }
-}
-
-// Map the coprocessor operand the corresponding gpr register operand.
-static unsigned getRegisterForMxtrC0(MCInst &Inst, bool IsMFTC0) {
-  switch (Inst.getOperand(IsMFTC0 ? 1 : 0).getReg()) {
-    case Mips::COP00:  return Mips::ZERO;
-    case Mips::COP01:  return Mips::AT;
-    case Mips::COP02:  return Mips::V0;
-    case Mips::COP03:  return Mips::V1;
-    case Mips::COP04:  return Mips::A0;
-    case Mips::COP05:  return Mips::A1;
-    case Mips::COP06:  return Mips::A2;
-    case Mips::COP07:  return Mips::A3;
-    case Mips::COP08:  return Mips::T0;
-    case Mips::COP09:  return Mips::T1;
-    case Mips::COP010: return Mips::T2;
-    case Mips::COP011: return Mips::T3;
-    case Mips::COP012: return Mips::T4;
-    case Mips::COP013: return Mips::T5;
-    case Mips::COP014: return Mips::T6;
-    case Mips::COP015: return Mips::T7;
-    case Mips::COP016: return Mips::S0;
-    case Mips::COP017: return Mips::S1;
-    case Mips::COP018: return Mips::S2;
-    case Mips::COP019: return Mips::S3;
-    case Mips::COP020: return Mips::S4;
-    case Mips::COP021: return Mips::S5;
-    case Mips::COP022: return Mips::S6;
-    case Mips::COP023: return Mips::S7;
-    case Mips::COP024: return Mips::T8;
-    case Mips::COP025: return Mips::T9;
-    case Mips::COP026: return Mips::K0;
-    case Mips::COP027: return Mips::K1;
-    case Mips::COP028: return Mips::GP;
-    case Mips::COP029: return Mips::SP;
-    case Mips::COP030: return Mips::FP;
-    case Mips::COP031: return Mips::RA;
-    default: llvm_unreachable("Unknown register for mttc0 alias!");
-  }
-}
-
-/// Expand an alias of 'mftr' or 'mttr' into the full instruction, by producing
-/// an mftr or mttr with the correctly mapped gpr register, u, sel and h bits.
-bool MipsAsmParser::expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                                    const MCSubtargetInfo *STI) {
-  MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned rd = 0;
-  unsigned u = 1;
-  unsigned sel = 0;
-  unsigned h = 0;
-  bool IsMFTR = false;
-  switch (Inst.getOpcode()) {
-    case Mips::MFTC0:
-      IsMFTR = true;
-      LLVM_FALLTHROUGH;
-    case Mips::MTTC0:
-      u = 0;
-      rd = getRegisterForMxtrC0(Inst, IsMFTR);
-      sel = Inst.getOperand(2).getImm();
-      break;
-    case Mips::MFTGPR:
-      IsMFTR = true;
-      LLVM_FALLTHROUGH;
-    case Mips::MTTGPR:
-      rd = Inst.getOperand(IsMFTR ? 1 : 0).getReg();
-      break;
-    case Mips::MFTLO:
-    case Mips::MFTHI:
-    case Mips::MFTACX:
-    case Mips::MFTDSP:
-      IsMFTR = true;
-      LLVM_FALLTHROUGH;
-    case Mips::MTTLO:
-    case Mips::MTTHI:
-    case Mips::MTTACX:
-    case Mips::MTTDSP:
-      rd = getRegisterForMxtrDSP(Inst, IsMFTR);
-      sel = 1;
-      break;
-    case Mips::MFTHC1:
-      h = 1;
-      LLVM_FALLTHROUGH;
-    case Mips::MFTC1:
-      IsMFTR = true;
-      rd = getRegisterForMxtrFP(Inst, IsMFTR);
-      sel = 2;
-      break;
-    case Mips::MTTHC1:
-      h = 1;
-      LLVM_FALLTHROUGH;
-    case Mips::MTTC1:
-      rd = getRegisterForMxtrFP(Inst, IsMFTR);
-      sel = 2;
-      break;
-    case Mips::CFTC1:
-      IsMFTR = true;
-      LLVM_FALLTHROUGH;
-    case Mips::CTTC1:
-      rd = getRegisterForMxtrFP(Inst, IsMFTR);
-      sel = 3;
-      break;
-  }
-  unsigned Op0 = IsMFTR ? Inst.getOperand(0).getReg() : rd;
-  unsigned Op1 =
-      IsMFTR ? rd
-             : (Inst.getOpcode() != Mips::MTTDSP ? Inst.getOperand(1).getReg()
-                                                 : Inst.getOperand(0).getReg());
-
-  TOut.emitRRIII(IsMFTR ? Mips::MFTR : Mips::MTTR, Op0, Op1, u, sel, h, IDLoc,
-                 STI);
-  return false;
-}
-
 unsigned
 MipsAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst,
                                               const OperandVector &Operands) {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 7caeb08589af6..2907b77158575 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -193,21 +193,6 @@ void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1,
   emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc, STI);
 }
 
-void MipsTargetStreamer::emitRRIII(unsigned Opcode, unsigned Reg0,
-                                   unsigned Reg1, int16_t Imm0, int16_t Imm1,
-                                   int16_t Imm2, SMLoc IDLoc,
-                                   const MCSubtargetInfo *STI) {
-  MCInst TmpInst;
-  TmpInst.setOpcode(Opcode);
-  TmpInst.addOperand(MCOperand::createReg(Reg0));
-  TmpInst.addOperand(MCOperand::createReg(Reg1));
-  TmpInst.addOperand(MCOperand::createImm(Imm0));
-  TmpInst.addOperand(MCOperand::createImm(Imm1));
-  TmpInst.addOperand(MCOperand::createImm(Imm2));
-  TmpInst.setLoc(IDLoc);
-  getStreamer().EmitInstruction(TmpInst, *STI);
-}
-
 void MipsTargetStreamer::emitAddu(unsigned DstReg, unsigned SrcReg,
                                   unsigned TrgReg, bool Is64Bit,
                                   const MCSubtargetInfo *STI) {
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index d2f0fdcc6cc11..6ceb055775387 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -190,6 +190,9 @@ def FeatureMadd4 : SubtargetFeature<"nomadd4", "DisableMadd4", "true",
 
 def FeatureMT : SubtargetFeature<"mt", "HasMT", "true", "Mips MT ASE">;
 
+def FeatureLongCalls : SubtargetFeature<"long-calls", "UseLongCalls", "true",
+                                        "Disable use of the jal instruction">;
+
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index a6ec9fb2e5987..20319f85696cd 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -364,6 +364,18 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::UDIV, MVT::i64, Expand);
   setOperationAction(ISD::UREM, MVT::i64, Expand);
 
+  if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) {
+    setOperationAction(ISD::ADDC, MVT::i32, Expand);
+    setOperationAction(ISD::ADDE, MVT::i32, Expand);
+  }
+
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+  setOperationAction(ISD::SUBC, MVT::i32, Expand);
+  setOperationAction(ISD::SUBE, MVT::i32, Expand);
+  setOperationAction(ISD::SUBC, MVT::i64, Expand);
+  setOperationAction(ISD::SUBE, MVT::i64, Expand);
+
   // Operations not directly supported by Mips.
   setOperationAction(ISD::BR_CC,             MVT::f32,   Expand);
   setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
@@ -469,6 +481,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::AssertZext);
   setTargetDAGCombine(ISD::SHL);
 
@@ -923,14 +936,127 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   }
 }
 
+static SDValue performMADD_MSUBCombine(SDNode *ROOTNode, SelectionDAG &CurDAG,
+                                       const MipsSubtarget &Subtarget) {
+  // ROOTNode must have a multiplication as an operand for the match to be
+  // successful.
+  if (ROOTNode->getOperand(0).getOpcode() != ISD::MUL &&
+      ROOTNode->getOperand(1).getOpcode() != ISD::MUL)
+    return SDValue();
+
+  // We don't handle vector types here.
+  if (ROOTNode->getValueType(0).isVector())
+    return SDValue();
+
+  // For MIPS64, madd / msub instructions are inefficent to use with 64 bit
+  // arithmetic. E.g.
+  // (add (mul a b) c) =>
+  //   let res = (madd (mthi (drotr c 32))x(mtlo c) a b) in
+  //   MIPS64:   (or (dsll (mfhi res) 32) (dsrl (dsll (mflo res) 32) 32)
+  //   or
+  //   MIPS64R2: (dins (mflo res) (mfhi res) 32 32)
+  //
+  // The overhead of setting up the Hi/Lo registers and reassembling the
+  // result makes this a dubious optimzation for MIPS64. The core of the
+  // problem is that Hi/Lo contain the upper and lower 32 bits of the
+  // operand and result.
+  //
+  // It requires a chain of 4 add/mul for MIPS64R2 to get better code
+  // density than doing it naively, 5 for MIPS64. Additionally, using
+  // madd/msub on MIPS64 requires the operands actually be 32 bit sign
+  // extended operands, not true 64 bit values.
+  //
+  // FIXME: For the moment, disable this completely for MIPS64.
+  if (Subtarget.hasMips64())
+    return SDValue();
+
+  SDValue Mult = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
+                     ? ROOTNode->getOperand(0)
+                     : ROOTNode->getOperand(1);
+
+  SDValue AddOperand = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
+                     ? ROOTNode->getOperand(1)
+                     : ROOTNode->getOperand(0);
+
+  // Transform this to a MADD only if the user of this node is the add.
+  // If there are other users of the mul, this function returns here.
+  if (!Mult.hasOneUse())
+    return SDValue();
+
+  // maddu and madd are unusual instructions in that on MIPS64 bits 63..31
+  // must be in canonical form, i.e. sign extended. For MIPS32, the operands
+  // of the multiply must have 32 or more sign bits, otherwise we cannot
+  // perform this optimization. We have to check this here as we're performing
+  // this optimization pre-legalization.
+  SDValue MultLHS = Mult->getOperand(0);
+  SDValue MultRHS = Mult->getOperand(1);
+
+  bool IsSigned = MultLHS->getOpcode() == ISD::SIGN_EXTEND &&
+                  MultRHS->getOpcode() == ISD::SIGN_EXTEND;
+  bool IsUnsigned = MultLHS->getOpcode() == ISD::ZERO_EXTEND &&
+                    MultRHS->getOpcode() == ISD::ZERO_EXTEND;
+
+  if (!IsSigned && !IsUnsigned)
+    return SDValue();
+
+  // Initialize accumulator.
+  SDLoc DL(ROOTNode);
+  SDValue TopHalf;
+  SDValue BottomHalf;
+  BottomHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
+                              CurDAG.getIntPtrConstant(0, DL));
+
+  TopHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
+                           CurDAG.getIntPtrConstant(1, DL));
+  SDValue ACCIn = CurDAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+                                  BottomHalf,
+                                  TopHalf);
+
+  // Create MipsMAdd(u) / MipsMSub(u) node.
+  bool IsAdd = ROOTNode->getOpcode() == ISD::ADD;
+  unsigned Opcode = IsAdd ? (IsUnsigned ? MipsISD::MAddu : MipsISD::MAdd)
+                          : (IsUnsigned ? MipsISD::MSubu : MipsISD::MSub);
+  SDValue MAddOps[3] = {
+      CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(0)),
+      CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(1)), ACCIn};
+  EVT VTs[2] = {MVT::i32, MVT::i32};
+  SDValue MAdd = CurDAG.getNode(Opcode, DL, VTs, MAddOps);
+
+  SDValue ResLo = CurDAG.getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
+  SDValue ResHi = CurDAG.getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
+  SDValue Combined =
+      CurDAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi);
+  return Combined;
+}
+
+static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  // (sub v0 (mul v1, v2)) => (msub v1, v2, v0)
+  if (DCI.isBeforeLegalizeOps()) {
+    if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+        !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
+      return performMADD_MSUBCombine(N, DAG, Subtarget);
+
+    return SDValue();
+  }
+
+  return SDValue();
+}
+
 static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget &Subtarget) {
-  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
+  // (add v0 (mul v1, v2)) => (madd v1, v2, v0)
+  if (DCI.isBeforeLegalizeOps()) {
+    if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+        !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
+      return performMADD_MSUBCombine(N, DAG, Subtarget);
 
-  if (DCI.isBeforeLegalizeOps())
     return SDValue();
+  }
 
+  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
   SDValue Add = N->getOperand(1);
 
   if (Add.getOpcode() != ISD::ADD)
@@ -1058,6 +1184,8 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return performAssertZextCombine(N, DAG, DCI, Subtarget);
   case ISD::SHL:
     return performSHLCombine(N, DAG, DCI, Subtarget);
+  case ISD::SUB:
+    return performSUBCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -3021,6 +3149,20 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   EVT Ty = Callee.getValueType();
   bool GlobalOrExternal = false, IsCallReloc = false;
 
+  // The long-calls feature is ignored in case of PIC.
+  // While we do not support -mshared / -mno-shared properly,
+  // ignore long-calls in case of -mabicalls too.
+  if (Subtarget.useLongCalls() && !Subtarget.isABICalls() && !IsPIC) {
+    // Get the address of the callee into a register to prevent
+    // using of the `jal` instruction for the direct call.
+    if (auto *N = dyn_cast<GlobalAddressSDNode>(Callee))
+      Callee = Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                    : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
+    else if (auto *N = dyn_cast<ExternalSymbolSDNode>(Callee))
+      Callee = Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                    : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
+  }
+
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     if (IsPIC) {
       const GlobalValue *Val = G->getGlobal();
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 94f3a74be98bc..0333fe6520fab 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -443,8 +443,17 @@ let AdditionalPredicates = [NotInMicroMips] in {
 }
 def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
                           bitconvert>, MFC1_FM<0>;
+def MFC1_D64 : MFC1_FT<"mfc1", GPR32Opnd, FGR64Opnd, II_MFC1>, MFC1_FM<0>,
+               FGR_64 {
+  let DecoderNamespace = "Mips64";
+}
 def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
                           bitconvert>, MFC1_FM<4>;
+def MTC1_D64 : MTC1_FT<"mtc1", FGR64Opnd, GPR32Opnd, II_MTC1>, MFC1_FM<4>,
+               FGR_64 {
+  let DecoderNamespace = "Mips64";
+}
+
 let AdditionalPredicates = [NotInMicroMips] in {
   def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
                   MFC1_FM<3>, ISA_MIPS32R2, FGR_32;
diff --git a/lib/Target/Mips/MipsMTInstrFormats.td b/lib/Target/Mips/MipsMTInstrFormats.td
index edc0981e6278a..64bee5bfba18e 100644
--- a/lib/Target/Mips/MipsMTInstrFormats.td
+++ b/lib/Target/Mips/MipsMTInstrFormats.td
@@ -35,8 +35,6 @@ class FIELD5<bits<5> Val> {
 def FIELD5_1_DMT_EMT  : FIELD5<0b00001>;
 def FIELD5_2_DMT_EMT  : FIELD5<0b01111>;
 def FIELD5_1_2_DVPE_EVPE : FIELD5<0b00000>;
-def FIELD5_MFTR : FIELD5<0b01000>;
-def FIELD5_MTTR : FIELD5<0b01100>;
 
 class COP0_MFMC0_MT<FIELD5 Op1, FIELD5 Op2, OPCODE1 sc> : MipsMTInst {
   bits<32> Inst;
@@ -52,25 +50,6 @@ class COP0_MFMC0_MT<FIELD5 Op1, FIELD5 Op2, OPCODE1 sc> : MipsMTInst {
   let Inst{2-0}   = 0b001;
 }
 
-class COP0_MFTTR_MT<FIELD5 Op> : MipsMTInst {
-  bits<32> Inst;
-
-  bits<5> rt;
-  bits<5> rd;
-  bits<1> u;
-  bits<1> h;
-  bits<3> sel;
-  let Inst{31-26} = 0b010000; // COP0
-  let Inst{25-21} = Op.Value; // MFMC0
-  let Inst{20-16} = rt;
-  let Inst{15-11} = rd;
-  let Inst{10-6}  = 0b00000;  // rx - currently unsupported.
-  let Inst{5}     = u;
-  let Inst{4}     = h;
-  let Inst{3}     = 0b0;
-  let Inst{2-0}   = sel;
-}
-
 class SPECIAL3_MT_FORK : MipsMTInst {
   bits<32> Inst;
 
diff --git a/lib/Target/Mips/MipsMTInstrInfo.td b/lib/Target/Mips/MipsMTInstrInfo.td
index 72e626cbec40a..ab6693f60fd94 100644
--- a/lib/Target/Mips/MipsMTInstrInfo.td
+++ b/lib/Target/Mips/MipsMTInstrInfo.td
@@ -6,13 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file describes the MIPS MT ASE as defined by MD00378 1.12.
-//
-// TODO: Add support for the microMIPS encodings for the MT ASE and add the
-//       instruction mappings.
-//
-//===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // MIPS MT Instruction Encodings
@@ -34,10 +27,6 @@ class FORK_ENC : SPECIAL3_MT_FORK;
 
 class YIELD_ENC : SPECIAL3_MT_YIELD;
 
-class MFTR_ENC : COP0_MFTTR_MT<FIELD5_MFTR>;
-
-class MTTR_ENC : COP0_MFTTR_MT<FIELD5_MTTR>;
-
 //===----------------------------------------------------------------------===//
 // MIPS MT Instruction Descriptions
 //===----------------------------------------------------------------------===//
@@ -50,22 +39,6 @@ class MT_1R_DESC_BASE<string instr_asm, InstrItinClass Itin = NoItinerary> {
   InstrItinClass Itinerary = Itin;
 }
 
-class MFTR_DESC {
-  dag OutOperandList = (outs GPR32Opnd:$rd);
-  dag InOperandList = (ins GPR32Opnd:$rt, uimm1:$u, uimm3:$sel, uimm1:$h);
-  string AsmString = "mftr\t$rd, $rt, $u, $sel, $h";
-  list<dag> Pattern = [];
-  InstrItinClass Itinerary = II_MFTR;
-}
-
-class MTTR_DESC {
-  dag OutOperandList = (outs GPR32Opnd:$rd);
-  dag InOperandList = (ins GPR32Opnd:$rt, uimm1:$u, uimm3:$sel, uimm1:$h);
-  string AsmString = "mttr\t$rt, $rd, $u, $sel, $h";
-  list<dag> Pattern = [];
-  InstrItinClass Itinerary = II_MTTR;
-}
-
 class FORK_DESC {
   dag OutOperandList = (outs GPR32Opnd:$rs, GPR32Opnd:$rd);
   dag InOperandList = (ins GPR32Opnd:$rt);
@@ -106,74 +79,9 @@ let hasSideEffects = 1, isNotDuplicable = 1,
   def FORK : FORK_ENC, FORK_DESC, ASE_MT;
 
   def YIELD : YIELD_ENC, YIELD_DESC, ASE_MT;
-
-  def MFTR : MFTR_ENC, MFTR_DESC, ASE_MT;
-
-  def MTTR : MTTR_ENC, MTTR_DESC, ASE_MT;
 }
 
 //===----------------------------------------------------------------------===//
-// MIPS MT Pseudo Instructions - used to support mtfr & mttr aliases.
-//===----------------------------------------------------------------------===//
-def MFTC0 : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins COP0Opnd:$rt,
-                                                        uimm3:$sel),
-                              "mftc0 $rd, $rt, $sel">, ASE_MT;
-
-def MFTGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rt,
-                                                          uimm3:$sel),
-                               "mftgpr $rd, $rt">, ASE_MT;
-
-def MFTLO : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac),
-                              "mftlo $rt, $ac">, ASE_MT;
-
-def MFTHI : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac),
-                              "mfthi $rt, $ac">, ASE_MT;
-
-def MFTACX : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac),
-                               "mftacx $rt, $ac">, ASE_MT;
-
-def MFTDSP : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins),
-                               "mftdsp $rt">, ASE_MT;
-
-def MFTC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGR32Opnd:$ft),
-                              "mftc1 $rt, $ft">, ASE_MT;
-
-def MFTHC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGR32Opnd:$ft),
-                               "mfthc1 $rt, $ft">, ASE_MT;
-
-def CFTC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGRCCOpnd:$ft),
-                              "cftc1 $rt, $ft">, ASE_MT;
-
-
-def MTTC0 : MipsAsmPseudoInst<(outs COP0Opnd:$rd), (ins GPR32Opnd:$rt,
-                                                        uimm3:$sel),
-                              "mttc0 $rt, $rd, $sel">, ASE_MT;
-
-def MTTGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins GPR32Opnd:$rd),
-                               "mttgpr $rd, $rt">, ASE_MT;
-
-def MTTLO : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt),
-                              "mttlo $rt, $ac">, ASE_MT;
-
-def MTTHI : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt),
-                              "mtthi $rt, $ac">, ASE_MT;
-
-def MTTACX : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt),
-                               "mttacx $rt, $ac">, ASE_MT;
-
-def MTTDSP : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rt),
-                               "mttdsp $rt">, ASE_MT;
-
-def MTTC1 : MipsAsmPseudoInst<(outs FGR32Opnd:$ft), (ins GPR32Opnd:$rt),
-                              "mttc1 $rt, $ft">, ASE_MT;
-
-def MTTHC1 : MipsAsmPseudoInst<(outs FGR32Opnd:$ft), (ins GPR32Opnd:$rt),
-                               "mtthc1 $rt, $ft">, ASE_MT;
-
-def CTTC1 : MipsAsmPseudoInst<(outs FGRCCOpnd:$ft), (ins GPR32Opnd:$rt),
-                              "cttc1 $rt, $ft">, ASE_MT;
-
-//===----------------------------------------------------------------------===//
 // MIPS MT Instruction Definitions
 //===----------------------------------------------------------------------===//
 
@@ -187,22 +95,4 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<"evpe", (EVPE ZERO), 1>, ASE_MT;
 
   def : MipsInstAlias<"yield $rs", (YIELD ZERO, GPR32Opnd:$rs), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mftc0 $rd, $rt", (MFTC0 GPR32Opnd:$rd, COP0Opnd:$rt, 0),
-                      1>, ASE_MT;
-
-  def : MipsInstAlias<"mftlo $rt", (MFTLO GPR32Opnd:$rt, AC0), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mfthi $rt", (MFTHI GPR32Opnd:$rt, AC0), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mftacx $rt", (MFTACX GPR32Opnd:$rt, AC0), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mttc0 $rd, $rt", (MTTC0 COP0Opnd:$rt, GPR32Opnd:$rd, 0),
-                      1>, ASE_MT;
-
-  def : MipsInstAlias<"mttlo $rt", (MTTLO AC0, GPR32Opnd:$rt), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mtthi $rt", (MTTHI AC0, GPR32Opnd:$rt), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mttacx $rt", (MTTACX AC0, GPR32Opnd:$rt), 1>, ASE_MT;
 }
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 49ae6dd4cd399..4be26dd25dc04 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -245,46 +245,64 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
   }
 }
 
-void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
-                                        SDValue CmpLHS, const SDLoc &DL,
-                                        SDNode *Node) const {
-  unsigned Opc = InFlag.getOpcode(); (void)Opc;
-
-  assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
-          (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
-         "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
-
-  unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu;
-  if (Subtarget->isGP64bit()) {
-    SLTuOp = Mips::SLTu64;
-    ADDuOp = Mips::DADDu;
-  }
-
-  SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+void MipsSEDAGToDAGISel::selectAddE(SDNode *Node, const SDLoc &DL) const {
+  SDValue InFlag = Node->getOperand(2);
+  unsigned Opc = InFlag.getOpcode();
   SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
   EVT VT = LHS.getValueType();
 
-  SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops);
-
-  if (Subtarget->isGP64bit()) {
-    // On 64-bit targets, sltu produces an i64 but our backend currently says
-    // that SLTu64 produces an i32. We need to fix this in the long run but for
-    // now, just make the DAG type-correct by asserting the upper bits are zero.
-    Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT,
-                                   CurDAG->getTargetConstant(0, DL, VT),
-                                   SDValue(Carry, 0),
-                                   CurDAG->getTargetConstant(Mips::sub_32, DL,
-                                                             VT));
+  // In the base case, we can rely on the carry bit from the addsc
+  // instruction.
+  if (Opc == ISD::ADDC) {
+    SDValue Ops[3] = {LHS, RHS, InFlag};
+    CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Ops);
+    return;
   }
 
-  // Generate a second addition only if we know that RHS is not a
-  // constant-zero node.
-  SDNode *AddCarry = Carry;
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
-  if (!C || C->getZExtValue())
-    AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS);
+  assert(Opc == ISD::ADDE && "ISD::ADDE not in a chain of ADDE nodes!");
+
+  // The more complex case is when there is a chain of ISD::ADDE nodes like:
+  // (adde (adde (adde (addc a b) c) d) e).
+  //
+  // The addwc instruction does not write to the carry bit, instead it writes
+  // to bit 20 of the dsp control register. To match this series of nodes, each
+  // intermediate adde node must be expanded to write the carry bit before the
+  // addition.
+
+  // Start by reading the overflow field for addsc and moving the value to the
+  // carry field. The usage of 1 here with MipsISD::RDDSP / Mips::WRDSP
+  // corresponds to reading/writing the entire control register to/from a GPR.
+
+  SDValue CstOne = CurDAG->getTargetConstant(1, DL, MVT::i32);
+
+  SDValue OuFlag = CurDAG->getTargetConstant(20, DL, MVT::i32);
+
+  SDNode *DSPCtrlField =
+      CurDAG->getMachineNode(Mips::RDDSP, DL, MVT::i32, MVT::Glue, CstOne, InFlag);
+
+  SDNode *Carry = CurDAG->getMachineNode(
+      Mips::EXT, DL, MVT::i32, SDValue(DSPCtrlField, 0), OuFlag, CstOne);
 
-  CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
+  SDValue Ops[4] = {SDValue(DSPCtrlField, 0),
+                    CurDAG->getTargetConstant(6, DL, MVT::i32), CstOne,
+                    SDValue(Carry, 0)};
+  SDNode *DSPCFWithCarry = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, Ops);
+
+  // My reading of the the MIPS DSP 3.01 specification isn't as clear as I
+  // would like about whether bit 20 always gets overwritten by addwc.
+  // Hence take an extremely conservative view and presume it's sticky. We
+  // therefore need to clear it.
+
+  SDValue Zero = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+  SDValue InsOps[4] = {Zero, OuFlag, CstOne, SDValue(DSPCFWithCarry, 0)};
+  SDNode *DSPCtrlFinal = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, InsOps);
+
+  SDNode *WrDSP = CurDAG->getMachineNode(Mips::WRDSP, DL, MVT::Glue,
+                                         SDValue(DSPCtrlFinal, 0), CstOne);
+
+  SDValue Operands[3] = {LHS, RHS, SDValue(WrDSP, 0)};
+  CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Operands);
 }
 
 /// Match frameindex
@@ -765,19 +783,8 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   switch(Opcode) {
   default: break;
 
-  case ISD::SUBE: {
-    SDValue InFlag = Node->getOperand(2);
-    unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu;
-    selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node);
-    return true;
-  }
-
   case ISD::ADDE: {
-    if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
-      break;
-    SDValue InFlag = Node->getOperand(2);
-    unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu;
-    selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node);
+    selectAddE(Node, DL);
     return true;
   }
 
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index f89a350cab044..6f38289c5a457 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -41,8 +41,7 @@ private:
                                            const SDLoc &dl, EVT Ty, bool HasLo,
                                            bool HasHi);
 
-  void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
-                      const SDLoc &DL, SDNode *Node) const;
+  void selectAddE(SDNode *Node, const SDLoc &DL) const;
 
   bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const;
   bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 06a97b9d123ec..72b2738bfac44 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -179,8 +179,6 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::LOAD,               MVT::i32, Custom);
   setOperationAction(ISD::STORE,              MVT::i32, Custom);
 
-  setTargetDAGCombine(ISD::ADDE);
-  setTargetDAGCombine(ISD::SUBE);
   setTargetDAGCombine(ISD::MUL);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -421,163 +419,6 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
   return MipsTargetLowering::LowerOperation(Op, DAG);
 }
 
-// selectMADD -
-// Transforms a subgraph in CurDAG if the following pattern is found:
-//  (addc multLo, Lo0), (adde multHi, Hi0),
-// where,
-//  multHi/Lo: product of multiplication
-//  Lo0: initial value of Lo register
-//  Hi0: initial value of Hi register
-// Return true if pattern matching was successful.
-static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
-  // ADDENode's second operand must be a flag output of an ADDC node in order
-  // for the matching to be successful.
-  SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
-
-  if (ADDCNode->getOpcode() != ISD::ADDC)
-    return false;
-
-  SDValue MultHi = ADDENode->getOperand(0);
-  SDValue MultLo = ADDCNode->getOperand(0);
-  SDNode *MultNode = MultHi.getNode();
-  unsigned MultOpc = MultHi.getOpcode();
-
-  // MultHi and MultLo must be generated by the same node,
-  if (MultLo.getNode() != MultNode)
-    return false;
-
-  // and it must be a multiplication.
-  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
-    return false;
-
-  // MultLo amd MultHi must be the first and second output of MultNode
-  // respectively.
-  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
-    return false;
-
-  // Transform this to a MADD only if ADDENode and ADDCNode are the only users
-  // of the values of MultNode, in which case MultNode will be removed in later
-  // phases.
-  // If there exist users other than ADDENode or ADDCNode, this function returns
-  // here, which will result in MultNode being mapped to a single MULT
-  // instruction node rather than a pair of MULT and MADD instructions being
-  // produced.
-  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
-    return false;
-
-  SDLoc DL(ADDENode);
-
-  // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
-                                  ADDCNode->getOperand(1),
-                                  ADDENode->getOperand(1));
-
-  // create MipsMAdd(u) node
-  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd;
-
-  SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped,
-                                 MultNode->getOperand(0),// Factor 0
-                                 MultNode->getOperand(1),// Factor 1
-                                 ACCIn);
-
-  // replace uses of adde and addc here
-  if (!SDValue(ADDCNode, 0).use_empty()) {
-    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
-  }
-  if (!SDValue(ADDENode, 0).use_empty()) {
-    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
-  }
-
-  return true;
-}
-
-// selectMSUB -
-// Transforms a subgraph in CurDAG if the following pattern is found:
-//  (addc Lo0, multLo), (sube Hi0, multHi),
-// where,
-//  multHi/Lo: product of multiplication
-//  Lo0: initial value of Lo register
-//  Hi0: initial value of Hi register
-// Return true if pattern matching was successful.
-static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
-  // SUBENode's second operand must be a flag output of an SUBC node in order
-  // for the matching to be successful.
-  SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
-
-  if (SUBCNode->getOpcode() != ISD::SUBC)
-    return false;
-
-  SDValue MultHi = SUBENode->getOperand(1);
-  SDValue MultLo = SUBCNode->getOperand(1);
-  SDNode *MultNode = MultHi.getNode();
-  unsigned MultOpc = MultHi.getOpcode();
-
-  // MultHi and MultLo must be generated by the same node,
-  if (MultLo.getNode() != MultNode)
-    return false;
-
-  // and it must be a multiplication.
-  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
-    return false;
-
-  // MultLo amd MultHi must be the first and second output of MultNode
-  // respectively.
-  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
-    return false;
-
-  // Transform this to a MSUB only if SUBENode and SUBCNode are the only users
-  // of the values of MultNode, in which case MultNode will be removed in later
-  // phases.
-  // If there exist users other than SUBENode or SUBCNode, this function returns
-  // here, which will result in MultNode being mapped to a single MULT
-  // instruction node rather than a pair of MULT and MSUB instructions being
-  // produced.
-  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
-    return false;
-
-  SDLoc DL(SUBENode);
-
-  // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
-                                  SUBCNode->getOperand(0),
-                                  SUBENode->getOperand(0));
-
-  // create MipsSub(u) node
-  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub;
-
-  SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue,
-                                 MultNode->getOperand(0),// Factor 0
-                                 MultNode->getOperand(1),// Factor 1
-                                 ACCIn);
-
-  // replace uses of sube and subc here
-  if (!SDValue(SUBCNode, 0).use_empty()) {
-    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
-  }
-  if (!SDValue(SUBENode, 0).use_empty()) {
-    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
-  }
-
-  return true;
-}
-
-static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget &Subtarget) {
-  if (DCI.isBeforeLegalize())
-    return SDValue();
-
-  if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
-      N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
-    return SDValue(N, 0);
-
-  return SDValue();
-}
-
 // Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
 //
 // Performs the following transformations:
@@ -820,19 +661,6 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget &Subtarget) {
-  if (DCI.isBeforeLegalize())
-    return SDValue();
-
-  if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 &&
-      selectMSUB(N, &DAG))
-    return SDValue(N, 0);
-
-  return SDValue();
-}
-
 static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT,
                             EVT ShiftTy, SelectionDAG &DAG) {
   // Clear the upper (64 - VT.sizeInBits) bits.
@@ -1110,16 +938,12 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Val;
 
   switch (N->getOpcode()) {
-  case ISD::ADDE:
-    return performADDECombine(N, DAG, DCI, Subtarget);
   case ISD::AND:
     Val = performANDCombine(N, DAG, DCI, Subtarget);
     break;
   case ISD::OR:
     Val = performORCombine(N, DAG, DCI, Subtarget);
     break;
-  case ISD::SUBE:
-    return performSUBECombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
     return performMULCombine(N, DAG, DCI, this);
   case ISD::SHL:
@@ -3596,9 +3420,17 @@ MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI,
                                : (Subtarget.isABI_O32() ? &Mips::GPR32RegClass
                                                         : &Mips::GPR64RegClass);
   const bool UsingMips32 = RC == &Mips::GPR32RegClass;
-  unsigned Rs = RegInfo.createVirtualRegister(RC);
+  unsigned Rs = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
 
   BuildMI(*BB, MI, DL, TII->get(Mips::COPY_U_H), Rs).addReg(Ws).addImm(0);
+  if(!UsingMips32) {
+    unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR64RegClass);
+    BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Tmp)
+        .addImm(0)
+        .addReg(Rs)
+        .addImm(Mips::sub_32);
+    Rs = Tmp;
+  }
   BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::SH : Mips::SH64))
       .addReg(Rs)
       .addReg(Rt)
@@ -3649,6 +3481,12 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
   for (unsigned i = 1; i < MI.getNumOperands(); i++)
     MIB.add(MI.getOperand(i));
 
+  if(!UsingMips32) {
+    unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Tmp).addReg(Rt, 0, Mips::sub_32);
+    Rt = Tmp;
+  }
+
   BuildMI(*BB, MI, DL, TII->get(Mips::FILL_H), Wd).addReg(Rt);
 
   MI.eraseFromParent();
@@ -3716,6 +3554,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   assert(Subtarget.hasMSA() && Subtarget.hasMips32r2());
 
   bool IsFGR64onMips64 = Subtarget.hasMips64() && IsFGR64;
+  bool IsFGR64onMips32 = !Subtarget.hasMips64() && IsFGR64;
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
@@ -3726,7 +3565,9 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
   const TargetRegisterClass *GPRRC =
       IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-  unsigned MFC1Opc = IsFGR64onMips64 ? Mips::DMFC1 : Mips::MFC1;
+  unsigned MFC1Opc = IsFGR64onMips64
+                         ? Mips::DMFC1
+                         : (IsFGR64onMips32 ? Mips::MFC1_D64 : Mips::MFC1);
   unsigned FILLOpc = IsFGR64onMips64 ? Mips::FILL_D : Mips::FILL_W;
 
   // Perform the register class copy as mentioned above.
@@ -3735,7 +3576,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   BuildMI(*BB, MI, DL, TII->get(FILLOpc), Wtemp).addReg(Rtemp);
   unsigned WPHI = Wtemp;
 
-  if (!Subtarget.hasMips64() && IsFGR64) {
+  if (IsFGR64onMips32) {
     unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
     BuildMI(*BB, MI, DL, TII->get(Mips::MFHC1_D64), Rtemp2).addReg(Fs);
     unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
@@ -3829,7 +3670,9 @@ MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI,
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetRegisterClass *GPRRC =
       IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-  unsigned MTC1Opc = IsFGR64onMips64 ? Mips::DMTC1 : Mips::MTC1;
+  unsigned MTC1Opc = IsFGR64onMips64
+                         ? Mips::DMTC1
+                         : (IsFGR64onMips32 ? Mips::MTC1_D64 : Mips::MTC1);
   unsigned COPYOpc = IsFGR64onMips64 ? Mips::COPY_S_D : Mips::COPY_S_W;
 
   unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 8ec55ab6284da..c2947bb44ef58 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -226,7 +226,6 @@ def II_MFC1             : InstrItinClass;
 def II_MFHC1            : InstrItinClass;
 def II_MFC2             : InstrItinClass;
 def II_MFHI_MFLO        : InstrItinClass; // mfhi and mflo
-def II_MFTR             : InstrItinClass;
 def II_MOD              : InstrItinClass;
 def II_MODU             : InstrItinClass;
 def II_MOVE             : InstrItinClass;
@@ -256,7 +255,6 @@ def II_MTC1             : InstrItinClass;
 def II_MTHC1            : InstrItinClass;
 def II_MTC2             : InstrItinClass;
 def II_MTHI_MTLO        : InstrItinClass; // mthi and mtlo
-def II_MTTR             : InstrItinClass;
 def II_MUL              : InstrItinClass;
 def II_MUH              : InstrItinClass;
 def II_MUHU             : InstrItinClass;
@@ -666,14 +664,12 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_MFHC0           , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MFC1            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MFC2            , [InstrStage<2,  [ALU]>]>,
-  InstrItinData<II_MFTR            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTC0            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTHC0           , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTC1            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTC2            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MFHC1           , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTHC1           , [InstrStage<2,  [ALU]>]>,
-  InstrItinData<II_MTTR            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_CACHE           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_PREF            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_CACHEE          , [InstrStage<1,  [ALU]>]>,
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 7619e7b08612b..cce3b8c4c8d18 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -152,6 +152,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // HasMT -- support MT ASE.
   bool HasMT;
 
+  // Disable use of the `jal` instruction.
+  bool UseLongCalls = false;
+
   InstrItineraryData InstrItins;
 
   // We can override the determination of whether we are in mips16 mode
@@ -269,6 +272,8 @@ public:
 
   bool useSoftFloat() const { return IsSoftFloat; }
 
+  bool useLongCalls() const { return UseLongCalls; }
+
   bool enableLongBranchPass() const {
     return hasStandardEncoding() || allowMixed16_32();
   }
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index af24838665e1f..7d9f99ce071e8 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -119,9 +119,6 @@ public:
                SMLoc IDLoc, const MCSubtargetInfo *STI);
   void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm,
                SMLoc IDLoc, const MCSubtargetInfo *STI);
-  void emitRRIII(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm0,
-                 int16_t Imm1, int16_t Imm2, SMLoc IDLoc,
-                 const MCSubtargetInfo *STI);
   void emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit,
                 const MCSubtargetInfo *STI);
   void emitDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index a00b56af0490a..92c8c224b71b4 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -271,7 +271,8 @@ unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
   unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 12;
 
   const MCOperand &MO = MI.getOperand(OpNo);
-  assert(MO.isImm());
+  assert(MO.isImm() && !(MO.getImm() % 16) &&
+         "Expecting an immediate that is a multiple of 16");
 
   return ((getMachineOpValue(MI, MO, Fixups, STI) >> 4) & 0xFFF) | RegBits;
 }
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 3aaf7ef2c2a02..901539b682baa 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -178,7 +178,7 @@ namespace {
     /// a base register plus a signed 16-bit displacement [r+imm].
     bool SelectAddrImm(SDValue N, SDValue &Disp,
                        SDValue &Base) {
-      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 0);
     }
 
     /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
@@ -211,7 +211,11 @@ namespace {
     /// a base register plus a signed 16-bit displacement that is a multiple of 4.
     /// Suitable for use by STD and friends.
     bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
-      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 4);
+    }
+
+    bool SelectAddrImmX16(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 16);
     }
 
     // Select an address into a single register.
@@ -305,6 +309,7 @@ private:
     bool AllUsersSelectZero(SDNode *N);
     void SwapAllSelectUsers(SDNode *N);
 
+    bool isOffsetMultipleOf(SDNode *N, unsigned Val) const;
     void transferMemOperands(SDNode *N, SDNode *Result);
   };
 
@@ -2999,6 +3004,25 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare,
   return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
 }
 
+/// Does this node represent a load/store node whose address can be represented
+/// with a register plus an immediate that's a multiple of \p Val:
+bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
+  LoadSDNode *LDN = dyn_cast<LoadSDNode>(N);
+  StoreSDNode *STN = dyn_cast<StoreSDNode>(N);
+  SDValue AddrOp;
+  if (LDN)
+    AddrOp = LDN->getOperand(1);
+  else if (STN)
+    AddrOp = STN->getOperand(2);
+
+  short Imm = 0;
+  if (AddrOp.getOpcode() == ISD::ADD)
+    return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val);
+
+  // If the address comes from the outside, the offset will be zero.
+  return AddrOp.getOpcode() == ISD::CopyFromReg;
+}
+
 void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
   // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 0e069ec1665f7..b3a3c73f6df03 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2130,12 +2130,12 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
 
 /// Returns true if the address N can be represented by a base register plus
 /// a signed 16-bit displacement [r+imm], and if it is not better
-/// represented as reg+reg.  If Aligned is true, only accept displacements
-/// suitable for STD and friends, i.e. multiples of 4.
+/// represented as reg+reg.  If \p Alignment is non-zero, only accept
+/// displacements that are multiples of that value.
 bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
                                             SDValue &Base,
                                             SelectionDAG &DAG,
-                                            bool Aligned) const {
+                                            unsigned Alignment) const {
   // FIXME dl should come from parent load or store, not from address
   SDLoc dl(N);
   // If this can be more profitably realized as r+r, fail.
@@ -2145,7 +2145,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
   if (N.getOpcode() == ISD::ADD) {
     int16_t imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) &&
-        (!Aligned || (imm & 3) == 0)) {
+        (!Alignment || (imm % Alignment) == 0)) {
       Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
         Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
@@ -2169,7 +2169,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
   } else if (N.getOpcode() == ISD::OR) {
     int16_t imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) &&
-        (!Aligned || (imm & 3) == 0)) {
+        (!Alignment || (imm % Alignment) == 0)) {
       // If this is an or of disjoint bitfields, we can codegen this as an add
       // (for better address arithmetic) if the LHS and RHS of the OR are
       // provably disjoint.
@@ -2196,7 +2196,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     // If this address fits entirely in a 16-bit sext immediate field, codegen
     // this as "d, 0"
     int16_t Imm;
-    if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
+    if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) {
       Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
       Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                              CN->getValueType(0));
@@ -2206,7 +2206,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     // Handle 32-bit sext immediates with LIS + addr mode.
     if ((CN->getValueType(0) == MVT::i32 ||
          (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
-        (!Aligned || (CN->getZExtValue() & 3) == 0)) {
+        (!Alignment || (CN->getZExtValue() % Alignment) == 0)) {
       int Addr = (int)CN->getZExtValue();
 
       // Otherwise, break this down into an LIS + disp.
@@ -2321,14 +2321,14 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 
   // LDU/STU can only handle immediates that are a multiple of 4.
   if (VT != MVT::i64) {
-    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0))
       return false;
   } else {
     // LDU/STU need an address with at least 4-byte alignment.
     if (Alignment < 4)
       return false;
 
-    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4))
       return false;
   }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 821927d3b1576..49d7d8220af16 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -616,7 +616,7 @@ namespace llvm {
     /// is not better represented as reg+reg.  If Aligned is true, only accept
     /// displacements suitable for STD and friends, i.e. multiples of 4.
     bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
-                             SelectionDAG &DAG, bool Aligned) const;
+                             SelectionDAG &DAG, unsigned Alignment) const;
 
     /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 13b4f9ab962da..e74ba38c351f0 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1635,8 +1635,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   if (equalityOnly) {
     // We need to check the uses of the condition register in order to reject
     // non-equality comparisons.
-    for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg),
-         IE = MRI->use_instr_end(); I != IE; ++I) {
+    for (MachineRegisterInfo::use_instr_iterator
+         I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end();
+         I != IE; ++I) {
       MachineInstr *UseMI = &*I;
       if (UseMI->getOpcode() == PPC::BCC) {
         unsigned Pred = UseMI->getOperand(0).getImm();
@@ -1658,8 +1659,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   for (MachineBasicBlock::iterator EL = CmpInstr.getParent()->end(); I != EL;
        ++I) {
     bool FoundUse = false;
-    for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg),
-         JE = MRI->use_instr_end(); J != JE; ++J)
+    for (MachineRegisterInfo::use_instr_iterator
+         J = MRI->use_instr_begin(CRReg), JE = MRI->use_instr_end();
+         J != JE; ++J)
       if (&*J == &*I) {
         FoundUse = true;
         break;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 6d9f55206b6af..dd7fc2659102a 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -405,6 +405,25 @@ def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() < 4;
 }]>;
 
+// This is a somewhat weaker condition than actually checking for 16-byte
+// alignment. It is simply checking that the displacement can be represented
+// as an immediate that is a multiple of 16 (i.e. the requirements for DQ-Form
+// instructions).
+def quadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return isOffsetMultipleOf(N, 16);
+}]>;
+def quadwOffsetStore : PatFrag<(ops node:$val, node:$ptr),
+                               (store node:$val, node:$ptr), [{
+  return isOffsetMultipleOf(N, 16);
+}]>;
+def nonQuadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return !isOffsetMultipleOf(N, 16);
+}]>;
+def nonQuadwOffsetStore : PatFrag<(ops node:$val, node:$ptr),
+                                  (store node:$val, node:$ptr), [{
+  return !isOffsetMultipleOf(N, 16);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC Flag Definitions.
 
@@ -815,7 +834,8 @@ def pred : Operand<OtherVT> {
 def iaddr  : ComplexPattern<iPTR, 2, "SelectAddrImm",    [], []>;
 def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",    [], []>;
 def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
-def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4",  [], []>; // "std"
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4",  [], []>;  // "std"
+def iqaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>; // "stxv"
 
 // The address in a single register. This is used with the SjLj
 // pseudo-instructions.
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 43635a8919e2b..942e8b392b82b 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2606,37 +2606,41 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   } // IsLittleEndian, HasP9Vector
 
   // D-Form Load/Store
-  def : Pat<(v4i32 (load iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v4f32 (load iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v2i64 (load iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v2f64 (load iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddr:$src)), (LXV memrix16:$src)>;
-
-  def : Pat<(store v4f32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(store v4i32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(store v2f64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(store v2i64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddr:$dst),
+  def : Pat<(v4i32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v4f32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2i64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2f64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iqaddr:$src)), (LXV memrix16:$src)>;
+
+  def : Pat<(quadwOffsetStore v4f32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore v2i64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iqaddr:$dst),
             (STXV $rS, memrix16:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddr:$dst),
+  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iqaddr:$dst),
             (STXV $rS, memrix16:$dst)>;
 
 
-  def : Pat<(v2f64 (load xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v2i64 (load xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v4f32 (load xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v4i32 (load xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(store v2f64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(store v2i64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(store v4f32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(store v4i32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xaddr:$dst),
-            (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xaddr:$dst),
-            (STXVX $rS, xaddr:$dst)>;
+  def : Pat<(v2f64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v2i64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4f32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v4f32:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v4i32:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
   def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
             (v4i32 (LXVWSX xoaddr:$src))>;
   def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
@@ -2788,21 +2792,21 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   let isPseudo = 1 in {
     def DFLOADf32  : Pseudo<(outs vssrc:$XT), (ins memrix:$src),
                             "#DFLOADf32",
-                            [(set f32:$XT, (load iaddr:$src))]>;
+                            [(set f32:$XT, (load ixaddr:$src))]>;
     def DFLOADf64  : Pseudo<(outs vsfrc:$XT), (ins memrix:$src),
                             "#DFLOADf64",
-                            [(set f64:$XT, (load iaddr:$src))]>;
+                            [(set f64:$XT, (load ixaddr:$src))]>;
     def DFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrix:$dst),
                             "#DFSTOREf32",
-                            [(store f32:$XT, iaddr:$dst)]>;
+                            [(store f32:$XT, ixaddr:$dst)]>;
     def DFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
                             "#DFSTOREf64",
-                            [(store f64:$XT, iaddr:$dst)]>;
+                            [(store f64:$XT, ixaddr:$dst)]>;
   }
-  def : Pat<(f64 (extloadf32 iaddr:$src)),
-            (COPY_TO_REGCLASS (DFLOADf32 iaddr:$src), VSFRC)>;
-  def : Pat<(f32 (fpround (extloadf32 iaddr:$src))),
-            (f32 (DFLOADf32 iaddr:$src))>;
+  def : Pat<(f64 (extloadf32 ixaddr:$src)),
+            (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>;
+  def : Pat<(f32 (fpround (extloadf32 ixaddr:$src))),
+            (f32 (DFLOADf32 ixaddr:$src))>;
 } // end HasP9Vector, AddedComplexity
 
 // Integer extend helper dags 32 -> 64
@@ -2881,13 +2885,13 @@ def FltToLongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A)))));
 }
 def FltToLongLoadP9 {
-  dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddr:$A)))));
+  dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 ixaddr:$A)))));
 }
 def FltToULongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A)))));
 }
 def FltToULongLoadP9 {
-  dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddr:$A)))));
+  dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ixaddr:$A)))));
 }
 def FltToLong {
   dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A))));
@@ -2911,13 +2915,13 @@ def DblToIntLoad {
   dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A)))));
 }
 def DblToIntLoadP9 {
-  dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddr:$A)))));
+  dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load ixaddr:$A)))));
 }
 def DblToUIntLoad {
   dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A)))));
 }
 def DblToUIntLoadP9 {
-  dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddr:$A)))));
+  dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load ixaddr:$A)))));
 }
 def DblToLongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A)))));
@@ -3088,17 +3092,17 @@ let AddedComplexity = 400 in {
               (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>;
     def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
+                                (XSCVDPSXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>;
     def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
+                                (XSCVDPUXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>;
     def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)),
               (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
-                                              (DFLOADf32 iaddr:$A),
+                                              (DFLOADf32 ixaddr:$A),
                                               VSFRC)), 0))>;
     def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)),
               (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
-                                              (DFLOADf32 iaddr:$A),
+                                              (DFLOADf32 ixaddr:$A),
                                               VSFRC)), 0))>;
   }
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 8af7f7e981171..9207165c46a6d 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -754,19 +754,31 @@ bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
   return false;
 }
 
-// Figure out if the offset in the instruction must be a multiple of 4.
-// This is true for instructions like "STD".
-static bool usesIXAddr(const MachineInstr &MI) {
+// If the offset must be a multiple of some value, return what that value is.
+static unsigned offsetMinAlign(const MachineInstr &MI) {
   unsigned OpC = MI.getOpcode();
 
   switch (OpC) {
   default:
-    return false;
+    return 1;
   case PPC::LWA:
   case PPC::LWA_32:
   case PPC::LD:
+  case PPC::LDU:
   case PPC::STD:
-    return true;
+  case PPC::STDU:
+  case PPC::DFLOADf32:
+  case PPC::DFLOADf64:
+  case PPC::DFSTOREf32:
+  case PPC::DFSTOREf64:
+  case PPC::LXSD:
+  case PPC::LXSSP:
+  case PPC::STXSD:
+  case PPC::STXSSP:
+    return 4;
+  case PPC::LXV:
+  case PPC::STXV:
+    return 16;
   }
 }
 
@@ -852,9 +864,6 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(FIOperandNum).ChangeToRegister(
     FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false);
 
-  // Figure out if the offset in the instruction is shifted right two bits.
-  bool isIXAddr = usesIXAddr(MI);
-
   // If the instruction is not present in ImmToIdxMap, then it has no immediate
   // form (and must be r+r).
   bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
@@ -883,7 +892,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // happen in invalid code.
   assert(OpC != PPC::DBG_VALUE &&
          "This should be handled in a target-independent way");
-  if (!noImmForm && ((isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) ||
+  if (!noImmForm && ((isInt<16>(Offset) &&
+                      ((Offset % offsetMinAlign(MI)) == 0)) ||
                      OpC == TargetOpcode::STACKMAP ||
                      OpC == TargetOpcode::PATCHPOINT)) {
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
@@ -1076,5 +1086,5 @@ bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
   return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
          MI->getOpcode() == TargetOpcode::STACKMAP ||
          MI->getOpcode() == TargetOpcode::PATCHPOINT ||
-         (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0));
+         (isInt<16>(Offset) && (Offset % offsetMinAlign(*MI)) == 0);
 }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 2dc3828334ac5..be705507b5347 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -41,6 +41,8 @@ public:
   ~PPCTargetMachine() override;
 
   const PPCSubtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some targets, is
+  // deprecated and should not be used.
   const PPCSubtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
index 11004c5a952fc..91cab00b2b651 100644
--- a/lib/Target/Sparc/Sparc.td
+++ b/lib/Target/Sparc/Sparc.td
@@ -20,6 +20,10 @@ include "llvm/Target/Target.td"
 // SPARC Subtarget features.
 //
 
+def FeatureSoftMulDiv
+  : SubtargetFeature<"soft-mul-div", "UseSoftMulDiv", "true",
+                     "Use software emulation for integer multiply and divide">;
+
 def FeatureV9
   : SubtargetFeature<"v9", "IsV9", "true",
                      "Enable SPARC-V9 instructions">;
@@ -75,7 +79,7 @@ class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
 def : Proc<"generic",         []>;
-def : Proc<"v7",              []>;
+def : Proc<"v7",              [FeatureSoftMulDiv]>;
 def : Proc<"v8",              []>;
 def : Proc<"supersparc",      []>;
 def : Proc<"sparclite",       []>;
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 9e7e3c6b705a9..6767a59a97571 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1689,6 +1689,19 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::MULHS,     MVT::i32, Expand);
   setOperationAction(ISD::MUL,       MVT::i32, Expand);
 
+  if (Subtarget->useSoftMulDiv()) {
+    // .umul works for both signed and unsigned
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setLibcallName(RTLIB::MUL_I32, ".umul");
+
+    setOperationAction(ISD::SDIV, MVT::i32, Expand);
+    setLibcallName(RTLIB::SDIV_I32, ".div");
+
+    setOperationAction(ISD::UDIV, MVT::i32, Expand);
+    setLibcallName(RTLIB::UDIV_I32, ".udiv");
+  }
+
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
     setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index ae45c8be67524..3194ad4aeb6b1 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -27,6 +27,9 @@ def Is32Bit : Predicate<"!Subtarget->is64Bit()">;
 // True when generating 64-bit code. This also implies HasV9.
 def Is64Bit : Predicate<"Subtarget->is64Bit()">;
 
+def UseSoftMulDiv : Predicate<"Subtarget->useSoftMulDiv()">,
+              AssemblerPredicate<"FeatureSoftMulDiv">;
+
 // HasV9 - This predicate is true when the target processor supports V9
 // instructions.  Note that the machine may be running in 32-bit mode.
 def HasV9   : Predicate<"Subtarget->isV9()">,
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index 43ddef3cc96e9..daac56add87ce 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -28,6 +28,7 @@ void SparcSubtarget::anchor() { }
 
 SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
+  UseSoftMulDiv = false;
   IsV9 = false;
   IsLeon = false;
   V8DeprecatedInsts = false;
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index fa42da425ff2d..d18139984b87f 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -32,6 +32,7 @@ class StringRef;
 class SparcSubtarget : public SparcGenSubtargetInfo {
   Triple TargetTriple;
   virtual void anchor();
+  bool UseSoftMulDiv;
   bool IsV9;
   bool IsLeon;
   bool V8DeprecatedInsts;
@@ -76,6 +77,7 @@ public:
 
   bool enableMachineScheduler() const override;
 
+  bool useSoftMulDiv() const { return UseSoftMulDiv; }
   bool isV9() const { return IsV9; }
   bool isLeon() const { return IsLeon; }
   bool isVIS() const { return IsVIS; }
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index ee23692ad1db2..33680789ee082 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -275,6 +275,10 @@ public:
   SMLoc getEndLoc() const override { return EndLoc; }
   void print(raw_ostream &OS) const override;
 
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
   // Used by the TableGen code to add particular types of operand
   // to an instruction.
   void addRegOperands(MCInst &Inst, unsigned N) const {
@@ -1164,6 +1168,8 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands,
   return false;
 }
 
+std::string SystemZMnemonicSpellCheck(StringRef S, uint64_t FBS);
+
 bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                OperandVector &Operands,
                                                MCStreamer &Out,
@@ -1209,8 +1215,13 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(ErrorLoc, "invalid operand for instruction");
   }
 
-  case Match_MnemonicFail:
-    return Error(IDLoc, "invalid instruction");
+  case Match_MnemonicFail: {
+    uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+    std::string Suggestion = SystemZMnemonicSpellCheck(
+      ((SystemZOperand &)*Operands[0]).getToken(), FBS);
+    return Error(IDLoc, "invalid instruction" + Suggestion,
+                 ((SystemZOperand &)*Operands[0]).getLocRange());
+  }
   }
 
   llvm_unreachable("Unexpected match type");
diff --git a/lib/Target/SystemZ/LLVMBuild.txt b/lib/Target/SystemZ/LLVMBuild.txt
index 6f8431db7b11c..9b8b141fd52ab 100644
--- a/lib/Target/SystemZ/LLVMBuild.txt
+++ b/lib/Target/SystemZ/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = SystemZCodeGen
 parent = SystemZ
-required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target
+required_libraries = Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target
 add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td
index c5faa0d62881d..fda9c30fe3fcc 100644
--- a/lib/Target/SystemZ/SystemZFeatures.td
+++ b/lib/Target/SystemZ/SystemZFeatures.td
@@ -189,6 +189,58 @@ def Arch11NewFeatures : SystemZFeatureList<[
 
 //===----------------------------------------------------------------------===//
 //
+// New features added in the Twelvth Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureMiscellaneousExtensions2 : SystemZFeature<
+  "miscellaneous-extensions-2", "MiscellaneousExtensions2",
+  "Assume that the miscellaneous-extensions facility 2 is installed"
+>;
+
+def FeatureGuardedStorage : SystemZFeature<
+  "guarded-storage", "GuardedStorage",
+  "Assume that the guarded-storage facility is installed"
+>;
+
+def FeatureMessageSecurityAssist7 : SystemZFeature<
+  "message-security-assist-extension7", "MessageSecurityAssist7",
+  "Assume that the message-security-assist extension facility 7 is installed"
+>;
+
+def FeatureMessageSecurityAssist8 : SystemZFeature<
+  "message-security-assist-extension8", "MessageSecurityAssist8",
+  "Assume that the message-security-assist extension facility 8 is installed"
+>;
+
+def FeatureVectorEnhancements1 : SystemZFeature<
+  "vector-enhancements-1", "VectorEnhancements1",
+  "Assume that the vector enhancements facility 1 is installed"
+>;
+def FeatureNoVectorEnhancements1 : SystemZMissingFeature<"VectorEnhancements1">;
+
+def FeatureVectorPackedDecimal : SystemZFeature<
+  "vector-packed-decimal", "VectorPackedDecimal",
+  "Assume that the vector packed decimal facility is installed"
+>;
+
+def FeatureInsertReferenceBitsMultiple : SystemZFeature<
+  "insert-reference-bits-multiple", "InsertReferenceBitsMultiple",
+  "Assume that the insert-reference-bits-multiple facility is installed"
+>;
+
+def Arch12NewFeatures : SystemZFeatureList<[
+    FeatureMiscellaneousExtensions2,
+    FeatureGuardedStorage,
+    FeatureMessageSecurityAssist7,
+    FeatureMessageSecurityAssist8,
+    FeatureVectorEnhancements1,
+    FeatureVectorPackedDecimal,
+    FeatureInsertReferenceBitsMultiple
+]>;
+
+//===----------------------------------------------------------------------===//
+//
 // Cumulative supported and unsupported feature sets
 //
 //===----------------------------------------------------------------------===//
@@ -201,9 +253,13 @@ def Arch10SupportedFeatures
   : SystemZFeatureAdd<Arch9SupportedFeatures.List,  Arch10NewFeatures.List>;
 def Arch11SupportedFeatures
   : SystemZFeatureAdd<Arch10SupportedFeatures.List, Arch11NewFeatures.List>;
+def Arch12SupportedFeatures
+  : SystemZFeatureAdd<Arch11SupportedFeatures.List, Arch12NewFeatures.List>;
 
-def Arch11UnsupportedFeatures
+def Arch12UnsupportedFeatures
   : SystemZFeatureList<[]>;
+def Arch11UnsupportedFeatures
+  : SystemZFeatureAdd<Arch12UnsupportedFeatures.List, Arch12NewFeatures.List>;
 def Arch10UnsupportedFeatures
   : SystemZFeatureAdd<Arch11UnsupportedFeatures.List, Arch11NewFeatures.List>;
 def Arch9UnsupportedFeatures
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2801141cd951f..2d916d2e15214 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -101,7 +101,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
     addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
   }
-  addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
+  if (Subtarget.hasVectorEnhancements1())
+    addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass);
+  else
+    addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
 
   if (Subtarget.hasVector()) {
     addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
@@ -316,7 +319,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::AND, VT, Legal);
       setOperationAction(ISD::OR, VT, Legal);
       setOperationAction(ISD::XOR, VT, Legal);
-      setOperationAction(ISD::CTPOP, VT, Custom);
+      if (Subtarget.hasVectorEnhancements1())
+        setOperationAction(ISD::CTPOP, VT, Legal);
+      else
+        setOperationAction(ISD::CTPOP, VT, Custom);
       setOperationAction(ISD::CTTZ, VT, Legal);
       setOperationAction(ISD::CTLZ, VT, Legal);
 
@@ -414,10 +420,60 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
   }
 
+  // The vector enhancements facility 1 has instructions for these.
+  if (Subtarget.hasVectorEnhancements1()) {
+    setOperationAction(ISD::FADD, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+    setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+    setOperationAction(ISD::FABS, MVT::v4f32, Legal);
+    setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f64, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::f64, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f128, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f128, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f128, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::f128, Legal);
+  }
+
   // We have fused multiply-addition for f32 and f64 but not f128.
   setOperationAction(ISD::FMA, MVT::f32,  Legal);
   setOperationAction(ISD::FMA, MVT::f64,  Legal);
-  setOperationAction(ISD::FMA, MVT::f128, Expand);
+  if (Subtarget.hasVectorEnhancements1())
+    setOperationAction(ISD::FMA, MVT::f128, Legal);
+  else
+    setOperationAction(ISD::FMA, MVT::f128, Expand);
+
+  // We don't have a copysign instruction on vector registers.
+  if (Subtarget.hasVectorEnhancements1())
+    setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
 
   // Needed so that we don't try to implement f128 constant loads using
   // a load-and-extend of a f80 constant (in cases where the constant
@@ -425,6 +481,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   for (MVT VT : MVT::fp_valuetypes())
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
 
+  // We don't have extending load instruction on vector registers.
+  if (Subtarget.hasVectorEnhancements1()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
+  }
+
   // Floating-point truncation and stores need to be done separately.
   setTruncStoreAction(MVT::f64,  MVT::f32, Expand);
   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
@@ -489,7 +551,7 @@ bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   case MVT::f64:
     return true;
   case MVT::f128:
-    return false;
+    return Subtarget.hasVectorEnhancements1();
   default:
     break;
   }
@@ -1462,21 +1524,25 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
     return true;
 
   case Intrinsic::s390_vfcedbs:
+  case Intrinsic::s390_vfcesbs:
     Opcode = SystemZISD::VFCMPES;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
   case Intrinsic::s390_vfchdbs:
+  case Intrinsic::s390_vfchsbs:
     Opcode = SystemZISD::VFCMPHS;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
   case Intrinsic::s390_vfchedbs:
+  case Intrinsic::s390_vfchesbs:
     Opcode = SystemZISD::VFCMPHES;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
   case Intrinsic::s390_vftcidb:
+  case Intrinsic::s390_vftcisb:
     Opcode = SystemZISD::VFTCI;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
@@ -2316,11 +2382,15 @@ static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL,
 
 // Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
 // producing a result of type VT.
-static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL,
-                            EVT VT, SDValue CmpOp0, SDValue CmpOp1) {
-  // There is no hardware support for v4f32, so extend the vector into
-  // two v2f64s and compare those.
-  if (CmpOp0.getValueType() == MVT::v4f32) {
+SDValue SystemZTargetLowering::getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
+                                            const SDLoc &DL, EVT VT,
+                                            SDValue CmpOp0,
+                                            SDValue CmpOp1) const {
+  // There is no hardware support for v4f32 (unless we have the vector
+  // enhancements facility 1), so extend the vector into two v2f64s
+  // and compare those.
+  if (CmpOp0.getValueType() == MVT::v4f32 &&
+      !Subtarget.hasVectorEnhancements1()) {
     SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
     SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
     SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
@@ -2334,9 +2404,11 @@ static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL,
 
 // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
 // an integer mask of type VT.
-static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
-                                ISD::CondCode CC, SDValue CmpOp0,
-                                SDValue CmpOp1) {
+SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG,
+                                                const SDLoc &DL, EVT VT,
+                                                ISD::CondCode CC,
+                                                SDValue CmpOp0,
+                                                SDValue CmpOp1) const {
   bool IsFP = CmpOp0.getValueType().isFloatingPoint();
   bool Invert = false;
   SDValue Cmp;
@@ -2960,6 +3032,12 @@ SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
     // We define this so that it can be used for constant division.
     lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
                     Op.getOperand(1), Ops[1], Ops[0]);
+  else if (Subtarget.hasMiscellaneousExtensions2())
+    // SystemZISD::SMUL_LOHI returns the low result in the odd register and
+    // the high result in the even register.  ISD::SMUL_LOHI is defined to
+    // return the low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZISD::SMUL_LOHI,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
   else {
     // Do a full 128-bit multiplication based on SystemZISD::UMUL_LOHI:
     //
@@ -4658,6 +4736,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(SELECT_CCMASK);
     OPCODE(ADJDYNALLOC);
     OPCODE(POPCNT);
+    OPCODE(SMUL_LOHI);
     OPCODE(UMUL_LOHI);
     OPCODE(SDIVREM);
     OPCODE(UDIVREM);
@@ -6118,6 +6197,7 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
   case SystemZ::SelectF32:
   case SystemZ::SelectF64:
   case SystemZ::SelectF128:
+  case SystemZ::SelectVR128:
     return emitSelect(MI, MBB, 0);
 
   case SystemZ::CondStore8Mux:
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 6c9c404816f09..abe8b7233e60c 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -88,6 +88,7 @@ enum NodeType : unsigned {
 
   // Wrappers around the ISD opcodes of the same name.  The output is GR128.
   // Input operands may be GR64 or GR32, depending on the instruction.
+  SMUL_LOHI,
   UMUL_LOHI,
   SDIVREM,
   UDIVREM,
@@ -479,6 +480,12 @@ private:
   const SystemZSubtarget &Subtarget;
 
   // Implement LowerOperation for individual opcodes.
+  SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
+                       const SDLoc &DL, EVT VT,
+                       SDValue CmpOp0, SDValue CmpOp1) const;
+  SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL,
+                           EVT VT, ISD::CondCode CC,
+                           SDValue CmpOp0, SDValue CmpOp1) const;
   SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 10172bd452034..02aeaadad0d9a 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -12,9 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 // C's ?: operator for floating-point operands.
-def SelectF32  : SelectWrapper<FP32>;
-def SelectF64  : SelectWrapper<FP64>;
-def SelectF128 : SelectWrapper<FP128>;
+def SelectF32  : SelectWrapper<f32, FP32>;
+def SelectF64  : SelectWrapper<f64, FP64>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def SelectF128 : SelectWrapper<f128, FP128>;
+let Predicates = [FeatureVectorEnhancements1] in
+  def SelectVR128 : SelectWrapper<f128, VR128>;
 
 defm CondStoreF32 : CondStores<FP32, nonvolatile_store,
                                nonvolatile_load, bdxaddr20only>;
@@ -69,8 +72,9 @@ let Defs = [CC], usesCustomInserter = 1 in {
 let Predicates = [FeatureVector] in {
   defm : CompareZeroFP<LTEBRCompare_VecPseudo, FP32>;
   defm : CompareZeroFP<LTDBRCompare_VecPseudo, FP64>;
-  defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>;
 }
+let Predicates = [FeatureVector, FeatureNoVectorEnhancements1] in
+  defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>;
 
 // Moves between 64-bit integer and floating-point registers.
 def LGDR : UnaryRRE<"lgdr", 0xB3CD, bitconvert, GR64, FP64>;
@@ -83,8 +87,12 @@ let isCodeGenOnly = 1 in {
 }
 
 // The sign of an FP128 is in the high register.
-def : Pat<(fcopysign FP32:$src1, FP128:$src2),
-          (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fcopysign FP32:$src1, (f32 (fpround (f128 FP128:$src2)))),
+            (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureVectorEnhancements1] in
+  def : Pat<(fcopysign FP32:$src1, (f32 (fpround (f128 VR128:$src2)))),
+            (CPSDRsd FP32:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_r64))>;
 
 // fcopysign with an FP64 result.
 let isCodeGenOnly = 1 in
@@ -92,8 +100,12 @@ let isCodeGenOnly = 1 in
 def CPSDRdd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP64>;
 
 // The sign of an FP128 is in the high register.
-def : Pat<(fcopysign FP64:$src1, FP128:$src2),
-          (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fcopysign FP64:$src1, (f64 (fpround (f128 FP128:$src2)))),
+            (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureVectorEnhancements1] in
+  def : Pat<(fcopysign FP64:$src1, (f64 (fpround (f128 VR128:$src2)))),
+            (CPSDRdd FP64:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_r64))>;
 
 // fcopysign with an FP128 result.  Use "upper" as the high half and leave
 // the low half as-is.
@@ -101,12 +113,14 @@ class CopySign128<RegisterOperand cls, dag upper>
   : Pat<(fcopysign FP128:$src1, cls:$src2),
         (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>;
 
-def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
-                                  FP32:$src2)>;
-def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
-                                  FP64:$src2)>;
-def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
-                                  (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                    FP32:$src2)>;
+  def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                    FP64:$src2)>;
+  def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                    (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+}
 
 defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCSequence, 4>;
 defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCSequence, 8>;
@@ -166,20 +180,32 @@ def LEXBRA : TernaryRRFe<"lexbra", 0xB346, FP128, FP128>,
 def LDXBRA : TernaryRRFe<"ldxbra", 0xB345, FP128, FP128>,
              Requires<[FeatureFPExtension]>;
 
-def : Pat<(f32 (fpround FP128:$src)),
-          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>;
-def : Pat<(f64 (fpround FP128:$src)),
-          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : Pat<(f32 (fpround FP128:$src)),
+            (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>;
+  def : Pat<(f64 (fpround FP128:$src)),
+            (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+}
 
 // Extend register floating-point values to wider representations.
-def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend, FP64,  FP32>;
-def LXEBR : UnaryRRE<"lxebr", 0xB306, fpextend, FP128, FP32>;
-def LXDBR : UnaryRRE<"lxdbr", 0xB305, fpextend, FP128, FP64>;
+def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend,  FP64,  FP32>;
+def LXEBR : UnaryRRE<"lxebr", 0xB306, null_frag, FP128, FP32>;
+def LXDBR : UnaryRRE<"lxdbr", 0xB305, null_frag, FP128, FP64>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : Pat<(f128 (fpextend (f32 FP32:$src))), (LXEBR FP32:$src)>;
+  def : Pat<(f128 (fpextend (f64 FP64:$src))), (LXDBR FP64:$src)>;
+}
 
 // Extend memory floating-point values to wider representations.
 def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64,  4>;
-def LXEB : UnaryRXE<"lxeb", 0xED06, extloadf32, FP128, 4>;
-def LXDB : UnaryRXE<"lxdb", 0xED05, extloadf64, FP128, 8>;
+def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag,  FP128, 4>;
+def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag,  FP128, 8>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : Pat<(f128 (extloadf32 bdxaddr12only:$src)),
+            (LXEB bdxaddr12only:$src)>;
+  def : Pat<(f128 (extloadf64 bdxaddr12only:$src)),
+            (LXDB bdxaddr12only:$src)>;
+}
 
 // Convert a signed integer register value to a floating-point one.
 def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32,  GR32>;
@@ -426,16 +452,18 @@ def : Pat<(fmul (f64 (fpextend FP32:$src1)),
 
 // f128 multiplication of two FP64 registers.
 def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>;
-def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))),
-          (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
-                                FP64:$src1, subreg_h64), FP64:$src2)>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))),
+            (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
+                                  FP64:$src1, subreg_h64), FP64:$src2)>;
 
 // f128 multiplication of an FP64 register and an f64 memory.
 def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
-def : Pat<(fmul (f128 (fpextend FP64:$src1)),
-                (f128 (extloadf64 bdxaddr12only:$addr))),
-          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
-                bdxaddr12only:$addr)>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fmul (f128 (fpextend FP64:$src1)),
+                  (f128 (extloadf64 bdxaddr12only:$addr))),
+            (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
+                  bdxaddr12only:$addr)>;
 
 // Fused multiply-add.
 def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32, FP32>;
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 7620e06ccbc96..033a0a879d37d 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -1091,6 +1091,94 @@ class InstVRIe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRIf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<5> V3;
+  bits<8> I4;
+  bits<4> M5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-28} = V3{3-0};
+  let Inst{27-24} = 0;
+  let Inst{23-20} = M5;
+  let Inst{19-12} = I4;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9}     = V3{4};
+  let Inst{8}     = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<8> I3;
+  bits<8> I4;
+  bits<4> M5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-24} = I4;
+  let Inst{23-20} = M5;
+  let Inst{19-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIh<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> I2;
+  bits<4> I3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = 0;
+  let Inst{31-16} = I2;
+  let Inst{15-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIi<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<4> R2;
+  bits<8> I3;
+  bits<4> M4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = R2;
+  let Inst{31-24} = 0;
+  let Inst{23-20} = M4;
+  let Inst{19-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 // Depending on the instruction mnemonic, certain bits may be or-ed into
 // the M4 value provided as explicit operand.  These are passed as m4or.
 class InstVRRa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
@@ -1259,6 +1347,67 @@ class InstVRRf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRRg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = V1{3-0};
+  let Inst{31-12} = 0;
+  let Inst{11}    = 0;
+  let Inst{10}    = V1{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRRh<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = V1{3-0};
+  let Inst{31-28} = V2{3-0};
+  let Inst{27-24} = 0;
+  let Inst{23-20} = M3;
+  let Inst{19-12} = 0;
+  let Inst{11}    = 0;
+  let Inst{10}    = V1{4};
+  let Inst{9}     = V2{4};
+  let Inst{8}     = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRRi<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<5> V2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-24} = 0;
+  let Inst{23-20} = M3;
+  let Inst{19-12} = 0;
+  let Inst{11}    = 0;
+  let Inst{10}    = V2{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstVRSa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -1321,6 +1470,25 @@ class InstVRSc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRSd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> BD2;
+  bits<4> R3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = R3;
+  let Inst{31-16} = BD2;
+  let Inst{15-12} = V1{3-0};
+  let Inst{11-9}  = 0;
+  let Inst{8}     = V1{4};
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstVRV<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -1358,6 +1526,24 @@ class InstVRX<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVSI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> BD2;
+  bits<8> I3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-32} = I3;
+  let Inst{31-16} = BD2;
+  let Inst{15-12} = V1{3-0};
+  let Inst{11-9}  = 0;
+  let Inst{8}     = V1{4};
+  let Inst{7-0}   = op{7-0};
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction classes for .insn directives
 //===----------------------------------------------------------------------===//
@@ -1910,6 +2096,25 @@ class FixedCondBranchRX<CondVariant V, string mnemonic, bits<8> opcode>
   let M1 = V.ccmask;
 }
 
+class CondBranchRXY<string mnemonic, bits<16> opcode>
+  : InstRXYb<opcode, (outs), (ins cond4:$valid, cond4:$M1, bdxaddr20only:$XBD2),
+             !subst("#", "${M1}", mnemonic)#"\t$XBD2", []> {
+  let CCMaskFirst = 1;
+}
+
+class AsmCondBranchRXY<string mnemonic, bits<16> opcode>
+  : InstRXYb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr20only:$XBD2),
+             mnemonic#"\t$M1, $XBD2", []>;
+
+class FixedCondBranchRXY<CondVariant V, string mnemonic, bits<16> opcode,
+                         SDPatternOperator operator = null_frag>
+  : InstRXYb<opcode, (outs), (ins bdxaddr20only:$XBD2),
+             !subst("#", V.suffix, mnemonic)#"\t$XBD2",
+             [(operator (load bdxaddr20only:$XBD2))]> {
+  let isAsmParserOnly = V.alternate;
+  let M1 = V.ccmask;
+}
+
 class CmpBranchRIEa<string mnemonic, bits<16> opcode,
                     RegisterOperand cls, Immediate imm>
   : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, cond4:$M3),
@@ -2272,6 +2477,24 @@ class StoreLengthVRSb<string mnemonic, bits<16> opcode,
   let AccessBytes = bytes;
 }
 
+class StoreLengthVRSd<string mnemonic, bits<16> opcode,
+                      SDPatternOperator operator, bits<5> bytes>
+  : InstVRSd<opcode, (outs), (ins VR128:$V1, GR32:$R3, bdaddr12only:$BD2),
+             mnemonic#"\t$V1, $R3, $BD2",
+             [(operator VR128:$V1, GR32:$R3, bdaddr12only:$BD2)]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreLengthVSI<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator, bits<5> bytes>
+  : InstVSI<opcode, (outs), (ins VR128:$V1, bdaddr12only:$BD2, imm32zx8:$I3),
+            mnemonic#"\t$V1, $BD2, $I3",
+            [(operator VR128:$V1, imm32zx8:$I3, bdaddr12only:$BD2)]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
 class StoreMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
                       AddressingMode mode = bdaddr12only>
   : InstRSa<opcode, (outs), (ins cls:$R1, cls:$R3, mode:$BD2),
@@ -2700,6 +2923,11 @@ class SideEffectBinaryRX<string mnemonic, bits<8> opcode,
   : InstRXa<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
             mnemonic##"\t$R1, $XBD2", []>;
 
+class SideEffectBinaryRXY<string mnemonic, bits<16> opcode,
+                          RegisterOperand cls>
+  : InstRXYa<opcode, (outs), (ins cls:$R1, bdxaddr20only:$XBD2),
+             mnemonic##"\t$R1, $XBD2", []>;
+
 class SideEffectBinaryRILPC<string mnemonic, bits<12> opcode,
                             RegisterOperand cls>
   : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
@@ -3188,6 +3416,11 @@ class BinaryVRIeFloatGeneric<string mnemonic, bits<16> opcode>
              (ins VR128:$V2, imm32zx12:$I3, imm32zx4:$M4, imm32zx4:$M5),
              mnemonic#"\t$V1, $V2, $I3, $M4, $M5", []>;
 
+class BinaryVRIh<string mnemonic, bits<16> opcode>
+  : InstVRIh<opcode, (outs VR128:$V1),
+             (ins imm32zx16:$I2, imm32zx4:$I3),
+             mnemonic#"\t$V1, $I2, $I3", []>;
+
 class BinaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0>
   : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx4:$M5),
@@ -3316,6 +3549,10 @@ class BinaryVRRf<string mnemonic, bits<16> opcode, SDPatternOperator operator,
              mnemonic#"\t$V1, $R2, $R3",
              [(set tr.op:$V1, (tr.vt (operator GR64:$R2, GR64:$R3)))]>;
 
+class BinaryVRRi<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstVRRi<opcode, (outs cls:$R1), (ins VR128:$V2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $V2, $M3", []>;
+
 class BinaryVRSa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type>
   : InstVRSa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, shift12only:$BD2),
@@ -3353,6 +3590,15 @@ class BinaryVRScGeneric<string mnemonic, bits<16> opcode>
              (ins VR128:$V3, shift12only:$BD2, imm32zx4: $M4),
              mnemonic#"\t$R1, $V3, $BD2, $M4", []>;
 
+class BinaryVRSd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 bits<5> bytes>
+  : InstVRSd<opcode, (outs VR128:$V1), (ins GR32:$R3, bdaddr12only:$BD2),
+             mnemonic#"\t$V1, $R3, $BD2",
+             [(set VR128:$V1, (operator GR32:$R3, bdaddr12only:$BD2))]> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
 class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 TypedReg tr, bits<5> bytes>
   : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
@@ -3398,6 +3644,15 @@ class StoreBinaryRSL<string mnemonic, bits<16> opcode, RegisterOperand cls>
   let mayStore = 1;
 }
 
+class BinaryVSI<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                bits<5> bytes>
+  : InstVSI<opcode, (outs VR128:$V1), (ins bdaddr12only:$BD2, imm32zx8:$I3),
+            mnemonic#"\t$V1, $BD2, $I3",
+            [(set VR128:$V1, (operator imm32zx8:$I3, bdaddr12only:$BD2))]> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
 class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
                      Immediate index>
   : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
@@ -3625,6 +3880,12 @@ class CompareVRRaFloatGeneric<string mnemonic, bits<16> opcode>
   let M5 = 0;
 }
 
+class CompareVRRh<string mnemonic, bits<16> opcode>
+  : InstVRRh<opcode, (outs), (ins VR128:$V1, VR128:$V2, imm32zx4:$M3),
+             mnemonic#"\t$V1, $V2, $M3", []> {
+  let isCompare = 1;
+}
+
 class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
               RegisterOperand cls>
   : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
@@ -3639,6 +3900,10 @@ class TestRSL<string mnemonic, bits<16> opcode>
   let mayLoad = 1;
 }
 
+class TestVRRg<string mnemonic, bits<16> opcode>
+  : InstVRRg<opcode, (outs), (ins VR128:$V1),
+             mnemonic#"\t$V1", []>;
+
 class SideEffectTernarySSc<string mnemonic, bits<8> opcode>
   : InstSSc<opcode, (outs), (ins bdladdr12onlylen4:$BDL1,
                                  shift12only:$BD2, imm32zx4:$I3),
@@ -3842,6 +4107,11 @@ class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M5 = type;
 }
 
+class TernaryVRIi<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstVRIi<opcode, (outs VR128:$V1),
+             (ins cls:$R2, imm32zx8:$I3, imm32zx4:$M4),
+             mnemonic#"\t$V1, $R2, $I3, $M4", []>;
+
 class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                   TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m4or>
   : InstVRRa<opcode, (outs tr1.op:$V1),
@@ -3914,6 +4184,25 @@ class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M6 = 0;
 }
 
+class TernaryVRRcFloat<string mnemonic, bits<16> opcode,
+                       SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
+                       bits<4> type = 0, bits<4> m5 = 0>
+  : InstVRRc<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $M6",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3),
+                                                 imm32zx4:$M6)))]> {
+  let M4 = type;
+  let M5 = m5;
+}
+
+class TernaryVRRcFloatGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRc<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5,
+                  imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $M4, $M5, $M6", []>;
+
 class TernaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                   TypedReg tr1, TypedReg tr2, bits<4> type = 0>
   : InstVRRd<opcode, (outs tr1.op:$V1),
@@ -4019,20 +4308,38 @@ class QuaternaryVRIdGeneric<string mnemonic, bits<16> opcode>
   let DisableEncoding = "$V1src";
 }
 
+class QuaternaryVRIf<string mnemonic, bits<16> opcode>
+  : InstVRIf<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3,
+                  imm32zx8:$I4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []>;
+
+class QuaternaryVRIg<string mnemonic, bits<16> opcode>
+  : InstVRIg<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, imm32zx8:$I3,
+                  imm32zx8:$I4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $I3, $I4, $M5", []>;
+
 class QuaternaryVRRd<string mnemonic, bits<16> opcode,
                      SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
-                     bits<4> type, SDPatternOperator m6mask, bits<4> m6or>
+                     TypedReg tr3, TypedReg tr4, bits<4> type,
+                     SDPatternOperator m6mask = imm32zx4, bits<4> m6or = 0>
   : InstVRRd<opcode, (outs tr1.op:$V1),
-             (ins tr2.op:$V2, tr2.op:$V3, tr2.op:$V4, m6mask:$M6),
+             (ins tr2.op:$V2, tr3.op:$V3, tr4.op:$V4, m6mask:$M6),
              mnemonic#"\t$V1, $V2, $V3, $V4, $M6",
              [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3),
-                                                 (tr2.vt tr2.op:$V4),
+                                                 (tr3.vt tr3.op:$V3),
+                                                 (tr4.vt tr4.op:$V4),
                                                  m6mask:$M6)))],
              m6or> {
   let M5 = type;
 }
 
+class QuaternaryVRRdGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRd<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, VR128:$V4, imm32zx4:$M5, imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+
 // Declare a pair of instructions, one which sets CC and one which doesn't.
 // The CC-setting form ends with "S" and sets the low bit of M6.
 // Also create aliases to make use of M6 operand optional in assembler.
@@ -4041,13 +4348,15 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
                                 SDPatternOperator operator_cc,
                                 TypedReg tr1, TypedReg tr2, bits<4> type,
                                 bits<4> modifier = 0> {
-  def "" : QuaternaryVRRd<mnemonic, opcode, operator, tr1, tr2, type,
+  def "" : QuaternaryVRRd<mnemonic, opcode, operator,
+                          tr1, tr2, tr2, tr2, type,
                           imm32zx4even, !and (modifier, 14)>;
   def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4",
                   (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
                                             tr2.op:$V3, tr2.op:$V4, 0)>;
   let Defs = [CC] in
-    def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+    def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc,
+                           tr1, tr2, tr2, tr2, type,
                            imm32zx4even, !add (!and (modifier, 14), 1)>;
   def : InstAlias<mnemonic#"s\t$V1, $V2, $V3, $V4",
                   (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
@@ -4055,10 +4364,7 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
 }
 
 multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> {
-  def "" : InstVRRd<opcode, (outs VR128:$V1),
-                   (ins VR128:$V2, VR128:$V3, VR128:$V4,
-                        imm32zx4:$M5, imm32zx4:$M6),
-                   mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+  def "" : QuaternaryVRRdGeneric<mnemonic, opcode>;
   def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4, $M5",
                   (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
                                             VR128:$V4, imm32zx4:$M5, 0)>;
@@ -4366,10 +4672,10 @@ class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
 
 // Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
 // the value of the PSW's 2-bit condition code field.
-class SelectWrapper<RegisterOperand cls>
+class SelectWrapper<ValueType vt, RegisterOperand cls>
   : Pseudo<(outs cls:$dst),
            (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc),
-           [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2,
+           [(set (vt cls:$dst), (z_select_ccmask cls:$src1, cls:$src2,
                                             imm32zx4:$valid, imm32zx4:$cc))]> {
   let usesCustomInserter = 1;
   // Although the instructions used by these nodes do not in themselves
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 66a5ff12be464..4533f4fdf21ae 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -869,6 +869,37 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  // Move 128-bit floating-point values between VR128 and FP128.
+  if (SystemZ::VR128BitRegClass.contains(DestReg) &&
+      SystemZ::FP128BitRegClass.contains(SrcReg)) {
+    unsigned SrcRegHi =
+      RI.getMatchingSuperReg(RI.getSubReg(SrcReg, SystemZ::subreg_h64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+    unsigned SrcRegLo =
+      RI.getMatchingSuperReg(RI.getSubReg(SrcReg, SystemZ::subreg_l64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+
+    BuildMI(MBB, MBBI, DL, get(SystemZ::VMRHG), DestReg)
+      .addReg(SrcRegHi, getKillRegState(KillSrc))
+      .addReg(SrcRegLo, getKillRegState(KillSrc));
+    return;
+  }
+  if (SystemZ::FP128BitRegClass.contains(DestReg) &&
+      SystemZ::VR128BitRegClass.contains(SrcReg)) {
+    unsigned DestRegHi =
+      RI.getMatchingSuperReg(RI.getSubReg(DestReg, SystemZ::subreg_h64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+    unsigned DestRegLo =
+      RI.getMatchingSuperReg(RI.getSubReg(DestReg, SystemZ::subreg_l64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+
+    if (DestRegHi != SrcReg)
+      copyPhysReg(MBB, MBBI, DL, DestRegHi, SrcReg, false);
+    BuildMI(MBB, MBBI, DL, get(SystemZ::VREPG), DestRegLo)
+      .addReg(SrcReg, getKillRegState(KillSrc)).addImm(1);
+    return;
+  }
+
   // Everything else needs only one instruction.
   unsigned Opcode;
   if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
@@ -1434,6 +1465,7 @@ SystemZII::Branch
 SystemZInstrInfo::getBranchInfo(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case SystemZ::BR:
+  case SystemZ::BI:
   case SystemZ::J:
   case SystemZ::JG:
     return SystemZII::Branch(SystemZII::BranchNormal, SystemZ::CCMASK_ANY,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 4569be7602e45..f64c0d15ef83b 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -48,6 +48,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
     let isIndirectBranch = 1 in {
       def BC  : CondBranchRX<"b#",  0x47>;
       def BCR : CondBranchRR<"b#r", 0x07>;
+      def BIC : CondBranchRXY<"bi#", 0xe347>,
+                Requires<[FeatureMiscellaneousExtensions2]>;
     }
   }
 
@@ -58,6 +60,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
   let isIndirectBranch = 1 in {
     def BCAsm  : AsmCondBranchRX<"bc",  0x47>;
     def BCRAsm : AsmCondBranchRR<"bcr", 0x07>;
+    def BICAsm : AsmCondBranchRXY<"bic", 0xe347>,
+                 Requires<[FeatureMiscellaneousExtensions2]>;
   }
 
   // Define AsmParser extended mnemonics for each general condition-code mask
@@ -69,6 +73,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
     let isIndirectBranch = 1 in {
       def BAsm#V  : FixedCondBranchRX <CV<V>, "b#",  0x47>;
       def BRAsm#V : FixedCondBranchRR <CV<V>, "b#r", 0x07>;
+      def BIAsm#V : FixedCondBranchRXY<CV<V>, "bi#", 0xe347>,
+                    Requires<[FeatureMiscellaneousExtensions2]>;
     }
   }
 }
@@ -81,6 +87,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   let isIndirectBranch = 1 in {
     def B  : FixedCondBranchRX<CondAlways, "b",  0x47>;
     def BR : FixedCondBranchRR<CondAlways, "br", 0x07, brind>;
+    def BI : FixedCondBranchRXY<CondAlways, "bi", 0xe347, brind>,
+             Requires<[FeatureMiscellaneousExtensions2]>;
   }
 }
 
@@ -316,9 +324,9 @@ let isReturn = 1, isTerminator = 1, hasCtrlDep = 1 in {
 // Select instructions
 //===----------------------------------------------------------------------===//
 
-def Select32Mux : SelectWrapper<GRX32>, Requires<[FeatureHighWord]>;
-def Select32    : SelectWrapper<GR32>;
-def Select64    : SelectWrapper<GR64>;
+def Select32Mux : SelectWrapper<i32, GRX32>, Requires<[FeatureHighWord]>;
+def Select32    : SelectWrapper<i32, GR32>;
+def Select64    : SelectWrapper<i64, GR64>;
 
 // We don't define 32-bit Mux stores if we don't have STOCFH, because the
 // low-only STOC should then always be used if possible.
@@ -921,6 +929,8 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   // Addition of memory.
   defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>;
   defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, add, GR32, load, 4>;
+  def  AGH : BinaryRXY<"agh", 0xE338, add, GR64, asextloadi16, 2>,
+             Requires<[FeatureMiscellaneousExtensions2]>;
   def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>;
   def  AG  : BinaryRXY<"ag",  0xE308, add, GR64, load, 8>;
 
@@ -1006,6 +1016,8 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   // Subtraction of memory.
   defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
   defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
+  def  SGH : BinaryRXY<"sgh", 0xE339, sub, GR64, asextloadi16, 2>,
+             Requires<[FeatureMiscellaneousExtensions2]>;
   def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>;
   def  SG  : BinaryRXY<"sg",  0xE309, sub, GR64, load, 8>;
 }
@@ -1207,6 +1219,15 @@ defm : RMWIByte<xor, bdaddr20pair, XIY>;
 // Multiplication
 //===----------------------------------------------------------------------===//
 
+// Multiplication of a register, setting the condition code.  We prefer these
+// over MS(G)R if available, even though we cannot use the condition code,
+// since they are three-operand instructions.
+let Predicates = [FeatureMiscellaneousExtensions2],
+    Defs = [CC], isCommutable = 1 in {
+  def MSRKC  : BinaryRRFa<"msrkc",  0xB9FD, mul, GR32, GR32, GR32>;
+  def MSGRKC : BinaryRRFa<"msgrkc", 0xB9ED, mul, GR64, GR64, GR64>;
+}
+
 // Multiplication of a register.
 let isCommutable = 1 in {
   def MSR  : BinaryRRE<"msr",  0xB252, mul, GR32, GR32>;
@@ -1226,21 +1247,37 @@ def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
 // Multiplication of memory.
 defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
 defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
+def  MGH  : BinaryRXY<"mgh", 0xE33C, mul, GR64, asextloadi16, 2>,
+            Requires<[FeatureMiscellaneousExtensions2]>;
 def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
 def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
 
+// Multiplication of memory, setting the condition code.
+let Predicates = [FeatureMiscellaneousExtensions2], Defs = [CC] in {
+  def MSC  : BinaryRXY<"msc",  0xE353, null_frag, GR32, load, 4>;
+  def MSGC : BinaryRXY<"msgc", 0xE383, null_frag, GR64, load, 8>;
+}
+
 // Multiplication of a register, producing two results.
-def MR   : BinaryRR <"mr",   0x1C,   null_frag, GR128, GR32>;
+def MR   : BinaryRR <"mr",    0x1C,   null_frag, GR128, GR32>;
+def MGRK : BinaryRRFa<"mgrk", 0xB9EC, null_frag, GR128, GR64, GR64>,
+           Requires<[FeatureMiscellaneousExtensions2]>;
 def MLR  : BinaryRRE<"mlr",  0xB996, null_frag, GR128, GR32>;
 def MLGR : BinaryRRE<"mlgr", 0xB986, null_frag, GR128, GR64>;
+def : Pat<(z_smul_lohi GR64:$src1, GR64:$src2),
+          (MGRK GR64:$src1, GR64:$src2)>;
 def : Pat<(z_umul_lohi GR64:$src1, GR64:$src2),
           (MLGR (AEXT128 GR64:$src1), GR64:$src2)>;
 
 // Multiplication of memory, producing two results.
 def M   : BinaryRX <"m",   0x5C,   null_frag, GR128, load, 4>;
 def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>;
+def MG  : BinaryRXY<"mg",  0xE384, null_frag, GR128, load, 8>,
+          Requires<[FeatureMiscellaneousExtensions2]>;
 def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, load, 4>;
 def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, load, 8>;
+def : Pat<(z_smul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+          (MG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 def : Pat<(z_umul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
           (MLG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
@@ -1765,8 +1802,29 @@ let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
                                                GR128, GR128, GR128>;
     def PCC   : SideEffectInherentRRE<"pcc", 0xB92C>;
   }
+
   let Predicates = [FeatureMessageSecurityAssist5] in
-    def PPNO  : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>;
+    def PPNO : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>;
+  let Predicates = [FeatureMessageSecurityAssist7], isAsmParserOnly = 1 in
+    def PRNO : SideEffectBinaryMemMemRRE<"prno", 0xB93C, GR128, GR128>;
+
+  let Predicates = [FeatureMessageSecurityAssist8] in
+    def KMA : SideEffectTernaryMemMemMemRRFb<"kma", 0xB929,
+                                              GR128, GR128, GR128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Guarded storage
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureGuardedStorage] in {
+  def LGG : UnaryRXY<"lgg", 0xE34C, null_frag, GR64, 8>;
+  def LLGFSG : UnaryRXY<"llgfsg", 0xE348, null_frag, GR64, 4>;
+
+  let mayLoad = 1 in
+    def LGSC : SideEffectBinaryRXY<"lgsc", 0xE34D, GR64>;
+  let mayStore = 1 in
+    def STGSC : SideEffectBinaryRXY<"stgsc", 0xE349, GR64>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrSystem.td b/lib/Target/SystemZ/SystemZInstrSystem.td
index a9803c2d83e9f..0112ebf1eb10c 100644
--- a/lib/Target/SystemZ/SystemZInstrSystem.td
+++ b/lib/Target/SystemZ/SystemZInstrSystem.td
@@ -126,6 +126,10 @@ let hasSideEffects = 1, Defs = [CC] in
 let Predicates = [FeatureResetReferenceBitsMultiple], hasSideEffects = 1 in
   def RRBM : UnaryRRE<"rrbm", 0xB9AE, null_frag, GR64, GR64>;
 
+// Insert reference bits multiple.
+let Predicates = [FeatureInsertReferenceBitsMultiple], hasSideEffects = 1 in
+  def IRBM : UnaryRRE<"irbm", 0xB9AC, null_frag, GR64, GR64>;
+
 // Perform frame management function.
 let hasSideEffects = 1 in
   def PFMF : SideEffectBinaryMemRRE<"pfmf", 0xB9AF, GR32, GR64>;
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index 0158fe6aec08d..c9a02d9c80821 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -14,7 +14,7 @@
 let Predicates = [FeatureVector] in {
   // Register move.
   def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>;
-  def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>;
+  def VLR32 : UnaryAliasVRR<null_frag, v32sb, v32sb>;
   def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>;
 
   // Load GR from VR element.
@@ -141,7 +141,7 @@ let Predicates = [FeatureVector] in {
   // LEY and LDY offer full 20-bit displacement fields.  It's often better
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
-  def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>;
+  def VL32 : UnaryAliasVRX<load, v32sb, bdxaddr12pair>;
   def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
 
   // Load logical element and zero.
@@ -154,6 +154,11 @@ let Predicates = [FeatureVector] in {
             (VLLEZF bdxaddr12only:$addr)>;
   def : Pat<(v2f64 (z_vllezf64 bdxaddr12only:$addr)),
             (VLLEZG bdxaddr12only:$addr)>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VLLEZLF : UnaryVRX<"vllezlf", 0xE704, z_vllezli32, v128f, 4, 6>;
+    def : Pat<(v4f32 (z_vllezlf32 bdxaddr12only:$addr)),
+              (VLLEZLF bdxaddr12only:$addr)>;
+  }
 
   // Load element.
   def VLEB : TernaryVRX<"vleb", 0xE700, z_vlei8,  v128b, v128b, 1, imm32zx4>;
@@ -170,6 +175,13 @@ let Predicates = [FeatureVector] in {
   def VGEG : TernaryVRV<"vgeg", 0xE712, 8, imm32zx1>;
 }
 
+let Predicates = [FeatureVectorPackedDecimal] in {
+  // Load rightmost with length.  The number of loaded bytes is only known
+  // at run time.
+  def VLRL : BinaryVSI<"vlrl", 0xE635, int_s390_vlrl, 0>;
+  def VLRLR : BinaryVRSd<"vlrlr", 0xE637, int_s390_vlrl, 0>;
+}
+
 // Use replicating loads if we're inserting a single element into an
 // undefined vector.  This avoids a false dependency on the previous
 // register contents.
@@ -219,7 +231,7 @@ let Predicates = [FeatureVector] in {
   // STEY and STDY offer full 20-bit displacement fields.  It's often better
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
-  def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>;
+  def VST32 : StoreAliasVRX<store, v32sb, bdxaddr12pair>;
   def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
 
   // Scatter element.
@@ -227,6 +239,13 @@ let Predicates = [FeatureVector] in {
   def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>;
 }
 
+let Predicates = [FeatureVectorPackedDecimal] in {
+  // Store rightmost with length.  The number of stored bytes is only known
+  // at run time.
+  def VSTRL : StoreLengthVSI<"vstrl", 0xE63D, int_s390_vstrl, 0>;
+  def VSTRLR : StoreLengthVRSd<"vstrlr", 0xE63F, int_s390_vstrl, 0>;
+}
+
 //===----------------------------------------------------------------------===//
 // Selects and permutes
 //===----------------------------------------------------------------------===//
@@ -256,6 +275,10 @@ let Predicates = [FeatureVector] in {
   // Permute doubleword immediate.
   def VPDI : TernaryVRRc<"vpdi", 0xE784, z_permute_dwords, v128g, v128g>;
 
+  // Bit Permute.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VBPERM : BinaryVRRc<"vbperm", 0xE785, int_s390_vbperm, v128g, v128b>;
+
   // Replicate.
   def VREP:   BinaryVRIcGeneric<"vrep", 0xE74D>;
   def VREPB : BinaryVRIc<"vrepb", 0xE74D, z_splat, v128b, v128b, 0>;
@@ -424,6 +447,10 @@ let Predicates = [FeatureVector] in {
   def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>;
   def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>;
 
+  // Not exclusive or.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VNX : BinaryVRRc<"vnx", 0xE76C, null_frag, v128any, v128any>;
+
   // Exclusive or.
   def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>;
 
@@ -567,6 +594,17 @@ let Predicates = [FeatureVector] in {
   def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>;
   def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>;
 
+  // Multiply sum logical.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VMSL  : QuaternaryVRRdGeneric<"vmsl", 0xE7B8>;
+    def VMSLG : QuaternaryVRRd<"vmslg", 0xE7B8, int_s390_vmslg,
+                               v128q, v128g, v128g, v128q, 3>;
+  }
+
+  // Nand.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VNN : BinaryVRRc<"vnn", 0xE76E, null_frag, v128any, v128any>;
+
   // Nor.
   def VNO : BinaryVRRc<"vno", 0xE76B, null_frag, v128any, v128any>;
   def : InstAlias<"vnot\t$V1, $V2", (VNO VR128:$V1, VR128:$V2, VR128:$V2), 0>;
@@ -574,9 +612,19 @@ let Predicates = [FeatureVector] in {
   // Or.
   def VO : BinaryVRRc<"vo", 0xE76A, null_frag, v128any, v128any>;
 
+  // Or with complement.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VOC : BinaryVRRc<"voc", 0xE76F, null_frag, v128any, v128any>;
+
   // Population count.
   def VPOPCT : UnaryVRRaGeneric<"vpopct", 0xE750>;
   def : Pat<(v16i8 (z_popcnt VR128:$x)), (VPOPCT VR128:$x, 0)>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VPOPCTB : UnaryVRRa<"vpopctb", 0xE750, ctpop, v128b, v128b, 0>;
+    def VPOPCTH : UnaryVRRa<"vpopcth", 0xE750, ctpop, v128h, v128h, 1>;
+    def VPOPCTF : UnaryVRRa<"vpopctf", 0xE750, ctpop, v128f, v128f, 2>;
+    def VPOPCTG : UnaryVRRa<"vpopctg", 0xE750, ctpop, v128g, v128g, 3>;
+  }
 
   // Element rotate left logical (with vector shift amount).
   def VERLLV  : BinaryVRRcGeneric<"verllv", 0xE773>;
@@ -724,6 +772,14 @@ multiclass BitwiseVectorOps<ValueType type> {
               (VNO VR128:$x, VR128:$y)>;
     def : Pat<(type (z_vnot VR128:$x)), (VNO VR128:$x, VR128:$x)>;
   }
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def : Pat<(type (z_vnot (xor VR128:$x, VR128:$y))),
+              (VNX VR128:$x, VR128:$y)>;
+    def : Pat<(type (z_vnot (and VR128:$x, VR128:$y))),
+              (VNN VR128:$x, VR128:$y)>;
+    def : Pat<(type (or VR128:$x, (z_vnot VR128:$y))),
+              (VOC VR128:$x, VR128:$y)>;
+  }
 }
 
 defm : BitwiseVectorOps<v16i8>;
@@ -879,6 +935,11 @@ let Predicates = [FeatureVector] in {
   def VFA   : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
   def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
   def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFASB : BinaryVRRc<"vfasb", 0xE7E3, fadd, v128sb, v128sb, 2, 0>;
+    def WFASB : BinaryVRRc<"wfasb", 0xE7E3, fadd, v32sb, v32sb, 2, 8>;
+    def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, fadd, v128xb, v128xb, 4, 8>;
+  }
 
   // Convert from fixed 64-bit.
   def VCDG  : TernaryVRRaFloatGeneric<"vcdg", 0xE7C3>;
@@ -910,6 +971,11 @@ let Predicates = [FeatureVector] in {
   def VFD   : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>;
   def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
   def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFDSB : BinaryVRRc<"vfdsb", 0xE7E5, fdiv, v128sb, v128sb, 2, 0>;
+    def WFDSB : BinaryVRRc<"wfdsb", 0xE7E5, fdiv, v32sb, v32sb, 2, 8>;
+    def WFDXB : BinaryVRRc<"wfdxb", 0xE7E5, fdiv, v128xb, v128xb, 4, 8>;
+  }
 
   // Load FP integer.
   def VFI   : TernaryVRRaFloatGeneric<"vfi", 0xE7C7>;
@@ -917,66 +983,213 @@ let Predicates = [FeatureVector] in {
   def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
   defm : VectorRounding<VFIDB, v128db>;
   defm : VectorRounding<WFIDB, v64db>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFISB : TernaryVRRa<"vfisb", 0xE7C7, int_s390_vfisb, v128sb, v128sb, 2, 0>;
+    def WFISB : TernaryVRRa<"wfisb", 0xE7C7, null_frag, v32sb, v32sb, 2, 8>;
+    def WFIXB : TernaryVRRa<"wfixb", 0xE7C7, null_frag, v128xb, v128xb, 4, 8>;
+    defm : VectorRounding<VFISB, v128sb>;
+    defm : VectorRounding<WFISB, v32sb>;
+    defm : VectorRounding<WFIXB, v128xb>;
+  }
 
   // Load lengthened.
   def VLDE  : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>;
-  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
-  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32eb, 2, 8>;
+  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128sb, 2, 0>;
+  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32sb, 2, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    let isAsmParserOnly = 1 in {
+      def VFLL  : UnaryVRRaFloatGeneric<"vfll", 0xE7C4>;
+      def VFLLS : UnaryVRRa<"vflls", 0xE7C4, null_frag, v128db, v128sb, 2, 0>;
+      def WFLLS : UnaryVRRa<"wflls", 0xE7C4, null_frag, v64db, v32sb, 2, 8>;
+    }
+    def WFLLD : UnaryVRRa<"wflld", 0xE7C4, fpextend, v128xb, v64db, 3, 8>;
+    def : Pat<(f128 (fpextend (f32 VR32:$src))),
+              (WFLLD (WLDEB VR32:$src))>;
+  }
 
-  // Load rounded,
+  // Load rounded.
   def VLED  : TernaryVRRaFloatGeneric<"vled", 0xE7C5>;
-  def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
-  def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
+  def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128sb, v128db, 3, 0>;
+  def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32sb, v64db, 3, 8>;
   def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
-  def : FPConversion<WLEDB, fpround, v32eb, v64db, 0, 0>;
+  def : FPConversion<WLEDB, fpround, v32sb, v64db, 0, 0>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    let isAsmParserOnly = 1 in {
+      def VFLR  : TernaryVRRaFloatGeneric<"vflr", 0xE7C5>;
+      def VFLRD : TernaryVRRa<"vflrd", 0xE7C5, null_frag, v128sb, v128db, 3, 0>;
+      def WFLRD : TernaryVRRa<"wflrd", 0xE7C5, null_frag, v32sb, v64db, 3, 8>;
+    }
+    def WFLRX : TernaryVRRa<"wflrx", 0xE7C5, null_frag, v64db, v128xb, 4, 8>;
+    def : FPConversion<WFLRX, fpround, v64db, v128xb, 0, 0>;
+    def : Pat<(f32 (fpround (f128 VR128:$src))),
+              (WLEDB (WFLRX VR128:$src, 0, 3), 0, 0)>;
+  }
+
+  // Maximum.
+  multiclass VectorMax<Instruction insn, TypedReg tr> {
+    def : FPMinMax<insn, fmaxnum, tr, 4>;
+    def : FPMinMax<insn, fmaxnan, tr, 1>;
+  }
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMAX   : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>;
+    def VFMAXDB : TernaryVRRcFloat<"vfmaxdb", 0xE7EF, int_s390_vfmaxdb,
+                                   v128db, v128db, 3, 0>;
+    def WFMAXDB : TernaryVRRcFloat<"wfmaxdb", 0xE7EF, null_frag,
+                                   v64db, v64db, 3, 8>;
+    def VFMAXSB : TernaryVRRcFloat<"vfmaxsb", 0xE7EF, int_s390_vfmaxsb,
+                                   v128sb, v128sb, 2, 0>;
+    def WFMAXSB : TernaryVRRcFloat<"wfmaxsb", 0xE7EF, null_frag,
+                                   v32sb, v32sb, 2, 8>;
+    def WFMAXXB : TernaryVRRcFloat<"wfmaxxb", 0xE7EF, null_frag,
+                                   v128xb, v128xb, 4, 8>;
+    defm : VectorMax<VFMAXDB, v128db>;
+    defm : VectorMax<WFMAXDB, v64db>;
+    defm : VectorMax<VFMAXSB, v128sb>;
+    defm : VectorMax<WFMAXSB, v32sb>;
+    defm : VectorMax<WFMAXXB, v128xb>;
+  }
+
+  // Minimum.
+  multiclass VectorMin<Instruction insn, TypedReg tr> {
+    def : FPMinMax<insn, fminnum, tr, 4>;
+    def : FPMinMax<insn, fminnan, tr, 1>;
+  }
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMIN   : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>;
+    def VFMINDB : TernaryVRRcFloat<"vfmindb", 0xE7EE, int_s390_vfmindb,
+                                   v128db, v128db, 3, 0>;
+    def WFMINDB : TernaryVRRcFloat<"wfmindb", 0xE7EE, null_frag,
+                                   v64db, v64db, 3, 8>;
+    def VFMINSB : TernaryVRRcFloat<"vfminsb", 0xE7EE, int_s390_vfminsb,
+                                   v128sb, v128sb, 2, 0>;
+    def WFMINSB : TernaryVRRcFloat<"wfminsb", 0xE7EE, null_frag,
+                                   v32sb, v32sb, 2, 8>;
+    def WFMINXB : TernaryVRRcFloat<"wfminxb", 0xE7EE, null_frag,
+                                   v128xb, v128xb, 4, 8>;
+    defm : VectorMin<VFMINDB, v128db>;
+    defm : VectorMin<WFMINDB, v64db>;
+    defm : VectorMin<VFMINSB, v128sb>;
+    defm : VectorMin<WFMINSB, v32sb>;
+    defm : VectorMin<WFMINXB, v128xb>;
+  }
 
   // Multiply.
   def VFM   : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>;
   def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
   def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, fmul, v128sb, v128sb, 2, 0>;
+    def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, fmul, v32sb, v32sb, 2, 8>;
+    def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, fmul, v128xb, v128xb, 4, 8>;
+  }
 
   // Multiply and add.
   def VFMA   : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
   def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
   def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, fma, v128sb, v128sb, 0, 2>;
+    def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, fma, v32sb, v32sb, 8, 2>;
+    def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, fma, v128xb, v128xb, 8, 4>;
+  }
 
   // Multiply and subtract.
   def VFMS   : TernaryVRReFloatGeneric<"vfms", 0xE78E>;
   def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
   def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMSSB : TernaryVRRe<"vfmssb", 0xE78E, fms, v128sb, v128sb, 0, 2>;
+    def WFMSSB : TernaryVRRe<"wfmssb", 0xE78E, fms, v32sb, v32sb, 8, 2>;
+    def WFMSXB : TernaryVRRe<"wfmsxb", 0xE78E, fms, v128xb, v128xb, 8, 4>;
+  }
+
+  // Negative multiply and add.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFNMA   : TernaryVRReFloatGeneric<"vfnma", 0xE79F>;
+    def VFNMADB : TernaryVRRe<"vfnmadb", 0xE79F, fnma, v128db, v128db, 0, 3>;
+    def WFNMADB : TernaryVRRe<"wfnmadb", 0xE79F, fnma, v64db, v64db, 8, 3>;
+    def VFNMASB : TernaryVRRe<"vfnmasb", 0xE79F, fnma, v128sb, v128sb, 0, 2>;
+    def WFNMASB : TernaryVRRe<"wfnmasb", 0xE79F, fnma, v32sb, v32sb, 8, 2>;
+    def WFNMAXB : TernaryVRRe<"wfnmaxb", 0xE79F, fnma, v128xb, v128xb, 8, 4>;
+  }
+
+  // Negative multiply and subtract.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFNMS   : TernaryVRReFloatGeneric<"vfnms", 0xE79E>;
+    def VFNMSDB : TernaryVRRe<"vfnmsdb", 0xE79E, fnms, v128db, v128db, 0, 3>;
+    def WFNMSDB : TernaryVRRe<"wfnmsdb", 0xE79E, fnms, v64db, v64db, 8, 3>;
+    def VFNMSSB : TernaryVRRe<"vfnmssb", 0xE79E, fnms, v128sb, v128sb, 0, 2>;
+    def WFNMSSB : TernaryVRRe<"wfnmssb", 0xE79E, fnms, v32sb, v32sb, 8, 2>;
+    def WFNMSXB : TernaryVRRe<"wfnmsxb", 0xE79E, fnms, v128xb, v128xb, 8, 4>;
+  }
 
   // Perform sign operation.
   def VFPSO   : BinaryVRRaFloatGeneric<"vfpso", 0xE7CC>;
   def VFPSODB : BinaryVRRa<"vfpsodb", 0xE7CC, null_frag, v128db, v128db, 3, 0>;
   def WFPSODB : BinaryVRRa<"wfpsodb", 0xE7CC, null_frag, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFPSOSB : BinaryVRRa<"vfpsosb", 0xE7CC, null_frag, v128sb, v128sb, 2, 0>;
+    def WFPSOSB : BinaryVRRa<"wfpsosb", 0xE7CC, null_frag, v32sb, v32sb, 2, 8>;
+    def WFPSOXB : BinaryVRRa<"wfpsoxb", 0xE7CC, null_frag, v128xb, v128xb, 4, 8>;
+  }
 
   // Load complement.
   def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>;
   def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFLCSB : UnaryVRRa<"vflcsb", 0xE7CC, fneg, v128sb, v128sb, 2, 0, 0>;
+    def WFLCSB : UnaryVRRa<"wflcsb", 0xE7CC, fneg, v32sb, v32sb, 2, 8, 0>;
+    def WFLCXB : UnaryVRRa<"wflcxb", 0xE7CC, fneg, v128xb, v128xb, 4, 8, 0>;
+  }
 
   // Load negative.
   def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>;
   def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFLNSB : UnaryVRRa<"vflnsb", 0xE7CC, fnabs, v128sb, v128sb, 2, 0, 1>;
+    def WFLNSB : UnaryVRRa<"wflnsb", 0xE7CC, fnabs, v32sb, v32sb, 2, 8, 1>;
+    def WFLNXB : UnaryVRRa<"wflnxb", 0xE7CC, fnabs, v128xb, v128xb, 4, 8, 1>;
+  }
 
   // Load positive.
   def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>;
   def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFLPSB : UnaryVRRa<"vflpsb", 0xE7CC, fabs, v128sb, v128sb, 2, 0, 2>;
+    def WFLPSB : UnaryVRRa<"wflpsb", 0xE7CC, fabs, v32sb, v32sb, 2, 8, 2>;
+    def WFLPXB : UnaryVRRa<"wflpxb", 0xE7CC, fabs, v128xb, v128xb, 4, 8, 2>;
+  }
 
   // Square root.
   def VFSQ   : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>;
   def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
   def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFSQSB : UnaryVRRa<"vfsqsb", 0xE7CE, fsqrt, v128sb, v128sb, 2, 0>;
+    def WFSQSB : UnaryVRRa<"wfsqsb", 0xE7CE, fsqrt, v32sb, v32sb, 2, 8>;
+    def WFSQXB : UnaryVRRa<"wfsqxb", 0xE7CE, fsqrt, v128xb, v128xb, 4, 8>;
+  }
 
   // Subtract.
   def VFS   : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
   def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
   def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, fsub, v128sb, v128sb, 2, 0>;
+    def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, fsub, v32sb, v32sb, 2, 8>;
+    def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, fsub, v128xb, v128xb, 4, 8>;
+  }
 
   // Test data class immediate.
   let Defs = [CC] in {
     def VFTCI   : BinaryVRIeFloatGeneric<"vftci", 0xE74A>;
     def VFTCIDB : BinaryVRIe<"vftcidb", 0xE74A, z_vftci, v128g, v128db, 3, 0>;
     def WFTCIDB : BinaryVRIe<"wftcidb", 0xE74A, null_frag, v64g, v64db, 3, 8>;
+    let Predicates = [FeatureVectorEnhancements1] in {
+      def VFTCISB : BinaryVRIe<"vftcisb", 0xE74A, z_vftci, v128f, v128sb, 2, 0>;
+      def WFTCISB : BinaryVRIe<"wftcisb", 0xE74A, null_frag, v32f, v32sb, 2, 8>;
+      def WFTCIXB : BinaryVRIe<"wftcixb", 0xE74A, null_frag, v128q, v128xb, 4, 8>;
+    }
   }
 }
 
@@ -989,12 +1202,20 @@ let Predicates = [FeatureVector] in {
   let Defs = [CC] in {
     def WFC   : CompareVRRaFloatGeneric<"wfc", 0xE7CB>;
     def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;
+    let Predicates = [FeatureVectorEnhancements1] in {
+      def WFCSB : CompareVRRa<"wfcsb", 0xE7CB, z_fcmp, v32sb, 2>;
+      def WFCXB : CompareVRRa<"wfcxb", 0xE7CB, z_fcmp, v128xb, 4>;
+    }
   }
 
   // Compare and signal scalar.
   let Defs = [CC] in {
     def WFK   : CompareVRRaFloatGeneric<"wfk", 0xE7CA>;
     def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, null_frag, v64db, 3>;
+    let Predicates = [FeatureVectorEnhancements1] in {
+      def WFKSB : CompareVRRa<"wfksb", 0xE7CA, null_frag, v32sb, 2>;
+      def WFKXB : CompareVRRa<"wfkxb", 0xE7CA, null_frag, v128xb, 4>;
+    }
   }
 
   // Compare equal.
@@ -1003,6 +1224,28 @@ let Predicates = [FeatureVector] in {
                                 v128g, v128db, 3, 0>;
   defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag,
                                 v64g, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFCESB : BinaryVRRcSPair<"vfcesb", 0xE7E8, z_vfcmpe, z_vfcmpes,
+                                  v128f, v128sb, 2, 0>;
+    defm WFCESB : BinaryVRRcSPair<"wfcesb", 0xE7E8, null_frag, null_frag,
+                                  v32f, v32sb, 2, 8>;
+    defm WFCEXB : BinaryVRRcSPair<"wfcexb", 0xE7E8, null_frag, null_frag,
+                                  v128q, v128xb, 4, 8>;
+  }
+
+  // Compare and signal equal.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFKEDB : BinaryVRRcSPair<"vfkedb", 0xE7E8, null_frag, null_frag,
+                                  v128g, v128db, 3, 4>;
+    defm WFKEDB : BinaryVRRcSPair<"wfkedb", 0xE7E8, null_frag, null_frag,
+                                  v64g, v64db, 3, 12>;
+    defm VFKESB : BinaryVRRcSPair<"vfkesb", 0xE7E8, null_frag, null_frag,
+                                  v128f, v128sb, 2, 4>;
+    defm WFKESB : BinaryVRRcSPair<"wfkesb", 0xE7E8, null_frag, null_frag,
+                                  v32f, v32sb, 2, 12>;
+    defm WFKEXB : BinaryVRRcSPair<"wfkexb", 0xE7E8, null_frag, null_frag,
+                                  v128q, v128xb, 4, 12>;
+  }
 
   // Compare high.
   def  VFCH   : BinaryVRRcSPairFloatGeneric<"vfch", 0xE7EB>;
@@ -1010,6 +1253,28 @@ let Predicates = [FeatureVector] in {
                                 v128g, v128db, 3, 0>;
   defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag,
                                 v64g, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFCHSB : BinaryVRRcSPair<"vfchsb", 0xE7EB, z_vfcmph, z_vfcmphs,
+                                  v128f, v128sb, 2, 0>;
+    defm WFCHSB : BinaryVRRcSPair<"wfchsb", 0xE7EB, null_frag, null_frag,
+                                  v32f, v32sb, 2, 8>;
+    defm WFCHXB : BinaryVRRcSPair<"wfchxb", 0xE7EB, null_frag, null_frag,
+                                  v128q, v128xb, 4, 8>;
+  }
+
+  // Compare and signal high.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFKHDB : BinaryVRRcSPair<"vfkhdb", 0xE7EB, null_frag, null_frag,
+                                  v128g, v128db, 3, 4>;
+    defm WFKHDB : BinaryVRRcSPair<"wfkhdb", 0xE7EB, null_frag, null_frag,
+                                  v64g, v64db, 3, 12>;
+    defm VFKHSB : BinaryVRRcSPair<"vfkhsb", 0xE7EB, null_frag, null_frag,
+                                  v128f, v128sb, 2, 4>;
+    defm WFKHSB : BinaryVRRcSPair<"wfkhsb", 0xE7EB, null_frag, null_frag,
+                                  v32f, v32sb, 2, 12>;
+    defm WFKHXB : BinaryVRRcSPair<"wfkhxb", 0xE7EB, null_frag, null_frag,
+                                  v128q, v128xb, 4, 12>;
+  }
 
   // Compare high or equal.
   def  VFCHE   : BinaryVRRcSPairFloatGeneric<"vfche", 0xE7EA>;
@@ -1017,6 +1282,28 @@ let Predicates = [FeatureVector] in {
                                  v128g, v128db, 3, 0>;
   defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag,
                                  v64g, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFCHESB : BinaryVRRcSPair<"vfchesb", 0xE7EA, z_vfcmphe, z_vfcmphes,
+                                   v128f, v128sb, 2, 0>;
+    defm WFCHESB : BinaryVRRcSPair<"wfchesb", 0xE7EA, null_frag, null_frag,
+                                   v32f, v32sb, 2, 8>;
+    defm WFCHEXB : BinaryVRRcSPair<"wfchexb", 0xE7EA, null_frag, null_frag,
+                                   v128q, v128xb, 4, 8>;
+  }
+
+  // Compare and signal high or equal.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFKHEDB : BinaryVRRcSPair<"vfkhedb", 0xE7EA, null_frag, null_frag,
+                                   v128g, v128db, 3, 4>;
+    defm WFKHEDB : BinaryVRRcSPair<"wfkhedb", 0xE7EA, null_frag, null_frag,
+                                   v64g, v64db, 3, 12>;
+    defm VFKHESB : BinaryVRRcSPair<"vfkhesb", 0xE7EA, null_frag, null_frag,
+                                   v128f, v128sb, 2, 4>;
+    defm WFKHESB : BinaryVRRcSPair<"wfkhesb", 0xE7EA, null_frag, null_frag,
+                                   v32f, v32sb, 2, 12>;
+    defm WFKHEXB : BinaryVRRcSPair<"wfkhexb", 0xE7EA, null_frag, null_frag,
+                                   v128q, v128xb, 4, 12>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1028,36 +1315,49 @@ def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (f128  VR128:$src))), (v16i8 VR128:$src)>;
 
 def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (f128  VR128:$src))), (v8i16 VR128:$src)>;
 
 def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (f128  VR128:$src))), (v4i32 VR128:$src)>;
 
 def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (f128  VR128:$src))), (v2i64 VR128:$src)>;
 
 def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (f128  VR128:$src))), (v4f32 VR128:$src)>;
 
 def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (f128  VR128:$src))), (v2f64 VR128:$src)>;
+
+def : Pat<(f128  (bitconvert (v16i8 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v8i16 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v4i32 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v2i64 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v4f32 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v2f64 VR128:$src))), (f128  VR128:$src)>;
 
 //===----------------------------------------------------------------------===//
 // Replicating scalars
@@ -1134,6 +1434,20 @@ let AddedComplexity = 4 in {
 }
 
 //===----------------------------------------------------------------------===//
+// Support for 128-bit floating-point values in vector registers
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVectorEnhancements1] in {
+  def : Pat<(f128 (load bdxaddr12only:$addr)),
+            (VL bdxaddr12only:$addr)>;
+  def : Pat<(store (f128 VR128:$src), bdxaddr12only:$addr),
+            (VST VR128:$src, bdxaddr12only:$addr)>;
+
+  def : Pat<(f128 fpimm0), (VZERO)>;
+  def : Pat<(f128 fpimmneg0), (WFLNXB (VZERO))>;
+}
+
+//===----------------------------------------------------------------------===//
 // String instructions
 //===----------------------------------------------------------------------===//
 
@@ -1202,3 +1516,37 @@ let Predicates = [FeatureVector] in {
   defm VSTRCZF : QuaternaryOptVRRdSPair<"vstrczf", 0xE78A, int_s390_vstrczf,
                                         z_vstrcz_cc, v128f, v128f, 2, 2>;
 }
+
+//===----------------------------------------------------------------------===//
+// Packed-decimal instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVectorPackedDecimal] in {
+  def VLIP : BinaryVRIh<"vlip", 0xE649>;
+
+  def VPKZ : BinaryVSI<"vpkz", 0xE634, null_frag, 0>;
+  def VUPKZ : StoreLengthVSI<"vupkz", 0xE63C, null_frag, 0>;
+
+  let Defs = [CC] in {
+    def VCVB : BinaryVRRi<"vcvb", 0xE650, GR32>;
+    def VCVBG : BinaryVRRi<"vcvbg", 0xE652, GR64>;
+    def VCVD : TernaryVRIi<"vcvd", 0xE658, GR32>;
+    def VCVDG : TernaryVRIi<"vcvdg", 0xE65A, GR64>;
+
+    def VAP : QuaternaryVRIf<"vap", 0xE671>;
+    def VSP : QuaternaryVRIf<"vsp", 0xE673>;
+
+    def VMP : QuaternaryVRIf<"vmp", 0xE678>;
+    def VMSP : QuaternaryVRIf<"vmsp", 0xE679>;
+
+    def VDP : QuaternaryVRIf<"vdp", 0xE67A>;
+    def VRP : QuaternaryVRIf<"vrp", 0xE67B>;
+    def VSDP : QuaternaryVRIf<"vsdp", 0xE67E>;
+
+    def VSRP : QuaternaryVRIg<"vsrp", 0xE659>;
+    def VPSOP : QuaternaryVRIg<"vpsop", 0xE65B>;
+
+    def VTP : TestVRRg<"vtp", 0xE65F>;
+    def VCP : CompareVRRh<"vcp", 0xE677>;
+  }
+}
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 9c6d5819f8a7e..759a8bb0ce14d 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -181,6 +181,7 @@ def z_select_ccmask     : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
     		                 [SDNPInGlue]>;
 def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
 def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
+def z_smul_lohi         : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>;
 def z_umul_lohi         : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>;
 def z_sdivrem           : SDNode<"SystemZISD::SDIVREM", SDT_ZGR128Binary>;
 def z_udivrem           : SDNode<"SystemZISD::UDIVREM", SDT_ZGR128Binary>;
@@ -549,6 +550,12 @@ def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
 def z_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                     (fma node:$src2, node:$src3, (fneg node:$src1))>;
 
+// Negative fused multiply-add and multiply-subtract.
+def fnma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                   (fneg (fma node:$src1, node:$src2, node:$src3))>;
+def fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                   (fneg (fms node:$src1, node:$src2, node:$src3))>;
+
 // Floating-point negative absolute.
 def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
 
@@ -624,6 +631,19 @@ def z_vllezf64 : PatFrag<(ops node:$addr),
                           (scalar_to_vector (f64 (load node:$addr))),
                           (z_vzero))>;
 
+// Similarly for the high element of a zeroed vector.
+def z_vllezli32 : z_vllez<i32, load, 0>;
+def z_vllezlf32 : PatFrag<(ops node:$addr),
+                          (bitconvert
+                           (z_merge_high
+                            (v2i64
+                             (bitconvert
+                              (z_merge_high
+                               (v4f32 (scalar_to_vector
+                                       (f32 (load node:$addr)))),
+                               (v4f32 (z_vzero))))),
+                            (v2i64 (z_vzero))))>;
+
 // Store one element of a vector.
 class z_vste<ValueType scalartype, SDPatternOperator store>
   : PatFrag<(ops node:$vec, node:$addr, node:$index),
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
index 16a7ed784d709..152521fb66a8d 100644
--- a/lib/Target/SystemZ/SystemZPatterns.td
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -167,3 +167,10 @@ class FPConversion<Instruction insn, SDPatternOperator operator, TypedReg tr1,
                    TypedReg tr2, bits<3> suppress, bits<4> mode>
   : Pat<(tr1.vt (operator (tr2.vt tr2.op:$vec))),
         (insn tr2.op:$vec, suppress, mode)>;
+
+// Use INSN to perform mininum/maximum operation OPERATOR on type TR.
+// FUNCTION is the type of minimum/maximum function to perform.
+class FPMinMax<Instruction insn, SDPatternOperator operator, TypedReg tr,
+               bits<4> function>
+  : Pat<(tr.vt (operator (tr.vt tr.op:$vec1), (tr.vt tr.op:$vec2))),
+        (insn tr.op:$vec1, tr.op:$vec2, function)>;
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
index 1cdc0949ff4ae..0dca4582dc0d6 100644
--- a/lib/Target/SystemZ/SystemZProcessors.td
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -33,3 +33,6 @@ def : ProcessorModel<"zEC12", ZEC12Model, Arch10SupportedFeatures.List>;
 def : ProcessorModel<"arch11", Z13Model, Arch11SupportedFeatures.List>;
 def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>;
 
+def : ProcessorModel<"arch12", Z14Model, Arch12SupportedFeatures.List>;
+def : ProcessorModel<"z14", Z14Model, Arch12SupportedFeatures.List>;
+
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index 36809ea81dc1c..52ba1a584017a 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -260,10 +260,10 @@ defm VF128 : SystemZRegClass<"VF128",
 
 // All vector registers.
 defm VR128 : SystemZRegClass<"VR128",
-                             [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
-                             (add (sequence "V%u", 0, 7),
-                                  (sequence "V%u", 16, 31),
-                                  (sequence "V%u", 8, 15))>;
+                             [f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                             128, (add (sequence "V%u", 0, 7),
+                                       (sequence "V%u", 16, 31),
+                                       (sequence "V%u", 8, 15))>;
 
 // Attaches a ValueType to a register operand, to make the instruction
 // definitions easier.
@@ -272,7 +272,8 @@ class TypedReg<ValueType vtin, RegisterOperand opin> {
   RegisterOperand op = opin;
 }
 
-def v32eb   : TypedReg<f32,     VR32>;
+def v32f    : TypedReg<i32,     VR32>;
+def v32sb   : TypedReg<f32,     VR32>;
 def v64g    : TypedReg<i64,     VR64>;
 def v64db   : TypedReg<f64,     VR64>;
 def v128b   : TypedReg<v16i8,   VR128>;
@@ -280,8 +281,9 @@ def v128h   : TypedReg<v8i16,   VR128>;
 def v128f   : TypedReg<v4i32,   VR128>;
 def v128g   : TypedReg<v2i64,   VR128>;
 def v128q   : TypedReg<v16i8,   VR128>;
-def v128eb  : TypedReg<v4f32,   VR128>;
+def v128sb  : TypedReg<v4f32,   VR128>;
 def v128db  : TypedReg<v2f64,   VR128>;
+def v128xb  : TypedReg<f128,    VR128>;
 def v128any : TypedReg<untyped, VR128>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td
index 1ce0168f95e95..8dba89f70a422 100644
--- a/lib/Target/SystemZ/SystemZSchedule.td
+++ b/lib/Target/SystemZ/SystemZSchedule.td
@@ -59,7 +59,7 @@ def FPU2 : SchedWrite;
 def DFU  : SchedWrite;
 def DFU2 : SchedWrite;
 
-// Vector sub units (z13)
+// Vector sub units (z13 and later)
 def VecBF     : SchedWrite;
 def VecBF2    : SchedWrite;
 def VecDF     : SchedWrite;
@@ -75,6 +75,7 @@ def VecXsPm   : SchedWrite;
 def VBU         : SchedWrite;
 
 
+include "SystemZScheduleZ14.td"
 include "SystemZScheduleZ13.td"
 include "SystemZScheduleZEC12.td"
 include "SystemZScheduleZ196.td"
diff --git a/lib/Target/SystemZ/SystemZScheduleZ14.td b/lib/Target/SystemZ/SystemZScheduleZ14.td
new file mode 100644
index 0000000000000..f11177af91a53
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -0,0 +1,1611 @@
+//-- SystemZScheduleZ14.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Z14 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Z14Model : SchedMachineModel {
+
+    let UnsupportedFeatures = Arch12UnsupportedFeatures.List;
+
+    let IssueWidth = 8;
+    let MicroOpBufferSize = 60;     // Issue queues
+    let LoadLatency = 1;            // Optimistic load latency.
+
+    let PostRAScheduler = 1;
+
+    // Extra cycles for a mispredicted branch.
+    let MispredictPenalty = 20;
+}
+
+let SchedModel = Z14Model in  {
+
+// These definitions could be put in a subtarget common include file,
+// but it seems the include system in Tablegen currently rejects
+// multiple includes of same file.
+def : WriteRes<GroupAlone, []> {
+  let NumMicroOps = 0;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<BeginGroup, []> {
+  let NumMicroOps = 0;
+  let BeginGroup  = 1;
+}
+def : WriteRes<EndGroup, []> {
+  let NumMicroOps = 0;
+  let EndGroup    = 1;
+}
+def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
+def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
+def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
+def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
+def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
+def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
+def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
+def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
+def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
+def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
+def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
+def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
+def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
+def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Execution units.
+def Z14_FXaUnit     : ProcResource<2>;
+def Z14_FXbUnit     : ProcResource<2>;
+def Z14_LSUnit      : ProcResource<2>;
+def Z14_VecUnit     : ProcResource<2>;
+def Z14_VecFPdUnit  : ProcResource<2> { let BufferSize = 1; /* blocking */ }
+def Z14_VBUnit      : ProcResource<2>;
+
+// Subtarget specific definitions of scheduling resources.
+def : WriteRes<FXa,     [Z14_FXaUnit]> { let Latency = 1; }
+def : WriteRes<FXa2,    [Z14_FXaUnit, Z14_FXaUnit]> { let Latency = 2; }
+def : WriteRes<FXb,     [Z14_FXbUnit]> { let Latency = 1; }
+def : WriteRes<LSU,     [Z14_LSUnit]>  { let Latency = 4; }
+def : WriteRes<VecBF,   [Z14_VecUnit]> { let Latency = 8; }
+def : WriteRes<VecBF2,  [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDF,   [Z14_VecUnit]> { let Latency = 8; }
+def : WriteRes<VecDF2,  [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDFX,  [Z14_VecUnit]> { let Latency = 1; }
+def : WriteRes<VecDFX2, [Z14_VecUnit, Z14_VecUnit]> { let Latency = 2; }
+def : WriteRes<VecFPd,  [Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit]>
+                         { let Latency = 30; }
+def : WriteRes<VecMul,  [Z14_VecUnit]> { let Latency = 5; }
+def : WriteRes<VecStr,  [Z14_VecUnit]> { let Latency = 4; }
+def : WriteRes<VecXsPm, [Z14_VecUnit]> { let Latency = 3; }
+def : WriteRes<VBU,     [Z14_VBUnit]>; // Virtual Branching Unit
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "BI(C)?(Asm.*)?$")>;
+def : InstRW<[FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[FXa, FXa, FXb, FXb, Lat4, GroupAlone],
+             (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[FXb], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[FXb, FXb, Lat2, GroupAlone],
+             (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[FXb], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[VBU, FXa, FXa, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[FXb, EndGroup], (instregex "Return$")>;
+def : InstRW<[FXb], (instregex "CondReturn$")>;
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// Select pseudo
+def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>;
+
+// CondStore pseudos
+def : InstRW<[FXa], (instregex "CondStore16(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore16Mux(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore32(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore32Mux(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore64(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore8(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore8Mux(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[FXb, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
+
+// Pseudo -> reg move
+def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[FXa], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[FXa], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[FXa], (instregex "REG_SEQUENCE$")>;
+def : InstRW<[FXa], (instregex "SUBREG_TO_REG$")>;
+
+// Loads
+def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>;
+def : InstRW<[LSU], (instregex "LG(RL)?$")>;
+def : InstRW<[LSU], (instregex "L128$")>;
+
+def : InstRW<[FXa], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[FXa], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[FXa], (instregex "LG(F|H)I$")>;
+def : InstRW<[FXa], (instregex "LHI(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LR(Mux)?$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSU], (instregex "LZR(F|G)$")>;
+
+// Load and trap
+def : InstRW<[FXb, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+
+// Load and test
+def : InstRW<[FXa, LSU, Lat5], (instregex "LT(G)?$")>;
+def : InstRW<[FXa], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[FXb, LSU, Lat5], (instregex "STG(RL)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "ST128$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "L(B|H|G)R$")>;
+def : InstRW<[FXa], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "LTGF$")>;
+def : InstRW<[FXa], (instregex "LTGFR$")>;
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LH(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LG(B|H|F)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSU], (instregex "LLHRL$")>;
+def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSU], (instregex "LLZRGF$")>;
+
+// Load and trap
+def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+             (instregex "LM(H|Y|G)?$")>;
+
+// Load multiple disjoint
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+
+// Store multiple (estimated average of ceil(5/2) FXb ops)
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
+              GroupAlone], (instregex "STM(G|H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address ( -> larl )
+def : InstRW<[FXa], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LP(G)?R$")>;
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "L(N|P)GFR$")>;
+def : InstRW<[FXa], (instregex "LN(R|GR)$")>;
+def : InstRW<[FXa], (instregex "LC(R|GR)$")>;
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "IC(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "IC32(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
+def : InstRW<[FXa], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[FXa], (instregex "IIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "IIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "IIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "IILF(64)?$")>;
+def : InstRW<[FXa], (instregex "IILH(64)?$")>;
+def : InstRW<[FXa], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "A(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "AH(Y)?$")>;
+def : InstRW<[FXa], (instregex "AIH$")>;
+def : InstRW<[FXa], (instregex "AFI(Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "AG$")>;
+def : InstRW<[FXa], (instregex "AGFI$")>;
+def : InstRW<[FXa], (instregex "AGHI(K)?$")>;
+def : InstRW<[FXa], (instregex "AGR(K)?$")>;
+def : InstRW<[FXa], (instregex "AHI(K)?$")>;
+def : InstRW<[FXa], (instregex "AHIMux(K)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "AL(Y)?$")>;
+def : InstRW<[FXa], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXa], (instregex "ALGHSIK$")>;
+def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
+def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
+def : InstRW<[FXa], (instregex "ALR(K)?$")>;
+def : InstRW<[FXa], (instregex "AR(K)?$")>;
+def : InstRW<[FXa], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXa], (instregex "ALSIH(N)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
+
+// Logical addition with carry
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
+def : InstRW<[FXa, Lat2, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (16/32 -> 64)
+def : InstRW<[FXa, LSU, Lat6], (instregex "AG(F|H)$")>;
+def : InstRW<[FXa, Lat2], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "S(G|Y)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "SH(Y)?$")>;
+def : InstRW<[FXa], (instregex "SGR(K)?$")>;
+def : InstRW<[FXa], (instregex "SLFI$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
+def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
+def : InstRW<[FXa], (instregex "SLR(K)?$")>;
+def : InstRW<[FXa], (instregex "SR(K)?$")>;
+def : InstRW<[FXa], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>;
+
+// Subtraction with borrow
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
+def : InstRW<[FXa, Lat2, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (16/32 -> 64)
+def : InstRW<[FXa, LSU, Lat6], (instregex "SG(F|H)$")>;
+def : InstRW<[FXa, Lat2], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "N(G|Y)?$")>;
+def : InstRW<[FXa], (instregex "NGR(K)?$")>;
+def : InstRW<[FXa], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "NI(Y)?$")>;
+def : InstRW<[FXa], (instregex "NIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "NIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "NIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "NILF(64)?$")>;
+def : InstRW<[FXa], (instregex "NILH(64)?$")>;
+def : InstRW<[FXa], (instregex "NILL(64)?$")>;
+def : InstRW<[FXa], (instregex "NR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "O(G|Y)?$")>;
+def : InstRW<[FXa], (instregex "OGR(K)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "OI(Y)?$")>;
+def : InstRW<[FXa], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXa], (instregex "OIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "OIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "OIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "OILF(64)?$")>;
+def : InstRW<[FXa], (instregex "OILH(64)?$")>;
+def : InstRW<[FXa], (instregex "OILL(64)?$")>;
+def : InstRW<[FXa], (instregex "OR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "X(G|Y)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "XI(Y)?$")>;
+def : InstRW<[FXa], (instregex "XIFMux$")>;
+def : InstRW<[FXa], (instregex "XGR(K)?$")>;
+def : InstRW<[FXa], (instregex "XIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "XILF(64)?$")>;
+def : InstRW<[FXa], (instregex "XR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat9], (instregex "MS(GF|Y)?$")>;
+def : InstRW<[FXa, Lat5], (instregex "MS(R|FI)$")>;
+def : InstRW<[FXa, LSU, Lat11], (instregex "MSG$")>;
+def : InstRW<[FXa, Lat7], (instregex "MSGR$")>;
+def : InstRW<[FXa, Lat5], (instregex "MSGF(I|R)$")>;
+def : InstRW<[FXa2, LSU, Lat12, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXa2, Lat8, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXa, Lat4], (instregex "MGHI$")>;
+def : InstRW<[FXa, Lat4], (instregex "MHI$")>;
+def : InstRW<[FXa, LSU, Lat8], (instregex "MH(Y)?$")>;
+def : InstRW<[FXa2, Lat6, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXa2, LSU, Lat10, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[FXa, LSU, Lat8], (instregex "MGH$")>;
+def : InstRW<[FXa, LSU, Lat12, GroupAlone], (instregex "MG$")>;
+def : InstRW<[FXa, Lat8, GroupAlone], (instregex "MGRK$")>;
+def : InstRW<[FXa, LSU, Lat9, GroupAlone], (instregex "MSC$")>;
+def : InstRW<[FXa, LSU, Lat11, GroupAlone], (instregex "MSGC$")>;
+def : InstRW<[FXa, Lat5], (instregex "MSRKC$")>;
+def : InstRW<[FXa, Lat7], (instregex "MSGRKC$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
+def : InstRW<[FXa2, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[LSU, FXa2, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXa, FXa, FXa, FXa, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
+
+// Rotate
+def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[FXa], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[FXa], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[FXa], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[FXa], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[FXb], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[FXb], (instregex "CG(F|H)I$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[FXb], (instregex "C(G)?R$")>;
+def : InstRW<[FXb], (instregex "CIH$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CH(F|SI)$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
+def : InstRW<[FXb], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLGF(RL)?$")>;
+def : InstRW<[FXb], (instregex "CLGF(I|R)$")>;
+def : InstRW<[FXb], (instregex "CLGR$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLGRL$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
+def : InstRW<[FXb], (instregex "CLIH$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
+def : InstRW<[FXb], (instregex "CLR$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXb], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>;
+
+// Compare halfword
+def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
+def : InstRW<[FXb, LSU, Lat6], (instregex "CGH(RL)?$")>;
+def : InstRW<[FXa, FXb, LSU, Lat6, BeginGroup], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[FXb, LSU, Lat6], (instregex "CGF(RL)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[FXb, LSU, Lat5], (instregex "TM(Y)?$")>;
+def : InstRW<[FXb], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[FXb], (instregex "TMHH(64)?$")>;
+def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
+def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
+def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
+
+// Compare logical characters under mask
+def : InstRW<[FXb, LSU, Lat6], (instregex "CLM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "BPP$")>;
+def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
+def : InstRW<[FXb], (instregex "NIAI$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAAL(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAN(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAO(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[FXb, LSU, Lat5, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[FXa, FXb, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[FXa, FXa, FXb, FXb, FXa, LSU, Lat10, GroupAlone],
+             (instregex "CDS(Y)?$")>;
+def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone],
+             (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[FXa, LSU, Lat30], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[FXa, FXa, FXa, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXa, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30], (instregex "KM(C|F|O|CTR|A)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "(KIMD|KLMD|KMAC)$")>;
+def : InstRW<[FXa, Lat30], (instregex "(PCC|PPNO|PRNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Guarded storage
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "LGG$")>;
+def : InstRW<[LSU, Lat5], (instregex "LLGFSG$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)GSC$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, VecDF, VecDF, LSU, LSU, Lat30, GroupAlone],
+             (instregex "CVBG$")>;
+def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXb, FXb, FXb, VecDF2, VecDF2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXb, VecDF, FXb, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "UNPK$")>;
+
+def : InstRW<[FXb, VecDFX, LSU, LSU, LSU, Lat9, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXb, VecDFX2, VecDFX2, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXb, VecDFX, VecDFX, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
+def : InstRW<[VecDFX, LSU, Lat4, BeginGroup], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[LSU, FXa, Lat5, BeginGroup], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[FXa, Lat3, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[FXa, FXa, FXb, Lat5, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[FXb], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[FXb, Lat2, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[FXa, FXb, Lat2, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+// Transaction begin
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat15, GroupAlone],
+              (instregex "TBEGIN(C|_nofloat)?$")>;
+
+// Transaction end
+def : InstRW<[FXb, GroupAlone], (instregex "TEND$")>;
+
+// Transaction abort
+def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+
+// Extract Transaction Nesting Depth
+def : InstRW<[FXa], (instregex "ETND$")>;
+
+// Nontransactional store
+def : InstRW<[FXb, LSU, Lat5], (instregex "NTSTG$")>;
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "PPA$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[FXa, FXa, Lat4, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>;
+
+// Extend
+def : InstRW<[FXa], (instregex "AEXT128$")>;
+def : InstRW<[FXa], (instregex "ZEXT128$")>;
+
+// String instructions
+def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[FXa, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXa, Lat30], (instregex "CMPSC$")>;
+
+// Execute
+def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Select instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "Select(F32|F64|F128|VR128)$")>;
+def : InstRW<[FXa], (instregex "CondStoreF32(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStoreF64(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[FXb], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[FXb, FXb, Lat2, BeginGroup], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[VecXsPm], (instregex "LER$")>;
+def : InstRW<[FXb], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[FXb, Lat3], (instregex "LGDR$")>;
+def : InstRW<[FXb, FXb, Lat2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)BR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "LTEBRCompare(_VecPseudo)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "LTDBRCompare(_VecPseudo)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone],
+             (instregex "LTXBRCompare(_VecPseudo)?$")>;
+
+// Copy sign
+def : InstRW<[VecXsPm], (instregex "CPSDRd(d|s)$")>;
+def : InstRW<[VecXsPm], (instregex "CPSDRs(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm, LSU, Lat7], (instregex "LE(Y)?$")>;
+def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSU], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat7], (instregex "STD(Y)?$")>;
+def : InstRW<[FXb, LSU, Lat7], (instregex "STE(Y)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecBF], (instregex "LEDBR(A)?$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LEXBR(A)?$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[VecBF, LSU, Lat12], (instregex "LDEB$")>;
+def : InstRW<[VecBF], (instregex "LDEBR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12 , GroupAlone], (instregex "LX(D|E)B$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)BR(A)?$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat11, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLFDBR$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DBR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)EBR$")>;
+def : InstRW<[FXb], (instregex "LCDFR(_32)?$")>;
+def : InstRW<[FXb], (instregex "LNDFR(_32)?$")>;
+def : InstRW<[FXb], (instregex "LPDFR(_32)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)B$")>;
+def : InstRW<[VecFPd], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[VecBF], (instregex "FIEBR(A)?$")>;
+def : InstRW<[VecBF], (instregex "FIDBR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>;
+def : InstRW<[VecBF], (instregex "A(E|D)BR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>;
+def : InstRW<[VecBF], (instregex "S(E|D)BR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[VecBF], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXDB$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
+def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
+
+// Divide to integer
+def : InstRW<[VecFPd, Lat30], (instregex "DI(E|D)BR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
+
+// Test Data Class
+def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[FXa, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "LFAS$")>;
+def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[VecBF], (instregex "LEXR$")>;
+def : InstRW<[VecDF2], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSU], (instregex "LDE$")>;
+def : InstRW<[FXb], (instregex "LDER$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "LX(D|E)$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)R$")>;
+
+// Convert from fixed
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)R$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)R$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)R$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)R$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[VecBF], (instregex "THD(E)?R$")>;
+def : InstRW<[VecBF], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)ER$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[VecBF], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)$")>;
+def : InstRW<[VecFPd], (instregex "SQ(E|D)R$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[VecBF], (instregex "FIER$")>;
+def : InstRW<[VecBF], (instregex "FIDR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
+def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
+def : InstRW<[VecBF], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)DR$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAYR$")>;
+
+// Division
+def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>;
+def : InstRW<[VecFPd], (instregex "D(E|D)R$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecBF, LSU, Lat12], (instregex "C(E|D)$")>;
+def : InstRW<[VecBF], (instregex "C(E|D)R$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[VecDF], (instregex "LTDTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecDF, Lat15], (instregex "LEDTR$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[VecDF], (instregex "LDETR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CD(F|G)TR(A)?$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CXL(F|G)TR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>;
+def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>;
+def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "C(S|U)XTR$")>;
+
+// Convert from / to zoned
+def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>;
+def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CZDT$")>;
+def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CZXT$")>;
+
+// Convert from / to packed
+def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDPT$")>;
+def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXPT$")>;
+def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>;
+def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>;
+
+// Perform floating-point operation
+def : InstRW<[FXb, Lat30], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[VecDF], (instregex "FIDTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEDTR$")>;
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "ESDTR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecDF], (instregex "ADTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[VecDF], (instregex "SDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[VecDF, Lat30], (instregex "MDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[VecDF, Lat30], (instregex "DDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[VecDF], (instregex "QADTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "RRDTR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[LSU, VecDF, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "IEDTR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecDF], (instregex "(K|C)DTR$")>;
+def : InstRW<[VecDF, VecDF, Lat11, GroupAlone], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[VecDF], (instregex "CEDTR$")>;
+def : InstRW<[VecDF], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+
+
+// --------------------------------- Vector --------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// Vector: Move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "VLR(32|64)?$")>;
+def : InstRW<[FXb, Lat4], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[FXb], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "VLVGP(32)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Immediate instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VZERO$")>;
+def : InstRW<[VecXsPm], (instregex "VONE$")>;
+def : InstRW<[VecXsPm], (instregex "VGBM$")>;
+def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Loads
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "VL(L|BB)?$")>;
+def : InstRW<[LSU], (instregex "VL(32|64)$")>;
+def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
+def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, LSU, Lat7], (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VGE(F|G)$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+              (instregex "VLM$")>;
+def : InstRW<[LSU, Lat5], (instregex "VLRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Stores
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat8], (instregex "VST(L|32|64)?$")>;
+def : InstRW<[FXb, LSU, Lat8], (instregex "VSTE(F|G)$")>;
+def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VSTE(B|H)$")>;
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat20, GroupAlone],
+              (instregex "VSTM$")>;
+def : InstRW<[FXb, FXb, LSU, Lat12, BeginGroup], (instregex "VSCE(F|G)$")>;
+def : InstRW<[FXb, LSU, Lat8], (instregex "VSTRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Selects and permutes
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VPERM$")>;
+def : InstRW<[VecXsPm], (instregex "VPDI$")>;
+def : InstRW<[VecXsPm], (instregex "VBPERM$")>;
+def : InstRW<[VecXsPm], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSEL$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Widening and narrowing
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPLL(B|F|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VN(C|O|N|X)?$")>;
+def : InstRW<[VecXsPm], (instregex "VO(C)?$")>;
+def : InstRW<[VecMul], (instregex "VCKSM$")>;
+def : InstRW<[VecXsPm], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VX$")>;
+def : InstRW<[VecMul], (instregex "VGFM?$")>;
+def : InstRW<[VecMul], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[VecMul], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[VecXsPm], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[VecMul], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[VecMul], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VML(B|F)?$")>;
+def : InstRW<[VecMul], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[VecMul], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMO(B|F|H)?$")>;
+def : InstRW<[VecBF2], (instregex "VMSL(G)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VPOPCT(B|F|G|H)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VSL(DB)?$")>;
+def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSLB$")>;
+def : InstRW<[VecXsPm], (instregex "VSR(A|L)$")>;
+def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSR(A|L)B$")>;
+
+def : InstRW<[VecXsPm], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[VecXsPm], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[VecMul], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[VecMul], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VSUMQ(F|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm, Lat4], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VTM$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point arithmetic
+//===----------------------------------------------------------------------===//
+
+// Conversion and rounding
+def : InstRW<[VecBF], (instregex "VCD(L)?G$")>;
+def : InstRW<[VecBF], (instregex "VCD(L)?GB$")>;
+def : InstRW<[VecBF], (instregex "WCD(L)?GB$")>;
+def : InstRW<[VecBF], (instregex "VC(L)?GD$")>;
+def : InstRW<[VecBF], (instregex "VC(L)?GDB$")>;
+def : InstRW<[VecBF], (instregex "WC(L)?GDB$")>;
+def : InstRW<[VecBF], (instregex "VL(DE|ED)$")>;
+def : InstRW<[VecBF], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[VecBF], (instregex "VFL(L|R)$")>;
+def : InstRW<[VecBF], (instregex "VFL(LS|RD)$")>;
+def : InstRW<[VecBF], (instregex "WFL(LS|RD)$")>;
+def : InstRW<[VecBF2], (instregex "WFLLD$")>;
+def : InstRW<[VecDF2, Lat10], (instregex "WFLRX$")>;
+def : InstRW<[VecBF2], (instregex "VFI$")>;
+def : InstRW<[VecBF], (instregex "VFIDB$")>;
+def : InstRW<[VecBF], (instregex "WFIDB$")>;
+def : InstRW<[VecBF2], (instregex "VFISB$")>;
+def : InstRW<[VecBF], (instregex "WFISB$")>;
+def : InstRW<[VecDF2, Lat10], (instregex "WFIXB$")>;
+
+// Sign operations
+def : InstRW<[VecXsPm], (instregex "VFPSO$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FPSOSB$")>;
+def : InstRW<[VecXsPm], (instregex "WFPSOXB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)DB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)SB$")>;
+def : InstRW<[VecXsPm], (instregex "WFL(C|N|P)XB$")>;
+
+// Minimum / maximum
+def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)$")>;
+def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)DB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)DB$")>;
+def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)SB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)SB$")>;
+def : InstRW<[VecDFX], (instregex "WF(MAX|MIN)XB$")>;
+
+// Test data class
+def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCIDB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCISB$")>;
+def : InstRW<[VecDFX, Lat4], (instregex "WFTCIXB$")>;
+
+// Add / subtract
+def : InstRW<[VecBF2], (instregex "VF(A|S)$")>;
+def : InstRW<[VecBF], (instregex "VF(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WF(A|S)DB$")>;
+def : InstRW<[VecBF2], (instregex "VF(A|S)SB$")>;
+def : InstRW<[VecBF], (instregex "WF(A|S)SB$")>;
+def : InstRW<[VecDF2, Lat10], (instregex "WF(A|S)XB$")>;
+
+// Multiply / multiply-and-add/subtract
+def : InstRW<[VecBF2], (instregex "VFM$")>;
+def : InstRW<[VecBF], (instregex "VFMDB$")>;
+def : InstRW<[VecBF], (instregex "WFMDB$")>;
+def : InstRW<[VecBF2], (instregex "VFMSB$")>;
+def : InstRW<[VecBF], (instregex "WFMSB$")>;
+def : InstRW<[VecDF2, Lat20], (instregex "WFMXB$")>;
+def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)$")>;
+def : InstRW<[VecBF], (instregex "VF(N)?M(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)SB$")>;
+def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[VecDF2, Lat20], (instregex "WF(N)?M(A|S)XB$")>;
+
+// Divide / square root
+def : InstRW<[VecFPd], (instregex "VFD$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FDSB$")>;
+def : InstRW<[VecFPd], (instregex "WFDXB$")>;
+def : InstRW<[VecFPd], (instregex "VFSQ$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FSQSB$")>;
+def : InstRW<[VecFPd], (instregex "WFSQXB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)$")>;
+def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[VecDFX], (instregex "WF(C|K)(E|H|HE)XB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)(E|H|HE)XBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)SB$")>;
+def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)XB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point insertion and extraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LEFR$")>;
+def : InstRW<[FXb, Lat4], (instregex "LFER$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: String instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecStr], (instregex "VFAE(B)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAEBS$")>;
+def : InstRW<[VecStr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAE(F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[VecStr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[VecStr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Packed-decimal instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecDF, VecDF, Lat10, GroupAlone], (instregex "VLIP$")>;
+def : InstRW<[VecDFX, LSU, Lat12, GroupAlone], (instregex "VPKZ$")>;
+def : InstRW<[VecDFX, FXb, LSU, Lat12, GroupAlone], (instregex "VUPKZ$")>;
+def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVB(G)?$")>;
+def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVD(G)?$")>;
+def : InstRW<[VecDFX], (instregex "V(A|S)P$")>;
+def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "VM(S)?P$")>;
+def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "V(D|R)P$")>;
+def : InstRW<[VecDFX, Lat30, GroupAlone], (instregex "VSDP$")>;
+def : InstRW<[VecDF, VecDF, Lat11], (instregex "VSRP$")>;
+def : InstRW<[VecDFX], (instregex "VPSOP$")>;
+def : InstRW<[VecDFX], (instregex "V(T|C)P$")>;
+
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXa, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXa, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXb, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXb, Lat30], (instregex "IRBM$")>;
+def : InstRW<[FXb, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXb, Lat30], (instregex "TB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXb, Lat30], (instregex "PR$")>;
+def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXb, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30, EndGroup], (instregex "BAKR$")>;
+def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[LSU, LSU, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
+             (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
+             (instregex "STCKE$")>;
+def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[LSU, LSU, FXb, Lat5, BeginGroup], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[FXb, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXb], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXb, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LPP$")>;
+def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXb, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "L(P|S)CTL$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXb, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXb, Lat30], (instregex "SAL$")>;
+
+}
+
diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td
index e3e1999d8ad8d..4d986e8391cf5 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -311,7 +311,7 @@ def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
 def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
-def : InstRW<[FXU, FXU, Lat3], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "A(L)?HHLR$")>;
 def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
@@ -337,7 +337,7 @@ def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
 def : InstRW<[FXU], (instregex "SLR(K)?$")>;
 def : InstRW<[FXU], (instregex "SR(K)?$")>;
 def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
-def : InstRW<[FXU, FXU, Lat3], (instregex "S(L)?HHLR$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
@@ -403,13 +403,13 @@ def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
 def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
 def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
 def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
@@ -436,7 +436,8 @@ def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
 def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -474,7 +475,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
 def : InstRW<[FXU], (instregex "CLR$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
 def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
-def : InstRW<[FXU, FXU, Lat3], (instregex "C(L)?HLR$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
 def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>;
@@ -499,7 +500,7 @@ def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
 
 // Compare logical characters under mask
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+def : InstRW<[FXU, FXU, LSU, Lat5, GroupAlone], (instregex "CLM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Prefetch
@@ -532,7 +533,7 @@ def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>;
 
 // Perform locked operation
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
@@ -548,36 +549,44 @@ def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 // Translate and convert
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Message-security assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
 
 //===----------------------------------------------------------------------===//
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
-def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>;
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
 
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone],
              (instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone],
              (instregex "(M|D)P$")>;
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone],
              (instregex "SRP$")>;
-def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
-def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, GroupAlone], (instregex "TP$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
 
 //===----------------------------------------------------------------------===//
@@ -621,7 +630,7 @@ def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>;
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
 def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
@@ -632,14 +641,14 @@ def : InstRW<[FXU], (instregex "ZEXT128$")>;
 
 // String instructions
 def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
 
 // Various complex instructions
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>;
 
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -780,9 +789,9 @@ def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
 
 // Division
@@ -791,7 +800,7 @@ def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
 
 // Divide to integer
-def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
@@ -813,9 +822,9 @@ def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
 def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
 def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
 def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
-def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
+def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[FXU, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>;
 def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
 
@@ -900,16 +909,20 @@ def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
 def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
 def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY(H|L)?$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MY(H|L)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>;
+def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[FPU2, FPU2,  Lat10, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>;
-def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAY(H|L)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
 
 // Division
 def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
@@ -949,16 +962,21 @@ def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
 def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CD(F|G)TR(A)?$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>;
+def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>;
 def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXL(F|G)TR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "C(F|G)DTR(A)?$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "C(F|G)XTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>;
 
 // Convert from / to signed / unsigned packed
 def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
@@ -967,7 +985,7 @@ def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>;
 def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>;
 
 // Perform floating-point operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>;
+def : InstRW<[FXU, Lat30], (instregex "PFPO$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Unary arithmetic
@@ -979,7 +997,7 @@ def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
 
 // Extract biased exponent
 def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEXTR$")>;
+def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>;
 
 // Extract significance
 def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
@@ -1010,15 +1028,15 @@ def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
 def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
 
 // Reround
-def : InstRW<[FXU, DFU, Lat30], (instregex "RRDTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>;
 def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
 
 // Shift significand left/right
-def : InstRW<[LSU, DFU, Lat11], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
 def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
 
 // Insert biased exponent
-def : InstRW<[FXU, DFU, Lat11], (instregex "IEDTR$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>;
 def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
 
 //===----------------------------------------------------------------------===//
@@ -1027,15 +1045,15 @@ def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
 
 // Compare
 def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
-def : InstRW<[DFU, DFU, Lat15, GroupAlone], (instregex "(K|C)XTR$")>;
+def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>;
 
 // Compare biased exponent
 def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
-def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>;
+def : InstRW<[DFU2, Lat9], (instregex "CEXTR$")>;
 
 // Test Data Class/Group
 def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
 
 
 // -------------------------------- System ---------------------------------- //
@@ -1046,19 +1064,20 @@ def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
 
 def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXU, Lat3], (instregex "IPK$")>;
-def : InstRW<[LSU], (instregex "SPKA$")>;
-def : InstRW<[LSU], (instregex "SSM$")>;
-def : InstRW<[FXU], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
 def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU], (instregex "SAC(F)?$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Control Register Instructions
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+             (instregex "STCT(L|G)$")>;
 def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
 def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
 def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
@@ -1103,16 +1122,17 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCSK$")>;
+def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30], (instregex "MVPG$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Address-Space Instructions
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU], (instregex "PALB$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
 def : InstRW<[FXU, Lat30], (instregex "PR$")>;
 def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
@@ -1124,7 +1144,7 @@ def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
 // System: Linkage-Stack Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>;
+def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>;
 def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
 def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
 
@@ -1161,9 +1181,9 @@ def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXU], (instregex "MC$")>;
+def : InstRW<[FXU, GroupAlone], (instregex "MC$")>;
 def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
-def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TRAC(E|G)$")>;
 def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
 def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
@@ -1176,7 +1196,8 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
 def : InstRW<[FXU], (instregex "LPP$")>;
 def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
 def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>;
+def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
 def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
 
diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 59f37205f4127..a0f2115eb9d72 100644
--- a/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -69,7 +69,7 @@ def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; }
 def : WriteRes<FPU,  [ZEC12_FPUnit]> { let Latency = 8; }
 def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; }
 def : WriteRes<DFU,  [ZEC12_DFUnit]> { let Latency = 2; }
-def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_FPUnit]> { let Latency = 3; }
+def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_DFUnit]> { let Latency = 3; }
 def : WriteRes<VBU,  [ZEC12_VBUnit]>; // Virtual Branching Unit
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -251,7 +251,7 @@ def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
              (instregex "LM(H|Y|G)?$")>;
 
 // Load multiple disjoint
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "LMD$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
 
 // Store multiple (estimated average of 3 ops)
 def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
@@ -413,13 +413,13 @@ def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
 def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
 def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
 def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
@@ -446,7 +446,8 @@ def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -544,7 +545,7 @@ def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>;
 
 // Perform locked operation
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
@@ -560,36 +561,44 @@ def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 // Translate and convert
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[FXU, FXU, FXU, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Message-security assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
 
 //===----------------------------------------------------------------------===//
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
-def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>;
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
 
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone],
              (instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone],
              (instregex "(M|D)P$")>;
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone],
              (instregex "SRP$")>;
-def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
-def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, Lat5, GroupAlone], (instregex "TP$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
 
 //===----------------------------------------------------------------------===//
@@ -659,7 +668,7 @@ def : InstRW<[FXU], (instregex "PPA$")>;
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
 def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
@@ -670,14 +679,14 @@ def : InstRW<[FXU], (instregex "ZEXT128$")>;
 
 // String instructions
 def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
 
 // Various complex instructions
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>;
 
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -818,9 +827,9 @@ def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
 
 // Division
@@ -829,7 +838,7 @@ def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
 
 // Divide to integer
-def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
@@ -851,10 +860,10 @@ def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
 def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
 def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
 def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
-def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
-def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
+def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[FXU, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>;
+def : InstRW<[FXU, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
 
 // --------------------- Hexadecimal floating point ------------------------- //
@@ -938,16 +947,20 @@ def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
 def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
 def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY(H|L)?$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MY(H|L)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>;
+def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>;
-def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAY(H|L)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
 
 // Division
 def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
@@ -987,16 +1000,21 @@ def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
 def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CD(F|G)TR(A)?$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>;
+def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>;
 def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXL(F|G)TR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "C(F|G)DTR(A)?$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "C(F|G)XTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>;
 
 // Convert from / to signed / unsigned packed
 def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
@@ -1007,11 +1025,11 @@ def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")
 // Convert from / to zoned
 def : InstRW<[LSU, DFU2, Lat7, GroupAlone], (instregex "CDZT$")>;
 def : InstRW<[LSU, LSU, DFU2, DFU2, Lat10, GroupAlone], (instregex "CXZT$")>;
-def : InstRW<[FXU, LSU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>;
+def : InstRW<[FXU, LSU, DFU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>;
 def : InstRW<[FXU, LSU, DFU, DFU, Lat15, GroupAlone], (instregex "CZXT$")>;
 
 // Perform floating-point operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>;
+def : InstRW<[FXU, Lat30], (instregex "PFPO$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Unary arithmetic
@@ -1023,7 +1041,7 @@ def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
 
 // Extract biased exponent
 def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEXTR$")>;
+def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>;
 
 // Extract significance
 def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
@@ -1054,15 +1072,15 @@ def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
 def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
 
 // Reround
-def : InstRW<[FXU, DFU, Lat30], (instregex "RRDTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>;
 def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
 
 // Shift significand left/right
-def : InstRW<[LSU, DFU, Lat11], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
 def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
 
 // Insert biased exponent
-def : InstRW<[FXU, DFU, Lat11], (instregex "IEDTR$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>;
 def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
 
 //===----------------------------------------------------------------------===//
@@ -1071,15 +1089,15 @@ def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
 
 // Compare
 def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
-def : InstRW<[DFU, DFU, Lat15, GroupAlone], (instregex "(K|C)XTR$")>;
+def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>;
 
 // Compare biased exponent
 def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
-def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>;
+def : InstRW<[DFU, DFU, Lat9], (instregex "CEXTR$")>;
 
 // Test Data Class/Group
 def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
 
 
 // -------------------------------- System ---------------------------------- //
@@ -1090,19 +1108,20 @@ def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
 
 def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXU, Lat3], (instregex "IPK$")>;
-def : InstRW<[LSU], (instregex "SPKA$")>;
-def : InstRW<[LSU], (instregex "SSM$")>;
-def : InstRW<[FXU], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
 def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU], (instregex "SAC(F)?$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Control Register Instructions
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "STCT(L|G)$")>;
 def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
 def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
 def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
@@ -1148,16 +1167,17 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[LSU, Lat6, Lat30, GroupAlone], (instregex "MVCSK$")>;
+def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30], (instregex "MVPG$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Address-Space Instructions
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU], (instregex "PALB$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
 def : InstRW<[FXU, Lat30], (instregex "PR$")>;
 def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
@@ -1169,7 +1189,7 @@ def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
 // System: Linkage-Stack Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>;
+def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>;
 def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
 def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
 
@@ -1206,7 +1226,7 @@ def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXU], (instregex "MC$")>;
+def : InstRW<[FXU, GroupAlone], (instregex "MC$")>;
 def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
 def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
 def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
@@ -1221,7 +1241,8 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
 def : InstRW<[FXU], (instregex "LPP$")>;
 def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
 def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>;
+def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
 def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
 
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index 7391df8342efd..13ceb371a425e 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -200,14 +200,26 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       Changed |= shortenOn001AddCC(MI, SystemZ::ADBR);
       break;
 
+    case SystemZ::WFASB:
+      Changed |= shortenOn001AddCC(MI, SystemZ::AEBR);
+      break;
+
     case SystemZ::WFDDB:
       Changed |= shortenOn001(MI, SystemZ::DDBR);
       break;
 
+    case SystemZ::WFDSB:
+      Changed |= shortenOn001(MI, SystemZ::DEBR);
+      break;
+
     case SystemZ::WFIDB:
       Changed |= shortenFPConv(MI, SystemZ::FIDBRA);
       break;
 
+    case SystemZ::WFISB:
+      Changed |= shortenFPConv(MI, SystemZ::FIEBRA);
+      break;
+
     case SystemZ::WLDEB:
       Changed |= shortenOn01(MI, SystemZ::LDEBR);
       break;
@@ -220,30 +232,58 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       Changed |= shortenOn001(MI, SystemZ::MDBR);
       break;
 
+    case SystemZ::WFMSB:
+      Changed |= shortenOn001(MI, SystemZ::MEEBR);
+      break;
+
     case SystemZ::WFLCDB:
       Changed |= shortenOn01(MI, SystemZ::LCDFR);
       break;
 
+    case SystemZ::WFLCSB:
+      Changed |= shortenOn01(MI, SystemZ::LCDFR_32);
+      break;
+
     case SystemZ::WFLNDB:
       Changed |= shortenOn01(MI, SystemZ::LNDFR);
       break;
 
+    case SystemZ::WFLNSB:
+      Changed |= shortenOn01(MI, SystemZ::LNDFR_32);
+      break;
+
     case SystemZ::WFLPDB:
       Changed |= shortenOn01(MI, SystemZ::LPDFR);
       break;
 
+    case SystemZ::WFLPSB:
+      Changed |= shortenOn01(MI, SystemZ::LPDFR_32);
+      break;
+
     case SystemZ::WFSQDB:
       Changed |= shortenOn01(MI, SystemZ::SQDBR);
       break;
 
+    case SystemZ::WFSQSB:
+      Changed |= shortenOn01(MI, SystemZ::SQEBR);
+      break;
+
     case SystemZ::WFSDB:
       Changed |= shortenOn001AddCC(MI, SystemZ::SDBR);
       break;
 
+    case SystemZ::WFSSB:
+      Changed |= shortenOn001AddCC(MI, SystemZ::SEBR);
+      break;
+
     case SystemZ::WFCDB:
       Changed |= shortenOn01(MI, SystemZ::CDBR);
       break;
 
+    case SystemZ::WFCSB:
+      Changed |= shortenOn01(MI, SystemZ::CEBR);
+      break;
+
     case SystemZ::VL32:
       // For z13 we prefer LDE over LE to avoid partial register dependencies.
       Changed |= shortenOn0(MI, SystemZ::LDE32);
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index eb4a0962f7eb2..9cd09b0f911e0 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -47,6 +47,10 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       HasVector(false), HasLoadStoreOnCond2(false),
       HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false),
       HasDFPPackedConversion(false),
+      HasMiscellaneousExtensions2(false), HasGuardedStorage(false),
+      HasMessageSecurityAssist7(false), HasMessageSecurityAssist8(false),
+      HasVectorEnhancements1(false), HasVectorPackedDecimal(false),
+      HasInsertReferenceBitsMultiple(false),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this), TSInfo(), FrameLowering() {}
 
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index b05a1bb6cafd0..4829f73e080e2 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -56,6 +56,13 @@ protected:
   bool HasLoadAndZeroRightmostByte;
   bool HasMessageSecurityAssist5;
   bool HasDFPPackedConversion;
+  bool HasMiscellaneousExtensions2;
+  bool HasGuardedStorage;
+  bool HasMessageSecurityAssist7;
+  bool HasMessageSecurityAssist8;
+  bool HasVectorEnhancements1;
+  bool HasVectorPackedDecimal;
+  bool HasInsertReferenceBitsMultiple;
 
 private:
   Triple TargetTriple;
@@ -168,6 +175,33 @@ public:
   // Return true if the target has the vector facility.
   bool hasVector() const { return HasVector; }
 
+  // Return true if the target has the miscellaneous-extensions facility 2.
+  bool hasMiscellaneousExtensions2() const {
+    return HasMiscellaneousExtensions2;
+  }
+
+  // Return true if the target has the guarded-storage facility.
+  bool hasGuardedStorage() const { return HasGuardedStorage; }
+
+  // Return true if the target has the message-security-assist
+  // extension facility 7.
+  bool hasMessageSecurityAssist7() const { return HasMessageSecurityAssist7; }
+
+  // Return true if the target has the message-security-assist
+  // extension facility 8.
+  bool hasMessageSecurityAssist8() const { return HasMessageSecurityAssist8; }
+
+  // Return true if the target has the vector-enhancements facility 1.
+  bool hasVectorEnhancements1() const { return HasVectorEnhancements1; }
+
+  // Return true if the target has the vector-packed-decimal facility.
+  bool hasVectorPackedDecimal() const { return HasVectorPackedDecimal; }
+
+  // Return true if the target has the insert-reference-bits-multiple facility.
+  bool hasInsertReferenceBitsMultiple() const {
+    return HasInsertReferenceBitsMultiple;
+  }
+
   // Return true if GV can be accessed using LARL for reloc model RM
   // and code model CM.
   bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const;
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index cb81c0e5276e7..025bf73d2df0d 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -143,8 +143,10 @@ public:
 } // end anonymous namespace
 
 void SystemZPassConfig::addIRPasses() {
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
     addPass(createSystemZTDCPass());
+    addPass(createLoopDataPrefetchPass());
+  }
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 9ac768b2189d7..506dc74279932 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -372,6 +372,9 @@ int SystemZTTIImpl::getArithmeticInstrCost(
         Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
       switch (ScalarBits) {
       case 32: {
+        // The vector enhancements facility 1 provides v4f32 instructions.
+        if (ST->hasVectorEnhancements1())
+          return NumVectors;
         // Return the cost of multiple scalar invocation plus the cost of
         // inserting and extracting the values.
         unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 6923fc6fc9104..a0c6fa94f8c1e 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -56,6 +56,10 @@ public:
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector) const;
 
+  unsigned getCacheLineSize() { return 256; }
+  unsigned getPrefetchDistance() { return 2000; }
+  unsigned getMinPrefetchStride() { return 2048; }
+
   bool prefersVectorizedAddressing() { return false; }
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index fc4adddc149ba..6e08d4cff6eaf 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -37,6 +37,7 @@ endif()
 set(sources
   X86AsmPrinter.cpp
   X86CallFrameOptimization.cpp
+  X86CmovConversion.cpp
   X86ExpandPseudo.cpp
   X86FastISel.cpp
   X86FixupBWInsts.cpp
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 19c93cfff0fe9..91201d1fec85a 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -83,6 +83,9 @@ FunctionPass *createX86WinEHStatePass();
 /// the MachineInstr to MC.
 FunctionPass *createX86ExpandPseudoPass();
 
+/// This pass converts X86 cmov instructions into branch when profitable.
+FunctionPass *createX86CmovConverterPass();
+
 /// Return a Machine IR pass that selectively replaces
 /// certain byte and word instructions by equivalent 32 bit instructions,
 /// in order to eliminate partial register usage, false dependences on
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 4ca57fe9fb00f..54eabeac51264 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -814,10 +814,8 @@ def : Proc<"bdver4", [
   FeatureMWAITX
 ]>;
 
-// TODO: The scheduler model falls to BTVER2 model.
-// The znver1 model has to be put in place.
-// Zen
-def: ProcessorModel<"znver1", BtVer2Model, [
+// Znver1
+def: ProcessorModel<"znver1", Znver1Model, [
   FeatureADX,
   FeatureAES,
   FeatureAVX2,
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 6decb550ad5f8..26461986427d2 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -448,7 +448,7 @@ def RetCC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::Swift", CCDelegateTo<RetCC_X86_64_Swift>>,
 
   // Handle explicit CC selection
-  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
 
   // Handle Vectorcall CC
@@ -1004,7 +1004,7 @@ def CC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
   CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
   CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
-  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::Win64", CCDelegateTo<CC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
   CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>,
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
new file mode 100644
index 0000000000000..bfc834435de55
--- /dev/null
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -0,0 +1,611 @@
+//====-- X86CmovConversion.cpp - Convert Cmov to Branch -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a pass that converts X86 cmov instructions into branch
+/// when profitable. This pass is conservative, i.e., it applies transformation
+/// if and only if it can gaurantee a gain with high confidence.
+///
+/// Thus, the optimization applies under the following conditions:
+///   1. Consider as a candidate only CMOV in most inner loop, assuming that
+///       most hotspots are represented by these loops.
+///   2. Given a group of CMOV instructions, that are using same EFLAGS def
+///      instruction:
+///      a. Consider them as candidates only if all have same code condition or
+///         opposite one, to prevent generating more than one conditional jump
+///         per EFLAGS def instruction.
+///      b. Consider them as candidates only if all are profitable to be
+///         converted, assuming that one bad conversion may casue a degradation.
+///   3. Apply conversion only for loop that are found profitable and only for
+///      CMOV candidates that were found profitable.
+///      a. Loop is considered profitable only if conversion will reduce its
+///         depth cost by some thrishold.
+///      b. CMOV is considered profitable if the cost of its condition is higher
+///         than the average cost of its true-value and false-value by 25% of
+///         branch-misprediction-penalty, this to assure no degredassion even
+///         with 25% branch misprediction.
+///
+/// Note: This pass is assumed to run on SSA machine code.
+//===----------------------------------------------------------------------===//
+//
+//  External interfaces:
+//      FunctionPass *llvm::createX86CmovConverterPass();
+//      bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
+//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cmov-converter"
+
+STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
+STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
+STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
+STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
+
+namespace {
+// This internal switch can be used to turn off the cmov/branch optimization.
+static cl::opt<bool>
+    EnableCmovConverter("x86-cmov-converter",
+                        cl::desc("Enable the X86 cmov-to-branch optimization."),
+                        cl::init(true), cl::Hidden);
+
+/// Converts X86 cmov instructions into branches when profitable.
+class X86CmovConverterPass : public MachineFunctionPass {
+public:
+  X86CmovConverterPass() : MachineFunctionPass(ID) {}
+  ~X86CmovConverterPass() {}
+
+  StringRef getPassName() const override { return "X86 cmov Conversion"; }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  /// Pass identification, replacement for typeid.
+  static char ID;
+
+  const MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+  TargetSchedModel TSchedModel;
+
+  /// List of consecutive CMOV instructions.
+  typedef SmallVector<MachineInstr *, 2> CmovGroup;
+  typedef SmallVector<CmovGroup, 2> CmovGroups;
+
+  /// Collect all CMOV-group-candidates in \p CurrLoop and update \p
+  /// CmovInstGroups accordingly.
+  ///
+  /// \param CurrLoop Loop being processed.
+  /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
+  /// \returns true iff it found any CMOV-group-candidate.
+  bool collectCmovCandidates(MachineLoop *CurrLoop, CmovGroups &CmovInstGroups);
+
+  /// Check if it is profitable to transform each CMOV-group-candidates into
+  /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
+  ///
+  /// \param CurrLoop Loop being processed.
+  /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
+  /// \returns true iff any CMOV-group-candidate remain.
+  bool checkForProfitableCmovCandidates(MachineLoop *CurrLoop,
+                                        CmovGroups &CmovInstGroups);
+
+  /// Convert the given list of consecutive CMOV instructions into a branch.
+  ///
+  /// \param Group Consecutive CMOV instructions to be converted into branch.
+  void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;
+};
+
+char X86CmovConverterPass::ID = 0;
+
+void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<MachineLoopInfo>();
+}
+
+bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+  if (!EnableCmovConverter)
+    return false;
+
+  DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+               << "**********\n");
+
+  bool Changed = false;
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  MRI = &MF.getRegInfo();
+  TII = STI.getInstrInfo();
+  TSchedModel.init(STI.getSchedModel(), &STI, TII);
+
+  //===--------------------------------------------------------------------===//
+  // Algorithm
+  // ---------
+  //   For each inner most loop
+  //     collectCmovCandidates() {
+  //       Find all CMOV-group-candidates.
+  //     }
+  //
+  //     checkForProfitableCmovCandidates() {
+  //       * Calculate both loop-depth and optimized-loop-depth.
+  //       * Use these depth to check for loop transformation profitability.
+  //       * Check for CMOV-group-candidate transformation profitability.
+  //     }
+  //
+  //     For each profitable CMOV-group-candidate
+  //       convertCmovInstsToBranches() {
+  //           * Create FalseBB, SinkBB, Conditional branch to SinkBB.
+  //           * Replace each CMOV instruction with a PHI instruction in SinkBB.
+  //       }
+  //
+  // Note: For more details, see each function description.
+  //===--------------------------------------------------------------------===//
+  for (MachineBasicBlock &MBB : MF) {
+    MachineLoop *CurrLoop = MLI.getLoopFor(&MBB);
+
+    // Optimize only inner most loops.
+    if (!CurrLoop || CurrLoop->getHeader() != &MBB ||
+        !CurrLoop->getSubLoops().empty())
+      continue;
+
+    // List of consecutive CMOV instructions to be processed.
+    CmovGroups CmovInstGroups;
+
+    if (!collectCmovCandidates(CurrLoop, CmovInstGroups))
+      continue;
+
+    if (!checkForProfitableCmovCandidates(CurrLoop, CmovInstGroups))
+      continue;
+
+    Changed = true;
+    for (auto &Group : CmovInstGroups)
+      convertCmovInstsToBranches(Group);
+  }
+  return Changed;
+}
+
+bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop,
+                                                 CmovGroups &CmovInstGroups) {
+  //===--------------------------------------------------------------------===//
+  // Collect all CMOV-group-candidates and add them into CmovInstGroups.
+  //
+  // CMOV-group:
+  //   CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
+  //
+  // CMOV-group-candidate:
+  //   CMOV-group where all the CMOV instructions are
+  //     1. consecutive.
+  //     2. have same condition code or opposite one.
+  //     3. have only operand registers (X86::CMOVrr).
+  //===--------------------------------------------------------------------===//
+  // List of possible improvement (TODO's):
+  // --------------------------------------
+  //   TODO: Add support for X86::CMOVrm instructions.
+  //   TODO: Add support for X86::SETcc instructions.
+  //   TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
+  //===--------------------------------------------------------------------===//
+
+  // Current processed CMOV-Group.
+  CmovGroup Group;
+  for (auto *MBB : CurrLoop->getBlocks()) {
+    Group.clear();
+    // Condition code of first CMOV instruction current processed range and its
+    // opposite condition code.
+    X86::CondCode FirstCC, FirstOppCC;
+    // Indicator of a non CMOVrr instruction in the current processed range.
+    bool FoundNonCMOVInst = false;
+    // Indicator for current processed CMOV-group if it should be skipped.
+    bool SkipGroup = false;
+
+    for (auto &I : *MBB) {
+      X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
+      // Check if we found a X86::CMOVrr instruction.
+      if (CC != X86::COND_INVALID && !I.mayLoad()) {
+        if (Group.empty()) {
+          // We found first CMOV in the range, reset flags.
+          FirstCC = CC;
+          FirstOppCC = X86::GetOppositeBranchCondition(CC);
+          FoundNonCMOVInst = false;
+          SkipGroup = false;
+        }
+        Group.push_back(&I);
+        // Check if it is a non-consecutive CMOV instruction or it has different
+        // condition code than FirstCC or FirstOppCC.
+        if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
+          // Mark the SKipGroup indicator to skip current processed CMOV-Group.
+          SkipGroup = true;
+        continue;
+      }
+      // If Group is empty, keep looking for first CMOV in the range.
+      if (Group.empty())
+        continue;
+
+      // We found a non X86::CMOVrr instruction.
+      FoundNonCMOVInst = true;
+      // Check if this instruction define EFLAGS, to determine end of processed
+      // range, as there would be no more instructions using current EFLAGS def.
+      if (I.definesRegister(X86::EFLAGS)) {
+        // Check if current processed CMOV-group should not be skipped and add
+        // it as a CMOV-group-candidate.
+        if (!SkipGroup)
+          CmovInstGroups.push_back(Group);
+        else
+          ++NumOfSkippedCmovGroups;
+        Group.clear();
+      }
+    }
+    // End of basic block is considered end of range, check if current processed
+    // CMOV-group should not be skipped and add it as a CMOV-group-candidate.
+    if (Group.empty())
+      continue;
+    if (!SkipGroup)
+      CmovInstGroups.push_back(Group);
+    else
+      ++NumOfSkippedCmovGroups;
+  }
+
+  NumOfCmovGroupCandidate += CmovInstGroups.size();
+  return !CmovInstGroups.empty();
+}
+
+/// \returns Depth of CMOV instruction as if it was converted into branch.
+/// \param TrueOpDepth depth cost of CMOV true value operand.
+/// \param FalseOpDepth depth cost of CMOV false value operand.
+static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
+  //===--------------------------------------------------------------------===//
+  // With no info about branch weight, we assume 50% for each value operand.
+  // Thus, depth of optimized CMOV instruction is the rounded up average of
+  // its True-Operand-Value-Depth and False-Operand-Value-Depth.
+  //===--------------------------------------------------------------------===//
+  return (TrueOpDepth + FalseOpDepth + 1) / 2;
+}
+
+bool X86CmovConverterPass::checkForProfitableCmovCandidates(
+    MachineLoop *CurrLoop, CmovGroups &CmovInstGroups) {
+  struct DepthInfo {
+    /// Depth of original loop.
+    unsigned Depth;
+    /// Depth of optimized loop.
+    unsigned OptDepth;
+  };
+  /// Number of loop iterations to calculate depth for ?!
+  static const unsigned LoopIterations = 2;
+  DenseMap<MachineInstr *, DepthInfo> DepthMap;
+  DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};
+  enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };
+  /// For each register type maps the register to its last def instruction.
+  DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum];
+  /// Maps register operand to its def instruction, which can be nullptr if it
+  /// is unknown (e.g., operand is defined outside the loop).
+  DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap;
+
+  // Set depth of unknown instruction (i.e., nullptr) to zero.
+  DepthMap[nullptr] = {0, 0};
+
+  SmallPtrSet<MachineInstr *, 4> CmovInstructions;
+  for (auto &Group : CmovInstGroups)
+    CmovInstructions.insert(Group.begin(), Group.end());
+
+  //===--------------------------------------------------------------------===//
+  // Step 1: Calculate instruction depth and loop depth.
+  // Optimized-Loop:
+  //   loop with CMOV-group-candidates converted into branches.
+  //
+  // Instruction-Depth:
+  //   instruction latency + max operand depth.
+  //     * For CMOV instruction in optimized loop the depth is calculated as:
+  //       CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
+  // TODO: Find a better way to estimate the latency of the branch instruction
+  //       rather than using the CMOV latency.
+  //
+  // Loop-Depth:
+  //   max instruction depth of all instructions in the loop.
+  // Note: instruction with max depth represents the critical-path in the loop.
+  //
+  // Loop-Depth[i]:
+  //   Loop-Depth calculated for first `i` iterations.
+  //   Note: it is enough to calculate depth for up to two iterations.
+  //
+  // Depth-Diff[i]:
+  //   Number of cycles saved in first 'i` iterations by optimizing the loop.
+  //===--------------------------------------------------------------------===//
+  for (unsigned I = 0; I < LoopIterations; ++I) {
+    DepthInfo &MaxDepth = LoopDepth[I];
+    for (auto *MBB : CurrLoop->getBlocks()) {
+      // Clear physical registers Def map.
+      RegDefMaps[PhyRegType].clear();
+      for (MachineInstr &MI : *MBB) {
+        unsigned MIDepth = 0;
+        unsigned MIDepthOpt = 0;
+        bool IsCMOV = CmovInstructions.count(&MI);
+        for (auto &MO : MI.uses()) {
+          // Checks for "isUse()" as "uses()" returns also implicit definitions.
+          if (!MO.isReg() || !MO.isUse())
+            continue;
+          unsigned Reg = MO.getReg();
+          auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)];
+          if (MachineInstr *DefMI = RDM.lookup(Reg)) {
+            OperandToDefMap[&MO] = DefMI;
+            DepthInfo Info = DepthMap.lookup(DefMI);
+            MIDepth = std::max(MIDepth, Info.Depth);
+            if (!IsCMOV)
+              MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);
+          }
+        }
+
+        if (IsCMOV)
+          MIDepthOpt = getDepthOfOptCmov(
+              DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,
+              DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);
+
+        // Iterates over all operands to handle implicit definitions as well.
+        for (auto &MO : MI.operands()) {
+          if (!MO.isReg() || !MO.isDef())
+            continue;
+          unsigned Reg = MO.getReg();
+          RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI;
+        }
+
+        unsigned Latency = TSchedModel.computeInstrLatency(&MI);
+        DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};
+        MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);
+        MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);
+      }
+    }
+  }
+
+  unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,
+                                   LoopDepth[1].Depth - LoopDepth[1].OptDepth};
+
+  //===--------------------------------------------------------------------===//
+  // Step 2: Check if Loop worth to be optimized.
+  // Worth-Optimize-Loop:
+  //   case 1: Diff[1] == Diff[0]
+  //           Critical-path is iteration independent - there is no dependency
+  //           of critical-path instructions on critical-path instructions of
+  //           previous iteration.
+  //           Thus, it is enough to check gain percent of 1st iteration -
+  //           To be conservative, the optimized loop need to have a depth of
+  //           12.5% cycles less than original loop, per iteration.
+  //
+  //   case 2: Diff[1] > Diff[0]
+  //           Critical-path is iteration dependent - there is dependency of
+  //           critical-path instructions on critical-path instructions of
+  //           previous iteration.
+  //           Thus, it is required to check the gradient of the gain - the
+  //           change in Depth-Diff compared to the change in Loop-Depth between
+  //           1st and 2nd iterations.
+  //           To be conservative, the gradient need to be at least 50%.
+  //
+  // If loop is not worth optimizing, remove all CMOV-group-candidates.
+  //===--------------------------------------------------------------------===//
+  bool WorthOptLoop = false;
+  if (Diff[1] == Diff[0])
+    WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
+  else if (Diff[1] > Diff[0])
+    WorthOptLoop =
+        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth);
+
+  if (!WorthOptLoop)
+    return false;
+
+  ++NumOfLoopCandidate;
+
+  //===--------------------------------------------------------------------===//
+  // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
+  // Worth-Optimize-Group:
+  //   Iff it worths to optimize all CMOV instructions in the group.
+  //
+  // Worth-Optimize-CMOV:
+  //   Predicted branch is faster than CMOV by the difference between depth of
+  //   condition operand and depth of taken (predicted) value operand.
+  //   To be conservative, the gain of such CMOV transformation should cover at
+  //   at least 25% of branch-misprediction-penalty.
+  //===--------------------------------------------------------------------===//
+  unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
+  CmovGroups TempGroups;
+  std::swap(TempGroups, CmovInstGroups);
+  for (auto &Group : TempGroups) {
+    bool WorthOpGroup = true;
+    for (auto *MI : Group) {
+      // Avoid CMOV instruction which value is used as a pointer to load from.
+      // This is another conservative check to avoid converting CMOV instruction
+      // used with tree-search like algorithm, where the branch is unpredicted.
+      auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
+      if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) {
+        unsigned Op = UIs.begin()->getOpcode();
+        if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
+          WorthOpGroup = false;
+          break;
+        }
+      }
+
+      unsigned CondCost =
+          DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth;
+      unsigned ValCost = getDepthOfOptCmov(
+          DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
+          DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
+      if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) {
+        WorthOpGroup = false;
+        break;
+      }
+    }
+
+    if (WorthOpGroup)
+      CmovInstGroups.push_back(Group);
+  }
+
+  return !CmovInstGroups.empty();
+}
+
+static bool checkEFLAGSLive(MachineInstr *MI) {
+  if (MI->killsRegister(X86::EFLAGS))
+    return false;
+
+  // The EFLAGS operand of MI might be missing a kill marker.
+  // Figure out whether EFLAGS operand should LIVE after MI instruction.
+  MachineBasicBlock *BB = MI->getParent();
+  MachineBasicBlock::iterator ItrMI = MI;
+
+  // Scan forward through BB for a use/def of EFLAGS.
+  for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) {
+    if (I->readsRegister(X86::EFLAGS))
+      return true;
+    if (I->definesRegister(X86::EFLAGS))
+      return false;
+  }
+
+  // We hit the end of the block, check whether EFLAGS is live into a successor.
+  for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) {
+    if ((*I)->isLiveIn(X86::EFLAGS))
+      return true;
+  }
+
+  return false;
+}
+
+void X86CmovConverterPass::convertCmovInstsToBranches(
+    SmallVectorImpl<MachineInstr *> &Group) const {
+  assert(!Group.empty() && "No CMOV instructions to convert");
+  ++NumOfOptimizedCmovGroups;
+
+  // To convert a CMOVcc instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+
+  // Before
+  // -----
+  // MBB:
+  //   cond = cmp ...
+  //   v1 = CMOVge t1, f1, cond
+  //   v2 = CMOVlt t2, f2, cond
+  //   v3 = CMOVge v1, f3, cond
+  //
+  // After
+  // -----
+  // MBB:
+  //   cond = cmp ...
+  //   jge %SinkMBB
+  //
+  // FalseMBB:
+  //   jmp %SinkMBB
+  //
+  // SinkMBB:
+  //   %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
+  //   %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
+  //                                          ; true-value with false-value
+  //   %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
+  //                                          ; previous Phi instruction result
+
+  MachineInstr &MI = *Group.front();
+  MachineInstr *LastCMOV = Group.back();
+  DebugLoc DL = MI.getDebugLoc();
+  X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction::iterator It = ++MBB->getIterator();
+  MachineFunction *F = MBB->getParent();
+  const BasicBlock *BB = MBB->getBasicBlock();
+
+  MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
+  F->insert(It, FalseMBB);
+  F->insert(It, SinkMBB);
+
+  // If the EFLAGS register isn't dead in the terminator, then claim that it's
+  // live into the sink and copy blocks.
+  if (checkEFLAGSLive(LastCMOV)) {
+    FalseMBB->addLiveIn(X86::EFLAGS);
+    SinkMBB->addLiveIn(X86::EFLAGS);
+  }
+
+  // Transfer the remainder of BB and its successor edges to SinkMBB.
+  SinkMBB->splice(SinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
+  SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Add the false and sink blocks as its successors.
+  MBB->addSuccessor(FalseMBB);
+  MBB->addSuccessor(SinkMBB);
+
+  // Create the conditional branch instruction.
+  BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB);
+
+  // Add the sink block to the false block successors.
+  FalseMBB->addSuccessor(SinkMBB);
+
+  MachineInstrBuilder MIB;
+  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+  MachineBasicBlock::iterator MIItEnd =
+      std::next(MachineBasicBlock::iterator(LastCMOV));
+  MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
+  // As we are creating the PHIs, we have to be careful if there is more than
+  // one.  Later CMOVs may reference the results of earlier CMOVs, but later
+  // PHIs have to reference the individual true/false inputs from earlier PHIs.
+  // That also means that PHI construction must work forward from earlier to
+  // later, and that the code must maintain a mapping from earlier PHI's
+  // destination registers, and the registers that went into the PHI.
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+    unsigned DestReg = MIIt->getOperand(0).getReg();
+    unsigned Op1Reg = MIIt->getOperand(1).getReg();
+    unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+    // If this CMOV we are processing is the opposite condition from the jump we
+    // generated, then we have to swap the operands for the PHI that is going to
+    // be generated.
+    if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC)
+      std::swap(Op1Reg, Op2Reg);
+
+    auto Op1Itr = RegRewriteTable.find(Op1Reg);
+    if (Op1Itr != RegRewriteTable.end())
+      Op1Reg = Op1Itr->second.first;
+
+    auto Op2Itr = RegRewriteTable.find(Op2Reg);
+    if (Op2Itr != RegRewriteTable.end())
+      Op2Reg = Op2Itr->second.second;
+
+    //  SinkMBB:
+    //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
+    //  ...
+    MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
+              .addReg(Op1Reg)
+              .addMBB(FalseMBB)
+              .addReg(Op2Reg)
+              .addMBB(MBB);
+    (void)MIB;
+    DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
+    DEBUG(dbgs() << "\tTo: "; MIB->dump());
+
+    // Add this PHI to the rewrite table.
+    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+  }
+
+  // Now remove the CMOV(s).
+  MBB->erase(MIItBegin, MIItEnd);
+}
+
+} // End anonymous namespace.
+
+FunctionPass *llvm::createX86CmovConverterPass() {
+  return new X86CmovConverterPass();
+}
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index ee9e78146305d..527e5d568ac6f 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1187,7 +1187,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
       CC != CallingConv::X86_StdCall &&
       CC != CallingConv::X86_ThisCall &&
       CC != CallingConv::X86_64_SysV &&
-      CC != CallingConv::X86_64_Win64)
+      CC != CallingConv::Win64)
     return false;
 
   // Don't handle popping bytes if they don't fit the ret's immediate.
@@ -3171,7 +3171,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   case CallingConv::X86_FastCall:
   case CallingConv::X86_StdCall:
   case CallingConv::X86_ThisCall:
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
   case CallingConv::X86_64_SysV:
     break;
   }
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index c28746f96439b..95c6f2a3fa342 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -22,7 +22,7 @@
 /// instructions and register-to-register moves.  It would
 /// seem like cmov(s) would also be affected, but because of the way cmov is
 /// really implemented by most machines as reading both the destination and
-/// and source regsters, and then "merging" the two based on a condition,
+/// and source registers, and then "merging" the two based on a condition,
 /// it really already should be considered as having a true dependence on the
 /// destination register as well.
 ///
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 65486cf7f529e..44eecd664714a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1335,6 +1335,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CTTZ,             VT, Custom);
     }
 
+    // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+    for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
+                    MVT::v8i64}) {
+      setOperationAction(ISD::ROTL,             VT, Custom);
+      setOperationAction(ISD::ROTR,             VT, Custom);
+    }
+
     // Need to promote to 64-bit even though we have 32-bit masked instructions
     // because the IR optimizers rearrange bitcasts around logic ops leaving
     // too many variations to handle if we don't promote them.
@@ -1663,10 +1670,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
 
-  // TODO: These control memcmp expansion in CGP and are set low to prevent
-  // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder.
-  MaxLoadsPerMemcmp = 1;
-  MaxLoadsPerMemcmpOptSize = 1;
+  // TODO: These control memcmp expansion in CGP and could be raised higher, but
+  // that needs to benchmarked and balanced with the potential use of vector
+  // load/store types (PR33329).
+  MaxLoadsPerMemcmp = 4;
+  MaxLoadsPerMemcmpOptSize = 2;
 
   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
@@ -2661,7 +2669,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
   switch (CC) {
   // C calling conventions:
   case CallingConv::C:
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
   case CallingConv::X86_64_SysV:
   // Callee pop conventions:
   case CallingConv::X86_ThisCall:
@@ -20188,7 +20196,10 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
                                  const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   EVT MaskVT = Mask.getValueType();
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
@@ -20210,7 +20221,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
                               const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
@@ -20235,7 +20249,10 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Index, SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -20254,7 +20271,10 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -22665,10 +22685,31 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc DL(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
+  unsigned Opcode = Op.getOpcode();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+
+  if (Subtarget.hasAVX512()) {
+    // Attempt to rotate by immediate.
+    APInt UndefElts;
+    SmallVector<APInt, 16> EltBits;
+    if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
+      if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
+            return EltBits[0] == V;
+          })) {
+        unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
+        uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
+        return DAG.getNode(Op, DL, VT, R,
+                           DAG.getConstant(RotateAmt, DL, MVT::i8));
+      }
+    }
+
+    // Else, fall-back on VPROLV/VPRORV.
+    return Op;
+  }
 
   assert(VT.isVector() && "Custom lowering only for vector rotates!");
   assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
-  assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
+  assert((Opcode == ISD::ROTL) && "Only ROTL supported");
 
   // XOP has 128-bit vector variable + immediate rotates.
   // +ve/-ve Amt = rotate left/right.
@@ -22683,7 +22724,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
-      assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
+      assert(RotateAmt < EltSizeInBits && "Rotation out of range");
       return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
                          DAG.getConstant(RotateAmt, DL, MVT::i8));
     }
@@ -24030,7 +24071,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
   case ISD::UMUL_LOHI:
   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
-  case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
+  case ISD::ROTL:
+  case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index cc5c09cbf0e5e..705d0f7a5cf7d 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -1759,29 +1759,29 @@ let Predicates = Preds in {
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rr) _.RC:$src1, _.RC:$src2),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (OpNode (_.VT _.RC:$src1), 
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
                                              (_.VT (bitconvert (_.LdFrag addr:$src2))))),
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rm) _.RC:$src1, addr:$src2),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (and _.KRCWM:$mask, 
+                              (_.KVT (and _.KRCWM:$mask,
                                           (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))),
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrk) _.KRCWM:$mask,
                                                                 _.RC:$src1, _.RC:$src2),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (and (_.KVT _.KRCWM:$mask), 
-                                          (_.KVT (OpNode (_.VT _.RC:$src1), 
-                                                         (_.VT (bitconvert 
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (_.VT (bitconvert
                                                                 (_.LdFrag addr:$src2))))))),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask,
                                                                  _.RC:$src1, addr:$src2),
                               NewInf.KRC)>;
 }
@@ -1798,7 +1798,7 @@ let Predicates = Preds in {
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmb) _.RC:$src1, addr:$src2),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
                               (_.KVT (and (_.KVT _.KRCWM:$mask),
                                           (_.KVT (OpNode (_.VT _.RC:$src1),
@@ -1879,7 +1879,7 @@ defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpeqm,
 defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpeqm,
                                        "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
 
-defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm, 
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm,
                                        "VPCMPEQQZ", [HasAVX512]>;
 defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpeqm,
                                        "VPCMPEQQZ", [HasAVX512]>;
@@ -2127,17 +2127,17 @@ multiclass avx512_icmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo Ne
                                           list<Predicate> Preds> {
 let Predicates = Preds in {
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (OpNode (_.VT _.RC:$src1), 
-                                             (_.VT _.RC:$src2), 
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
+                                             (_.VT _.RC:$src2),
                                              imm:$cc)),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1,
                                                                  _.RC:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (OpNode (_.VT _.RC:$src1), 
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
                                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
                                              imm:$cc)),
                               (i64 0)),
@@ -2145,37 +2145,37 @@ let Predicates = Preds in {
                                                                  addr:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (and _.KRCWM:$mask, 
+                              (_.KVT (and _.KRCWM:$mask,
                                           (OpNode (_.VT _.RC:$src1),
                                                   (_.VT _.RC:$src2),
                                                   imm:$cc))),
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrik) _.KRCWM:$mask,
-                                                                  _.RC:$src1, 
+                                                                  _.RC:$src1,
                                                                   _.RC:$src2,
                                                                   imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (and (_.KVT _.KRCWM:$mask), 
-                                          (_.KVT (OpNode (_.VT _.RC:$src1), 
-                                                         (_.VT (bitconvert 
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (_.VT (bitconvert
                                                                 (_.LdFrag addr:$src2))),
                                                          imm:$cc)))),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask,
                                                                   _.RC:$src1,
                                                                   addr:$src2,
                                                                   imm:$cc),
                               NewInf.KRC)>;
 }
 }
-  
+
 multiclass avx512_icmp_cc_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
                                               SDNode OpNode, string InstrStr,
-                                              list<Predicate> Preds> 
+                                              list<Predicate> Preds>
          : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
 let Predicates = Preds in {
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
@@ -2187,7 +2187,7 @@ let Predicates = Preds in {
                                                                   addr:$src2,
                                                                   imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
                               (_.KVT (and (_.KVT _.KRCWM:$mask),
                                           (_.KVT (OpNode (_.VT _.RC:$src1),
@@ -2447,17 +2447,17 @@ multiclass avx512_fcmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo Ne
                                           string InstrStr, list<Predicate> Preds> {
 let Predicates = Preds in {
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (X86cmpm (_.VT _.RC:$src1), 
-                                              (_.VT _.RC:$src2), 
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1),
+                                              (_.VT _.RC:$src2),
                                               imm:$cc)),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1,
                                                                  _.RC:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (X86cmpm (_.VT _.RC:$src1), 
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1),
                                               (_.VT (bitconvert (_.LdFrag addr:$src2))),
                                               imm:$cc)),
                               (i64 0)),
@@ -2477,19 +2477,19 @@ let Predicates = Preds in {
                               NewInf.KRC)>;
 }
 }
-  
+
 multiclass avx512_fcmp_cc_packed_sae_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
-                                              string InstrStr, list<Predicate> Preds> 
+                                              string InstrStr, list<Predicate> Preds>
          : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> {
 
 let Predicates = Preds in
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), 
-                                                 (_.VT _.RC:$src2), 
+                              (_.KVT (X86cmpmRnd (_.VT _.RC:$src1),
+                                                 (_.VT _.RC:$src2),
                                                  imm:$cc,
                                                  (i32 FROUND_NO_EXC))),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1,
                                                                  _.RC:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
@@ -2817,16 +2817,16 @@ let Predicates = [HasAVX512] in {
     def : Pat<(maskVT (scalar_to_vector GR32:$src)),
               (COPY_TO_REGCLASS GR32:$src, maskRC)>;
 
-    def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))), 
+    def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))),
               (COPY_TO_REGCLASS maskRC:$src, GR32)>;
 
     def : Pat<(maskVT (scalar_to_vector GR8:$src)),
               (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>;
 
-    def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))), 
+    def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))),
               (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>;
 
-    def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))), 
+    def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))),
               (COPY_TO_REGCLASS maskRC:$src, GR32)>;
   }
 
@@ -3036,7 +3036,7 @@ def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
             (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
             (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
 
-def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
                             (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
                             (i64 0)),
             (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrr)
@@ -3044,8 +3044,8 @@ def : Pat<(insert_subvector (v16i1 immAllZerosV),
                      (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
             (i8 8)), (i8 8))>;
 
-def : Pat<(insert_subvector (v16i1 immAllZerosV), 
-                            (v8i1 (and VK8:$mask, 
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
+                            (v8i1 (and VK8:$mask,
                                        (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))),
                             (i64 0)),
             (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrk)
@@ -3063,7 +3063,7 @@ def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2)
             (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
             imm:$cc), VK8)>;
 
-def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
                             (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
                             (i64 0)),
             (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrri)
@@ -3072,8 +3072,8 @@ def : Pat<(insert_subvector (v16i1 immAllZerosV),
                      imm:$cc),
             (i8 8)), (i8 8))>;
 
-def : Pat<(insert_subvector (v16i1 immAllZerosV), 
-                            (v8i1 (and VK8:$mask, 
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
+                            (v8i1 (and VK8:$mask,
                                        (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))),
                             (i64 0)),
             (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrik)
@@ -3379,35 +3379,35 @@ defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
-                                       HasAVX512, "VMOVDQA32">, 
+                                       HasAVX512, "VMOVDQA32">,
                  PD, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
-                                    HasAVX512, "VMOVDQA64">, 
+                                    HasAVX512, "VMOVDQA64">,
                  PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
                 avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
-                                 HasBWI, "VMOVDQU8">, 
+                                 HasBWI, "VMOVDQU8">,
                 XD, EVEX_CD8<8, CD8VF>;
 
 defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
                  avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
-                                 HasBWI, "VMOVDQU16">, 
+                                 HasBWI, "VMOVDQU16">,
                  XD, VEX_W, EVEX_CD8<16, CD8VF>;
 
 defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
                                 null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
-                                 HasAVX512, "VMOVDQU32">, 
+                                 HasAVX512, "VMOVDQU32">,
                  XS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
                                 null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
-                                 HasAVX512, "VMOVDQU64">, 
+                                 HasAVX512, "VMOVDQU64">,
                  XS, VEX_W, EVEX_CD8<64, CD8VF>;
 
 // Special instructions to help with spilling when we don't have VLX. We need
@@ -3964,49 +3964,49 @@ def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
            (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
 
 let hasSideEffects = 0 in {
-  def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+  def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, FR32X:$src2),
                            "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                            [], NoItinerary>, XS, EVEX_4V, VEX_LIG,
                            FoldGenData<"VMOVSSZrr">;
 
 let Constraints = "$src0 = $dst" in
-  def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
-                             (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask, 
+  def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                             (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask,
                                                    VR128X:$src1, FR32X:$src2),
                              "vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
                              [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG,
                              FoldGenData<"VMOVSSZrrk">;
- 
-  def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+
+  def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                          (ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2),
                          "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                                     "$dst {${mask}} {z}, $src1, $src2}",
                          [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
                          FoldGenData<"VMOVSSZrrkz">;
 
-  def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+  def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, FR64X:$src2),
                            "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                            [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W,
                            FoldGenData<"VMOVSDZrr">;
 
 let Constraints = "$src0 = $dst" in
-  def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
-                             (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask, 
+  def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                             (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask,
                                                    VR128X:$src1, FR64X:$src2),
                              "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
                              [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG,
-                             VEX_W, FoldGenData<"VMOVSDZrrk">; 
+                             VEX_W, FoldGenData<"VMOVSDZrrk">;
 
-  def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
-                              (ins f64x_info.KRCWM:$mask, VR128X:$src1, 
+  def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                              (ins f64x_info.KRCWM:$mask, VR128X:$src1,
                                                           FR64X:$src2),
                               "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                                          "$dst {${mask}} {z}, $src1, $src2}",
-                              [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, 
+                              [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
                               VEX_W, FoldGenData<"VMOVSDZrrkz">;
 }
 
@@ -5676,6 +5676,109 @@ defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>;
 defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>;
 defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>;
 
+
+// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                       sub_xmm)>;
+  def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                       sub_ymm)>;
+
+  def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                        sub_xmm)>;
+  def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                        sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       imm:$src2)), sub_ymm)>;
+
+  def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        imm:$src2)), sub_ymm)>;
+}
+
+// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                       sub_xmm)>;
+  def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                       sub_ymm)>;
+
+  def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                        sub_xmm)>;
+  def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                        sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       imm:$src2)), sub_ymm)>;
+
+  def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        imm:$src2)), sub_ymm)>;
+}
+
 //===-------------------------------------------------------------------===//
 // 1-src variable permutation VPERMW/D/Q
 //===-------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 7e4cba1c8345f..343da2573b55c 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -224,7 +224,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
 const TargetRegisterClass *
 X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
   const Function *F = MF.getFunction();
-  if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
+  if (IsWin64 || (F && F->getCallingConv() == CallingConv::Win64))
     return &X86::GR64_TCW64RegClass;
   else if (Is64Bit)
     return &X86::GR64_TCRegClass;
@@ -334,7 +334,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     if (Is64Bit)
       return CSR_64_MostRegs_SaveList;
     break;
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
     if (!HasSSE)
       return CSR_Win64_NoSSE_SaveList;
     return CSR_Win64_SaveList;
@@ -450,7 +450,7 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     if (Is64Bit)
       return CSR_64_MostRegs_RegMask;
     break;
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
     return CSR_Win64_RegMask;
   case CallingConv::X86_64_SysV:
     return CSR_64_RegMask;
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index a12fa68faf4f1..d831a7974359a 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -663,5 +663,6 @@ include "X86ScheduleAtom.td"
 include "X86SchedSandyBridge.td"
 include "X86SchedHaswell.td"
 include "X86ScheduleSLM.td"
+include "X86ScheduleZnver1.td"
 include "X86ScheduleBtVer2.td"
 
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index ed53893b779ce..9dcc968a1a7af 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -371,6 +371,22 @@ def : WriteRes<WriteFence,  [JSAGU]>;
 def : WriteRes<WriteNop, []>;
 
 ////////////////////////////////////////////////////////////////////////////////
+// SSE4A instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def WriteEXTRQ: SchedWriteRes<[JFPU01]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+def : InstRW<[WriteEXTRQ], (instregex "EXTRQ")>;
+
+def WriteINSERTQ: SchedWriteRes<[JFPU01]> {
+  let Latency = 2;
+  let ResourceCycles = [4];
+}
+def : InstRW<[WriteINSERTQ], (instregex "INSERTQ")>;
+
+////////////////////////////////////////////////////////////////////////////////
 // AVX instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td
new file mode 100644
index 0000000000000..d5b4cfe2ddee0
--- /dev/null
+++ b/lib/Target/X86/X86ScheduleZnver1.td
@@ -0,0 +1,223 @@
+//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Znver1 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Znver1Model : SchedMachineModel {
+  // Zen can decode 4 instructions per cycle.
+  let IssueWidth = 4;
+  // Based on the reorder buffer we define MicroOpBufferSize
+  let MicroOpBufferSize = 192;
+  let LoadLatency = 4;
+  let MispredictPenalty = 17;
+  let HighLatency = 25;
+  let PostRAScheduler = 1;
+
+  // FIXME: This variable is required for incomplete model.
+  // We haven't catered all instructions.
+  // So, we reset the value of this variable so as to
+  // say that the model is incomplete.
+  let CompleteModel = 0;
+}
+
+let SchedModel = Znver1Model in {
+
+// Zen can issue micro-ops to 10 different units in one cycle.
+// These are
+//  * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3)
+//  * Two AGU units (ZAGU0, ZAGU1)
+//  * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3)
+// AGUs feed load store queues @two loads and 1 store per cycle.
+
+// Four ALU units are defined below
+def ZnALU0 : ProcResource<1>;
+def ZnALU1 : ProcResource<1>;
+def ZnALU2 : ProcResource<1>;
+def ZnALU3 : ProcResource<1>;
+
+// Two AGU units are defined below
+def ZnAGU0 : ProcResource<1>;
+def ZnAGU1 : ProcResource<1>;
+
+// Four FPU units are defined below
+def ZnFPU0 : ProcResource<1>;
+def ZnFPU1 : ProcResource<1>;
+def ZnFPU2 : ProcResource<1>;
+def ZnFPU3 : ProcResource<1>;
+
+// FPU grouping
+def ZnFPU     : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>;
+def ZnFPU013  : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>;
+def ZnFPU01   : ProcResGroup<[ZnFPU0, ZnFPU1]>;
+def ZnFPU12   : ProcResGroup<[ZnFPU1, ZnFPU2]>;
+def ZnFPU13   : ProcResGroup<[ZnFPU1, ZnFPU3]>;
+def ZnFPU23   : ProcResGroup<[ZnFPU2, ZnFPU3]>;
+def ZnFPU02   : ProcResGroup<[ZnFPU0, ZnFPU2]>;
+def ZnFPU03   : ProcResGroup<[ZnFPU0, ZnFPU3]>;
+
+// Below are the grouping of the units.
+// Micro-ops to be issued to multiple units are tackled this way.
+
+// ALU grouping
+// ZnALU03 - 0,3 grouping
+def ZnALU03: ProcResGroup<[ZnALU0, ZnALU3]>;
+
+// 56 Entry (14x4 entries) Int Scheduler
+def ZnALU : ProcResGroup<[ZnALU0, ZnALU1, ZnALU2, ZnALU3]> {
+  let BufferSize=56;
+}
+
+// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations
+// but are relevant for some instructions
+def ZnAGU : ProcResGroup<[ZnAGU0, ZnAGU1]> {
+  let BufferSize=28;
+}
+
+// Integer Multiplication issued on ALU1.
+def ZnMultiplier : ProcResource<1>;
+
+// Integer division issued on ALU2.
+def ZnDivider : ProcResource<1>;
+
+// 4 Cycles load-to use Latency is captured
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// (a folded load is an instruction that loads and does some operation)
+// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops 
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops.
+//      a. load and
+//      b. addpd
+// This multiclass is for folded loads for integer units.
+multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant takes 1-cycle on Execution Port.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on ZnAGU 
+  // adds 4 cycles to the latency.
+  def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
+     let Latency = !add(Lat, 4);
+  }
+}
+
+// This multiclass is for folded loads for floating point units.
+multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant takes 1-cycle on Execution Port.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on ZnAGU
+  // adds 7 cycles to the latency.
+  def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
+     let Latency = !add(Lat, 7);
+  }
+}
+
+// WriteRMW is set for instructions with Memory write 
+// operation in codegen
+def : WriteRes<WriteRMW, [ZnAGU]>;
+
+def : WriteRes<WriteStore, [ZnAGU]>;
+def : WriteRes<WriteMove,  [ZnALU]>;
+def : WriteRes<WriteLoad,  [ZnAGU]> { let Latency = 8; }
+
+def : WriteRes<WriteZero,  []>;
+def : WriteRes<WriteLEA, [ZnALU]>;
+defm : ZnWriteResPair<WriteALU,   ZnALU, 1>;
+defm : ZnWriteResPair<WriteShift, ZnALU, 1>;
+defm : ZnWriteResPair<WriteJump,  ZnALU, 1>;
+
+// IDIV
+def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> {
+  let Latency = 41;
+  let ResourceCycles = [1, 41];
+}
+
+def : WriteRes<WriteIDivLd, [ZnALU2, ZnAGU, ZnDivider]> {
+  let Latency = 45;
+  let ResourceCycles = [1, 4, 41];
+}
+
+// IMUL
+def  : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
+  let Latency = 4;
+}
+def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> {
+  let Latency = 4;
+}
+
+def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> {
+  let Latency = 8;
+}
+
+// Floating point operations
+defm : ZnWriteResFpuPair<WriteFHAdd,     ZnFPU0,  3>;
+defm : ZnWriteResFpuPair<WriteFAdd,      ZnFPU0,  3>;
+defm : ZnWriteResFpuPair<WriteFBlend,    ZnFPU01, 1>;
+defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>;
+defm : ZnWriteResFpuPair<WriteVarBlend,  ZnFPU0,  1>;
+defm : ZnWriteResFpuPair<WriteCvtI2F,    ZnFPU3,  5>;
+defm : ZnWriteResFpuPair<WriteCvtF2F,    ZnFPU3,  5>;
+defm : ZnWriteResFpuPair<WriteCvtF2I,    ZnFPU3,  5>;
+defm : ZnWriteResFpuPair<WriteFDiv,      ZnFPU3, 15>;
+defm : ZnWriteResFpuPair<WriteFShuffle,  ZnFPU12, 1>;
+defm : ZnWriteResFpuPair<WriteFMul,      ZnFPU0,  5>;
+defm : ZnWriteResFpuPair<WriteFRcp,      ZnFPU01, 5>;
+defm : ZnWriteResFpuPair<WriteFRsqrt,    ZnFPU01, 5>;
+defm : ZnWriteResFpuPair<WriteFSqrt,     ZnFPU3, 20>;
+
+// Vector integer operations which uses FPU units
+defm : ZnWriteResFpuPair<WriteVecShift,   ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteVecLogic,   ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WritePHAdd,      ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteVecALU,     ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteVecIMul,    ZnFPU0,  4>;
+defm : ZnWriteResFpuPair<WriteShuffle,    ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteBlend,      ZnFPU01, 1>;
+defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU,   2>;
+
+// Vector Shift Operations
+defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>;
+
+// AES Instructions.
+defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>;
+defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>;
+defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>;
+
+def : WriteRes<WriteFence,  [ZnAGU]>;
+def : WriteRes<WriteNop, []>;
+
+// Following instructions with latency=100 are microcoded.
+// We set long latency so as to block the entire pipeline.
+defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>;
+
+//Microcoded Instructions
+let Latency = 100 in {
+  def : WriteRes<WriteMicrocoded, []>;
+  def : WriteRes<WriteSystem, []>;
+  def : WriteRes<WriteMPSAD, []>;
+  def : WriteRes<WriteMPSADLd, []>;
+  def : WriteRes<WriteCLMul, []>;
+  def : WriteRes<WriteCLMulLd, []>;
+  def : WriteRes<WritePCmpIStrM, []>;
+  def : WriteRes<WritePCmpIStrMLd, []>;
+  def : WriteRes<WritePCmpEStrI, []>;
+  def : WriteRes<WritePCmpEStrILd, []>;
+  def : WriteRes<WritePCmpEStrM, []>;
+  def : WriteRes<WritePCmpEStrMLd, []>;
+  def : WriteRes<WritePCmpIStrI, []>;
+  def : WriteRes<WritePCmpIStrILd, []>;
+  }
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index fa0afe29586b4..427a0001bef98 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -597,7 +597,7 @@ public:
     case CallingConv::Intel_OCL_BI:
       return isTargetWin64();
     // This convention allows using the Win64 convention on other targets.
-    case CallingConv::X86_64_Win64:
+    case CallingConv::Win64:
       return true;
     // This convention allows using the SysV convention on Windows targets.
     case CallingConv::X86_64_SysV:
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 8d891c983fab0..08c2cdaefe71d 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -375,6 +375,7 @@ bool X86PassConfig::addILPOpts() {
   addPass(&EarlyIfConverterID);
   if (EnableMachineCombinerPass)
     addPass(&MachineCombinerID);
+  addPass(createX86CmovConverterPass());
   return true;
 }
 
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index aaa6d58bd1340..c16207973b393 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -40,6 +40,8 @@ public:
   ~X86TargetMachine() override;
 
   const X86Subtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some targets, is
+  // deprecated and should not be used.
   const X86Subtarget *getSubtargetImpl() const = delete;
 
   TargetIRAnalysis getTargetIRAnalysis() override;