35 files changed, 1133 insertions, 341 deletions
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 33680789ee08..bde067d6c129 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -425,7 +425,7 @@ public:
   SystemZAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
                    const MCInstrInfo &MII,
                    const MCTargetOptions &Options)
-    : MCTargetAsmParser(Options, sti), Parser(parser) {
+    : MCTargetAsmParser(Options, sti, MII), Parser(parser) {
     MCAsmParserExtension::Initialize(Parser);
 
     // Alias the .word directive to .short.
@@ -543,6 +543,7 @@ public:
 #define GET_REGISTER_MATCHER
 #define GET_SUBTARGET_FEATURE_NAME
 #define GET_MATCHER_IMPLEMENTATION
+#define GET_MNEMONIC_SPELL_CHECKER
 #include "SystemZGenAsmMatcher.inc"
 
 // Used for the .insn directives; contains information needed to parse the
@@ -1168,7 +1169,8 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands,
   return false;
 }
 
-std::string SystemZMnemonicSpellCheck(StringRef S, uint64_t FBS);
+static std::string SystemZMnemonicSpellCheck(StringRef S, uint64_t FBS,
+                                             unsigned VariantID = 0);
 
 bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                OperandVector &Operands,
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 51ac410a9c81..e035c3b87a40 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -66,7 +66,8 @@ public:
     llvm_unreachable("SystemZ does do not have assembler relaxation");
   }
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
-  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+  std::unique_ptr<MCObjectWriter>
+  createObjectWriter(raw_pwrite_stream &OS) const override {
     return createSystemZObjectWriter(OS, OSABI);
   }
 };
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index df0a8161e6e7..238926d6c8e0 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
@@ -160,8 +161,8 @@ unsigned SystemZObjectWriter::getRelocType(MCContext &Ctx,
   }
 }
 
-MCObjectWriter *llvm::createSystemZObjectWriter(raw_pwrite_stream &OS,
-                                                uint8_t OSABI) {
-  MCELFObjectTargetWriter *MOTW = new SystemZObjectWriter(OSABI);
-  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/false);
+std::unique_ptr<MCObjectWriter>
+llvm::createSystemZObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI) {
+  return createELFObjectWriter(llvm::make_unique<SystemZObjectWriter>(OSABI),
+                               OS, /*IsLittleEndian=*/false);
 }
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 727ab921daf9..05688ed8efbb 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -173,43 +173,6 @@ createSystemZMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
   return createSystemZMCSubtargetInfoImpl(TT, CPU, FS);
 }
 
-static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
-                              CodeModel::Model &CM) {
-  // For SystemZ we define the models as follows:
-  //
-  // Small:  BRASL can call any function and will use a stub if necessary.
-  //         Locally-binding symbols will always be in range of LARL.
-  //
-  // Medium: BRASL can call any function and will use a stub if necessary.
-  //         GOT slots and locally-defined text will always be in range
-  //         of LARL, but other symbols might not be.
-  //
-  // Large:  Equivalent to Medium for now.
-  //
-  // Kernel: Equivalent to Medium for now.
-  //
-  // This means that any PIC module smaller than 4GB meets the
-  // requirements of Small, so Small seems like the best default there.
-  //
-  // All symbols bind locally in a non-PIC module, so the choice is less
-  // obvious.  There are two cases:
-  //
-  // - When creating an executable, PLTs and copy relocations allow
-  //   us to treat external symbols as part of the executable.
-  //   Any executable smaller than 4GB meets the requirements of Small,
-  //   so that seems like the best default.
-  //
-  // - When creating JIT code, stubs will be in range of BRASL if the
-  //   image is less than 4GB in size.  GOT entries will likewise be
-  //   in range of LARL.  However, the JIT environment has no equivalent
-  //   of copy relocs, so locally-binding data symbols might not be in
-  //   the range of LARL.  We need the Medium model in that case.
-  if (CM == CodeModel::Default)
-    CM = CodeModel::Small;
-  else if (CM == CodeModel::JITDefault)
-    CM = RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium;
-}
-
 static MCInstPrinter *createSystemZMCInstPrinter(const Triple &T,
                                                  unsigned SyntaxVariant,
                                                  const MCAsmInfo &MAI,
@@ -223,10 +186,6 @@ extern "C" void LLVMInitializeSystemZTargetMC() {
   TargetRegistry::RegisterMCAsmInfo(getTheSystemZTarget(),
                                     createSystemZMCAsmInfo);
 
-  // Register the adjustCodeGenOpts.
-  TargetRegistry::registerMCAdjustCodeGenOpts(getTheSystemZTarget(),
-                                              adjustCodeGenOpts);
-
   // Register the MCCodeEmitter.
   TargetRegistry::RegisterMCCodeEmitter(getTheSystemZTarget(),
                                         createSystemZMCCodeEmitter);
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index dbca3485290a..99b157e37275 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -12,6 +12,8 @@
 
 #include "llvm/Support/DataTypes.h"
 
+#include <memory>
+
 namespace llvm {
 
 class MCAsmBackend;
@@ -91,7 +93,8 @@ MCAsmBackend *createSystemZMCAsmBackend(const Target &T,
                                         const Triple &TT, StringRef CPU,
                                         const MCTargetOptions &Options);
 
-MCObjectWriter *createSystemZObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+std::unique_ptr<MCObjectWriter> createSystemZObjectWriter(raw_pwrite_stream &OS,
+                                                          uint8_t OSABI);
 } // end namespace llvm
 
 // Defines symbolic names for SystemZ registers.
diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
index 41300a1b6295..06905fb41e44 100644
--- a/lib/Target/SystemZ/SystemZ.td
+++ b/lib/Target/SystemZ/SystemZ.td
@@ -58,7 +58,7 @@ include "SystemZInstrHFP.td"
 include "SystemZInstrDFP.td"
 include "SystemZInstrSystem.td"
 
-def SystemZInstrInfo : InstrInfo {}
+def SystemZInstrInfo : InstrInfo { let guessInstructionProperties = 0; }
 
 //===----------------------------------------------------------------------===//
 // Assembly parser
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index d70f9e90cd3e..55f7a7b8d0d1 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -25,9 +25,9 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
 #include <cstdint>
 
@@ -110,12 +110,8 @@ static bool isCCLiveOut(MachineBasicBlock &MBB) {
   return false;
 }
 
-// Return true if any CC result of MI would reflect the value of Reg.
-static bool resultTests(MachineInstr &MI, unsigned Reg) {
-  if (MI.getNumOperands() > 0 && MI.getOperand(0).isReg() &&
-      MI.getOperand(0).isDef() && MI.getOperand(0).getReg() == Reg)
-    return true;
-
+// Returns true if MI is an instruction whose output equals the value in Reg.
+static bool preservesValueOf(MachineInstr &MI, unsigned Reg) {
   switch (MI.getOpcode()) {
   case SystemZ::LR:
   case SystemZ::LGR:
@@ -136,6 +132,16 @@ static bool resultTests(MachineInstr &MI, unsigned Reg) {
   return false;
 }
 
+// Return true if any CC result of MI would (perhaps after conversion)
+// reflect the value of Reg.
+static bool resultTests(MachineInstr &MI, unsigned Reg) {
+  if (MI.getNumOperands() > 0 && MI.getOperand(0).isReg() &&
+      MI.getOperand(0).isDef() && MI.getOperand(0).getReg() == Reg)
+    return true;
+
+  return (preservesValueOf(MI, Reg));
+}
+
 // Describe the references to Reg or any of its aliases in MI.
 Reference SystemZElimCompare::getRegReferences(MachineInstr &MI, unsigned Reg) {
   Reference Ref;
@@ -421,11 +427,34 @@ bool SystemZElimCompare::optimizeCompareZero(
     }
     SrcRefs |= getRegReferences(MI, SrcReg);
     if (SrcRefs.Def)
-      return false;
+      break;
     CCRefs |= getRegReferences(MI, SystemZ::CC);
     if (CCRefs.Use && CCRefs.Def)
+      break;
+  }
+
+  // Also do a forward search to handle cases where an instruction after the
+  // compare can be converted like
+  //
+  // LTEBRCompare %f0s, %f0s, implicit-def %cc LTEBRCompare %f0s, %f0s,
+  // implicit-def %cc %f2s = LER %f0s
+  //
+  MBBI = Compare, MBBE = MBB.end();
+  while (++MBBI != MBBE) {
+    MachineInstr &MI = *MBBI;
+    if (preservesValueOf(MI, SrcReg)) {
+      // Try to eliminate Compare by reusing a CC result from MI.
+      if (convertToLoadAndTest(MI)) {
+        EliminatedComparisons += 1;
+        return true;
+      }
+    }
+    if (getRegReferences(MI, SrcReg).Def)
+      return false;
+    if (getRegReferences(MI, SystemZ::CC))
       return false;
   }
+
   return false;
 }
 
@@ -564,7 +593,7 @@ bool SystemZElimCompare::processBlock(MachineBasicBlock &MBB) {
 }
 
 bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
-  if (skipFunction(*F.getFunction()))
+  if (skipFunction(F.getFunction()))
     return false;
 
   TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 0cb2b5a14ce7..b600aa61cd0b 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -71,7 +71,7 @@ void SystemZFrameLowering::determineCalleeSaves(MachineFunction &MF,
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   bool HasFP = hasFP(MF);
   SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
-  bool IsVarArg = MF.getFunction()->isVarArg();
+  bool IsVarArg = MF.getFunction().isVarArg();
 
   // va_start stores incoming FPR varargs in the normal way, but delegates
   // the saving of incoming GPR varargs to spillCalleeSavedRegisters().
@@ -139,7 +139,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
-  bool IsVarArg = MF.getFunction()->isVarArg();
+  bool IsVarArg = MF.getFunction().isVarArg();
   DebugLoc DL;
 
   // Scan the call-saved GPRs and find the bounds of the register spill area.
@@ -220,7 +220,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 bool SystemZFrameLowering::
 restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            const std::vector<CalleeSavedInfo> &CSI,
+                            std::vector<CalleeSavedInfo> &CSI,
                             const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
@@ -374,7 +374,7 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
   uint64_t StackSize = getAllocatedStackSize(MF);
   if (StackSize) {
     // Determine if we want to store a backchain.
-    bool StoreBackchain = MF.getFunction()->hasFnAttribute("backchain");
+    bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
 
     // If we need backchain, save current stack pointer.  R1 is free at this
     // point.
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index d43a176ad874..a75d111b0294 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -11,7 +11,7 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
 
 #include "llvm/ADT/IndexedMap.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 class SystemZTargetMachine;
@@ -35,7 +35,7 @@ public:
                                  const TargetRegisterInfo *TRI) const override;
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBII,
-                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   std::vector<CalleeSavedInfo> &CSI,
                                    const TargetRegisterInfo *TRI) const
     override;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
diff --git a/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index 73a1036f88e0..f37216022762 100644
--- a/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -19,6 +19,13 @@
 // * Processor resources usage. It is beneficial to balance the use of
 // resources.
 //
+// A goal is to consider all instructions, also those outside of any
+// scheduling region. Such instructions are "advanced" past and include
+// single instructions before a scheduling region, branches etc.
+//
+// A block that has only one predecessor continues scheduling with the state
+// of it (which may be updated by emitting branches).
+//
 // ===---------------------------------------------------------------------===//
 
 #include "SystemZHazardRecognizer.h"
@@ -36,13 +43,9 @@ static cl::opt<int> ProcResCostLim("procres-cost-lim", cl::Hidden,
                                             "resources during scheduling."),
                                    cl::init(8));
 
-SystemZHazardRecognizer::
-SystemZHazardRecognizer(const MachineSchedContext *C) : DAG(nullptr),
-                                                        SchedModel(nullptr) {}
-
 unsigned SystemZHazardRecognizer::
 getNumDecoderSlots(SUnit *SU) const {
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  const MCSchedClassDesc *SC = getSchedClass(SU);
   if (!SC->isValid())
     return 0; // IMPLICIT_DEF / KILL -- will not make impact in output.
 
@@ -73,12 +76,13 @@ void SystemZHazardRecognizer::Reset() {
   clearProcResCounters();
   GrpCount = 0;
   LastFPdOpCycleIdx = UINT_MAX;
+  LastEmittedMI = nullptr;
   DEBUG(CurGroupDbg = "";);
 }
 
 bool
 SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const {
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  const MCSchedClassDesc *SC = getSchedClass(SU);
   if (!SC->isValid())
     return true;
 
@@ -125,9 +129,9 @@ void SystemZHazardRecognizer::nextGroup(bool DbgOutput) {
 #ifndef NDEBUG // Debug output
 void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const {
   OS << "SU(" << SU->NodeNum << "):";
-  OS << SchedModel->getInstrInfo()->getName(SU->getInstr()->getOpcode());
+  OS << TII->getName(SU->getInstr()->getOpcode());
 
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  const MCSchedClassDesc *SC = getSchedClass(SU);
   if (!SC->isValid())
     return;
   
@@ -200,10 +204,15 @@ void SystemZHazardRecognizer::clearProcResCounters() {
   CriticalResourceIdx = UINT_MAX;
 }
 
+static inline bool isBranchRetTrap(MachineInstr *MI) {
+  return (MI->isBranch() || MI->isReturn() ||
+          MI->getOpcode() == SystemZ::CondTrap);
+}
+
 // Update state with SU as the next scheduled unit.
 void SystemZHazardRecognizer::
 EmitInstruction(SUnit *SU) {
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  const MCSchedClassDesc *SC = getSchedClass(SU);
   DEBUG( dumpCurrGroup("Decode group before emission"););
 
   // If scheduling an SU that must begin a new decoder group, move on
@@ -218,8 +227,10 @@ EmitInstruction(SUnit *SU) {
            cgd << ", ";
          dumpSU(SU, cgd););
 
+  LastEmittedMI = SU->getInstr();
+
   // After returning from a call, we don't know much about the state.
-  if (SU->getInstr()->isCall()) {
+  if (SU->isCall) {
     DEBUG (dbgs() << "+++ Clearing state after call.\n";);
     clearProcResCounters();
     LastFPdOpCycleIdx = UINT_MAX;
@@ -259,6 +270,9 @@ EmitInstruction(SUnit *SU) {
            << LastFPdOpCycleIdx << "\n";);
   }
 
+  bool GroupEndingBranch =
+    (CurrGroupSize >= 1 && isBranchRetTrap(SU->getInstr()));
+
   // Insert SU into current group by increasing number of slots used
   // in current group.
   CurrGroupSize += getNumDecoderSlots(SU);
@@ -266,12 +280,12 @@ EmitInstruction(SUnit *SU) {
 
   // Check if current group is now full/ended. If so, move on to next
   // group to be ready to evaluate more candidates.
-  if (CurrGroupSize == 3 || SC->EndGroup)
+  if (CurrGroupSize == 3 || SC->EndGroup || GroupEndingBranch)
     nextGroup();
 }
 
 int SystemZHazardRecognizer::groupingCost(SUnit *SU) const {
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  const MCSchedClassDesc *SC = getSchedClass(SU);
   if (!SC->isValid())
     return 0;
   
@@ -315,7 +329,7 @@ int SystemZHazardRecognizer::
 resourcesCost(SUnit *SU) {
   int Cost = 0;
 
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  const MCSchedClassDesc *SC = getSchedClass(SU);
   if (!SC->isValid())
     return 0;
 
@@ -335,3 +349,50 @@ resourcesCost(SUnit *SU) {
   return Cost;
 }
 
+void SystemZHazardRecognizer::emitInstruction(MachineInstr *MI,
+                                              bool TakenBranch) {
+  // Make a temporary SUnit.
+  SUnit SU(MI, 0);
+
+  // Set interesting flags.
+  SU.isCall = MI->isCall();
+
+  const MCSchedClassDesc *SC = SchedModel->resolveSchedClass(MI);
+  for (const MCWriteProcResEntry &PRE :
+         make_range(SchedModel->getWriteProcResBegin(SC),
+                    SchedModel->getWriteProcResEnd(SC))) {
+    switch (SchedModel->getProcResource(PRE.ProcResourceIdx)->BufferSize) {
+    case 0:
+      SU.hasReservedResource = true;
+      break;
+    case 1:
+      SU.isUnbuffered = true;
+      break;
+    default:
+      break;
+    }
+  }
+
+  EmitInstruction(&SU);
+
+  if (TakenBranch && CurrGroupSize > 0)
+    nextGroup(false /*DbgOutput*/);
+
+  assert ((!MI->isTerminator() || isBranchRetTrap(MI)) &&
+          "Scheduler: unhandled terminator!");
+}
+
+void SystemZHazardRecognizer::
+copyState(SystemZHazardRecognizer *Incoming) {
+  // Current decoder group
+  CurrGroupSize = Incoming->CurrGroupSize;
+  DEBUG (CurGroupDbg = Incoming->CurGroupDbg;);
+
+  // Processor resources
+  ProcResourceCounters = Incoming->ProcResourceCounters;
+  CriticalResourceIdx = Incoming->CriticalResourceIdx;
+
+  // FPd
+  LastFPdOpCycleIdx = Incoming->LastFPdOpCycleIdx;
+  GrpCount = Incoming->GrpCount;
+}
diff --git a/lib/Target/SystemZ/SystemZHazardRecognizer.h b/lib/Target/SystemZ/SystemZHazardRecognizer.h
index 0c755c9ad1b9..7e1b5fb2e4fe 100644
--- a/lib/Target/SystemZ/SystemZHazardRecognizer.h
+++ b/lib/Target/SystemZ/SystemZHazardRecognizer.h
@@ -19,6 +19,13 @@
 // * Processor resources usage. It is beneficial to balance the use of
 // resources.
 //
+// A goal is to consider all instructions, also those outside of any
+// scheduling region. Such instructions are "advanced" past and include
+// single instructions before a scheduling region, branches etc.
+//
+// A block that has only one predecessor continues scheduling with the state
+// of it (which may be updated by emitting branches).
+//
 // ===---------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZHAZARDRECOGNIZER_H
@@ -35,10 +42,12 @@
 
 namespace llvm {
 
-/// SystemZHazardRecognizer maintains the state during scheduling.
+/// SystemZHazardRecognizer maintains the state for one MBB during scheduling.
 class SystemZHazardRecognizer : public ScheduleHazardRecognizer {
 
-  ScheduleDAGMI *DAG;
+#ifndef NDEBUG
+  const SystemZInstrInfo *TII;
+#endif
   const TargetSchedModel *SchedModel;
 
   /// Keep track of the number of decoder slots used in the current
@@ -88,18 +97,34 @@ class SystemZHazardRecognizer : public ScheduleHazardRecognizer {
   /// ops, return true if it seems good to schedule an FPd op next.
   bool isFPdOpPreferred_distance(const SUnit *SU);
 
-public:
-  SystemZHazardRecognizer(const MachineSchedContext *C);
+  /// Last emitted instruction or nullptr.
+  MachineInstr *LastEmittedMI;
 
-  void setDAG(ScheduleDAGMI *dag) {
-    DAG = dag;
-    SchedModel = dag->getSchedModel();
+public:
+  SystemZHazardRecognizer(const SystemZInstrInfo *tii,
+                          const TargetSchedModel *SM)
+      :
+#ifndef NDEBUG
+        TII(tii),
+#endif
+        SchedModel(SM) {
+    Reset();
   }
-  
-  HazardType getHazardType(SUnit *m, int Stalls = 0) override;    
+
+  HazardType getHazardType(SUnit *m, int Stalls = 0) override;
   void Reset() override;
   void EmitInstruction(SUnit *SU) override;
 
+  /// Resolves and cache a resolved scheduling class for an SUnit.
+  const MCSchedClassDesc *getSchedClass(SUnit *SU) const {
+    if (!SU->SchedClass && SchedModel->hasInstrSchedModel())
+      SU->SchedClass = SchedModel->resolveSchedClass(SU->getInstr());
+    return SU->SchedClass;
+  }
+
+  /// Wrap a non-scheduled instruction in an SU and emit it.
+  void emitInstruction(MachineInstr *MI, bool TakenBranch = false);
+
   // Cost functions used by SystemZPostRASchedStrategy while
   // evaluating candidates.
 
@@ -121,6 +146,11 @@ public:
   void dumpCurrGroup(std::string Msg = "") const;
   void dumpProcResourceCounters() const;
 #endif
+
+  MachineBasicBlock::iterator getLastEmittedMI() { return LastEmittedMI; }
+
+  /// Copy counters from end of single predecessor.
+  void copyState(SystemZHazardRecognizer *Incoming);
 };
 
 } // namespace llvm
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index cd2f708458bf..ce6f3d37f5c9 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -838,9 +838,16 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
   case ISD::SIGN_EXTEND: {
     // Check that the extension bits are don't-care (i.e. are masked out
     // by the final mask).
+    unsigned BitSize = N.getValueSizeInBits();
     unsigned InnerBitSize = N.getOperand(0).getValueSizeInBits();
-    if (maskMatters(RxSBG, allOnes(RxSBG.BitSize) - allOnes(InnerBitSize)))
-      return false;
+    if (maskMatters(RxSBG, allOnes(BitSize) - allOnes(InnerBitSize))) {
+      // In the case where only the sign bit is active, increase Rotate with
+      // the extension width.
+      if (RxSBG.Mask == 1 && RxSBG.Rotate == 1)
+        RxSBG.Rotate += (BitSize - InnerBitSize);
+      else
+        return false;
+    }
 
     RxSBG.Input = N.getOperand(0);
     return true;
@@ -992,7 +999,15 @@ bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
   if (Subtarget->hasMiscellaneousExtensions())
     Opcode = SystemZ::RISBGN;
   EVT OpcodeVT = MVT::i64;
-  if (VT == MVT::i32 && Subtarget->hasHighWord()) {
+  if (VT == MVT::i32 && Subtarget->hasHighWord() &&
+      // We can only use the 32-bit instructions if all source bits are
+      // in the low 32 bits without wrapping, both after rotation (because
+      // of the smaller range for Start and End) and before rotation
+      // (because the input value is truncated).
+      RISBG.Start >= 32 && RISBG.End >= RISBG.Start &&
+      ((RISBG.Start + RISBG.Rotate) & 63) >= 32 &&
+      ((RISBG.End + RISBG.Rotate) & 63) >=
+      ((RISBG.Start + RISBG.Rotate) & 63)) {
     Opcode = SystemZ::RISBMux;
     OpcodeVT = MVT::i32;
     RISBG.Start &= 31;
@@ -1255,8 +1270,10 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
     // Fall through.
   or_xor:
     // If this is a 64-bit operation in which both 32-bit halves are nonzero,
-    // split the operation into two.
-    if (Node->getValueType(0) == MVT::i64)
+    // split the operation into two.  If both operands here happen to be
+    // constant, leave this to common code to optimize.
+    if (Node->getValueType(0) == MVT::i64 &&
+        Node->getOperand(0).getOpcode() != ISD::Constant)
       if (auto *Op1 = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
         uint64_t Val = Op1->getZExtValue();
         if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val)) {
@@ -1379,8 +1396,11 @@ SelectInlineAsmMemoryOperand(const SDValue &Op,
     break;
   case InlineAsm::Constraint_T:
   case InlineAsm::Constraint_m:
+  case InlineAsm::Constraint_o:
     // Accept an address with a long displacement and an index.
     // m works the same as T, as this is the most general case.
+    // We don't really have any special handling of "offsettable"
+    // memory addresses, so just treat o the same as m.
     Form = SystemZAddressingMode::FormBDXNormal;
     DispRange = SystemZAddressingMode::Disp20Only;
     break;
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2d916d2e1521..adf368319dc3 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
 #include <cctype>
@@ -220,7 +221,17 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::ATOMIC_LOAD_MAX,  MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom);
-  setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Custom);
+
+  // Even though i128 is not a legal type, we still need to custom lower
+  // the atomic operations in order to exploit SystemZ instructions.
+  setOperationAction(ISD::ATOMIC_LOAD,     MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_STORE,    MVT::i128, Custom);
+
+  // We can use the CC result of compare-and-swap to implement
+  // the "success" result of ATOMIC_CMP_SWAP_WITH_SUCCESS.
+  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 
   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
 
@@ -586,9 +597,104 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   return true;
 }
 
+// Information about the addressing mode for a memory access.
+struct AddressingMode {
+  // True if a long displacement is supported.
+  bool LongDisplacement;
+
+  // True if use of index register is supported.
+  bool IndexReg;
+  
+  AddressingMode(bool LongDispl, bool IdxReg) :
+    LongDisplacement(LongDispl), IndexReg(IdxReg) {}
+};
+
+// Return the desired addressing mode for a Load which has only one use (in
+// the same block) which is a Store.
+static AddressingMode getLoadStoreAddrMode(bool HasVector,
+                                          Type *Ty) {
+  // With vector support a Load->Store combination may be combined to either
+  // an MVC or vector operations and it seems to work best to allow the
+  // vector addressing mode.
+  if (HasVector)
+    return AddressingMode(false/*LongDispl*/, true/*IdxReg*/);
+
+  // Otherwise only the MVC case is special.
+  bool MVC = Ty->isIntegerTy(8);
+  return AddressingMode(!MVC/*LongDispl*/, !MVC/*IdxReg*/);
+}
+
+// Return the addressing mode which seems most desirable given an LLVM
+// Instruction pointer.
+static AddressingMode
+supportedAddressingMode(Instruction *I, bool HasVector) {
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      return AddressingMode(false/*LongDispl*/, false/*IdxReg*/);
+    }
+  }
+
+  if (isa<LoadInst>(I) && I->hasOneUse()) {
+    auto *SingleUser = dyn_cast<Instruction>(*I->user_begin());
+    if (SingleUser->getParent() == I->getParent()) {
+      if (isa<ICmpInst>(SingleUser)) {
+        if (auto *C = dyn_cast<ConstantInt>(SingleUser->getOperand(1)))
+          if (isInt<16>(C->getSExtValue()) || isUInt<16>(C->getZExtValue()))
+            // Comparison of memory with 16 bit signed / unsigned immediate
+            return AddressingMode(false/*LongDispl*/, false/*IdxReg*/);
+      } else if (isa<StoreInst>(SingleUser))
+        // Load->Store
+        return getLoadStoreAddrMode(HasVector, I->getType());
+    }
+  } else if (auto *StoreI = dyn_cast<StoreInst>(I)) {
+    if (auto *LoadI = dyn_cast<LoadInst>(StoreI->getValueOperand()))
+      if (LoadI->hasOneUse() && LoadI->getParent() == I->getParent())
+        // Load->Store
+        return getLoadStoreAddrMode(HasVector, LoadI->getType());
+  }
+
+  if (HasVector && (isa<LoadInst>(I) || isa<StoreInst>(I))) {
+
+    // * Use LDE instead of LE/LEY for z13 to avoid partial register
+    //   dependencies (LDE only supports small offsets).
+    // * Utilize the vector registers to hold floating point
+    //   values (vector load / store instructions only support small
+    //   offsets).
+
+    Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
+                         I->getOperand(0)->getType());
+    bool IsFPAccess = MemAccessTy->isFloatingPointTy();
+    bool IsVectorAccess = MemAccessTy->isVectorTy();
+
+    // A store of an extracted vector element will be combined into a VSTE type
+    // instruction.
+    if (!IsVectorAccess && isa<StoreInst>(I)) {
+      Value *DataOp = I->getOperand(0);
+      if (isa<ExtractElementInst>(DataOp))
+        IsVectorAccess = true;
+    }
+
+    // A load which gets inserted into a vector element will be combined into a
+    // VLE type instruction.
+    if (!IsVectorAccess && isa<LoadInst>(I) && I->hasOneUse()) {
+      User *LoadUser = *I->user_begin();
+      if (isa<InsertElementInst>(LoadUser))
+        IsVectorAccess = true;
+    }
+
+    if (IsFPAccess || IsVectorAccess)
+      return AddressingMode(false/*LongDispl*/, true/*IdxReg*/);
+  }
+
+  return AddressingMode(true/*LongDispl*/, true/*IdxReg*/);
+}
+
 bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
-                                                  const AddrMode &AM, Type *Ty,
-                                                  unsigned AS) const {
+       const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const {
   // Punt on globals for now, although they can be used in limited
   // RELATIVE LONG cases.
   if (AM.BaseGV)
@@ -598,48 +704,19 @@ bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (!isInt<20>(AM.BaseOffs))
     return false;
 
-  // Indexing is OK but no scale factor can be applied.
-  return AM.Scale == 0 || AM.Scale == 1;
-}
-
-bool SystemZTargetLowering::isFoldableMemAccessOffset(Instruction *I,
-                                                      int64_t Offset) const {
-  // This only applies to z13.
-  if (!Subtarget.hasVector())
-    return true;
-
-  // * Use LDE instead of LE/LEY to avoid partial register
-  //   dependencies (LDE only supports small offsets).
-  // * Utilize the vector registers to hold floating point
-  //   values (vector load / store instructions only support small
-  //   offsets).
-
-  assert (isa<LoadInst>(I) || isa<StoreInst>(I));
-  Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
-                       I->getOperand(0)->getType());
-  bool IsFPAccess = MemAccessTy->isFloatingPointTy();
-  bool IsVectorAccess = MemAccessTy->isVectorTy();
-
-  // A store of an extracted vector element will be combined into a VSTE type
-  // instruction.
-  if (!IsVectorAccess && isa<StoreInst>(I)) {
-    Value *DataOp = I->getOperand(0);
-    if (isa<ExtractElementInst>(DataOp))
-      IsVectorAccess = true;
-  }
-
-  // A load which gets inserted into a vector element will be combined into a
-  // VLE type instruction.
-  if (!IsVectorAccess && isa<LoadInst>(I) && I->hasOneUse()) {
-    User *LoadUser = *I->user_begin();
-    if (isa<InsertElementInst>(LoadUser))
-      IsVectorAccess = true;
-  }
+  AddressingMode SupportedAM(true, true);
+  if (I != nullptr)
+    SupportedAM = supportedAddressingMode(I, Subtarget.hasVector());
 
-  if (!isUInt<12>(Offset) && (IsFPAccess || IsVectorAccess))
+  if (!SupportedAM.LongDisplacement && !isUInt<12>(AM.BaseOffs))
     return false;
 
-  return true;
+  if (!SupportedAM.IndexReg)
+    // No indexing allowed.
+    return AM.Scale == 0;
+  else
+    // Indexing is OK but no scale factor can be applied.
+    return AM.Scale == 0 || AM.Scale == 1;
 }
 
 bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
@@ -1767,11 +1844,14 @@ static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL,
                               ISD::SEXTLOAD :
                               ISD::ZEXTLOAD);
   if (C.Op0.getValueType() != MVT::i32 ||
-      Load->getExtensionType() != ExtType)
+      Load->getExtensionType() != ExtType) {
     C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32, Load->getChain(),
                            Load->getBasePtr(), Load->getPointerInfo(),
                            Load->getMemoryVT(), Load->getAlignment(),
                            Load->getMemOperand()->getFlags());
+    // Update the chain uses.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), C.Op0.getValue(1));
+  }
 
   // Make sure that the second operand is an i32 with the right value.
   if (C.Op1.getValueType() != MVT::i32 ||
@@ -2121,6 +2201,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
       NewC.Op0.getOpcode() == ISD::SHL &&
       isSimpleShift(NewC.Op0, ShiftVal) &&
       (MaskVal >> ShiftVal != 0) &&
+      ((CmpVal >> ShiftVal) << ShiftVal) == CmpVal &&
       (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
                                         MaskVal >> ShiftVal,
                                         CmpVal >> ShiftVal,
@@ -2131,6 +2212,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
              NewC.Op0.getOpcode() == ISD::SRL &&
              isSimpleShift(NewC.Op0, ShiftVal) &&
              (MaskVal << ShiftVal != 0) &&
+             ((CmpVal << ShiftVal) >> ShiftVal) == CmpVal &&
              (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
                                                MaskVal << ShiftVal,
                                                CmpVal << ShiftVal,
@@ -2863,9 +2945,13 @@ SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
   // but we need this case for bitcasts that are created during lowering
   // and which are then lowered themselves.
   if (auto *LoadN = dyn_cast<LoadSDNode>(In))
-    if (ISD::isNormalLoad(LoadN))
-      return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(),
-                         LoadN->getMemOperand());
+    if (ISD::isNormalLoad(LoadN)) {
+      SDValue NewLoad = DAG.getLoad(ResVT, DL, LoadN->getChain(),
+                                    LoadN->getBasePtr(), LoadN->getMemOperand());
+      // Update the chain uses.
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LoadN, 1), NewLoad.getValue(1));
+      return NewLoad;
+    }
 
   if (InVT == MVT::i32 && ResVT == MVT::f32) {
     SDValue In64;
@@ -2953,8 +3039,8 @@ SDValue SystemZTargetLowering::
 lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
   MachineFunction &MF = DAG.getMachineFunction();
-  bool RealignOpt = !MF.getFunction()-> hasFnAttribute("no-realign-stack");
-  bool StoreBackchain = MF.getFunction()->hasFnAttribute("backchain");
+  bool RealignOpt = !MF.getFunction().hasFnAttribute("no-realign-stack");
+  bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
 
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
@@ -3276,28 +3362,28 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }
 
-// Op is an atomic load.  Lower it into a serialization followed
-// by a normal volatile load.
+// Op is an atomic load.  Lower it into a normal volatile load.
 SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
                                                 SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
-  SDValue Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
-                                             MVT::Other, Node->getChain()), 0);
   return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
-                        Chain, Node->getBasePtr(),
+                        Node->getChain(), Node->getBasePtr(),
                         Node->getMemoryVT(), Node->getMemOperand());
 }
 
-// Op is an atomic store.  Lower it into a normal volatile store followed
-// by a serialization.
+// Op is an atomic store.  Lower it into a normal volatile store.
 SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
                                                  SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
   SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
                                     Node->getBasePtr(), Node->getMemoryVT(),
                                     Node->getMemOperand());
-  return SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op), MVT::Other,
-                                    Chain), 0);
+  // We have to enforce sequential consistency by performing a
+  // serialization operation after the store.
+  if (Node->getOrdering() == AtomicOrdering::SequentiallyConsistent)
+    Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
+                                       MVT::Other, Chain), 0);
+  return Chain;
 }
 
 // Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation.  Lower the first
@@ -3410,25 +3496,38 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op,
   return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
 }
 
-// Node is an 8- or 16-bit ATOMIC_CMP_SWAP operation.  Lower the first two
-// into a fullword ATOMIC_CMP_SWAPW operation.
+// Lower 8/16/32/64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS node.
 SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
                                                     SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
-
-  // We have native support for 32-bit compare and swap.
-  EVT NarrowVT = Node->getMemoryVT();
-  EVT WideVT = MVT::i32;
-  if (NarrowVT == WideVT)
-    return Op;
-
-  int64_t BitSize = NarrowVT.getSizeInBits();
   SDValue ChainIn = Node->getOperand(0);
   SDValue Addr = Node->getOperand(1);
   SDValue CmpVal = Node->getOperand(2);
   SDValue SwapVal = Node->getOperand(3);
   MachineMemOperand *MMO = Node->getMemOperand();
   SDLoc DL(Node);
+
+  // We have native support for 32-bit and 64-bit compare and swap, but we
+  // still need to expand extracting the "success" result from the CC.
+  EVT NarrowVT = Node->getMemoryVT();
+  EVT WideVT = NarrowVT == MVT::i64 ? MVT::i64 : MVT::i32;
+  if (NarrowVT == WideVT) {
+    SDVTList Tys = DAG.getVTList(WideVT, MVT::Other, MVT::Glue);
+    SDValue Ops[] = { ChainIn, Addr, CmpVal, SwapVal };
+    SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP,
+                                               DL, Tys, Ops, NarrowVT, MMO);
+    SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(2),
+                                SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
+
+    DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
+    DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
+    DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(1));
+    return SDValue();
+  }
+
+  // Convert 8-bit and 16-bit compare and swap to a loop, implemented
+  // via a fullword ATOMIC_CMP_SWAPW operation.
+  int64_t BitSize = NarrowVT.getSizeInBits();
   EVT PtrVT = Addr.getValueType();
 
   // Get the address of the containing word.
@@ -3447,12 +3546,18 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
                                     DAG.getConstant(0, DL, WideVT), BitShift);
 
   // Construct the ATOMIC_CMP_SWAPW node.
-  SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
+  SDVTList VTList = DAG.getVTList(WideVT, MVT::Other, MVT::Glue);
   SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
                     NegBitShift, DAG.getConstant(BitSize, DL, WideVT) };
   SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
                                              VTList, Ops, NarrowVT, MMO);
-  return AtomicOp;
+  SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(2),
+                              SystemZ::CCMASK_ICMP, SystemZ::CCMASK_CMP_EQ);
+
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(1));
+  return SDValue();
 }
 
 SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
@@ -3467,7 +3572,7 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
                                                  SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
-  bool StoreBackchain = MF.getFunction()->hasFnAttribute("backchain");
+  bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
 
   SDValue Chain = Op.getOperand(0);
   SDValue NewSP = Op.getOperand(1);
@@ -4680,7 +4785,7 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
   case ISD::ATOMIC_LOAD_UMAX:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
-  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
     return lowerATOMIC_CMP_SWAP(Op, DAG);
   case ISD::STACKSAVE:
     return lowerSTACKSAVE(Op, DAG);
@@ -4717,6 +4822,92 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
   }
 }
 
+// Lower operations with invalid operand or result types (currently used
+// only for 128-bit integer types).
+
+static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
+  SDLoc DL(In);
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
+                           DAG.getIntPtrConstant(0, DL));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
+                           DAG.getIntPtrConstant(1, DL));
+  SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
+                                    MVT::Untyped, Hi, Lo);
+  return SDValue(Pair, 0);
+}
+
+static SDValue lowerGR128ToI128(SelectionDAG &DAG, SDValue In) {
+  SDLoc DL(In);
+  SDValue Hi = DAG.getTargetExtractSubreg(SystemZ::subreg_h64,
+                                          DL, MVT::i64, In);
+  SDValue Lo = DAG.getTargetExtractSubreg(SystemZ::subreg_l64,
+                                          DL, MVT::i64, In);
+  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi);
+}
+
+void
+SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
+                                             SmallVectorImpl<SDValue> &Results,
+                                             SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  case ISD::ATOMIC_LOAD: {
+    SDLoc DL(N);
+    SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::Other);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1) };
+    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
+    SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128,
+                                          DL, Tys, Ops, MVT::i128, MMO);
+    Results.push_back(lowerGR128ToI128(DAG, Res));
+    Results.push_back(Res.getValue(1));
+    break;
+  }
+  case ISD::ATOMIC_STORE: {
+    SDLoc DL(N);
+    SDVTList Tys = DAG.getVTList(MVT::Other);
+    SDValue Ops[] = { N->getOperand(0),
+                      lowerI128ToGR128(DAG, N->getOperand(2)),
+                      N->getOperand(1) };
+    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
+    SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_STORE_128,
+                                          DL, Tys, Ops, MVT::i128, MMO);
+    // We have to enforce sequential consistency by performing a
+    // serialization operation after the store.
+    if (cast<AtomicSDNode>(N)->getOrdering() ==
+        AtomicOrdering::SequentiallyConsistent)
+      Res = SDValue(DAG.getMachineNode(SystemZ::Serialize, DL,
+                                       MVT::Other, Res), 0);
+    Results.push_back(Res);
+    break;
+  }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+    SDLoc DL(N);
+    SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::Other, MVT::Glue);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      lowerI128ToGR128(DAG, N->getOperand(2)),
+                      lowerI128ToGR128(DAG, N->getOperand(3)) };
+    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
+    SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP_128,
+                                          DL, Tys, Ops, MVT::i128, MMO);
+    SDValue Success = emitSETCC(DAG, DL, Res.getValue(2),
+                                SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
+    Success = DAG.getZExtOrTrunc(Success, DL, N->getValueType(1));
+    Results.push_back(lowerGR128ToI128(DAG, Res));
+    Results.push_back(Success);
+    Results.push_back(Res.getValue(1));
+    break;
+  }
+  default:
+    llvm_unreachable("Unexpected node to lower");
+  }
+}
+
+void
+SystemZTargetLowering::ReplaceNodeResults(SDNode *N,
+                                          SmallVectorImpl<SDValue> &Results,
+                                          SelectionDAG &DAG) const {
+  return LowerOperationWrapper(N, Results, DAG);
+}
+
 const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
 #define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
   switch ((SystemZISD::NodeType)Opcode) {
@@ -4817,6 +5008,10 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(ATOMIC_LOADW_UMIN);
     OPCODE(ATOMIC_LOADW_UMAX);
     OPCODE(ATOMIC_CMP_SWAPW);
+    OPCODE(ATOMIC_CMP_SWAP);
+    OPCODE(ATOMIC_LOAD_128);
+    OPCODE(ATOMIC_STORE_128);
+    OPCODE(ATOMIC_CMP_SWAP_128);
     OPCODE(LRV);
     OPCODE(STRV);
     OPCODE(PREFETCH);
@@ -5067,7 +5262,8 @@ SDValue SystemZTargetLowering::combineSTORE(
   }
   // Combine STORE (BSWAP) into STRVH/STRV/STRVG
   // See comment in combineBSWAP about volatile accesses.
-  if (!SN->isVolatile() &&
+  if (!SN->isTruncatingStore() &&
+      !SN->isVolatile() &&
       Op1.getOpcode() == ISD::BSWAP &&
       Op1.getNode()->hasOneUse() &&
       (Op1.getValueType() == MVT::i16 ||
@@ -5840,10 +6036,42 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
   MBB->addSuccessor(LoopMBB);
   MBB->addSuccessor(DoneMBB);
 
+  // If the CC def wasn't dead in the ATOMIC_CMP_SWAPW, mark CC as live-in
+  // to the block after the loop.  At this point, CC may have been defined
+  // either by the CR in LoopMBB or by the CS in SetMBB.
+  if (!MI.registerDefIsDead(SystemZ::CC))
+    DoneMBB->addLiveIn(SystemZ::CC);
+
   MI.eraseFromParent();
   return DoneMBB;
 }
 
+// Emit a move from two GR64s to a GR128.
+MachineBasicBlock *
+SystemZTargetLowering::emitPair128(MachineInstr &MI,
+                                   MachineBasicBlock *MBB) const {
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dest = MI.getOperand(0).getReg();
+  unsigned Hi = MI.getOperand(1).getReg();
+  unsigned Lo = MI.getOperand(2).getReg();
+  unsigned Tmp1 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+  unsigned Tmp2 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+
+  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Tmp1);
+  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Tmp2)
+    .addReg(Tmp1).addReg(Hi).addImm(SystemZ::subreg_h64);
+  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
+    .addReg(Tmp2).addReg(Lo).addImm(SystemZ::subreg_l64);
+
+  MI.eraseFromParent();
+  return MBB;
+}
+
 // Emit an extension from a GR64 to a GR128.  ClearEven is true
 // if the high register of the GR128 value must be cleared or false if
 // it's "don't care".
@@ -6237,6 +6465,8 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
   case SystemZ::CondStoreF64Inv:
     return emitCondStore(MI, MBB, SystemZ::STD, 0, true);
 
+  case SystemZ::PAIR128:
+    return emitPair128(MI, MBB);
   case SystemZ::AEXT128:
     return emitExt128(MI, MBB, false);
   case SystemZ::ZEXT128:
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index abe8b7233e60..2cdc88db5a4d 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -18,7 +18,7 @@
 #include "SystemZ.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/TargetLowering.h"
 
 namespace llvm {
 namespace SystemZISD {
@@ -308,6 +308,22 @@ enum NodeType : unsigned {
   // Operand 5: the width of the field in bits (8 or 16)
   ATOMIC_CMP_SWAPW,
 
+  // Atomic compare-and-swap returning glue (condition code).
+  // Val, OUTCHAIN, glue = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
+  ATOMIC_CMP_SWAP,
+
+  // 128-bit atomic load.
+  // Val, OUTCHAIN = ATOMIC_LOAD_128(INCHAIN, ptr)
+  ATOMIC_LOAD_128,
+
+  // 128-bit atomic store.
+  // OUTCHAIN = ATOMIC_STORE_128(INCHAIN, val, ptr)
+  ATOMIC_STORE_128,
+
+  // 128-bit atomic compare-and-swap.
+  // Val, OUTCHAIN, glue = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
+  ATOMIC_CMP_SWAP_128,
+
   // Byte swapping load.
   //
   // Operand 0: the address to load from
@@ -384,8 +400,8 @@ public:
   bool isLegalICmpImmediate(int64_t Imm) const override;
   bool isLegalAddImmediate(int64_t Imm) const override;
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
-                             unsigned AS) const override;
-  bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) const override;
+                             unsigned AS,
+                             Instruction *I = nullptr) const override;
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
                                       bool *Fast) const override;
@@ -410,6 +426,8 @@ public:
       switch(ConstraintCode[0]) {
       default:
         break;
+      case 'o':
+        return InlineAsm::Constraint_o;
       case 'Q':
         return InlineAsm::Constraint_Q;
       case 'R':
@@ -448,6 +466,10 @@ public:
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *BB) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  void LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                             SelectionDAG &DAG) const override;
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                          SelectionDAG &DAG) const override;
   bool allowTruncateForTailCall(Type *, Type *) const override;
   bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
@@ -565,6 +587,8 @@ private:
   MachineBasicBlock *emitCondStore(MachineInstr &MI, MachineBasicBlock *BB,
                                    unsigned StoreOpcode, unsigned STOCOpcode,
                                    bool Invert) const;
+  MachineBasicBlock *emitPair128(MachineInstr &MI,
+                                 MachineBasicBlock *MBB) const;
   MachineBasicBlock *emitExt128(MachineInstr &MI, MachineBasicBlock *MBB,
                                 bool ClearEven) const;
   MachineBasicBlock *emitAtomicLoadBinary(MachineInstr &MI,
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 02aeaadad0d9..16edbea87cda 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -7,6 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+// TODO: Most floating-point instructions (except for simple moves and the
+// like) can raise exceptions -- should they have hasSideEffects=1 ?
+
 //===----------------------------------------------------------------------===//
 // Select instructions
 //===----------------------------------------------------------------------===//
@@ -29,22 +32,20 @@ defm CondStoreF64 : CondStores<FP64, nonvolatile_store,
 //===----------------------------------------------------------------------===//
 
 // Load zero.
-let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+let isAsCheapAsAMove = 1, isMoveImm = 1 in {
   def LZER : InherentRRE<"lzer", 0xB374, FP32,  fpimm0>;
   def LZDR : InherentRRE<"lzdr", 0xB375, FP64,  fpimm0>;
   def LZXR : InherentRRE<"lzxr", 0xB376, FP128, fpimm0>;
 }
 
 // Moves between two floating-point registers.
-let hasSideEffects = 0 in {
-  def LER : UnaryRR <"ler", 0x38,   null_frag, FP32,  FP32>;
-  def LDR : UnaryRR <"ldr", 0x28,   null_frag, FP64,  FP64>;
-  def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>;
+def LER : UnaryRR <"ler", 0x38,   null_frag, FP32,  FP32>;
+def LDR : UnaryRR <"ldr", 0x28,   null_frag, FP64,  FP64>;
+def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>;
 
-  // For z13 we prefer LDR over LER to avoid partial register dependencies.
-  let isCodeGenOnly = 1 in
-    def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>;
-}
+// For z13 we prefer LDR over LER to avoid partial register dependencies.
+let isCodeGenOnly = 1 in
+  def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>;
 
 // Moves between two floating-point registers that also set the condition
 // codes.
@@ -130,7 +131,7 @@ defm LoadStoreF128 : MVCLoadStore<load, f128, MVCSequence, 16>;
 // Load instructions
 //===----------------------------------------------------------------------===//
 
-let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
+let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
   defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
   defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
 
@@ -150,7 +151,7 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
 // Store instructions
 //===----------------------------------------------------------------------===//
 
-let SimpleBDXStore = 1 in {
+let SimpleBDXStore = 1, mayStore = 1 in {
   defm STE : StoreRXPair<"ste", 0x70, 0xED66, store, FP32, 4>;
   defm STD : StoreRXPair<"std", 0x60, 0xED67, store, FP64, 8>;
 
@@ -525,11 +526,14 @@ let Defs = [CC], CCValues = 0xC in {
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 1 in {
-  def EFPC  : InherentRRE<"efpc", 0xB38C, GR32, int_s390_efpc>;
-  def STFPC : StoreInherentS<"stfpc", 0xB29C, storei<int_s390_efpc>, 4>;
+  let mayLoad = 1, mayStore = 1 in {
+    // TODO: EFPC and SFPC do not touch memory at all
+    def EFPC  : InherentRRE<"efpc", 0xB38C, GR32, int_s390_efpc>;
+    def STFPC : StoreInherentS<"stfpc", 0xB29C, storei<int_s390_efpc>, 4>;
 
-  def SFPC : SideEffectUnaryRRE<"sfpc", 0xB384, GR32, int_s390_sfpc>;
-  def LFPC : SideEffectUnaryS<"lfpc", 0xB29D, loadu<int_s390_sfpc>, 4>;
+    def SFPC : SideEffectUnaryRRE<"sfpc", 0xB384, GR32, int_s390_sfpc>;
+    def LFPC : SideEffectUnaryS<"lfpc", 0xB29D, loadu<int_s390_sfpc>, 4>;
+  }
 
   def SFASR : SideEffectUnaryRRE<"sfasr", 0xB385, GR32, null_frag>;
   def LFAS  : SideEffectUnaryS<"lfas", 0xB2BD, null_frag, 4>;
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 033a0a879d37..06da66ad8764 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -21,6 +21,10 @@ class InstSystemZ<int size, dag outs, dag ins, string asmstr,
   let Pattern = pattern;
   let AsmString = asmstr;
 
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+
   // Some instructions come in pairs, one having a 12-bit displacement
   // and the other having a 20-bit displacement.  Both instructions in
   // the pair have the same DispKey and their DispSizes are "12" and "20"
@@ -2100,11 +2104,14 @@ class CondBranchRXY<string mnemonic, bits<16> opcode>
   : InstRXYb<opcode, (outs), (ins cond4:$valid, cond4:$M1, bdxaddr20only:$XBD2),
              !subst("#", "${M1}", mnemonic)#"\t$XBD2", []> {
   let CCMaskFirst = 1;
+  let mayLoad = 1;
 }
 
 class AsmCondBranchRXY<string mnemonic, bits<16> opcode>
   : InstRXYb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr20only:$XBD2),
-             mnemonic#"\t$M1, $XBD2", []>;
+             mnemonic#"\t$M1, $XBD2", []> {
+  let mayLoad = 1;
+}
 
 class FixedCondBranchRXY<CondVariant V, string mnemonic, bits<16> opcode,
                          SDPatternOperator operator = null_frag>
@@ -2113,6 +2120,7 @@ class FixedCondBranchRXY<CondVariant V, string mnemonic, bits<16> opcode,
              [(operator (load bdxaddr20only:$XBD2))]> {
   let isAsmParserOnly = V.alternate;
   let M1 = V.ccmask;
+  let mayLoad = 1;
 }
 
 class CmpBranchRIEa<string mnemonic, bits<16> opcode,
@@ -2784,7 +2792,6 @@ multiclass CondUnaryRSYPair<string mnemonic, bits<16> opcode,
   def Asm : AsmCondUnaryRSY<mnemonic, opcode, cls, bytes, mode>;
 }
 
-
 class UnaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
               RegisterOperand cls, bits<5> bytes,
               AddressingMode mode = bdxaddr12only>
@@ -4688,7 +4695,8 @@ class SelectWrapper<ValueType vt, RegisterOperand cls>
 // Stores $new to $addr if $cc is true ("" case) or false (Inv case).
 multiclass CondStores<RegisterOperand cls, SDPatternOperator store,
                       SDPatternOperator load, AddressingMode mode> {
-  let Defs = [CC], Uses = [CC], usesCustomInserter = 1 in {
+  let Defs = [CC], Uses = [CC], usesCustomInserter = 1,
+      mayLoad = 1, mayStore = 1 in {
     def "" : Pseudo<(outs),
                     (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
                     [(store (z_select_ccmask cls:$new, (load mode:$addr),
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 4533f4fdf21a..572446c1aa12 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -16,8 +16,9 @@
 #include "SystemZ.h"
 #include "SystemZInstrBuilder.h"
 #include "SystemZSubtarget.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -27,14 +28,14 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -45,6 +46,9 @@ using namespace llvm;
 #define GET_INSTRMAP_INFO
 #include "SystemZGenInstrInfo.inc"
 
+#define DEBUG_TYPE "systemz-II"
+STATISTIC(LOCRMuxJumps, "Number of LOCRMux jump-sequences (lower is better)");
+
 // Return a mask with Count low bits set.
 static uint64_t allOnes(unsigned int Count) {
   return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
@@ -209,6 +213,8 @@ void SystemZInstrInfo::expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
     MI.setDesc(get(LowOpcode));
   else if (DestIsHigh && SrcIsHigh)
     MI.setDesc(get(HighOpcode));
+  else
+    LOCRMuxJumps++;
 
   // If we were unable to implement the pseudo with a single instruction, we
   // need to convert it back into a branch sequence.  This cannot be done here
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index b8be1f5f3921..216139eb7c79 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -20,7 +20,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include <cstdint>
 
 #define GET_INSTRINFO_HEADER
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index f64c0d15ef83..abb804597f4e 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -11,24 +11,25 @@
 // Stack allocation
 //===----------------------------------------------------------------------===//
 
-let hasNoSchedulingInfo = 1 in {
+// The callseq_start node requires the hasSideEffects flag, even though these
+// instructions are noops on SystemZ.
+let hasNoSchedulingInfo = 1, hasSideEffects = 1 in {
   def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                                 [(callseq_start timm:$amt1, timm:$amt2)]>;
   def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                                 [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
-let hasSideEffects = 0 in {
-  // Takes as input the value of the stack pointer after a dynamic allocation
-  // has been made.  Sets the output to the address of the dynamically-
-  // allocated area itself, skipping the outgoing arguments.
-  //
-  // This expands to an LA or LAY instruction.  We restrict the offset
-  // to the range of LA and keep the LAY range in reserve for when
-  // the size of the outgoing arguments is added.
-  def ADJDYNALLOC : Pseudo<(outs GR64:$dst), (ins dynalloc12only:$src),
-                           [(set GR64:$dst, dynalloc12only:$src)]>;
-}
+// Takes as input the value of the stack pointer after a dynamic allocation
+// has been made.  Sets the output to the address of the dynamically-
+// allocated area itself, skipping the outgoing arguments.
+//
+// This expands to an LA or LAY instruction.  We restrict the offset
+// to the range of LA and keep the LAY range in reserve for when
+// the size of the outgoing arguments is added.
+def ADJDYNALLOC : Pseudo<(outs GR64:$dst), (ins dynalloc12only:$src),
+                         [(set GR64:$dst, dynalloc12only:$src)]>;
+
 
 //===----------------------------------------------------------------------===//
 // Branch instructions
@@ -197,15 +198,15 @@ let isBranch = 1, isTerminator = 1 in {
 //===----------------------------------------------------------------------===//
 
 // Unconditional trap.
-let hasCtrlDep = 1 in
+let hasCtrlDep = 1, hasSideEffects = 1 in
   def Trap : Alias<4, (outs), (ins), [(trap)]>;
 
 // Conditional trap.
-let hasCtrlDep = 1, Uses = [CC] in
+let hasCtrlDep = 1, Uses = [CC], hasSideEffects = 1 in
   def CondTrap : Alias<4, (outs), (ins cond4:$valid, cond4:$R1), []>;
 
 // Fused compare-and-trap instructions.
-let hasCtrlDep = 1 in {
+let hasCtrlDep = 1, hasSideEffects = 1 in {
   // These patterns work the same way as for compare-and-branch.
   defm CRT   : CmpBranchRRFcPair<"crt",   0xB972, GR32>;
   defm CGRT  : CmpBranchRRFcPair<"cgrt",  0xB960, GR64>;
@@ -360,21 +361,22 @@ defm CondStore64 : CondStores<GR64, nonvolatile_store,
 //===----------------------------------------------------------------------===//
 
 // Register moves.
-let hasSideEffects = 0 in {
-  // Expands to LR, RISBHG or RISBLG, depending on the choice of registers.
-  def LRMux : UnaryRRPseudo<"lr", null_frag, GRX32, GRX32>,
-              Requires<[FeatureHighWord]>;
-  def LR  : UnaryRR <"lr",  0x18,   null_frag, GR32, GR32>;
-  def LGR : UnaryRRE<"lgr", 0xB904, null_frag, GR64, GR64>;
-}
+// Expands to LR, RISBHG or RISBLG, depending on the choice of registers.
+def LRMux : UnaryRRPseudo<"lr", null_frag, GRX32, GRX32>,
+            Requires<[FeatureHighWord]>;
+def LR  : UnaryRR <"lr",  0x18,   null_frag, GR32, GR32>;
+def LGR : UnaryRRE<"lgr", 0xB904, null_frag, GR64, GR64>;
+
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
   def LTR  : UnaryRR <"ltr",  0x12,   null_frag, GR32, GR32>;
   def LTGR : UnaryRRE<"ltgr", 0xB902, null_frag, GR64, GR64>;
 }
 
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
+  def PAIR128 : Pseudo<(outs GR128:$dst), (ins GR64:$hi, GR64:$lo), []>;
+
 // Immediate moves.
-let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
-    isReMaterializable = 1 in {
+let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in {
   // 16-bit sign-extended immediates.  LHIMux expands to LHI or IIHF,
   // deopending on the choice of register.
   def LHIMux : UnaryRIPseudo<bitconvert, GRX32, imm32sx16>,
@@ -395,7 +397,7 @@ let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
 }
 
 // Register loads.
-let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
+let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
   // Expands to L, LY or LFH, depending on the choice of register.
   def LMux : UnaryRXYPseudo<"l", load, GRX32, 4>,
              Requires<[FeatureHighWord]>;
@@ -432,14 +434,14 @@ let Predicates = [FeatureLoadAndZeroRightmostByte] in {
 }
 
 // Load and trap.
-let Predicates = [FeatureLoadAndTrap] in {
+let Predicates = [FeatureLoadAndTrap], hasSideEffects = 1 in {
   def LAT   : UnaryRXY<"lat",   0xE39F, null_frag, GR32, 4>;
   def LFHAT : UnaryRXY<"lfhat", 0xE3C8, null_frag, GRH32, 4>;
   def LGAT  : UnaryRXY<"lgat",  0xE385, null_frag, GR64, 8>;
 }
 
 // Register stores.
-let SimpleBDXStore = 1 in {
+let SimpleBDXStore = 1, mayStore = 1 in {
   // Expands to ST, STY or STFH, depending on the choice of register.
   def STMux : StoreRXYPseudo<store, GRX32, 4>,
               Requires<[FeatureHighWord]>;
@@ -486,17 +488,16 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in
 let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in {
   // Load immediate on condition.  Matched via DAG pattern and created
   // by the PeepholeOptimizer via FoldImmediate.
-  let hasSideEffects = 0 in {
-    // Expands to LOCHI or LOCHHI, depending on the choice of register.
-    def LOCHIMux : CondBinaryRIEPseudo<GRX32, imm32sx16>;
-    defm LOCHHI  : CondBinaryRIEPair<"lochhi", 0xEC4E, GRH32, imm32sx16>;
-    defm LOCHI   : CondBinaryRIEPair<"lochi",  0xEC42, GR32, imm32sx16>;
-    defm LOCGHI  : CondBinaryRIEPair<"locghi", 0xEC46, GR64, imm64sx16>;
-  }
+
+  // Expands to LOCHI or LOCHHI, depending on the choice of register.
+  def LOCHIMux : CondBinaryRIEPseudo<GRX32, imm32sx16>;
+  defm LOCHHI  : CondBinaryRIEPair<"lochhi", 0xEC4E, GRH32, imm32sx16>;
+  defm LOCHI   : CondBinaryRIEPair<"lochi",  0xEC42, GR32, imm32sx16>;
+  defm LOCGHI  : CondBinaryRIEPair<"locghi", 0xEC46, GR64, imm64sx16>;
 
   // Move register on condition.  Expanded from Select* pseudos and
   // created by early if-conversion.
-  let hasSideEffects = 0, isCommutable = 1 in {
+  let isCommutable = 1 in {
     // Expands to LOCR or LOCFHR or a branch-and-move sequence,
     // depending on the choice of registers.
     def LOCRMux : CondBinaryRRFPseudo<GRX32, GRX32>;
@@ -531,7 +532,7 @@ let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in {
 let Predicates = [FeatureLoadStoreOnCond], Uses = [CC] in {
   // Move register on condition.  Expanded from Select* pseudos and
   // created by early if-conversion.
-  let hasSideEffects = 0, isCommutable = 1 in {
+  let isCommutable = 1 in {
     defm LOCR  : CondBinaryRRFPair<"locr",  0xB9F2, GR32, GR32>;
     defm LOCGR : CondBinaryRRFPair<"locgr", 0xB9E2, GR64, GR64>;
   }
@@ -567,17 +568,14 @@ let Predicates = [FeatureLoadStoreOnCond], Uses = [CC] in {
 //===----------------------------------------------------------------------===//
 
 // 32-bit extensions from registers.
-let hasSideEffects = 0 in {
-  def LBR : UnaryRRE<"lbr", 0xB926, sext8,  GR32, GR32>;
-  def LHR : UnaryRRE<"lhr", 0xB927, sext16, GR32, GR32>;
-}
+def LBR : UnaryRRE<"lbr", 0xB926, sext8,  GR32, GR32>;
+def LHR : UnaryRRE<"lhr", 0xB927, sext16, GR32, GR32>;
 
 // 64-bit extensions from registers.
-let hasSideEffects = 0 in {
-  def LGBR : UnaryRRE<"lgbr", 0xB906, sext8,  GR64, GR64>;
-  def LGHR : UnaryRRE<"lghr", 0xB907, sext16, GR64, GR64>;
-  def LGFR : UnaryRRE<"lgfr", 0xB914, sext32, GR64, GR32>;
-}
+def LGBR : UnaryRRE<"lgbr", 0xB906, sext8,  GR64, GR64>;
+def LGHR : UnaryRRE<"lghr", 0xB907, sext16, GR64, GR64>;
+def LGFR : UnaryRRE<"lgfr", 0xB914, sext32, GR64, GR32>;
+
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
   def LTGFR : UnaryRRE<"ltgfr", 0xB912, null_frag, GR64, GR32>;
 
@@ -617,23 +615,20 @@ let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
 //===----------------------------------------------------------------------===//
 
 // 32-bit extensions from registers.
-let hasSideEffects = 0 in {
-  // Expands to LLCR or RISB[LH]G, depending on the choice of registers.
-  def LLCRMux : UnaryRRPseudo<"llcr", zext8, GRX32, GRX32>,
-                Requires<[FeatureHighWord]>;
-  def LLCR    : UnaryRRE<"llcr", 0xB994, zext8,  GR32, GR32>;
-  // Expands to LLHR or RISB[LH]G, depending on the choice of registers.
-  def LLHRMux : UnaryRRPseudo<"llhr", zext16, GRX32, GRX32>,
-                Requires<[FeatureHighWord]>;
-  def LLHR    : UnaryRRE<"llhr", 0xB995, zext16, GR32, GR32>;
-}
+
+// Expands to LLCR or RISB[LH]G, depending on the choice of registers.
+def LLCRMux : UnaryRRPseudo<"llcr", zext8, GRX32, GRX32>,
+              Requires<[FeatureHighWord]>;
+def LLCR    : UnaryRRE<"llcr", 0xB994, zext8,  GR32, GR32>;
+// Expands to LLHR or RISB[LH]G, depending on the choice of registers.
+def LLHRMux : UnaryRRPseudo<"llhr", zext16, GRX32, GRX32>,
+              Requires<[FeatureHighWord]>;
+def LLHR    : UnaryRRE<"llhr", 0xB995, zext16, GR32, GR32>;
 
 // 64-bit extensions from registers.
-let hasSideEffects = 0 in {
-  def LLGCR : UnaryRRE<"llgcr", 0xB984, zext8,  GR64, GR64>;
-  def LLGHR : UnaryRRE<"llghr", 0xB985, zext16, GR64, GR64>;
-  def LLGFR : UnaryRRE<"llgfr", 0xB916, zext32, GR64, GR32>;
-}
+def LLGCR : UnaryRRE<"llgcr", 0xB984, zext8,  GR64, GR64>;
+def LLGHR : UnaryRRE<"llghr", 0xB985, zext16, GR64, GR64>;
+def LLGFR : UnaryRRE<"llgfr", 0xB916, zext32, GR64, GR32>;
 
 // Match 32-to-64-bit zero extensions in which the source is already
 // in a 64-bit register.
@@ -680,7 +675,7 @@ let Predicates = [FeatureLoadAndZeroRightmostByte] in {
 }
 
 // Load and trap.
-let Predicates = [FeatureLoadAndTrap] in {
+let Predicates = [FeatureLoadAndTrap], hasSideEffects = 1 in {
   def LLGFAT : UnaryRXY<"llgfat", 0xE39D, null_frag, GR64, 4>;
   def LLGTAT : UnaryRXY<"llgtat", 0xE39C, null_frag, GR64, 4>;
 }
@@ -757,10 +752,8 @@ def STMH : StoreMultipleRSY<"stmh", 0xEB26, GRH32>;
 //===----------------------------------------------------------------------===//
 
 // Byte-swapping register moves.
-let hasSideEffects = 0 in {
-  def LRVR  : UnaryRRE<"lrvr",  0xB91F, bswap, GR32, GR32>;
-  def LRVGR : UnaryRRE<"lrvgr", 0xB90F, bswap, GR64, GR64>;
-}
+def LRVR  : UnaryRRE<"lrvr",  0xB91F, bswap, GR32, GR32>;
+def LRVGR : UnaryRRE<"lrvgr", 0xB90F, bswap, GR64, GR64>;
 
 // Byte-swapping loads.  Unlike normal loads, these instructions are
 // allowed to access storage more than once.
@@ -782,13 +775,12 @@ let mayLoad = 1, mayStore = 1 in
 //===----------------------------------------------------------------------===//
 
 // Load BDX-style addresses.
-let hasSideEffects = 0, isAsCheapAsAMove = 1, isReMaterializable = 1 in
+let isAsCheapAsAMove = 1, isReMaterializable = 1 in
   defm LA : LoadAddressRXPair<"la", 0x41, 0xE371, bitconvert>;
 
 // Load a PC-relative address.  There's no version of this instruction
 // with a 16-bit offset, so there's no relaxation.
-let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
-    isReMaterializable = 1 in
+let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in
   def LARL : LoadAddressRIL<"larl", 0xC00, bitconvert>;
 
 // Load the Global Offset Table address.  This will be lowered into a
@@ -1264,6 +1256,7 @@ def MGRK : BinaryRRFa<"mgrk", 0xB9EC, null_frag, GR128, GR64, GR64>,
            Requires<[FeatureMiscellaneousExtensions2]>;
 def MLR  : BinaryRRE<"mlr",  0xB996, null_frag, GR128, GR32>;
 def MLGR : BinaryRRE<"mlgr", 0xB986, null_frag, GR128, GR64>;
+
 def : Pat<(z_smul_lohi GR64:$src1, GR64:$src2),
           (MGRK GR64:$src1, GR64:$src2)>;
 def : Pat<(z_umul_lohi GR64:$src1, GR64:$src2),
@@ -1276,6 +1269,7 @@ def MG  : BinaryRXY<"mg",  0xE384, null_frag, GR128, load, 8>,
           Requires<[FeatureMiscellaneousExtensions2]>;
 def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, load, 4>;
 def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, load, 8>;
+
 def : Pat<(z_smul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
           (MG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 def : Pat<(z_umul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
@@ -1325,11 +1319,9 @@ def : Pat<(z_udivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))),
 //===----------------------------------------------------------------------===//
 
 // Logical shift left.
-let hasSideEffects = 0 in {
-  defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
-  def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
-  def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>;
-}
+defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
+def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
+def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>;
 
 // Arithmetic shift left.
 let Defs = [CC] in {
@@ -1339,11 +1331,9 @@ let Defs = [CC] in {
 }
 
 // Logical shift right.
-let hasSideEffects = 0 in {
-  defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
-  def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
-  def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>;
-}
+defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
+def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
+def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>;
 
 // Arithmetic shift right.
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
@@ -1353,10 +1343,8 @@ let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
 }
 
 // Rotate left.
-let hasSideEffects = 0 in {
-  def RLL  : BinaryRSY<"rll",  0xEB1D, rotl, GR32>;
-  def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>;
-}
+def RLL  : BinaryRSY<"rll",  0xEB1D, rotl, GR32>;
+def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>;
 
 // Rotate second operand left and inserted selected bits into first operand.
 // These can act like 32-bit operands provided that the constant start and
@@ -1547,10 +1535,12 @@ let Defs = [CC] in {
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
 
-def PFD : PrefetchRXY<"pfd", 0xE336, z_prefetch>;
-def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>;
+let mayLoad = 1, mayStore = 1 in {
+  def PFD : PrefetchRXY<"pfd", 0xE336, z_prefetch>;
+  def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>;
+}
 
-let Predicates = [FeatureExecutionHint] in {
+let Predicates = [FeatureExecutionHint], hasSideEffects = 1 in {
   // Branch Prediction Preload
   def BPP : BranchPreloadSMI<"bpp", 0xC7>;
   def BPRP : BranchPreloadMII<"bprp", 0xC5>;
@@ -1714,14 +1704,14 @@ let mayLoad = 1, Defs = [CC] in
 
 // Compare and swap.
 let Defs = [CC] in {
-  defm CS  : CmpSwapRSPair<"cs", 0xBA, 0xEB14, atomic_cmp_swap_32, GR32>;
-  def  CSG : CmpSwapRSY<"csg", 0xEB30, atomic_cmp_swap_64, GR64>;
+  defm CS  : CmpSwapRSPair<"cs", 0xBA, 0xEB14, z_atomic_cmp_swap, GR32>;
+  def  CSG : CmpSwapRSY<"csg", 0xEB30, z_atomic_cmp_swap, GR64>;
 }
 
 // Compare double and swap.
 let Defs = [CC] in {
   defm CDS  : CmpSwapRSPair<"cds", 0xBB, 0xEB31, null_frag, GR128>;
-  def  CDSG : CmpSwapRSY<"cdsg", 0xEB3E, null_frag, GR128>;
+  def  CDSG : CmpSwapRSY<"cdsg", 0xEB3E, z_atomic_cmp_swap_128, GR128>;
 }
 
 // Compare and swap and store.
@@ -1733,8 +1723,8 @@ let Uses = [R0L, R1D], Defs = [CC], mayStore = 1, mayLoad =1 in
   def PLO : SideEffectQuaternarySSe<"plo", 0xEE, GR64>;
 
 // Load/store pair from/to quadword.
-def LPQ  : UnaryRXY<"lpq", 0xE38F, null_frag, GR128, 16>;
-def STPQ : StoreRXY<"stpq", 0xE38E, null_frag, GR128, 16>;
+def LPQ  : UnaryRXY<"lpq", 0xE38F, z_atomic_load_128, GR128, 16>;
+def STPQ : StoreRXY<"stpq", 0xE38E, z_atomic_store_128, GR128, 16>;
 
 // Load pair disjoint.
 let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
@@ -1817,7 +1807,10 @@ let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
 // Guarded storage
 //===----------------------------------------------------------------------===//
 
-let Predicates = [FeatureGuardedStorage] in {
+// These instructions use and/or modify the guarded storage control
+// registers, which we do not otherwise model, so they should have
+// hasSideEffects.
+let Predicates = [FeatureGuardedStorage], hasSideEffects = 1 in {
   def LGG : UnaryRXY<"lgg", 0xE34C, null_frag, GR64, 8>;
   def LLGFSG : UnaryRXY<"llgfsg", 0xE348, null_frag, GR64, 4>;
 
@@ -1893,7 +1886,7 @@ defm LAE : LoadAddressRXPair<"lae", 0x51, 0xE375, null_frag>;
 // Load access multiple.
 defm LAM : LoadMultipleRSPair<"lam", 0x9A, 0xEB9A, AR32>;
 
-// Load access multiple.
+// Store access multiple.
 defm STAM : StoreMultipleRSPair<"stam", 0x9B, 0xEB9B, AR32>;
 
 //===----------------------------------------------------------------------===//
@@ -1942,7 +1935,6 @@ let hasSideEffects = 1, Predicates = [FeatureTransactionalExecution] in {
   let mayStore = 1, usesCustomInserter = 1, Defs = [CC] in {
     def TBEGIN : SideEffectBinarySIL<"tbegin", 0xE560, z_tbegin, imm32zx16>;
     def TBEGIN_nofloat : SideEffectBinarySILPseudo<z_tbegin_nofloat, imm32zx16>;
-
     def TBEGINC : SideEffectBinarySIL<"tbeginc", 0xE561,
                                       int_s390_tbeginc, imm32zx16>;
   }
@@ -1952,7 +1944,8 @@ let hasSideEffects = 1, Predicates = [FeatureTransactionalExecution] in {
     def TEND : SideEffectInherentS<"tend", 0xB2F8, z_tend>;
 
   // Transaction Abort
-  let isTerminator = 1, isBarrier = 1 in
+  let isTerminator = 1, isBarrier = 1, mayStore = 1,
+      hasSideEffects = 1 in
     def TABORT : SideEffectAddressS<"tabort", 0xB2FC, int_s390_tabort>;
 
   // Nontransactional Store
@@ -2028,7 +2021,7 @@ let hasSideEffects = 1 in {
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
 
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, hasSideEffects = 1 in {
   def InsnE   : DirectiveInsnE<(outs), (ins imm64zx16:$enc), ".insn e,$enc", []>;
   def InsnRI  : DirectiveInsnRI<(outs), (ins imm64zx32:$enc, AnyReg:$R1,
                                              imm32sx16:$I2),
diff --git a/lib/Target/SystemZ/SystemZInstrSystem.td b/lib/Target/SystemZ/SystemZInstrSystem.td
index 0112ebf1eb10..c351577fa5bd 100644
--- a/lib/Target/SystemZ/SystemZInstrSystem.td
+++ b/lib/Target/SystemZ/SystemZInstrSystem.td
@@ -23,7 +23,7 @@ let hasSideEffects = 1, Uses = [CC] in
   def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>;
 
 // Load PSW (extended).
-let hasSideEffects = 1, Defs = [CC], mayLoad = 1 in {
+let hasSideEffects = 1, Defs = [CC] in {
   def LPSW : SideEffectUnaryS<"lpsw", 0x8200, null_frag, 8>;
   def LPSWE : SideEffectUnaryS<"lpswe", 0xB2B2, null_frag, 16>;
 }
@@ -37,7 +37,7 @@ let hasSideEffects = 1 in
   def SPKA : SideEffectAddressS<"spka", 0xB20A, null_frag>;
 
 // Set system mask.
-let hasSideEffects = 1, mayLoad = 1 in
+let hasSideEffects = 1 in
   def SSM : SideEffectUnaryS<"ssm", 0x8000, null_frag, 1>;
 
 // Store then AND/OR system mask.
@@ -60,13 +60,15 @@ let hasSideEffects = 1 in {
 // Control Register Instructions.
 //===----------------------------------------------------------------------===//
 
-// Load control.
-def LCTL : LoadMultipleRS<"lctl", 0xB7, CR64>;
-def LCTLG : LoadMultipleRSY<"lctlg", 0xEB2F, CR64>;
+let hasSideEffects = 1 in {
+  // Load control.
+  def LCTL : LoadMultipleRS<"lctl", 0xB7, CR64>;
+  def LCTLG : LoadMultipleRSY<"lctlg", 0xEB2F, CR64>;
 
-// Store control.
-def STCTL : StoreMultipleRS<"stctl", 0xB6, CR64>;
-def STCTG : StoreMultipleRSY<"stctg", 0xEB25, CR64>;
+  // Store control.
+  def STCTL : StoreMultipleRS<"stctl", 0xB6, CR64>;
+  def STCTG : StoreMultipleRSY<"stctg", 0xEB25, CR64>;
+}
 
 // Extract primary ASN (and instance).
 let hasSideEffects = 1 in {
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index c9a02d9c8082..92b86575235a 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -56,8 +56,7 @@ def : VectorExtractSubreg<v4i32, VLGVF>;
 //===----------------------------------------------------------------------===//
 
 let Predicates = [FeatureVector] in {
-  let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
-      isReMaterializable = 1 in {
+  let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in {
 
     // Generate byte mask.
     def VZERO : InherentVRIa<"vzero", 0xE744, 0>;
@@ -141,8 +140,10 @@ let Predicates = [FeatureVector] in {
   // LEY and LDY offer full 20-bit displacement fields.  It's often better
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
-  def VL32 : UnaryAliasVRX<load, v32sb, bdxaddr12pair>;
-  def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
+  let mayLoad = 1 in {
+    def VL32 : UnaryAliasVRX<load, v32sb, bdxaddr12pair>;
+    def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
+  }
 
   // Load logical element and zero.
   def VLLEZ  : UnaryVRXGeneric<"vllez", 0xE704>;
@@ -231,8 +232,10 @@ let Predicates = [FeatureVector] in {
   // STEY and STDY offer full 20-bit displacement fields.  It's often better
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
-  def VST32 : StoreAliasVRX<store, v32sb, bdxaddr12pair>;
-  def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
+  let mayStore = 1 in {
+    def VST32 : StoreAliasVRX<store, v32sb, bdxaddr12pair>;
+    def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
+  }
 
   // Scatter element.
   def VSCEF : StoreBinaryVRV<"vscef", 0xE71B, 4, imm32zx2>;
diff --git a/lib/Target/SystemZ/SystemZLDCleanup.cpp b/lib/Target/SystemZ/SystemZLDCleanup.cpp
index d4cd89ce590f..f532e9e23b1f 100644
--- a/lib/Target/SystemZ/SystemZLDCleanup.cpp
+++ b/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -19,9 +19,9 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 
@@ -64,7 +64,7 @@ void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 bool SystemZLDCleanup::runOnMachineFunction(MachineFunction &F) {
-  if (skipFunction(*F.getFunction()))
+  if (skipFunction(F.getFunction()))
     return false;
 
   TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index 8342463c1086..08eb73fc362e 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -11,7 +11,8 @@
 // SystemZPostRASchedStrategy is a scheduling strategy which is plugged into
 // the MachineScheduler. It has a sorted Available set of SUs and a pickNode()
 // implementation that looks to optimize decoder grouping and balance the
-// usage of processor resources.
+// usage of processor resources. Scheduler states are saved for the end
+// region of each MBB, so that a successor block can learn from it.
 //===----------------------------------------------------------------------===//
 
 #include "SystemZMachineScheduler.h"
@@ -34,14 +35,118 @@ dump(SystemZHazardRecognizer &HazardRec) const {
 }
 #endif
 
+// Try to find a single predecessor that would be interesting for the
+// scheduler in the top-most region of MBB.
+static MachineBasicBlock *getSingleSchedPred(MachineBasicBlock *MBB,
+                                             const MachineLoop *Loop) {
+  MachineBasicBlock *PredMBB = nullptr;
+  if (MBB->pred_size() == 1)
+    PredMBB = *MBB->pred_begin();
+
+  // The loop header has two predecessors, return the latch, but not for a
+  // single block loop.
+  if (MBB->pred_size() == 2 && Loop != nullptr && Loop->getHeader() == MBB) {
+    for (auto I = MBB->pred_begin(); I != MBB->pred_end(); ++I)
+      if (Loop->contains(*I))
+        PredMBB = (*I == MBB ? nullptr : *I);
+  }
+
+  assert ((PredMBB == nullptr || !Loop || Loop->contains(PredMBB))
+          && "Loop MBB should not consider predecessor outside of loop.");
+
+  return PredMBB;
+}
+
+void SystemZPostRASchedStrategy::
+advanceTo(MachineBasicBlock::iterator NextBegin) {
+  MachineBasicBlock::iterator LastEmittedMI = HazardRec->getLastEmittedMI();
+  MachineBasicBlock::iterator I =
+    ((LastEmittedMI != nullptr && LastEmittedMI->getParent() == MBB) ?
+     std::next(LastEmittedMI) : MBB->begin());
+
+  for (; I != NextBegin; ++I) {
+    if (I->isPosition() || I->isDebugValue())
+      continue;
+    HazardRec->emitInstruction(&*I);
+  }
+}
+
+void SystemZPostRASchedStrategy::enterMBB(MachineBasicBlock *NextMBB) {
+  assert ((SchedStates.find(NextMBB) == SchedStates.end()) &&
+          "Entering MBB twice?");
+  DEBUG(dbgs() << "+++ Entering " << printMBBReference(*NextMBB));
+
+  MBB = NextMBB;
+  /// Create a HazardRec for MBB, save it in SchedStates and set HazardRec to
+  /// point to it.
+  HazardRec = SchedStates[MBB] = new SystemZHazardRecognizer(TII, &SchedModel);
+  DEBUG (const MachineLoop *Loop = MLI->getLoopFor(MBB);
+         if(Loop && Loop->getHeader() == MBB)
+           dbgs() << " (Loop header)";
+         dbgs() << ":\n";);
+
+  // Try to take over the state from a single predecessor, if it has been
+  // scheduled. If this is not possible, we are done.
+  MachineBasicBlock *SinglePredMBB =
+    getSingleSchedPred(MBB, MLI->getLoopFor(MBB));
+  if (SinglePredMBB == nullptr ||
+      SchedStates.find(SinglePredMBB) == SchedStates.end())
+    return;
+
+  DEBUG(dbgs() << "+++ Continued scheduling from "
+               << printMBBReference(*SinglePredMBB) << "\n";);
+
+  HazardRec->copyState(SchedStates[SinglePredMBB]);
+
+  // Emit incoming terminator(s). Be optimistic and assume that branch
+  // prediction will generally do "the right thing".
+  for (MachineBasicBlock::iterator I = SinglePredMBB->getFirstTerminator();
+       I != SinglePredMBB->end(); I++) {
+    DEBUG (dbgs() << "+++ Emitting incoming branch: "; I->dump(););
+    bool TakenBranch = (I->isBranch() &&
+      (TII->getBranchInfo(*I).Target->isReg() || // Relative branch
+       TII->getBranchInfo(*I).Target->getMBB() == MBB));
+    HazardRec->emitInstruction(&*I, TakenBranch);
+    if (TakenBranch)
+      break;
+  }
+}
+
+void SystemZPostRASchedStrategy::leaveMBB() {
+  DEBUG(dbgs() << "+++ Leaving " << printMBBReference(*MBB) << "\n";);
+
+  // Advance to first terminator. The successor block will handle terminators
+  // dependent on CFG layout (T/NT branch etc).
+  advanceTo(MBB->getFirstTerminator());
+}
+
 SystemZPostRASchedStrategy::
 SystemZPostRASchedStrategy(const MachineSchedContext *C)
-  : DAG(nullptr), HazardRec(C) {}
+  : MLI(C->MLI),
+    TII(static_cast<const SystemZInstrInfo *>
+        (C->MF->getSubtarget().getInstrInfo())), 
+    MBB(nullptr), HazardRec(nullptr) {
+  const TargetSubtargetInfo *ST = &C->MF->getSubtarget();
+  SchedModel.init(ST->getSchedModel(), ST, TII);
+}
+
+SystemZPostRASchedStrategy::~SystemZPostRASchedStrategy() {
+  // Delete hazard recognizers kept around for each MBB.
+  for (auto I : SchedStates) {
+    SystemZHazardRecognizer *hazrec = I.second;
+    delete hazrec;
+  }
+}
+
+void SystemZPostRASchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
+                                            MachineBasicBlock::iterator End,
+                                            unsigned NumRegionInstrs) {
+  // Don't emit the terminators.
+  if (Begin->isTerminator())
+    return;
 
-void SystemZPostRASchedStrategy::initialize(ScheduleDAGMI *dag) {
-  DAG = dag;
-  HazardRec.setDAG(dag);
-  HazardRec.Reset();
+  // Emit any instructions before start of region.
+  advanceTo(Begin);
 }
 
 // Pick the next node to schedule.
@@ -55,25 +160,25 @@ SUnit *SystemZPostRASchedStrategy::pickNode(bool &IsTopNode) {
   // If only one choice, return it.
   if (Available.size() == 1) {
     DEBUG (dbgs() << "+++ Only one: ";
-           HazardRec.dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";);
+           HazardRec->dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";);
     return *Available.begin();
   }
 
   // All nodes that are possible to schedule are stored by in the
   // Available set.
-  DEBUG(dbgs() << "+++ Available: "; Available.dump(HazardRec););
+  DEBUG(dbgs() << "+++ Available: "; Available.dump(*HazardRec););
 
   Candidate Best;
   for (auto *SU : Available) {
 
     // SU is the next candidate to be compared against current Best.
-    Candidate c(SU, HazardRec);
+    Candidate c(SU, *HazardRec);
 
     // Remeber which SU is the best candidate.
     if (Best.SU == nullptr || c < Best) {
       Best = c;
       DEBUG(dbgs() << "+++ Best sofar: ";
-            HazardRec.dumpSU(Best.SU, dbgs());
+            HazardRec->dumpSU(Best.SU, dbgs());
             if (Best.GroupingCost != 0)
               dbgs() << "\tGrouping cost:" << Best.GroupingCost;
             if (Best.ResourcesCost != 0)
@@ -138,13 +243,13 @@ void SystemZPostRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
 
   // Remove SU from Available set and update HazardRec.
   Available.erase(SU);
-  HazardRec.EmitInstruction(SU);
+  HazardRec->EmitInstruction(SU);
 }
 
 void SystemZPostRASchedStrategy::releaseTopNode(SUnit *SU) {
   // Set isScheduleHigh flag on all SUs that we want to consider first in
   // pickNode().
-  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  const MCSchedClassDesc *SC = HazardRec->getSchedClass(SU);
   bool AffectsGrouping = (SC->isValid() && (SC->BeginGroup || SC->EndGroup));
   SU->isScheduleHigh = (AffectsGrouping || SU->isUnbuffered);
 
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.h b/lib/Target/SystemZ/SystemZMachineScheduler.h
index 3dfef388691e..de1bf4655c54 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -11,7 +11,8 @@
 // SystemZPostRASchedStrategy is a scheduling strategy which is plugged into
 // the MachineScheduler. It has a sorted Available set of SUs and a pickNode()
 // implementation that looks to optimize decoder grouping and balance the
-// usage of processor resources.
+// usage of processor resources. Scheduler states are saved for the end
+// region of each MBB, so that a successor block can learn from it.
 //===----------------------------------------------------------------------===//
 
 #include "SystemZHazardRecognizer.h"
@@ -28,7 +29,14 @@ namespace llvm {
   
 /// A MachineSchedStrategy implementation for SystemZ post RA scheduling.
 class SystemZPostRASchedStrategy : public MachineSchedStrategy {
-  ScheduleDAGMI *DAG;
+
+  const MachineLoopInfo *MLI;
+  const SystemZInstrInfo *TII;
+
+  // A SchedModel is needed before any DAG is built while advancing past
+  // non-scheduled instructions, so it would not always be possible to call
+  // DAG->getSchedClass(SU).
+  TargetSchedModel SchedModel;
   
   /// A candidate during instruction evaluation.
   struct Candidate {
@@ -79,18 +87,45 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
   /// The set of available SUs to schedule next.
   SUSet Available;
 
-  // HazardRecognizer that tracks the scheduler state for the current
-  // region.
-  SystemZHazardRecognizer HazardRec;
-  
+  /// Current MBB
+  MachineBasicBlock *MBB;
+
+  /// Maintain hazard recognizers for all blocks, so that the scheduler state
+  /// can be maintained past BB boundaries when appropariate.
+  typedef std::map<MachineBasicBlock*, SystemZHazardRecognizer*> MBB2HazRec;
+  MBB2HazRec SchedStates;
+
+  /// Pointer to the HazardRecognizer that tracks the scheduler state for
+  /// the current region.
+  SystemZHazardRecognizer *HazardRec;
+
+  /// Update the scheduler state by emitting (non-scheduled) instructions
+  /// up to, but not including, NextBegin.
+  void advanceTo(MachineBasicBlock::iterator NextBegin);
+
 public:
   SystemZPostRASchedStrategy(const MachineSchedContext *C);
+  virtual ~SystemZPostRASchedStrategy();
+
+  /// Called for a region before scheduling.
+  void initPolicy(MachineBasicBlock::iterator Begin,
+                  MachineBasicBlock::iterator End,
+                  unsigned NumRegionInstrs) override;
 
   /// PostRA scheduling does not track pressure.
   bool shouldTrackPressure() const override { return false; }
 
-  /// Initialize the strategy after building the DAG for a new region.
-  void initialize(ScheduleDAGMI *dag) override;
+  // Process scheduling regions top-down so that scheduler states can be
+  // transferrred over scheduling boundaries.
+  bool doMBBSchedRegionsTopDown() const override { return true; }
+
+  void initialize(ScheduleDAGMI *dag) override {}
+
+  /// Tell the strategy that MBB is about to be processed.
+  void enterMBB(MachineBasicBlock *NextMBB) override;
+
+  /// Tell the strategy that current MBB is done.
+  void leaveMBB() override;
 
   /// Pick the next node to schedule, or return NULL.
   SUnit *pickNode(bool &IsTopNode) override;
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 759a8bb0ce14..d067f331f677 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -55,6 +55,22 @@ def SDT_ZAtomicCmpSwapW     : SDTypeProfile<1, 6,
                                              SDTCisVT<4, i32>,
                                              SDTCisVT<5, i32>,
                                              SDTCisVT<6, i32>]>;
+def SDT_ZAtomicCmpSwap      : SDTypeProfile<1, 3,
+                                            [SDTCisInt<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>]>;
+def SDT_ZAtomicLoad128      : SDTypeProfile<1, 1,
+                                            [SDTCisVT<0, untyped>,
+                                             SDTCisPtrTy<1>]>;
+def SDT_ZAtomicStore128     : SDTypeProfile<0, 2,
+                                            [SDTCisVT<0, untyped>,
+                                             SDTCisPtrTy<1>]>;
+def SDT_ZAtomicCmpSwap128   : SDTypeProfile<1, 3,
+                                            [SDTCisVT<0, untyped>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, untyped>,
+                                             SDTCisVT<3, untyped>]>;
 def SDT_ZMemMemLength       : SDTypeProfile<0, 3,
                                             [SDTCisPtrTy<0>,
                                              SDTCisPtrTy<1>,
@@ -285,7 +301,26 @@ def z_atomic_loadw_min  : AtomicWOp<"ATOMIC_LOADW_MIN">;
 def z_atomic_loadw_max  : AtomicWOp<"ATOMIC_LOADW_MAX">;
 def z_atomic_loadw_umin : AtomicWOp<"ATOMIC_LOADW_UMIN">;
 def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
-def z_atomic_cmp_swapw  : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>;
+
+def z_atomic_cmp_swap   : SDNode<"SystemZISD::ATOMIC_CMP_SWAP",
+                                 SDT_ZAtomicCmpSwap,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                                  SDNPOutGlue, SDNPMemOperand]>;
+def z_atomic_cmp_swapw  : SDNode<"SystemZISD::ATOMIC_CMP_SWAPW",
+                                 SDT_ZAtomicCmpSwapW,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                                  SDNPOutGlue, SDNPMemOperand]>;
+
+def z_atomic_load_128   : SDNode<"SystemZISD::ATOMIC_LOAD_128",
+                                 SDT_ZAtomicLoad128,
+                                 [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def z_atomic_store_128  : SDNode<"SystemZISD::ATOMIC_STORE_128",
+                                 SDT_ZAtomicStore128,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def z_atomic_cmp_swap_128 : SDNode<"SystemZISD::ATOMIC_CMP_SWAP_128",
+                                   SDT_ZAtomicCmpSwap128,
+                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                                    SDNPOutGlue, SDNPMemOperand]>;
 
 def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index d14a0fb0b0b2..856505e00a10 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -10,9 +10,12 @@
 #include "SystemZRegisterInfo.h"
 #include "SystemZInstrInfo.h"
 #include "SystemZSubtarget.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/VirtRegMap.h"
 
 using namespace llvm;
 
@@ -22,10 +25,91 @@ using namespace llvm;
 SystemZRegisterInfo::SystemZRegisterInfo()
     : SystemZGenRegisterInfo(SystemZ::R14D) {}
 
+// Given that MO is a GRX32 operand, return either GR32 or GRH32 if MO
+// somehow belongs in it. Otherwise, return GRX32.
+static const TargetRegisterClass *getRC32(MachineOperand &MO,
+                                          const VirtRegMap *VRM,
+                                          const MachineRegisterInfo *MRI) {
+  const TargetRegisterClass *RC = MRI->getRegClass(MO.getReg());
+
+  if (SystemZ::GR32BitRegClass.hasSubClassEq(RC) ||
+      MO.getSubReg() == SystemZ::subreg_l32 ||
+      MO.getSubReg() == SystemZ::subreg_hl32)
+    return &SystemZ::GR32BitRegClass;
+  if (SystemZ::GRH32BitRegClass.hasSubClassEq(RC) ||
+      MO.getSubReg() == SystemZ::subreg_h32 ||
+      MO.getSubReg() == SystemZ::subreg_hh32)
+    return &SystemZ::GRH32BitRegClass;
+
+  if (VRM && VRM->hasPhys(MO.getReg())) {
+    unsigned PhysReg = VRM->getPhys(MO.getReg());
+    if (SystemZ::GR32BitRegClass.contains(PhysReg))
+      return &SystemZ::GR32BitRegClass;
+    assert (SystemZ::GRH32BitRegClass.contains(PhysReg) &&
+            "Phys reg not in GR32 or GRH32?");
+    return &SystemZ::GRH32BitRegClass;
+  }
+
+  assert (RC == &SystemZ::GRX32BitRegClass);
+  return RC;
+}
+
+bool
+SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
+                                           ArrayRef<MCPhysReg> Order,
+                                           SmallVectorImpl<MCPhysReg> &Hints,
+                                           const MachineFunction &MF,
+                                           const VirtRegMap *VRM,
+                                           const LiveRegMatrix *Matrix) const {
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  if (MRI->getRegClass(VirtReg) == &SystemZ::GRX32BitRegClass) {
+    SmallVector<unsigned, 8> Worklist;
+    SmallSet<unsigned, 4> DoneRegs;
+    Worklist.push_back(VirtReg);
+    while (Worklist.size()) {
+      unsigned Reg = Worklist.pop_back_val();
+      if (!DoneRegs.insert(Reg).second)
+        continue;
+
+      for (auto &Use : MRI->use_instructions(Reg))
+        // For LOCRMux, see if the other operand is already a high or low
+        // register, and in that case give the correpsonding hints for
+        // VirtReg. LOCR instructions need both operands in either high or
+        // low parts.
+        if (Use.getOpcode() == SystemZ::LOCRMux) {
+          MachineOperand &TrueMO = Use.getOperand(1);
+          MachineOperand &FalseMO = Use.getOperand(2);
+          const TargetRegisterClass *RC =
+            TRI->getCommonSubClass(getRC32(FalseMO, VRM, MRI),
+                                   getRC32(TrueMO, VRM, MRI));
+          if (RC && RC != &SystemZ::GRX32BitRegClass) {
+            for (MCPhysReg Reg : Order)
+              if (RC->contains(Reg) && !MRI->isReserved(Reg))
+                Hints.push_back(Reg);
+            // Return true to make these hints the only regs available to
+            // RA. This may mean extra spilling but since the alternative is
+            // a jump sequence expansion of the LOCRMux, it is preferred.
+            return true;
+          }
+
+          // Add the other operand of the LOCRMux to the worklist.
+          unsigned OtherReg =
+            (TrueMO.getReg() == Reg ? FalseMO.getReg() : TrueMO.getReg());
+          if (MRI->getRegClass(OtherReg) == &SystemZ::GRX32BitRegClass)
+            Worklist.push_back(OtherReg);
+        }
+    }
+  }
+
+  return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+                                                   VRM, Matrix);
+}
+
 const MCPhysReg *
 SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (MF->getSubtarget().getTargetLowering()->supportSwiftError() &&
-      MF->getFunction()->getAttributes().hasAttrSomewhere(
+      MF->getFunction().getAttributes().hasAttrSomewhere(
           Attribute::SwiftError))
     return CSR_SystemZ_SwiftError_SaveList;
   return CSR_SystemZ_SaveList;
@@ -35,7 +119,7 @@ const uint32_t *
 SystemZRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
   if (MF.getSubtarget().getTargetLowering()->supportSwiftError() &&
-      MF.getFunction()->getAttributes().hasAttrSomewhere(
+      MF.getFunction().getAttributes().hasAttrSomewhere(
           Attribute::SwiftError))
     return CSR_SystemZ_SwiftError_RegMask;
   return CSR_SystemZ_RegMask;
@@ -152,6 +236,72 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
 }
 
+bool SystemZRegisterInfo::shouldCoalesce(MachineInstr *MI,
+                                  const TargetRegisterClass *SrcRC,
+                                  unsigned SubReg,
+                                  const TargetRegisterClass *DstRC,
+                                  unsigned DstSubReg,
+                                  const TargetRegisterClass *NewRC,
+                                  LiveIntervals &LIS) const {
+  assert (MI->isCopy() && "Only expecting COPY instructions");
+
+  // Coalesce anything which is not a COPY involving a subreg to/from GR128.
+  if (!(NewRC->hasSuperClassEq(&SystemZ::GR128BitRegClass) &&
+        (getRegSizeInBits(*SrcRC) <= 64 || getRegSizeInBits(*DstRC) <= 64)))
+    return true;
+
+  // Allow coalescing of a GR128 subreg COPY only if the live ranges are small
+  // and local to one MBB with not too much interferring registers. Otherwise
+  // regalloc may run out of registers.
+
+  unsigned WideOpNo = (getRegSizeInBits(*SrcRC) == 128 ? 1 : 0);
+  unsigned GR128Reg = MI->getOperand(WideOpNo).getReg();
+  unsigned GRNarReg = MI->getOperand((WideOpNo == 1) ? 0 : 1).getReg();
+  LiveInterval &IntGR128 = LIS.getInterval(GR128Reg);
+  LiveInterval &IntGRNar = LIS.getInterval(GRNarReg);
+
+  // Check that the two virtual registers are local to MBB.
+  MachineBasicBlock *MBB = MI->getParent();
+  if (LIS.isLiveInToMBB(IntGR128, MBB) || LIS.isLiveOutOfMBB(IntGR128, MBB) ||
+      LIS.isLiveInToMBB(IntGRNar, MBB) || LIS.isLiveOutOfMBB(IntGRNar, MBB))
+    return false;
+
+  // Find the first and last MIs of the registers.
+  MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
+  if (WideOpNo == 1) {
+    FirstMI = LIS.getInstructionFromIndex(IntGR128.beginIndex());
+    LastMI  = LIS.getInstructionFromIndex(IntGRNar.endIndex());
+  } else {
+    FirstMI = LIS.getInstructionFromIndex(IntGRNar.beginIndex());
+    LastMI  = LIS.getInstructionFromIndex(IntGR128.endIndex());
+  }
+  assert (FirstMI && LastMI && "No instruction from index?");
+
+  // Check if coalescing seems safe by finding the set of clobbered physreg
+  // pairs in the region.
+  BitVector PhysClobbered(getNumRegs());
+  MachineBasicBlock::iterator MII = FirstMI, MEE = LastMI;
+  MEE++;
+  for (; MII != MEE; ++MII) {
+    for (const MachineOperand &MO : MII->operands())
+      if (MO.isReg() && isPhysicalRegister(MO.getReg())) {
+        for (MCSuperRegIterator SI(MO.getReg(), this, true/*IncludeSelf*/);
+             SI.isValid(); ++SI)
+          if (NewRC->contains(*SI)) {
+            PhysClobbered.set(*SI);
+            break;
+          }
+      }
+  }
+
+  // Demand an arbitrary margin of free regs.
+  unsigned const DemandedFreeGR128 = 3;
+  if (PhysClobbered.count() > (NewRC->getNumRegs() - DemandedFreeGR128))
+    return false;
+
+  return true;
+}
+
 unsigned
 SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const SystemZFrameLowering *TFI = getFrameLowering(MF);
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index e41c06c98af2..8787a90b1e25 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -11,13 +11,15 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZREGISTERINFO_H
 
 #include "SystemZ.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 
 #define GET_REGINFO_HEADER
 #include "SystemZGenRegisterInfo.inc"
 
 namespace llvm {
 
+class LiveIntervals;
+
 namespace SystemZ {
 // Return the subreg to use for referring to the even and odd registers
 // in a GR128 pair.  Is32Bit says whether we want a GR32 or GR64.
@@ -42,6 +44,15 @@ public:
     return &SystemZ::ADDR64BitRegClass;
   }
 
+  bool getRegAllocationHints(unsigned VirtReg,
+                             ArrayRef<MCPhysReg> Order,
+                             SmallVectorImpl<MCPhysReg> &Hints,
+                             const MachineFunction &MF,
+                             const VirtRegMap *VRM,
+                             const LiveRegMatrix *Matrix) const override;
+
+  bool enableMultipleCopyHints() const override { return true; }
+
   // Override TargetRegisterInfo.h.
   bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
@@ -59,6 +70,16 @@ public:
   void eliminateFrameIndex(MachineBasicBlock::iterator MI,
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS) const override;
+
+  /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true.
+ bool shouldCoalesce(MachineInstr *MI,
+                      const TargetRegisterClass *SrcRC,
+                      unsigned SubReg,
+                      const TargetRegisterClass *DstRC,
+                      unsigned DstSubReg,
+                      const TargetRegisterClass *NewRC,
+                      LiveIntervals &LIS) const override;
+
   unsigned getFrameRegister(const MachineFunction &MF) const override;
 };
 
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index 52ba1a584017..a1cfaf699401 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -65,6 +65,7 @@ class GPR64<bits<16> num, string n, GPR32 low, GPR32 high>
  : SystemZRegWithSubregs<n, [low, high]> {
   let HWEncoding = num;
   let SubRegIndices = [subreg_l32, subreg_h32];
+  let CoveredBySubRegs = 1;
 }
 
 // 8 even-odd pairs of GPR64s.
@@ -72,6 +73,7 @@ class GPR128<bits<16> num, string n, GPR64 low, GPR64 high>
  : SystemZRegWithSubregs<n, [low, high]> {
   let HWEncoding = num;
   let SubRegIndices = [subreg_l64, subreg_h64];
+  let CoveredBySubRegs = 1;
 }
 
 // General-purpose registers
@@ -194,6 +196,7 @@ class FPR128<bits<16> num, string n, FPR64 low, FPR64 high>
  : SystemZRegWithSubregs<n, [low, high]> {
   let HWEncoding = num;
   let SubRegIndices = [subreg_l64, subreg_h64];
+  let CoveredBySubRegs = 1;
 }
 
 // Floating-point registers.  Registers 16-31 require the vector facility.
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index 13ceb371a425..195fa20a2c90 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -17,7 +17,7 @@
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 
 using namespace llvm;
 
@@ -309,7 +309,7 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
 }
 
 bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) {
-  if (skipFunction(*F.getFunction()))
+  if (skipFunction(F.getFunction()))
     return false;
 
   const SystemZSubtarget &ST = F.getSubtarget<SystemZSubtarget>();
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index 4829f73e080e..8285b4277d11 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -20,8 +20,8 @@
 #include "SystemZRegisterInfo.h"
 #include "SystemZSelectionDAGInfo.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
 #define GET_SUBTARGETINFO_HEADER
@@ -91,6 +91,11 @@ public:
     return &TSInfo;
   }
 
+  // True if the subtarget should run MachineScheduler after aggressive
+  // coalescing. This currently replaces the SelectionDAG scheduler with the
+  // "source" order scheduler.
+  bool enableMachineScheduler() const override { return true; }
+
   // This is important for reducing register pressure in vector code.
   bool useAA() const override { return true; }
 
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 025bf73d2df0..e74d68182949 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -18,12 +18,12 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/Scalar.h"
 #include <string>
 
@@ -99,14 +99,54 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
+// For SystemZ we define the models as follows:
+//
+// Small:  BRASL can call any function and will use a stub if necessary.
+//         Locally-binding symbols will always be in range of LARL.
+//
+// Medium: BRASL can call any function and will use a stub if necessary.
+//         GOT slots and locally-defined text will always be in range
+//         of LARL, but other symbols might not be.
+//
+// Large:  Equivalent to Medium for now.
+//
+// Kernel: Equivalent to Medium for now.
+//
+// This means that any PIC module smaller than 4GB meets the
+// requirements of Small, so Small seems like the best default there.
+//
+// All symbols bind locally in a non-PIC module, so the choice is less
+// obvious.  There are two cases:
+//
+// - When creating an executable, PLTs and copy relocations allow
+//   us to treat external symbols as part of the executable.
+//   Any executable smaller than 4GB meets the requirements of Small,
+//   so that seems like the best default.
+//
+// - When creating JIT code, stubs will be in range of BRASL if the
+//   image is less than 4GB in size.  GOT entries will likewise be
+//   in range of LARL.  However, the JIT environment has no equivalent
+//   of copy relocs, so locally-binding data symbols might not be in
+//   the range of LARL.  We need the Medium model in that case.
+static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM,
+                                              Reloc::Model RM, bool JIT) {
+  if (CM)
+    return *CM;
+  if (JIT)
+    return RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium;
+  return CodeModel::Small;
+}
+
 SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
                                            Optional<Reloc::Model> RM,
-                                           CodeModel::Model CM,
-                                           CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options,
-                        getEffectiveRelocModel(RM), CM, OL),
+                                           Optional<CodeModel::Model> CM,
+                                           CodeGenOpt::Level OL, bool JIT)
+    : LLVMTargetMachine(
+          T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options,
+          getEffectiveRelocModel(RM),
+          getEffectiveCodeModel(CM, getEffectiveRelocModel(RM), JIT), OL),
       TLOF(llvm::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index a10ca64fa632..95ad5e339e0b 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -32,8 +32,8 @@ class SystemZTargetMachine : public LLVMTargetMachine {
 public:
   SystemZTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Optional<Reloc::Model> RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
+                       Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+                       CodeGenOpt::Level OL, bool JIT);
   ~SystemZTargetMachine() override;
 
   const SystemZSubtarget *getSubtargetImpl() const { return &Subtarget; }
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 506dc7427993..37c55c4e3889 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -17,10 +17,10 @@
 #include "SystemZTargetTransformInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/CostTable.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/CostTable.h"
-#include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "systemztti"
@@ -292,6 +292,19 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   UP.Force = true;
 }
 
+
+bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
+                                   TargetTransformInfo::LSRCost &C2) {
+  // SystemZ specific: check instruction count (first), and don't care about
+  // ImmCost, since offsets are checked explicitly.
+  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
+                  C1.NumIVMuls, C1.NumBaseAdds,
+                  C1.ScaleCost, C1.SetupCost) <
+    std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
+             C2.NumIVMuls, C2.NumBaseAdds,
+             C2.ScaleCost, C2.SetupCost);
+}
+
 unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) {
   if (!Vector)
     // Discount the stack pointer.  Also leave out %r0, since it can't
@@ -310,6 +323,11 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
   return 0;
 }
 
+bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
+  EVT VT = TLI->getValueType(DL, DataType);
+  return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
+}
+
 int SystemZTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty,  
     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index a0c6fa94f8c1..4b11a6f0a837 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -48,6 +48,8 @@ public:
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
+  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
+                     TargetTransformInfo::LSRCost &C2);
   /// @}
 
   /// \name Vector TTI Implementations
@@ -60,7 +62,9 @@ public:
   unsigned getPrefetchDistance() { return 2000; }
   unsigned getMinPrefetchStride() { return 2048; }
 
+  bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool prefersVectorizedAddressing() { return false; }
+  bool LSRWithInstrQueries() { return true; }
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
 
diff --git a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
index d3c53a43b391..e2b9efd35d3e 100644
--- a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
+++ b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -18,6 +18,6 @@ Target &llvm::getTheSystemZTarget() {
 }
 
 extern "C" void LLVMInitializeSystemZTargetInfo() {
-  RegisterTarget<Triple::systemz, /*HasJIT=*/true> X(getTheSystemZTarget(),
-                                                     "systemz", "SystemZ");
+  RegisterTarget<Triple::systemz, /*HasJIT=*/true> X(
+      getTheSystemZTarget(), "systemz", "SystemZ", "SystemZ");
 }