vendor/llvm/llvm-trunk-r300422

author: Dimitry Andric <dim@FreeBSD.org> 2017-04-16 16:01:22 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2017-04-16 16:01:22 +0000
commit: 71d5a2540a98c81f5bcaeb48805e0e2881f530ef (patch)
tree: 5343938942df402b49ec7300a1c25a2d4ccd5821 /lib/Target/SystemZ
parent: 31bbf64f3a4974a2d6c8b3b27ad2f519caf74057 (diff)
21 files changed, 988 insertions, 226 deletions
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index a94717c93456..3f91ca9035a6 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -8,16 +8,31 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
 
 using namespace llvm;
 
@@ -31,6 +46,7 @@ static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
 }
 
 namespace {
+
 enum RegisterKind {
   GR32Reg,
   GRH32Reg,
@@ -56,7 +72,6 @@ enum MemoryKind {
 };
 
 class SystemZOperand : public MCParsedAsmOperand {
-public:
 private:
   enum OperandKind {
     KindInvalid,
@@ -140,12 +155,14 @@ public:
                                                        SMLoc EndLoc) {
     return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
   }
+
   static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) {
     auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc);
     Op->Token.Data = Str.data();
     Op->Token.Length = Str.size();
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
     auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
@@ -153,12 +170,14 @@ public:
     Op->Reg.Num = Num;
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) {
     auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
     Op->Imm = Expr;
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createMem(MemoryKind MemKind, RegisterKind RegKind, unsigned Base,
             const MCExpr *Disp, unsigned Index, const MCExpr *LengthImm,
@@ -175,6 +194,7 @@ public:
       Op->Mem.Length.Reg = LengthReg;
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createImmTLS(const MCExpr *Imm, const MCExpr *Sym,
                SMLoc StartLoc, SMLoc EndLoc) {
@@ -503,6 +523,7 @@ public:
     return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1, true);
   }
 };
+
 } // end anonymous namespace
 
 #define GET_REGISTER_MATCHER
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 1806e015f61e..a281a0aa6bcc 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -7,12 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "SystemZ.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -21,17 +25,19 @@ using namespace llvm;
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
+
 class SystemZDisassembler : public MCDisassembler {
 public:
   SystemZDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
     : MCDisassembler(STI, Ctx) {}
-  ~SystemZDisassembler() override {}
+  ~SystemZDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &VStream,
                               raw_ostream &CStream) const override;
 };
+
 } // end anonymous namespace
 
 static MCDisassembler *createSystemZDisassembler(const Target &T,
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index 1207c7b327e8..6cd12e13e220 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax ===//
+//===- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax -===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,10 +10,13 @@
 #include "SystemZInstPrinter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index 6336f5ee0efa..d65c661545eb 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -15,8 +15,10 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
+#include <cstdint>
 
 namespace llvm {
+
 class MCOperand;
 
 class SystemZInstPrinter : public MCInstPrinter {
@@ -70,6 +72,7 @@ private:
   // This forms part of the instruction name rather than the operand list.
   void printCond4Operand(const MCInst *MI, int OpNum, raw_ostream &O);
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 9192448afd04..23b7d5b5d501 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -51,7 +51,7 @@ public:
   }
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
   bool mayNeedRelaxation(const MCInst &Inst) const override {
     return false;
   }
@@ -91,7 +91,7 @@ SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 
 void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                      unsigned DataSize, uint64_t Value,
-                                     bool IsPCRel) const {
+                                     bool IsPCRel, MCContext &Ctx) const {
   MCFixupKind Kind = Fixup.getKind();
   unsigned Offset = Fixup.getOffset();
   unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index 7082abad716d..092eb4011adc 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -11,20 +11,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "MCTargetDesc/SystemZMCFixups.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
 
 namespace {
+
 class SystemZMCCodeEmitter : public MCCodeEmitter {
   const MCInstrInfo &MCII;
   MCContext &Ctx;
@@ -34,7 +42,7 @@ public:
     : MCII(mcii), Ctx(ctx) {
   }
 
-  ~SystemZMCCodeEmitter() override {}
+  ~SystemZMCCodeEmitter() override = default;
 
   // OVerride MCCodeEmitter.
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -137,13 +145,8 @@ private:
   void verifyInstructionPredicates(const MCInst &MI,
                                    uint64_t AvailableFeatures) const;
 };
-} // end anonymous namespace
 
-MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
-                                                const MCRegisterInfo &MRI,
-                                                MCContext &Ctx) {
-  return new SystemZMCCodeEmitter(MCII, Ctx);
-}
+} // end anonymous namespace
 
 void SystemZMCCodeEmitter::
 encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -282,3 +285,9 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
 
 #define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "SystemZGenMCCodeEmitter.inc"
+
+MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                MCContext &Ctx) {
+  return new SystemZMCCodeEmitter(MCII, Ctx);
+}
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 43a96e84289c..3de570bf30cc 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -7,35 +7,38 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "MCTargetDesc/SystemZMCFixups.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 namespace {
+
 class SystemZObjectWriter : public MCELFObjectTargetWriter {
 public:
   SystemZObjectWriter(uint8_t OSABI);
-
-  ~SystemZObjectWriter() override;
+  ~SystemZObjectWriter() override = default;
 
 protected:
   // Override MCELFObjectTargetWriter.
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
 };
+
 } // end anonymous namespace
 
 SystemZObjectWriter::SystemZObjectWriter(uint8_t OSABI)
   : MCELFObjectTargetWriter(/*Is64Bit=*/true, OSABI, ELF::EM_S390,
                             /*HasRelocationAddend=*/ true) {}
 
-SystemZObjectWriter::~SystemZObjectWriter() {
-}
-
 // Return the relocation type for an absolute value of MCFixupKind Kind.
 static unsigned getAbsoluteReloc(unsigned Kind) {
   switch (Kind) {
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index b4c843f658aa..d70f9e90cd3e 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -13,15 +13,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
 #include "SystemZTargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -33,11 +41,11 @@ STATISTIC(EliminatedComparisons, "Number of eliminated comparisons");
 STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions");
 
 namespace {
+
 // Represents the references to a particular register in one or more
 // instructions.
 struct Reference {
-  Reference()
-    : Def(false), Use(false) {}
+  Reference() = default;
 
   Reference &operator|=(const Reference &Other) {
     Def |= Other.Def;
@@ -49,15 +57,16 @@ struct Reference {
 
   // True if the register is defined or used in some form, either directly or
   // via a sub- or super-register.
-  bool Def;
-  bool Use;
+  bool Def = false;
+  bool Use = false;
 };
 
 class SystemZElimCompare : public MachineFunctionPass {
 public:
   static char ID;
+
   SystemZElimCompare(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {}
+    : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override {
     return "SystemZ Comparison Elimination";
@@ -65,6 +74,7 @@ public:
 
   bool processBlock(MachineBasicBlock &MBB);
   bool runOnMachineFunction(MachineFunction &F) override;
+
   MachineFunctionProperties getRequiredProperties() const override {
     return MachineFunctionProperties().set(
         MachineFunctionProperties::Property::NoVRegs);
@@ -84,16 +94,13 @@ private:
   bool fuseCompareOperations(MachineInstr &Compare,
                              SmallVectorImpl<MachineInstr *> &CCUsers);
 
-  const SystemZInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
+  const SystemZInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
 };
 
 char SystemZElimCompare::ID = 0;
-} // end anonymous namespace
 
-FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) {
-  return new SystemZElimCompare(TM);
-}
+} // end anonymous namespace
 
 // Return true if CC is live out of MBB.
 static bool isCCLiveOut(MachineBasicBlock &MBB) {
@@ -167,7 +174,7 @@ static unsigned getCompareSourceReg(MachineInstr &Compare) {
     reg = Compare.getOperand(0).getReg();
   else if (isLoadAndTestAsCmp(Compare))
     reg = Compare.getOperand(1).getReg();
-  assert (reg);
+  assert(reg);
 
   return reg;
 }
@@ -216,9 +223,7 @@ bool SystemZElimCompare::convertToBRCT(
     Branch->RemoveOperand(0);
   Branch->setDesc(TII->get(BRCT));
   MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
-  MIB.addOperand(MI.getOperand(0))
-     .addOperand(MI.getOperand(1))
-     .addOperand(Target);
+  MIB.add(MI.getOperand(0)).add(MI.getOperand(1)).add(Target);
   // Add a CC def to BRCT(G), since we may have to split them again if the
   // branch displacement overflows.  BRCTH has a 32-bit displacement, so
   // this is not necessary there.
@@ -261,10 +266,10 @@ bool SystemZElimCompare::convertToLoadAndTrap(
     Branch->RemoveOperand(0);
   Branch->setDesc(TII->get(LATOpcode));
   MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
-      .addOperand(MI.getOperand(0))
-      .addOperand(MI.getOperand(1))
-      .addOperand(MI.getOperand(2))
-      .addOperand(MI.getOperand(3));
+      .add(MI.getOperand(0))
+      .add(MI.getOperand(1))
+      .add(MI.getOperand(2))
+      .add(MI.getOperand(3));
   MI.eraseFromParent();
   return true;
 }
@@ -368,10 +373,8 @@ static bool isCompareZero(MachineInstr &Compare) {
     return true;
 
   default:
-
     if (isLoadAndTestAsCmp(Compare))
       return true;
-
     return Compare.getNumExplicitOperands() == 2 &&
            Compare.getOperand(1).isImm() && Compare.getOperand(1).getImm() == 0;
   }
@@ -502,15 +505,15 @@ bool SystemZElimCompare::fuseCompareOperations(
   Branch->setDesc(TII->get(FusedOpcode));
   MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
   for (unsigned I = 0; I < SrcNOps; I++)
-    MIB.addOperand(Compare.getOperand(I));
-  MIB.addOperand(CCMask);
+    MIB.add(Compare.getOperand(I));
+  MIB.add(CCMask);
 
   if (Type == SystemZII::CompareAndBranch) {
     // Only conditional branches define CC, as they may be converted back
     // to a non-fused branch because of a long displacement.  Conditional
     // returns don't have that problem.
-    MIB.addOperand(Target)
-       .addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead);
+    MIB.add(Target).addReg(SystemZ::CC,
+                           RegState::ImplicitDefine | RegState::Dead);
   }
 
   if (Type == SystemZII::CompareAndSibcall)
@@ -573,3 +576,7 @@ bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
 
   return Changed;
 }
+
+FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) {
+  return new SystemZElimCompare(TM);
+}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2d0a06af18ae..84d3c7bed50a 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -194,6 +194,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::UMUL_LOHI, VT, Custom);
 
       // Only z196 and above have native support for conversions to unsigned.
+      // On z10, promoting to i64 doesn't generate an inexact condition for
+      // values that are outside the i32 range but in the i64 range, so use
+      // the default expansion.
       if (!Subtarget.hasFPExtension())
         setOperationAction(ISD::FP_TO_UINT, VT, Expand);
     }
@@ -344,9 +347,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     // There should be no need to check for float types other than v2f64
     // since <2 x f32> isn't a legal type.
     setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal);
     setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal);
     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
   }
 
   // Handle floating-point types.
@@ -2789,8 +2796,9 @@ SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
   // but we need this case for bitcasts that are created during lowering
   // and which are then lowered themselves.
   if (auto *LoadN = dyn_cast<LoadSDNode>(In))
-    return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(),
-                       LoadN->getMemOperand());
+    if (ISD::isNormalLoad(LoadN))
+      return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(),
+                         LoadN->getMemOperand());
 
   if (InVT == MVT::i32 && ResVT == MVT::f32) {
     SDValue In64;
@@ -3802,7 +3810,7 @@ namespace {
 struct GeneralShuffle {
   GeneralShuffle(EVT vt) : VT(vt) {}
   void addUndef();
-  void add(SDValue, unsigned);
+  bool add(SDValue, unsigned);
   SDValue getNode(SelectionDAG &, const SDLoc &);
 
   // The operands of the shuffle.
@@ -3828,8 +3836,10 @@ void GeneralShuffle::addUndef() {
 // Add an extra element to the shuffle, taking it from element Elem of Op.
 // A null Op indicates a vector input whose value will be calculated later;
 // there is at most one such input per shuffle and it always has the same
-// type as the result.
-void GeneralShuffle::add(SDValue Op, unsigned Elem) {
+// type as the result. Aborts and returns false if the source vector elements
+// of an EXTRACT_VECTOR_ELT are smaller than the destination elements. Per
+// LLVM they become implicitly extended, but this is rare and not optimized.
+bool GeneralShuffle::add(SDValue Op, unsigned Elem) {
   unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
 
   // The source vector can have wider elements than the result,
@@ -3837,8 +3847,12 @@ void GeneralShuffle::add(SDValue Op, unsigned Elem) {
   // We want the least significant part.
   EVT FromVT = Op.getNode() ? Op.getValueType() : VT;
   unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize();
-  assert(FromBytesPerElement >= BytesPerElement &&
-         "Invalid EXTRACT_VECTOR_ELT");
+
+  // Return false if the source elements are smaller than their destination
+  // elements.
+  if (FromBytesPerElement < BytesPerElement)
+    return false;
+
   unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes +
                    (FromBytesPerElement - BytesPerElement));
 
@@ -3856,13 +3870,13 @@ void GeneralShuffle::add(SDValue Op, unsigned Elem) {
         break;
       if (NewByte < 0) {
         addUndef();
-        return;
+        return true;
       }
       Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes);
       Byte = unsigned(NewByte) % SystemZ::VectorBytes;
     } else if (Op.isUndef()) {
       addUndef();
-      return;
+      return true;
     } else
       break;
   }
@@ -3879,6 +3893,8 @@ void GeneralShuffle::add(SDValue Op, unsigned Elem) {
   unsigned Base = OpNo * SystemZ::VectorBytes + Byte;
   for (unsigned I = 0; I < BytesPerElement; ++I)
     Bytes.push_back(Base + I);
+
+  return true;
 }
 
 // Return SDNodes for the completed shuffle.
@@ -4110,12 +4126,14 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
     if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         Op.getOperand(1).getOpcode() == ISD::Constant) {
       unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      GS.add(Op.getOperand(0), Elem);
+      if (!GS.add(Op.getOperand(0), Elem))
+        return SDValue();
       FoundOne = true;
     } else if (Op.isUndef()) {
       GS.addUndef();
     } else {
-      GS.add(SDValue(), ResidueOps.size());
+      if (!GS.add(SDValue(), ResidueOps.size()))
+        return SDValue();
       ResidueOps.push_back(BVN->getOperand(I));
     }
   }
@@ -4354,9 +4372,9 @@ SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
     int Elt = VSN->getMaskElt(I);
     if (Elt < 0)
       GS.addUndef();
-    else
-      GS.add(Op.getOperand(unsigned(Elt) / NumElements),
-             unsigned(Elt) % NumElements);
+    else if (!GS.add(Op.getOperand(unsigned(Elt) / NumElements),
+                     unsigned(Elt) % NumElements))
+      return SDValue();
   }
   return GS.getNode(DAG, SDLoc(VSN));
 }
@@ -4722,9 +4740,12 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
 }
 
 // Return true if VT is a vector whose elements are a whole number of bytes
-// in width.
-static bool canTreatAsByteVector(EVT VT) {
-  return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0;
+// in width. Also check for presence of vector support.
+bool SystemZTargetLowering::canTreatAsByteVector(EVT VT) const {
+  if (!Subtarget.hasVector())
+    return false;
+
+  return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0 && VT.isSimple();
 }
 
 // Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT
@@ -4986,6 +5007,10 @@ SDValue SystemZTargetLowering::combineSTORE(
 
 SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
     SDNode *N, DAGCombinerInfo &DCI) const {
+
+  if (!Subtarget.hasVector())
+    return SDValue();
+
   // Try to simplify a vector extraction.
   if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
     SDValue Op0 = N->getOperand(0);
@@ -5233,7 +5258,7 @@ static unsigned forceReg(MachineInstr &MI, MachineOperand &Base,
 
   unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
   BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg)
-      .addOperand(Base)
+      .add(Base)
       .addImm(0)
       .addReg(0);
   return Reg;
@@ -5322,8 +5347,11 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
     if (Invert)
       CCMask ^= CCValid;
     BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
-      .addReg(SrcReg).addOperand(Base).addImm(Disp)
-      .addImm(CCValid).addImm(CCMask);
+        .addReg(SrcReg)
+        .add(Base)
+        .addImm(Disp)
+        .addImm(CCValid)
+        .addImm(CCMask);
     MI.eraseFromParent();
     return MBB;
   }
@@ -5350,7 +5378,10 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
   //   # fallthrough to JoinMBB
   MBB = FalseMBB;
   BuildMI(MBB, DL, TII->get(StoreOpcode))
-    .addReg(SrcReg).addOperand(Base).addImm(Disp).addReg(IndexReg);
+      .addReg(SrcReg)
+      .add(Base)
+      .addImm(Disp)
+      .addReg(IndexReg);
   MBB->addSuccessor(JoinMBB);
 
   MI.eraseFromParent();
@@ -5415,8 +5446,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
   //   %OrigVal = L Disp(%Base)
   //   # fall through to LoopMMB
   MBB = StartMBB;
-  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
-    .addOperand(Base).addImm(Disp).addReg(0);
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
   MBB->addSuccessor(LoopMBB);
 
   //  LoopMBB:
@@ -5437,8 +5467,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
   if (Invert) {
     // Perform the operation normally and then invert every bit of the field.
     unsigned Tmp = MRI.createVirtualRegister(RC);
-    BuildMI(MBB, DL, TII->get(BinOpcode), Tmp)
-      .addReg(RotatedOldVal).addOperand(Src2);
+    BuildMI(MBB, DL, TII->get(BinOpcode), Tmp).addReg(RotatedOldVal).add(Src2);
     if (BitSize <= 32)
       // XILF with the upper BitSize bits set.
       BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
@@ -5454,7 +5483,8 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
   } else if (BinOpcode)
     // A simply binary operation.
     BuildMI(MBB, DL, TII->get(BinOpcode), RotatedNewVal)
-      .addReg(RotatedOldVal).addOperand(Src2);
+        .addReg(RotatedOldVal)
+        .add(Src2);
   else if (IsSubWord)
     // Use RISBG to rotate Src2 into position and use it to replace the
     // field in RotatedOldVal.
@@ -5465,7 +5495,10 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
     BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
       .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
   BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
-    .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+      .addReg(OldVal)
+      .addReg(NewVal)
+      .add(Base)
+      .addImm(Disp);
   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
     .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
   MBB->addSuccessor(LoopMBB);
@@ -5533,8 +5566,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
   //   %OrigVal     = L Disp(%Base)
   //   # fall through to LoopMMB
   MBB = StartMBB;
-  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
-    .addOperand(Base).addImm(Disp).addReg(0);
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
   MBB->addSuccessor(LoopMBB);
 
   //  LoopMBB:
@@ -5581,7 +5613,10 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
     BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
       .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
   BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
-    .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+      .addReg(OldVal)
+      .addReg(NewVal)
+      .add(Base)
+      .addImm(Disp);
   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
     .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
   MBB->addSuccessor(LoopMBB);
@@ -5642,7 +5677,9 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
   //   # fall through to LoopMMB
   MBB = StartMBB;
   BuildMI(MBB, DL, TII->get(LOpcode), OrigOldVal)
-    .addOperand(Base).addImm(Disp).addReg(0);
+      .add(Base)
+      .addImm(Disp)
+      .addReg(0);
   MBB->addSuccessor(LoopMBB);
 
   //  LoopMBB:
@@ -5696,7 +5733,10 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
   BuildMI(MBB, DL, TII->get(SystemZ::RLL), StoreVal)
     .addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
   BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
-    .addReg(OldVal).addReg(StoreVal).addOperand(Base).addImm(Disp);
+      .addReg(OldVal)
+      .addReg(StoreVal)
+      .add(Base)
+      .addImm(Disp);
   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
     .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
   MBB->addSuccessor(LoopMBB);
@@ -5869,7 +5909,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     if (!isUInt<12>(DestDisp)) {
       unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
-          .addOperand(DestBase)
+          .add(DestBase)
           .addImm(DestDisp)
           .addReg(0);
       DestBase = MachineOperand::CreateReg(Reg, false);
@@ -5878,15 +5918,18 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     if (!isUInt<12>(SrcDisp)) {
       unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
-          .addOperand(SrcBase)
+          .add(SrcBase)
           .addImm(SrcDisp)
           .addReg(0);
       SrcBase = MachineOperand::CreateReg(Reg, false);
       SrcDisp = 0;
     }
     BuildMI(*MBB, MI, DL, TII->get(Opcode))
-      .addOperand(DestBase).addImm(DestDisp).addImm(ThisLength)
-      .addOperand(SrcBase).addImm(SrcDisp);
+        .add(DestBase)
+        .addImm(DestDisp)
+        .addImm(ThisLength)
+        .add(SrcBase)
+        .addImm(SrcDisp);
     DestDisp += ThisLength;
     SrcDisp += ThisLength;
     Length -= ThisLength;
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 7a21a474c119..7d92a7355877 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -537,6 +537,7 @@ private:
                                  unsigned UnpackHigh) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
 
+  bool canTreatAsByteVector(EVT VT) const;
   SDValue combineExtract(const SDLoc &DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
                          unsigned Index, DAGCombinerInfo &DCI,
                          bool Force) const;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 3565d5f2c49c..c8ff9558cc88 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -11,12 +11,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SystemZInstrInfo.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "SystemZ.h"
 #include "SystemZInstrBuilder.h"
-#include "SystemZTargetMachine.h"
-#include "llvm/CodeGen/LiveVariables.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 
@@ -58,12 +79,25 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
   MachineInstr *EarlierMI = MF.CloneMachineInstr(&*MI);
   MBB->insert(MI, EarlierMI);
 
-  // Set up the two 64-bit registers.
+  // Set up the two 64-bit registers and remember super reg and its flags.
   MachineOperand &HighRegOp = EarlierMI->getOperand(0);
   MachineOperand &LowRegOp = MI->getOperand(0);
+  unsigned Reg128 = LowRegOp.getReg();
+  unsigned Reg128Killed = getKillRegState(LowRegOp.isKill());
+  unsigned Reg128Undef  = getUndefRegState(LowRegOp.isUndef());
   HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64));
   LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_l64));
 
+  if (MI->mayStore()) {
+    // Add implicit uses of the super register in case one of the subregs is
+    // undefined. We could track liveness and skip storing an undefined
+    // subreg, but this is hopefully rare (discovered with llvm-stress).
+    // If Reg128 was killed, set kill flag on MI.
+    unsigned Reg128UndefImpl = (Reg128Undef | RegState::Implicit);
+    MachineInstrBuilder(MF, EarlierMI).addReg(Reg128, Reg128UndefImpl);
+    MachineInstrBuilder(MF, MI).addReg(Reg128, (Reg128UndefImpl | Reg128Killed));
+  }
+
   // The address in the first (high) instruction is already correct.
   // Adjust the offset in the second (low) instruction.
   MachineOperand &HighOffsetOp = EarlierMI->getOperand(2);
@@ -131,7 +165,8 @@ void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
     MI.setDesc(get(LowOpcodeK));
   else {
     emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, SrcReg,
-                  SystemZ::LR, 32, MI.getOperand(1).isKill());
+                  SystemZ::LR, 32, MI.getOperand(1).isKill(),
+                  MI.getOperand(1).isUndef());
     MI.setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
     MI.getOperand(1).setReg(DestReg);
     MI.tieOperands(0, 1);
@@ -185,9 +220,15 @@ void SystemZInstrInfo::expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
 // are low registers, otherwise use RISB[LH]G.
 void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
                                         unsigned Size) const {
-  emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(),
-                MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), LowOpcode,
-                Size, MI.getOperand(1).isKill());
+  MachineInstrBuilder MIB =
+    emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(),
+               MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), LowOpcode,
+               Size, MI.getOperand(1).isKill(), MI.getOperand(1).isUndef());
+
+  // Keep the remaining operands as-is.
+  for (unsigned I = 2; I < MI.getNumOperands(); ++I)
+    MIB.add(MI.getOperand(I));
+
   MI.eraseFromParent();
 }
 
@@ -227,11 +268,13 @@ void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const {
 // are low registers, otherwise use RISB[LH]G.  Size is the number of bits
 // taken from the low end of SrcReg (8 for LLCR, 16 for LLHR and 32 for LR).
 // KillSrc is true if this move is the last use of SrcReg.
-void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MBBI,
-                                     const DebugLoc &DL, unsigned DestReg,
-                                     unsigned SrcReg, unsigned LowLowOpcode,
-                                     unsigned Size, bool KillSrc) const {
+MachineInstrBuilder
+SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                const DebugLoc &DL, unsigned DestReg,
+                                unsigned SrcReg, unsigned LowLowOpcode,
+                                unsigned Size, bool KillSrc,
+                                bool UndefSrc) const {
   unsigned Opcode;
   bool DestIsHigh = isHighReg(DestReg);
   bool SrcIsHigh = isHighReg(SrcReg);
@@ -242,18 +285,16 @@ void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
   else if (!DestIsHigh && SrcIsHigh)
     Opcode = SystemZ::RISBLH;
   else {
-    BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
+    return BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc) | getUndefRegState(UndefSrc));
   }
   unsigned Rotate = (DestIsHigh != SrcIsHigh ? 32 : 0);
-  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+  return BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
     .addReg(DestReg, RegState::Undef)
-    .addReg(SrcReg, getKillRegState(KillSrc))
+    .addReg(SrcReg, getKillRegState(KillSrc) | getUndefRegState(UndefSrc))
     .addImm(32 - Size).addImm(128 + 31).addImm(Rotate);
 }
 
-
 MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI,
                                                        bool NewMI,
                                                        unsigned OpIdx1,
@@ -282,7 +323,6 @@ MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   }
 }
 
-
 // If MI is a simple load or store for a frame object, return the register
 // it loads or stores and set FrameIndex to the index of the frame object.
 // Return 0 otherwise.
@@ -586,7 +626,6 @@ bool SystemZInstrInfo::optimizeCompareInstr(
          removeIPMBasedCompare(Compare, SrcReg, MRI, &RI);
 }
 
-
 bool SystemZInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
                                        ArrayRef<MachineOperand> Pred,
                                        unsigned TrueReg, unsigned FalseReg,
@@ -640,6 +679,12 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
     else {
       Opc = SystemZ::LOCR;
       MRI.constrainRegClass(DstReg, &SystemZ::GR32BitRegClass);
+      unsigned TReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
+      unsigned FReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
+      BuildMI(MBB, I, DL, get(TargetOpcode::COPY), TReg).addReg(TrueReg);
+      BuildMI(MBB, I, DL, get(TargetOpcode::COPY), FReg).addReg(FalseReg);
+      TrueReg = TReg;
+      FalseReg = FReg;
     }
   } else if (SystemZ::GR64BitRegClass.hasSubClassEq(RC))
     Opc = SystemZ::LOCGR;
@@ -706,7 +751,7 @@ bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   return true;
 }
 
-bool SystemZInstrInfo::isPredicable(MachineInstr &MI) const {
+bool SystemZInstrInfo::isPredicable(const MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   if (Opcode == SystemZ::Return ||
       Opcode == SystemZ::Trap ||
@@ -780,10 +825,11 @@ bool SystemZInstrInfo::PredicateInstruction(
     MI.RemoveOperand(0);
     MI.setDesc(get(SystemZ::CallBRCL));
     MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-      .addImm(CCValid).addImm(CCMask)
-      .addOperand(FirstOp)
-      .addRegMask(RegMask)
-      .addReg(SystemZ::CC, RegState::Implicit);
+        .addImm(CCValid)
+        .addImm(CCMask)
+        .add(FirstOp)
+        .addRegMask(RegMask)
+        .addReg(SystemZ::CC, RegState::Implicit);
     return true;
   }
   if (Opcode == SystemZ::CallBR) {
@@ -813,7 +859,8 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   }
 
   if (SystemZ::GRX32BitRegClass.contains(DestReg, SrcReg)) {
-    emitGRX32Move(MBB, MBBI, DL, DestReg, SrcReg, SystemZ::LR, 32, KillSrc);
+    emitGRX32Move(MBB, MBBI, DL, DestReg, SrcReg, SystemZ::LR, 32, KillSrc,
+                  false);
     return;
   }
 
@@ -888,15 +935,19 @@ static bool isSimpleBD12Move(const MachineInstr *MI, unsigned Flag) {
 }
 
 namespace {
+
 struct LogicOp {
-  LogicOp() : RegSize(0), ImmLSB(0), ImmSize(0) {}
+  LogicOp() = default;
   LogicOp(unsigned regSize, unsigned immLSB, unsigned immSize)
     : RegSize(regSize), ImmLSB(immLSB), ImmSize(immSize) {}
 
   explicit operator bool() const { return RegSize; }
 
-  unsigned RegSize, ImmLSB, ImmSize;
+  unsigned RegSize = 0;
+  unsigned ImmLSB = 0;
+  unsigned ImmSize = 0;
 };
+
 } // end anonymous namespace
 
 static LogicOp interpretAndImmediate(unsigned Opcode) {
@@ -976,12 +1027,12 @@ MachineInstr *SystemZInstrInfo::convertToThreeAddress(
       MachineInstrBuilder MIB(
           *MF, MF->CreateMachineInstr(get(ThreeOperandOpcode), MI.getDebugLoc(),
                                       /*NoImplicit=*/true));
-      MIB.addOperand(Dest);
+      MIB.add(Dest);
       // Keep the kill state, but drop the tied flag.
       MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
       // Keep the remaining operands as-is.
       for (unsigned I = 2; I < NumOps; ++I)
-        MIB.addOperand(MI.getOperand(I));
+        MIB.add(MI.getOperand(I));
       MBB->insert(MI, MIB);
       return finishConvertToThreeAddress(&MI, MIB, LV);
     }
@@ -1009,7 +1060,7 @@ MachineInstr *SystemZInstrInfo::convertToThreeAddress(
       MachineOperand &Src = MI.getOperand(1);
       MachineInstrBuilder MIB =
           BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpcode))
-              .addOperand(Dest)
+              .add(Dest)
               .addReg(0)
               .addReg(Src.getReg(), getKillRegState(Src.isKill()),
                       Src.getSubReg())
@@ -1040,7 +1091,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       MCRegUnitIterator CCUnit(SystemZ::CC, TRI);
       LiveRange &CCLiveRange = LIS->getRegUnit(*CCUnit);
       ++CCUnit;
-      assert (!CCUnit.isValid() && "CC only has one reg unit.");
+      assert(!CCUnit.isValid() && "CC only has one reg unit.");
       SlotIndex MISlot =
           LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
       if (!CCLiveRange.liveAt(MISlot)) {
@@ -1091,7 +1142,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       unsigned StoreOpcode = Op1IsGPR ? SystemZ::STG : SystemZ::STD;
       return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
                      get(StoreOpcode))
-          .addOperand(MI.getOperand(1))
+          .add(MI.getOperand(1))
           .addFrameIndex(FrameIndex)
           .addImm(0)
           .addReg(0);
@@ -1100,12 +1151,12 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     // destination register instead.
     if (OpNum == 1) {
       unsigned LoadOpcode = Op0IsGPR ? SystemZ::LG : SystemZ::LD;
-      unsigned Dest = MI.getOperand(0).getReg();
       return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
-                     get(LoadOpcode), Dest)
-          .addFrameIndex(FrameIndex)
-          .addImm(0)
-          .addReg(0);
+                     get(LoadOpcode))
+        .add(MI.getOperand(0))
+        .addFrameIndex(FrameIndex)
+        .addImm(0)
+        .addReg(0);
     }
   }
 
@@ -1132,7 +1183,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
             .addFrameIndex(FrameIndex)
             .addImm(0)
             .addImm(Size)
-            .addOperand(MI.getOperand(1))
+            .add(MI.getOperand(1))
             .addImm(MI.getOperand(2).getImm())
             .addMemOperand(MMO);
       }
@@ -1140,7 +1191,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       if (isSimpleBD12Move(&MI, SystemZII::SimpleBDXStore)) {
         return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
                        get(SystemZ::MVC))
-            .addOperand(MI.getOperand(1))
+            .add(MI.getOperand(1))
             .addImm(MI.getOperand(2).getImm())
             .addImm(Size)
             .addFrameIndex(FrameIndex)
@@ -1164,7 +1215,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
                                         MI.getDebugLoc(), get(MemOpcode));
       for (unsigned I = 0; I < OpNum; ++I)
-        MIB.addOperand(MI.getOperand(I));
+        MIB.add(MI.getOperand(I));
       MIB.addFrameIndex(FrameIndex).addImm(Offset);
       if (MemDesc.TSFlags & SystemZII::HasIndex)
         MIB.addReg(0);
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 794b193a501e..b8be1f5f3921 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -16,16 +16,22 @@
 
 #include "SystemZ.h"
 #include "SystemZRegisterInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include <cstdint>
 
 #define GET_INSTRINFO_HEADER
 #include "SystemZGenInstrInfo.inc"
 
 namespace llvm {
 
-class SystemZTargetMachine;
+class SystemZSubtarget;
 
 namespace SystemZII {
+
 enum {
   // See comments in SystemZInstrFormats.td.
   SimpleBDXLoad          = (1 << 0),
@@ -43,12 +49,15 @@ enum {
   CCMaskLast             = (1 << 19),
   IsLogical              = (1 << 20)
 };
+
 static inline unsigned getAccessSize(unsigned int Flags) {
   return (Flags & AccessSizeMask) >> AccessSizeShift;
 }
+
 static inline unsigned getCCValues(unsigned int Flags) {
   return (Flags & CCValuesMask) >> CCValuesShift;
 }
+
 static inline unsigned getCompareZeroCCMask(unsigned int Flags) {
   return (Flags & CompareZeroCCMaskMask) >> CompareZeroCCMaskShift;
 }
@@ -64,6 +73,7 @@ enum {
   // @INDNTPOFF
   MO_INDNTPOFF = (2 << 0)
 };
+
 // Classifies a branch.
 enum BranchType {
   // An instruction that branches on the current value of CC.
@@ -93,6 +103,7 @@ enum BranchType {
   // the result is nonzero.
   BranchCTG
 };
+
 // Information about a branch instruction.
 struct Branch {
   // The type of the branch.
@@ -111,6 +122,7 @@ struct Branch {
          const MachineOperand *target)
     : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {}
 };
+
 // Kinds of fused compares in compare-and-* instructions.  Together with type
 // of the converted compare, this identifies the compare-and-*
 // instruction.
@@ -127,9 +139,9 @@ enum FusedCompareType {
   // Trap
   CompareAndTrap
 };
+
 } // end namespace SystemZII
 
-class SystemZSubtarget;
 class SystemZInstrInfo : public SystemZGenInstrInfo {
   const SystemZRegisterInfo RI;
   SystemZSubtarget &STI;
@@ -149,9 +161,13 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
   void expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
                         unsigned Size) const;
   void expandLoadStackGuard(MachineInstr *MI) const;
-  void emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
-                     unsigned LowLowOpcode, unsigned Size, bool KillSrc) const;
+
+  MachineInstrBuilder
+  emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                unsigned LowLowOpcode, unsigned Size, bool KillSrc,
+                bool UndefSrc) const;
+
   virtual void anchor();
 
 protected:
@@ -203,7 +219,7 @@ public:
                     unsigned FalseReg) const override;
   bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
                      MachineRegisterInfo *MRI) const override;
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                            unsigned ExtraPredCycles,
                            BranchProbability Probability) const override;
@@ -304,6 +320,7 @@ public:
   areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                   AliasAnalysis *AA = nullptr) const override;
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRINFO_H
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index 738ea7a33729..0158fe6aec08 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -56,17 +56,28 @@ def : VectorExtractSubreg<v4i32, VLGVF>;
 //===----------------------------------------------------------------------===//
 
 let Predicates = [FeatureVector] in {
-  // Generate byte mask.
-  def VZERO : InherentVRIa<"vzero", 0xE744, 0>;
-  def VONE  : InherentVRIa<"vone", 0xE744, 0xffff>;
-  def VGBM  : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>;
-
-  // Generate mask.
-  def VGM  : BinaryVRIbGeneric<"vgm", 0xE746>;
-  def VGMB : BinaryVRIb<"vgmb", 0xE746, z_rotate_mask, v128b, 0>;
-  def VGMH : BinaryVRIb<"vgmh", 0xE746, z_rotate_mask, v128h, 1>;
-  def VGMF : BinaryVRIb<"vgmf", 0xE746, z_rotate_mask, v128f, 2>;
-  def VGMG : BinaryVRIb<"vgmg", 0xE746, z_rotate_mask, v128g, 3>;
+  let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
+      isReMaterializable = 1 in {
+
+    // Generate byte mask.
+    def VZERO : InherentVRIa<"vzero", 0xE744, 0>;
+    def VONE  : InherentVRIa<"vone", 0xE744, 0xffff>;
+    def VGBM  : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>;
+
+    // Generate mask.
+    def VGM  : BinaryVRIbGeneric<"vgm", 0xE746>;
+    def VGMB : BinaryVRIb<"vgmb", 0xE746, z_rotate_mask, v128b, 0>;
+    def VGMH : BinaryVRIb<"vgmh", 0xE746, z_rotate_mask, v128h, 1>;
+    def VGMF : BinaryVRIb<"vgmf", 0xE746, z_rotate_mask, v128f, 2>;
+    def VGMG : BinaryVRIb<"vgmg", 0xE746, z_rotate_mask, v128g, 3>;
+
+    // Replicate immediate.
+    def VREPI  : UnaryVRIaGeneric<"vrepi", 0xE745, imm32sx16>;
+    def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>;
+    def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>;
+    def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>;
+    def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>;
+  }
 
   // Load element immediate.
   //
@@ -86,13 +97,6 @@ let Predicates = [FeatureVector] in {
     def VLEIG : TernaryVRIa<"vleig", 0xE742, z_vector_insert,
                             v128g, v128g, imm64sx16, imm32zx1>;
   }
-
-  // Replicate immediate.
-  def VREPI  : UnaryVRIaGeneric<"vrepi", 0xE745, imm32sx16>;
-  def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>;
-  def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>;
-  def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>;
-  def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index 14ff6afbd4ae..791f0334e0f1 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -53,15 +53,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
 #include "SystemZTargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -70,72 +76,72 @@ using namespace llvm;
 STATISTIC(LongBranches, "Number of long branches.");
 
 namespace {
+
 // Represents positional information about a basic block.
 struct MBBInfo {
   // The address that we currently assume the block has.
-  uint64_t Address;
+  uint64_t Address = 0;
 
   // The size of the block in bytes, excluding terminators.
   // This value never changes.
-  uint64_t Size;
+  uint64_t Size = 0;
 
   // The minimum alignment of the block, as a log2 value.
   // This value never changes.
-  unsigned Alignment;
+  unsigned Alignment = 0;
 
   // The number of terminators in this block.  This value never changes.
-  unsigned NumTerminators;
+  unsigned NumTerminators = 0;
 
-  MBBInfo()
-    : Address(0), Size(0), Alignment(0), NumTerminators(0) {} 
+  MBBInfo() = default;
 };
 
 // Represents the state of a block terminator.
 struct TerminatorInfo {
   // If this terminator is a relaxable branch, this points to the branch
   // instruction, otherwise it is null.
-  MachineInstr *Branch;
+  MachineInstr *Branch = nullptr;
 
   // The address that we currently assume the terminator has.
-  uint64_t Address;
+  uint64_t Address = 0;
 
   // The current size of the terminator in bytes.
-  uint64_t Size;
+  uint64_t Size = 0;
 
   // If Branch is nonnull, this is the number of the target block,
   // otherwise it is unused.
-  unsigned TargetBlock;
+  unsigned TargetBlock = 0;
 
   // If Branch is nonnull, this is the length of the longest relaxed form,
   // otherwise it is zero.
-  unsigned ExtraRelaxSize;
+  unsigned ExtraRelaxSize = 0;
 
-  TerminatorInfo() : Branch(nullptr), Size(0), TargetBlock(0),
-                     ExtraRelaxSize(0) {}
+  TerminatorInfo() = default;
 };
 
 // Used to keep track of the current position while iterating over the blocks.
 struct BlockPosition {
   // The address that we assume this position has.
-  uint64_t Address;
+  uint64_t Address = 0;
 
   // The number of low bits in Address that are known to be the same
   // as the runtime address.
   unsigned KnownBits;
 
-  BlockPosition(unsigned InitialAlignment)
-    : Address(0), KnownBits(InitialAlignment) {}
+  BlockPosition(unsigned InitialAlignment) : KnownBits(InitialAlignment) {}
 };
 
 class SystemZLongBranch : public MachineFunctionPass {
 public:
   static char ID;
+
   SystemZLongBranch(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID), TII(nullptr) {}
+    : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override { return "SystemZ Long Branch"; }
 
   bool runOnMachineFunction(MachineFunction &F) override;
+
   MachineFunctionProperties getRequiredProperties() const override {
     return MachineFunctionProperties().set(
         MachineFunctionProperties::Property::NoVRegs);
@@ -155,7 +161,7 @@ private:
   void relaxBranch(TerminatorInfo &Terminator);
   void relaxBranches();
 
-  const SystemZInstrInfo *TII;
+  const SystemZInstrInfo *TII = nullptr;
   MachineFunction *MF;
   SmallVector<MBBInfo, 16> MBBs;
   SmallVector<TerminatorInfo, 16> Terminators;
@@ -165,11 +171,8 @@ char SystemZLongBranch::ID = 0;
 
 const uint64_t MaxBackwardRange = 0x10000;
 const uint64_t MaxForwardRange = 0xfffe;
-} // end anonymous namespace
 
-FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
-  return new SystemZLongBranch(TM);
-}
+} // end anonymous namespace
 
 // Position describes the state immediately before Block.  Update Block
 // accordingly and move Position to the end of the block's non-terminator
@@ -354,13 +357,13 @@ void SystemZLongBranch::splitBranchOnCount(MachineInstr *MI,
   MachineBasicBlock *MBB = MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   BuildMI(*MBB, MI, DL, TII->get(AddOpcode))
-    .addOperand(MI->getOperand(0))
-    .addOperand(MI->getOperand(1))
-    .addImm(-1);
+      .add(MI->getOperand(0))
+      .add(MI->getOperand(1))
+      .addImm(-1);
   MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
-    .addImm(SystemZ::CCMASK_ICMP)
-    .addImm(SystemZ::CCMASK_CMP_NE)
-    .addOperand(MI->getOperand(2));
+                           .addImm(SystemZ::CCMASK_ICMP)
+                           .addImm(SystemZ::CCMASK_CMP_NE)
+                           .add(MI->getOperand(2));
   // The implicit use of CC is a killing use.
   BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
   MI->eraseFromParent();
@@ -373,12 +376,12 @@ void SystemZLongBranch::splitCompareBranch(MachineInstr *MI,
   MachineBasicBlock *MBB = MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   BuildMI(*MBB, MI, DL, TII->get(CompareOpcode))
-    .addOperand(MI->getOperand(0))
-    .addOperand(MI->getOperand(1));
+      .add(MI->getOperand(0))
+      .add(MI->getOperand(1));
   MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
-    .addImm(SystemZ::CCMASK_ICMP)
-    .addOperand(MI->getOperand(2))
-    .addOperand(MI->getOperand(3));
+                           .addImm(SystemZ::CCMASK_ICMP)
+                           .add(MI->getOperand(2))
+                           .add(MI->getOperand(3));
   // The implicit use of CC is a killing use.
   BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
   MI->eraseFromParent();
@@ -463,3 +466,7 @@ bool SystemZLongBranch::runOnMachineFunction(MachineFunction &F) {
   relaxBranches();
   return true;
 }
+
+FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
+  return new SystemZLongBranch(TM);
+}
diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.h b/lib/Target/SystemZ/SystemZMachineScheduler.h
index b919758b70e7..12357e0348a9 100644
--- a/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -1,4 +1,4 @@
-//==-- SystemZMachineScheduler.h - SystemZ Scheduler Interface -*- C++ -*---==//
+//==- SystemZMachineScheduler.h - SystemZ Scheduler Interface ----*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,10 +14,10 @@
 // usage of processor resources.
 //===----------------------------------------------------------------------===//
 
-#include "SystemZInstrInfo.h"
 #include "SystemZHazardRecognizer.h"
 #include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include <set>
 
 #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
@@ -28,29 +28,29 @@ namespace llvm {
   
 /// A MachineSchedStrategy implementation for SystemZ post RA scheduling.
 class SystemZPostRASchedStrategy : public MachineSchedStrategy {
-    ScheduleDAGMI *DAG;
+  ScheduleDAGMI *DAG;
   
   /// A candidate during instruction evaluation.
   struct Candidate {
-    SUnit *SU;
+    SUnit *SU = nullptr;
 
     /// The decoding cost.
-    int GroupingCost;
+    int GroupingCost = 0;
 
     /// The processor resources cost.
-    int ResourcesCost;
+    int ResourcesCost = 0;
 
-    Candidate() : SU(nullptr), GroupingCost(0), ResourcesCost(0) {}
+    Candidate() = default;
     Candidate(SUnit *SU_, SystemZHazardRecognizer &HazardRec);
 
     // Compare two candidates.
     bool operator<(const Candidate &other);
 
     // Check if this node is free of cost ("as good as any").
-    bool inline noCost() {
+    bool noCost() const {
       return (GroupingCost <= 0 && !ResourcesCost);
     }
-   };
+  };
 
   // A sorter for the Available set that makes sure that SUs are considered
   // in the best order.
@@ -83,7 +83,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
   // region.
   SystemZHazardRecognizer HazardRec;
   
- public:
+public:
   SystemZPostRASchedStrategy(const MachineSchedContext *C);
 
   /// PostRA scheduling does not track pressure.
@@ -107,6 +107,6 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
   void releaseBottomNode(SUnit *SU) override {};
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif /* LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H */
+#endif // LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td
index e97d61d8355d..7aee6f52e9a7 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -855,8 +855,8 @@ def : InstRW<[VecXsPm], (instregex "VZERO$")>;
 def : InstRW<[VecXsPm], (instregex "VONE$")>;
 def : InstRW<[VecXsPm], (instregex "VGBM$")>;
 def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
 def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Loads
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index 83882fc0310a..263aff8b7bfb 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -167,10 +167,10 @@ bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
     MI.RemoveOperand(0);
     MI.setDesc(TII->get(Opcode));
     MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
-      .addOperand(Dest)
-      .addOperand(Mode)
-      .addOperand(Src)
-      .addOperand(Suppress);
+        .add(Dest)
+        .add(Mode)
+        .add(Src)
+        .add(Suppress);
     return true;
   }
   return false;
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 33fdb8f90825..ede5005fa491 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -7,14 +7,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "SystemZ.h"
+#include "SystemZMachineScheduler.h"
 #include "SystemZTargetMachine.h"
 #include "SystemZTargetTransformInfo.h"
-#include "SystemZMachineScheduler.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include <string>
 
 using namespace llvm;
 
@@ -48,7 +59,7 @@ static bool UsesVectorABI(StringRef CPU, StringRef FS) {
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
                                      StringRef FS) {
   bool VectorABI = UsesVectorABI(CPU, FS);
-  std::string Ret = "";
+  std::string Ret;
 
   // Big endian.
   Ret += "E";
@@ -96,14 +107,15 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT,
                                            CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options,
                         getEffectiveRelocModel(RM), CM, OL),
-      TLOF(make_unique<TargetLoweringObjectFileELF>()),
+      TLOF(llvm::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
 
-SystemZTargetMachine::~SystemZTargetMachine() {}
+SystemZTargetMachine::~SystemZTargetMachine() = default;
 
 namespace {
+
 /// SystemZ Code Generator Pass Configuration Options.
 class SystemZPassConfig : public TargetPassConfig {
 public:
@@ -116,7 +128,8 @@ public:
 
   ScheduleDAGInstrs *
   createPostMachineScheduler(MachineSchedContext *C) const override {
-    return new ScheduleDAGMI(C, make_unique<SystemZPostRASchedStrategy>(C),
+    return new ScheduleDAGMI(C,
+                             llvm::make_unique<SystemZPostRASchedStrategy>(C),
                              /*RemoveKillFlags=*/true);
   }
 
@@ -126,6 +139,7 @@ public:
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
+
 } // end anonymous namespace
 
 void SystemZPassConfig::addIRPasses() {
@@ -157,7 +171,6 @@ void SystemZPassConfig::addPreSched2() {
 }
 
 void SystemZPassConfig::addPreEmitPass() {
-
   // Do instruction shortening before compare elimination because some
   // vector instructions will be shortened into opcodes that compare
   // elimination recognizes.
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index 69cf9bc6e525..a10ca64fa632 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -1,4 +1,4 @@
-//==- SystemZTargetMachine.h - Define TargetMachine for SystemZ ---*- C++ -*-=//
+//=- SystemZTargetMachine.h - Define TargetMachine for SystemZ ----*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,15 +16,18 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETMACHINE_H
 
 #include "SystemZSubtarget.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
 
-class TargetFrameLowering;
-
 class SystemZTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  SystemZSubtarget        Subtarget;
+  SystemZSubtarget Subtarget;
 
 public:
   SystemZTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -34,20 +37,22 @@ public:
   ~SystemZTargetMachine() override;
 
   const SystemZSubtarget *getSubtargetImpl() const { return &Subtarget; }
+
   const SystemZSubtarget *getSubtargetImpl(const Function &) const override {
     return &Subtarget;
   }
+
   // Override LLVMTargetMachine
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
   TargetIRAnalysis getTargetIRAnalysis() override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
 
   bool targetSchedulesPostRAScheduling() const override { return true; };
-
 };
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETMACHINE_H
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index b10c0e09a0d4..e74c9a80515d 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -259,11 +259,8 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L,
         }
       }
       if (isa<StoreInst>(&I)) {
-        NumStores++;
         Type *MemAccessTy = I.getOperand(0)->getType();
-        if((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) &&
-           (getDataLayout().getTypeSizeInBits(MemAccessTy) == 128))
-          NumStores++;  // 128 bit fp/int stores get split.
+        NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0);
       }
     }
 
@@ -313,3 +310,547 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) {
   return 0;
 }
 
+int SystemZTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty,  
+    TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+    TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) {
+
+  // TODO: return a good value for BB-VECTORIZER that includes the
+  // immediate loads, which we do not want to count for the loop
+  // vectorizer, since they are hopefully hoisted out of the loop. This
+  // would require a new parameter 'InLoop', but not sure if constant
+  // args are common enough to motivate this.
+
+  unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+  if (Ty->isVectorTy()) {
+    assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type.");
+    unsigned VF = Ty->getVectorNumElements();
+    unsigned NumVectors = getNumberOfParts(Ty);
+
+    // These vector operations are custom handled, but are still supported
+    // with one instruction per vector, regardless of element size.
+    if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
+        Opcode == Instruction::AShr) {
+      return NumVectors;
+    }
+
+    // These FP operations are supported with a single vector instruction for
+    // double (base implementation assumes float generally costs 2). For
+    // FP128, the scalar cost is 1, and there is no overhead since the values
+    // are already in scalar registers.
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
+      switch (ScalarBits) {
+      case 32: {
+        // Return the cost of multiple scalar invocation plus the cost of
+        // inserting and extracting the values.
+        unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
+        // FIXME: VF 2 for these FP operations are currently just as
+        // expensive as for VF 4.
+        if (VF == 2)
+          Cost *= 2;
+        return Cost;
+      }
+      case 64:
+      case 128:
+        return NumVectors;
+      default:
+        break;
+      }
+    }
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem) {
+      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
+      // FIXME: VF 2 for float is currently just as expensive as for VF 4.
+      if (VF == 2 && ScalarBits == 32)
+        Cost *= 2;
+      return Cost;
+    }
+  }
+  else {  // Scalar:
+    // These FP operations are supported with a dedicated instruction for
+    // float, double and fp128 (base implementation assumes float generally
+    // costs 2).
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
+      return 1;
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem)
+      return LIBCALL_COST;
+
+    if (Opcode == Instruction::LShr || Opcode == Instruction::AShr)
+      return (ScalarBits >= 32 ? 1 : 2 /*ext*/);
+
+    // Or requires one instruction, although it has custom handling for i64.
+    if (Opcode == Instruction::Or)
+      return 1;
+
+    if (Opcode == Instruction::Xor && ScalarBits == 1)
+      // 2 * ipm sequences ; xor ; shift ; compare
+      return 7;
+
+    // An extra extension for narrow types is needed.
+    if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem))
+      // sext of op(s) for narrow types
+      return (ScalarBits < 32 ? 4 : (ScalarBits == 32 ? 2 : 1));
+
+    if (Opcode == Instruction::UDiv || Opcode == Instruction::URem)
+      // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
+      return (ScalarBits < 32 ? 4 : 2);
+  }
+
+  // Fallback to the default implementation.
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                       Opd1PropInfo, Opd2PropInfo, Args);
+}
+
+
+int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                   Type *SubTp) {
+  assert (Tp->isVectorTy());
+  assert (ST->hasVector() && "getShuffleCost() called.");
+  unsigned NumVectors = getNumberOfParts(Tp);
+  
+  // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
+
+  // FP128 values are always in scalar registers, so there is no work
+  // involved with a shuffle, except for broadcast. In that case register
+  // moves are done with a single instruction per element.
+  if (Tp->getScalarType()->isFP128Ty())
+    return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
+
+  switch (Kind) {
+  case  TargetTransformInfo::SK_ExtractSubvector:
+    // ExtractSubvector Index indicates start offset.
+
+    // Extracting a subvector from first index is a noop.
+    return (Index == 0 ? 0 : NumVectors);
+
+  case TargetTransformInfo::SK_Broadcast:
+    // Loop vectorizer calls here to figure out the extra cost of
+    // broadcasting a loaded value to all elements of a vector. Since vlrep
+    // loads and replicates with a single instruction, adjust the returned
+    // value.
+    return NumVectors - 1;
+
+  default:
+
+    // SystemZ supports single instruction permutation / replication.
+    return NumVectors;
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+// Return the log2 difference of the element sizes of the two vector types.
+static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
+  unsigned Bits0 = Ty0->getScalarSizeInBits();
+  unsigned Bits1 = Ty1->getScalarSizeInBits();
+
+  if (Bits1 >  Bits0)
+    return (Log2_32(Bits1) - Log2_32(Bits0));
+
+  return (Log2_32(Bits0) - Log2_32(Bits1));
+}
+
+// Return the number of instructions needed to truncate SrcTy to DstTy.
+unsigned SystemZTTIImpl::
+getVectorTruncCost(Type *SrcTy, Type *DstTy) {
+  assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
+  assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
+          "Packing must reduce size of vector type.");
+  assert (SrcTy->getVectorNumElements() == DstTy->getVectorNumElements() &&
+          "Packing should not change number of elements.");
+
+  // TODO: Since fp32 is expanded, the extract cost should always be 0.
+
+  unsigned NumParts = getNumberOfParts(SrcTy);
+  if (NumParts <= 2)
+    // Up to 2 vector registers can be truncated efficiently with pack or
+    // permute. The latter requires an immediate mask to be loaded, which
+    // typically gets hoisted out of a loop.  TODO: return a good value for
+    // BB-VECTORIZER that includes the immediate loads, which we do not want
+    // to count for the loop vectorizer.
+    return 1;
+
+  unsigned Cost = 0;
+  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
+  unsigned VF = SrcTy->getVectorNumElements();
+  for (unsigned P = 0; P < Log2Diff; ++P) {
+    if (NumParts > 1)
+      NumParts /= 2;
+    Cost += NumParts;
+  }
+
+  // Currently, a general mix of permutes and pack instructions is output by
+  // isel, which follow the cost computation above except for this case which
+  // is one instruction less:
+  if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
+      DstTy->getScalarSizeInBits() == 8)
+    Cost--;
+
+  return Cost;
+}
+
+// Return the cost of converting a vector bitmask produced by a compare
+// (SrcTy), to the type of the select or extend instruction (DstTy).
+unsigned SystemZTTIImpl::
+getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) {
+  assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
+          "Should only be called with vector types.");
+
+  unsigned PackCost = 0;
+  unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
+  unsigned DstScalarBits = DstTy->getScalarSizeInBits();
+  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
+  if (SrcScalarBits > DstScalarBits)
+    // The bitmask will be truncated.
+    PackCost = getVectorTruncCost(SrcTy, DstTy);
+  else if (SrcScalarBits < DstScalarBits) {
+    unsigned DstNumParts = getNumberOfParts(DstTy);
+    // Each vector select needs its part of the bitmask unpacked.
+    PackCost = Log2Diff * DstNumParts;
+    // Extra cost for moving part of mask before unpacking.
+    PackCost += DstNumParts - 1;
+  }
+
+  return PackCost;
+}
+
+// Return the type of the compared operands. This is needed to compute the
+// cost for a Select / ZExt or SExt instruction.
+static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
+  Type *OpTy = nullptr;
+  if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
+    OpTy = CI->getOperand(0)->getType();
+  else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
+    if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
+      if (isa<CmpInst>(LogicI->getOperand(1)))
+        OpTy = CI0->getOperand(0)->getType();
+
+  if (OpTy != nullptr) {
+    if (VF == 1) {
+      assert (!OpTy->isVectorTy() && "Expected scalar type");
+      return OpTy;
+    }
+    // Return the potentially vectorized type based on 'I' and 'VF'.  'I' may
+    // be either scalar or already vectorized with a same or lesser VF.
+    Type *ElTy = OpTy->getScalarType();
+    return VectorType::get(ElTy, VF);
+  }
+
+  return nullptr;
+}
+
+int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     const Instruction *I) {
+  unsigned DstScalarBits = Dst->getScalarSizeInBits();
+  unsigned SrcScalarBits = Src->getScalarSizeInBits();
+
+  if (Src->isVectorTy()) {
+    assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
+    assert (Dst->isVectorTy());
+    unsigned VF = Src->getVectorNumElements();
+    unsigned NumDstVectors = getNumberOfParts(Dst);
+    unsigned NumSrcVectors = getNumberOfParts(Src);
+
+    if (Opcode == Instruction::Trunc) {
+      if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
+        return 0; // Check for NOOP conversions.
+      return getVectorTruncCost(Src, Dst);
+    }
+
+    if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
+      if (SrcScalarBits >= 8) {
+        // ZExt/SExt will be handled with one unpack per doubling of width.
+        unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
+
+        // For types that spans multiple vector registers, some additional
+        // instructions are used to setup the unpacking.
+        unsigned NumSrcVectorOps =
+          (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
+                          : (NumDstVectors / 2));
+
+        return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
+      }
+      else if (SrcScalarBits == 1) {
+        // This should be extension of a compare i1 result.
+        // If we know what the widths of the compared operands, get the
+        // cost of converting it to Dst. Otherwise assume same widths.
+        unsigned Cost = 0;
+        Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+        if (CmpOpTy != nullptr)
+          Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
+        if (Opcode == Instruction::ZExt)
+          // One 'vn' per dst vector with an immediate mask.
+          Cost += NumDstVectors;
+        return Cost;
+      }
+    }
+  
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
+        Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
+      // TODO: Fix base implementation which could simplify things a bit here
+      // (seems to miss on differentiating on scalar/vector types).
+
+      // Only 64 bit vector conversions are natively supported.
+      if (SrcScalarBits == 64 && DstScalarBits == 64)
+        return NumDstVectors;
+
+      // Return the cost of multiple scalar invocation plus the cost of
+      // inserting and extracting the values. Base implementation does not
+      // realize float->int gets scalarized.
+      unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
+                                             Src->getScalarType());
+      unsigned TotCost = VF * ScalarCost;
+      bool NeedsInserts = true, NeedsExtracts = true;
+      // FP128 registers do not get inserted or extracted.
+      if (DstScalarBits == 128 &&
+          (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
+        NeedsInserts = false;
+      if (SrcScalarBits == 128 &&
+          (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
+        NeedsExtracts = false;
+
+      TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts);
+
+      // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
+      if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
+        TotCost *= 2;
+
+      return TotCost;
+    }
+
+    if (Opcode == Instruction::FPTrunc) {
+      if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
+        return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false);
+      else // double -> float
+        return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
+    }
+
+    if (Opcode == Instruction::FPExt) {
+      if (SrcScalarBits == 32 && DstScalarBits == 64) {
+        // float -> double is very rare and currently unoptimized. Instead of
+        // using vldeb, which can do two at a time, all conversions are
+        // scalarized.
+        return VF * 2;
+      }
+      // -> fp128.  VF * lxdb/lxeb + extraction of elements.
+      return VF + getScalarizationOverhead(Src, false, true);
+    }
+  }
+  else { // Scalar
+    assert (!Dst->isVectorTy());
+
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)
+      return (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/);
+    
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+        Src->isIntegerTy(1)) {
+      // This should be extension of a compare i1 result, which is done with
+      // ipm and a varying sequence of instructions.
+      unsigned Cost = 0;
+      if (Opcode == Instruction::SExt)
+        Cost = (DstScalarBits < 64 ? 3 : 4);
+      if (Opcode == Instruction::ZExt)
+        Cost = 3;
+      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+        // If operands of an fp-type was compared, this costs +1.
+        Cost++;
+
+      return Cost;
+    }
+  }
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+}
+
+int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                       const Instruction *I) {
+  if (ValTy->isVectorTy()) {
+    assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
+    assert (CondTy == nullptr || CondTy->isVectorTy());
+    unsigned VF = ValTy->getVectorNumElements();
+
+    // Called with a compare instruction.
+    if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
+      unsigned PredicateExtraCost = 0;
+      if (I != nullptr) {
+        // Some predicates cost one or two extra instructions.
+        switch (dyn_cast<CmpInst>(I)->getPredicate()) {
+        case CmpInst::Predicate::ICMP_NE:
+        case CmpInst::Predicate::ICMP_UGE:
+        case CmpInst::Predicate::ICMP_ULE:
+        case CmpInst::Predicate::ICMP_SGE:
+        case CmpInst::Predicate::ICMP_SLE:
+          PredicateExtraCost = 1;
+          break;
+        case CmpInst::Predicate::FCMP_ONE:
+        case CmpInst::Predicate::FCMP_ORD:
+        case CmpInst::Predicate::FCMP_UEQ:
+        case CmpInst::Predicate::FCMP_UNO:
+          PredicateExtraCost = 2;
+          break;
+        default:
+          break;
+        }
+      }
+
+      // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
+      // floats.  FIXME: <2 x float> generates same code as <4 x float>.
+      unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
+      unsigned NumVecs_cmp = getNumberOfParts(ValTy);
+
+      unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
+      return Cost;
+    }
+    else { // Called with a select instruction.
+      assert (Opcode == Instruction::Select);
+
+      // We can figure out the extra cost of packing / unpacking if the
+      // instruction was passed and the compare instruction is found.
+      unsigned PackCost = 0;
+      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+      if (CmpOpTy != nullptr)
+        PackCost =
+          getVectorBitmaskConversionCost(CmpOpTy, ValTy);
+
+      return getNumberOfParts(ValTy) /*vsel*/ + PackCost;
+    }
+  }
+  else { // Scalar
+    switch (Opcode) {
+    case Instruction::ICmp: {
+      unsigned Cost = 1;
+      if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
+        Cost += 2; // extend both operands
+      return Cost;
+    }
+    case Instruction::Select:
+      if (ValTy->isFloatingPointTy())
+        return 4; // No load on condition for FP, so this costs a conditional jump.
+      return 1; // Load On Condition.
+    }
+  }
+
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
+}
+
+int SystemZTTIImpl::
+getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  // vlvgp will insert two grs into a vector register, so only count half the
+  // number of instructions.
+  if (Opcode == Instruction::InsertElement &&
+      Val->getScalarType()->isIntegerTy(64))
+    return ((Index % 2 == 0) ? 1 : 0);
+
+  if (Opcode == Instruction::ExtractElement) {
+    int Cost = ((Val->getScalarSizeInBits() == 1) ? 2 /*+test-under-mask*/ : 1);
+
+    // Give a slight penalty for moving out of vector pipeline to FXU unit.
+    if (Index == 0 && Val->getScalarType()->isIntegerTy())
+      Cost += 1;
+
+    return Cost;
+  }
+
+  return BaseT::getVectorInstrCost(Opcode, Val, Index);
+}
+
+int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                    unsigned Alignment, unsigned AddressSpace,
+                                    const Instruction *I) {
+  assert(!Src->isVoidTy() && "Invalid type");
+
+  if (!Src->isVectorTy() && Opcode == Instruction::Load &&
+      I != nullptr && I->hasOneUse()) {
+      const Instruction *UserI = cast<Instruction>(*I->user_begin());
+      unsigned Bits = Src->getScalarSizeInBits();
+      bool FoldsLoad = false;
+      switch (UserI->getOpcode()) {
+      case Instruction::ICmp:
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Mul:
+      case Instruction::SDiv:
+      case Instruction::UDiv:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+      // This also makes sense for float operations, but disabled for now due
+      // to regressions.
+      // case Instruction::FCmp:
+      // case Instruction::FAdd:
+      // case Instruction::FSub:
+      // case Instruction::FMul:
+      // case Instruction::FDiv:
+        FoldsLoad = (Bits == 32 || Bits == 64);
+        break;
+      }
+
+      if (FoldsLoad) {
+        assert (UserI->getNumOperands() == 2 &&
+                "Expected to only handle binops.");
+
+        // UserI can't fold two loads, so in that case return 0 cost only
+        // half of the time.
+        for (unsigned i = 0; i < 2; ++i) {
+          if (UserI->getOperand(i) == I)
+            continue;
+          if (LoadInst *LI = dyn_cast<LoadInst>(UserI->getOperand(i))) {
+            if (LI->hasOneUse())
+              return i == 0;
+          }
+        }
+
+        return 0;
+      }
+  }
+
+  unsigned NumOps = getNumberOfParts(Src);
+
+  if (Src->getScalarSizeInBits() == 128)
+    // 128 bit scalars are held in a pair of two 64 bit registers.
+    NumOps *= 2;
+
+  return  NumOps;
+}
+
+int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+  assert(isa<VectorType>(VecTy) &&
+         "Expect a vector type for interleaved memory op");
+
+  unsigned WideBits = (VecTy->isPtrOrPtrVectorTy() ?
+     (64U * VecTy->getVectorNumElements()) : VecTy->getPrimitiveSizeInBits());
+  assert (WideBits > 0 && "Could not compute size of vector");
+  int NumWideParts =
+    ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
+
+  // How many source vectors are handled to produce a vectorized operand?
+  int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts);
+  int NumSrcParts =
+    ((NumWideParts > NumElsPerVector) ? NumElsPerVector : NumWideParts);
+
+  // A Load group may have gaps.
+  unsigned NumOperands =
+    ((Opcode == Instruction::Load) ? Indices.size() : Factor);
+
+  // Each needed permute takes two vectors as input.
+  if (NumSrcParts > 1)
+    NumSrcParts--;
+  int NumPermutes = NumSrcParts * NumOperands;
+
+  // Cost of load/store operations and the permutations needed.
+  return NumWideParts + NumPermutes;
+}
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index f7d2d827f11b..3766ed45b8c4 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -27,6 +27,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
   const SystemZSubtarget *getST() const { return ST; }
   const SystemZTargetLowering *getTLI() const { return TLI; }
 
+  unsigned const LIBCALL_COST = 30;
+
 public:
   explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -53,6 +55,32 @@ public:
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
 
+  bool supportsEfficientVectorElementLoadStore() { return true; }
+  bool enableInterleavedAccessVectorization() { return true; }
+
+  int getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
+  unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace, const Instruction *I = nullptr);
+
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor,
+                                 ArrayRef<unsigned> Indices,
+                                 unsigned Alignment,
+                                 unsigned AddressSpace);
   /// @}
 };
author	Dimitry Andric <dim@FreeBSD.org>	2017-04-16 16:01:22 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-04-16 16:01:22 +0000
commit	71d5a2540a98c81f5bcaeb48805e0e2881f530ef (patch)
tree	5343938942df402b49ec7300a1c25a2d4ccd5821 /lib/Target/SystemZ
parent	31bbf64f3a4974a2d6c8b3b27ad2f519caf74057 (diff)